linux-brain/fs/btrfs/super.c
Filipe Manana d5f996bea4 btrfs: fix transaction leak and crash after RO remount caused by qgroup rescan
[ Upstream commit cb13eea3b49055bd78e6ddf39defd6340f7379fc ]

If we remount a filesystem in RO mode while the qgroup rescan worker is
running, we can end up having it still running after the remount is done,
and at unmount time we may end up with an open transaction that ends up
never getting committed. If that happens we end up with several memory
leaks and can crash when hardware acceleration is unavailable for crc32c.
Possibly it can lead to other nasty surprises too, due to use-after-free
issues.

The following steps explain how the problem happens.

1) We have a filesystem mounted in RW mode and the qgroup rescan worker is
   running;

2) We remount the filesystem in RO mode, and never stop/pause the rescan
   worker, so after the remount the rescan worker is still running. The
   important detail here is that the rescan task is still running after
   the remount operation committed any ongoing transaction through its
   call to btrfs_commit_super();

3) The rescan is still running, and after the remount completed, the
   rescan worker started a transaction, after it finished iterating all
   leaves of the extent tree, to update the qgroup status item in the
   quotas tree. It does not commit the transaction, it only releases its
   handle on the transaction;

4) A filesystem unmount operation starts shortly after;

5) The unmount task, at close_ctree(), stops the transaction kthread,
   which had not had a chance to commit the open transaction since it was
   sleeping and the commit interval (default of 30 seconds) has not yet
   elapsed since the last time it committed a transaction;

6) So after stopping the transaction kthread we still have the transaction
   used to update the qgroup status item open. At close_ctree(), when the
   filesystem is in RO mode and no transaction abort happened (or the
   filesystem is in error mode), we do not expect to have any transaction
   open, so we do not call btrfs_commit_super();

7) We then proceed to destroy the work queues, free the roots and block
   groups, etc. After that we drop the last reference on the btree inode
   by calling iput() on it. Since there are dirty pages for the btree
   inode, corresponding to the COWed extent buffer for the quotas btree,
   btree_write_cache_pages() is invoked to flush those dirty pages. This
   results in creating a bio and submitting it, which makes us end up at
   btrfs_submit_metadata_bio();

8) At btrfs_submit_metadata_bio() we end up at the if-then-else branch
   that calls btrfs_wq_submit_bio(), because check_async_write() returned
   a value of 1. This value of 1 is because we did not have hardware
   acceleration available for crc32c, so BTRFS_FS_CSUM_IMPL_FAST was not
   set in fs_info->flags;

9) Then at btrfs_wq_submit_bio() we call btrfs_queue_work() against the
   workqueue at fs_info->workers, which was already freed before by the
   call to btrfs_stop_all_workers() at close_ctree(). This results in an
   invalid memory access due to a use-after-free, leading to a crash.

When this happens, before the crash there are several warnings triggered,
since we have reserved metadata space in a block group, the delayed refs
reservation, etc:

  ------------[ cut here ]------------
  WARNING: CPU: 4 PID: 1729896 at fs/btrfs/block-group.c:125 btrfs_put_block_group+0x63/0xa0 [btrfs]
  Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
  CPU: 4 PID: 1729896 Comm: umount Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  RIP: 0010:btrfs_put_block_group+0x63/0xa0 [btrfs]
  Code: f0 01 00 00 48 39 c2 75 (...)
  RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
  RAX: 0000000000000001 RBX: ffff947ed73e4000 RCX: ffff947ebc8b29c8
  RDX: 0000000000000001 RSI: ffffffffc0b150a0 RDI: ffff947ebc8b2800
  RBP: ffff947ebc8b2800 R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
  R13: ffff947ed73e4160 R14: ffff947ebc8b2988 R15: dead000000000100
  FS:  00007f15edfea840(0000) GS:ffff9481ad600000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f37e2893320 CR3: 0000000138f68001 CR4: 00000000003706e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   btrfs_free_block_groups+0x17f/0x2f0 [btrfs]
   close_ctree+0x2ba/0x2fa [btrfs]
   generic_shutdown_super+0x6c/0x100
   kill_anon_super+0x14/0x30
   btrfs_kill_super+0x12/0x20 [btrfs]
   deactivate_locked_super+0x31/0x70
   cleanup_mnt+0x100/0x160
   task_work_run+0x68/0xb0
   exit_to_user_mode_prepare+0x1bb/0x1c0
   syscall_exit_to_user_mode+0x4b/0x260
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f15ee221ee7
  Code: ff 0b 00 f7 d8 64 89 01 48 (...)
  RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
  RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
  RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
  RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
  R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
  R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
  irq event stamp: 0
  hardirqs last  enabled at (0): [<0000000000000000>] 0x0
  hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
  softirqs last  enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
  softirqs last disabled at (0): [<0000000000000000>] 0x0
  ---[ end trace dd74718fef1ed5c6 ]---
  ------------[ cut here ]------------
  WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-rsv.c:459 btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
  Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
  CPU: 2 PID: 1729896 Comm: umount Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  RIP: 0010:btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
  Code: 48 83 bb b0 03 00 00 00 (...)
  RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
  RAX: 000000000033c000 RBX: ffff947ed73e4000 RCX: 0000000000000000
  RDX: 0000000000000001 RSI: ffffffffc0b0d8c1 RDI: 00000000ffffffff
  RBP: ffff947ebc8b7000 R08: 0000000000000001 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
  R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
  FS:  00007f15edfea840(0000) GS:ffff9481aca00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000561a79f76e20 CR3: 0000000138f68006 CR4: 00000000003706e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   btrfs_free_block_groups+0x24c/0x2f0 [btrfs]
   close_ctree+0x2ba/0x2fa [btrfs]
   generic_shutdown_super+0x6c/0x100
   kill_anon_super+0x14/0x30
   btrfs_kill_super+0x12/0x20 [btrfs]
   deactivate_locked_super+0x31/0x70
   cleanup_mnt+0x100/0x160
   task_work_run+0x68/0xb0
   exit_to_user_mode_prepare+0x1bb/0x1c0
   syscall_exit_to_user_mode+0x4b/0x260
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f15ee221ee7
  Code: ff 0b 00 f7 d8 64 89 01 (...)
  RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
  RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
  RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
  RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
  R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
  R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
  irq event stamp: 0
  hardirqs last  enabled at (0): [<0000000000000000>] 0x0
  hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
  softirqs last  enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
  softirqs last disabled at (0): [<0000000000000000>] 0x0
  ---[ end trace dd74718fef1ed5c7 ]---
  ------------[ cut here ]------------
  WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-group.c:3377 btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
  Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
  CPU: 5 PID: 1729896 Comm: umount Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  RIP: 0010:btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
  Code: ad de 49 be 22 01 00 (...)
  RSP: 0018:ffffb270826bbde8 EFLAGS: 00010206
  RAX: ffff947ebeae1d08 RBX: ffff947ed73e4000 RCX: 0000000000000000
  RDX: 0000000000000001 RSI: ffff947e9d823ae8 RDI: 0000000000000246
  RBP: ffff947ebeae1d08 R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ebeae1c00
  R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
  FS:  00007f15edfea840(0000) GS:ffff9481ad200000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f1475d98ea8 CR3: 0000000138f68005 CR4: 00000000003706e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   close_ctree+0x2ba/0x2fa [btrfs]
   generic_shutdown_super+0x6c/0x100
   kill_anon_super+0x14/0x30
   btrfs_kill_super+0x12/0x20 [btrfs]
   deactivate_locked_super+0x31/0x70
   cleanup_mnt+0x100/0x160
   task_work_run+0x68/0xb0
   exit_to_user_mode_prepare+0x1bb/0x1c0
   syscall_exit_to_user_mode+0x4b/0x260
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f15ee221ee7
  Code: ff 0b 00 f7 d8 64 89 (...)
  RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
  RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
  RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
  RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
  R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
  R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
  irq event stamp: 0
  hardirqs last  enabled at (0): [<0000000000000000>] 0x0
  hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
  softirqs last  enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
  softirqs last disabled at (0): [<0000000000000000>] 0x0
  ---[ end trace dd74718fef1ed5c8 ]---
  BTRFS info (device sdc): space_info 4 has 268238848 free, is not full
  BTRFS info (device sdc): space_info total=268435456, used=114688, pinned=0, reserved=16384, may_use=0, readonly=65536
  BTRFS info (device sdc): global_block_rsv: size 0 reserved 0
  BTRFS info (device sdc): trans_block_rsv: size 0 reserved 0
  BTRFS info (device sdc): chunk_block_rsv: size 0 reserved 0
  BTRFS info (device sdc): delayed_block_rsv: size 0 reserved 0
  BTRFS info (device sdc): delayed_refs_rsv: size 524288 reserved 0

And the crash, which only happens when we do not have crc32c hardware
acceleration, produces the following trace immediately after those
warnings:

  stack segment: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
  CPU: 2 PID: 1749129 Comm: umount Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  RIP: 0010:btrfs_queue_work+0x36/0x190 [btrfs]
  Code: 54 55 53 48 89 f3 (...)
  RSP: 0018:ffffb27082443ae8 EFLAGS: 00010282
  RAX: 0000000000000004 RBX: ffff94810ee9ad90 RCX: 0000000000000000
  RDX: 0000000000000001 RSI: ffff94810ee9ad90 RDI: ffff947ed8ee75a0
  RBP: a56b6b6b6b6b6b6b R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000007 R11: 0000000000000001 R12: ffff947fa9b435a8
  R13: ffff94810ee9ad90 R14: 0000000000000000 R15: ffff947e93dc0000
  FS:  00007f3cfe974840(0000) GS:ffff9481ac600000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f1b42995a70 CR3: 0000000127638003 CR4: 00000000003706e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   btrfs_wq_submit_bio+0xb3/0xd0 [btrfs]
   btrfs_submit_metadata_bio+0x44/0xc0 [btrfs]
   submit_one_bio+0x61/0x70 [btrfs]
   btree_write_cache_pages+0x414/0x450 [btrfs]
   ? kobject_put+0x9a/0x1d0
   ? trace_hardirqs_on+0x1b/0xf0
   ? _raw_spin_unlock_irqrestore+0x3c/0x60
   ? free_debug_processing+0x1e1/0x2b0
   do_writepages+0x43/0xe0
   ? lock_acquired+0x199/0x490
   __writeback_single_inode+0x59/0x650
   writeback_single_inode+0xaf/0x120
   write_inode_now+0x94/0xd0
   iput+0x187/0x2b0
   close_ctree+0x2c6/0x2fa [btrfs]
   generic_shutdown_super+0x6c/0x100
   kill_anon_super+0x14/0x30
   btrfs_kill_super+0x12/0x20 [btrfs]
   deactivate_locked_super+0x31/0x70
   cleanup_mnt+0x100/0x160
   task_work_run+0x68/0xb0
   exit_to_user_mode_prepare+0x1bb/0x1c0
   syscall_exit_to_user_mode+0x4b/0x260
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f3cfebabee7
  Code: ff 0b 00 f7 d8 64 89 01 (...)
  RSP: 002b:00007ffc9c9a05f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
  RAX: 0000000000000000 RBX: 00007f3cfecd1264 RCX: 00007f3cfebabee7
  RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 0000562b6b478000
  RBP: 0000562b6b473a30 R08: 0000000000000000 R09: 00007f3cfec6cbe0
  R10: 0000562b6b479fe0 R11: 0000000000000246 R12: 0000000000000000
  R13: 0000562b6b478000 R14: 0000562b6b473b40 R15: 0000562b6b473c60
  Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
  ---[ end trace dd74718fef1ed5cc ]---

Finally when we remove the btrfs module (rmmod btrfs), there are several
warnings about objects that were allocated from our slabs but were never
freed, consequence of the transaction that was never committed and got
leaked:

  =============================================================================
  BUG btrfs_delayed_ref_head (Tainted: G    B   W        ): Objects remaining in btrfs_delayed_ref_head on __kmem_cache_shutdown()
  -----------------------------------------------------------------------------

  INFO: Slab 0x0000000094c2ae56 objects=24 used=2 fp=0x000000002bfa2521 flags=0x17fffc000010200
  CPU: 5 PID: 1729921 Comm: rmmod Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  Call Trace:
   dump_stack+0x8d/0xb5
   slab_err+0xb7/0xdc
   ? lock_acquired+0x199/0x490
   __kmem_cache_shutdown+0x1ac/0x3c0
   ? lock_release+0x20e/0x4c0
   kmem_cache_destroy+0x55/0x120
   btrfs_delayed_ref_exit+0x11/0x35 [btrfs]
   exit_btrfs_fs+0xa/0x59 [btrfs]
   __x64_sys_delete_module+0x194/0x260
   ? fpregs_assert_state_consistent+0x1e/0x40
   ? exit_to_user_mode_prepare+0x55/0x1c0
   ? trace_hardirqs_on+0x1b/0xf0
   do_syscall_64+0x33/0x80
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f693e305897
  Code: 73 01 c3 48 8b 0d f9 f5 (...)
  RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
  RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
  RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
  RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
  R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
  R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
  INFO: Object 0x0000000050cbdd61 @offset=12104
  INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1894 cpu=6 pid=1729873
	__slab_alloc.isra.0+0x109/0x1c0
	kmem_cache_alloc+0x7bb/0x830
	btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
	btrfs_free_tree_block+0x128/0x360 [btrfs]
	__btrfs_cow_block+0x489/0x5f0 [btrfs]
	btrfs_cow_block+0xf7/0x220 [btrfs]
	btrfs_search_slot+0x62a/0xc40 [btrfs]
	btrfs_del_orphan_item+0x65/0xd0 [btrfs]
	btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
	open_ctree+0x125a/0x18a0 [btrfs]
	btrfs_mount_root.cold+0x13/0xed [btrfs]
	legacy_get_tree+0x30/0x60
	vfs_get_tree+0x28/0xe0
	fc_mount+0xe/0x40
	vfs_kern_mount.part.0+0x71/0x90
	btrfs_mount+0x13b/0x3e0 [btrfs]
  INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=4292 cpu=2 pid=1729526
	kmem_cache_free+0x34c/0x3c0
	__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
	btrfs_run_delayed_refs+0x81/0x210 [btrfs]
	commit_cowonly_roots+0xfb/0x300 [btrfs]
	btrfs_commit_transaction+0x367/0xc40 [btrfs]
	sync_filesystem+0x74/0x90
	generic_shutdown_super+0x22/0x100
	kill_anon_super+0x14/0x30
	btrfs_kill_super+0x12/0x20 [btrfs]
	deactivate_locked_super+0x31/0x70
	cleanup_mnt+0x100/0x160
	task_work_run+0x68/0xb0
	exit_to_user_mode_prepare+0x1bb/0x1c0
	syscall_exit_to_user_mode+0x4b/0x260
	entry_SYSCALL_64_after_hwframe+0x44/0xa9
  INFO: Object 0x0000000086e9b0ff @offset=12776
  INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1900 cpu=6 pid=1729873
	__slab_alloc.isra.0+0x109/0x1c0
	kmem_cache_alloc+0x7bb/0x830
	btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
	btrfs_alloc_tree_block+0x2bf/0x360 [btrfs]
	alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
	__btrfs_cow_block+0x12d/0x5f0 [btrfs]
	btrfs_cow_block+0xf7/0x220 [btrfs]
	btrfs_search_slot+0x62a/0xc40 [btrfs]
	btrfs_del_orphan_item+0x65/0xd0 [btrfs]
	btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
	open_ctree+0x125a/0x18a0 [btrfs]
	btrfs_mount_root.cold+0x13/0xed [btrfs]
	legacy_get_tree+0x30/0x60
	vfs_get_tree+0x28/0xe0
	fc_mount+0xe/0x40
	vfs_kern_mount.part.0+0x71/0x90
  INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=3141 cpu=6 pid=1729803
	kmem_cache_free+0x34c/0x3c0
	__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
	btrfs_run_delayed_refs+0x81/0x210 [btrfs]
	btrfs_write_dirty_block_groups+0x17d/0x3d0 [btrfs]
	commit_cowonly_roots+0x248/0x300 [btrfs]
	btrfs_commit_transaction+0x367/0xc40 [btrfs]
	close_ctree+0x113/0x2fa [btrfs]
	generic_shutdown_super+0x6c/0x100
	kill_anon_super+0x14/0x30
	btrfs_kill_super+0x12/0x20 [btrfs]
	deactivate_locked_super+0x31/0x70
	cleanup_mnt+0x100/0x160
	task_work_run+0x68/0xb0
	exit_to_user_mode_prepare+0x1bb/0x1c0
	syscall_exit_to_user_mode+0x4b/0x260
	entry_SYSCALL_64_after_hwframe+0x44/0xa9
  kmem_cache_destroy btrfs_delayed_ref_head: Slab cache still has objects
  CPU: 5 PID: 1729921 Comm: rmmod Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  Call Trace:
   dump_stack+0x8d/0xb5
   kmem_cache_destroy+0x119/0x120
   btrfs_delayed_ref_exit+0x11/0x35 [btrfs]
   exit_btrfs_fs+0xa/0x59 [btrfs]
   __x64_sys_delete_module+0x194/0x260
   ? fpregs_assert_state_consistent+0x1e/0x40
   ? exit_to_user_mode_prepare+0x55/0x1c0
   ? trace_hardirqs_on+0x1b/0xf0
   do_syscall_64+0x33/0x80
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f693e305897
  Code: 73 01 c3 48 8b 0d f9 f5 0b (...)
  RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
  RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
  RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
  RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
  R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
  R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
  =============================================================================
  BUG btrfs_delayed_tree_ref (Tainted: G    B   W        ): Objects remaining in btrfs_delayed_tree_ref on __kmem_cache_shutdown()
  -----------------------------------------------------------------------------

  INFO: Slab 0x0000000011f78dc0 objects=37 used=2 fp=0x0000000032d55d91 flags=0x17fffc000010200
  CPU: 3 PID: 1729921 Comm: rmmod Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  Call Trace:
   dump_stack+0x8d/0xb5
   slab_err+0xb7/0xdc
   ? lock_acquired+0x199/0x490
   __kmem_cache_shutdown+0x1ac/0x3c0
   ? lock_release+0x20e/0x4c0
   kmem_cache_destroy+0x55/0x120
   btrfs_delayed_ref_exit+0x1d/0x35 [btrfs]
   exit_btrfs_fs+0xa/0x59 [btrfs]
   __x64_sys_delete_module+0x194/0x260
   ? fpregs_assert_state_consistent+0x1e/0x40
   ? exit_to_user_mode_prepare+0x55/0x1c0
   ? trace_hardirqs_on+0x1b/0xf0
   do_syscall_64+0x33/0x80
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f693e305897
  Code: 73 01 c3 48 8b 0d f9 f5 (...)
  RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
  RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
  RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
  RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
  R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
  R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
  INFO: Object 0x000000001a340018 @offset=4408
  INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1917 cpu=6 pid=1729873
	__slab_alloc.isra.0+0x109/0x1c0
	kmem_cache_alloc+0x7bb/0x830
	btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs]
	btrfs_free_tree_block+0x128/0x360 [btrfs]
	__btrfs_cow_block+0x489/0x5f0 [btrfs]
	btrfs_cow_block+0xf7/0x220 [btrfs]
	btrfs_search_slot+0x62a/0xc40 [btrfs]
	btrfs_del_orphan_item+0x65/0xd0 [btrfs]
	btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
	open_ctree+0x125a/0x18a0 [btrfs]
	btrfs_mount_root.cold+0x13/0xed [btrfs]
	legacy_get_tree+0x30/0x60
	vfs_get_tree+0x28/0xe0
	fc_mount+0xe/0x40
	vfs_kern_mount.part.0+0x71/0x90
	btrfs_mount+0x13b/0x3e0 [btrfs]
  INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=4167 cpu=4 pid=1729795
	kmem_cache_free+0x34c/0x3c0
	__btrfs_run_delayed_refs+0x63d/0x1290 [btrfs]
	btrfs_run_delayed_refs+0x81/0x210 [btrfs]
	btrfs_commit_transaction+0x60/0xc40 [btrfs]
	create_subvol+0x56a/0x990 [btrfs]
	btrfs_mksubvol+0x3fb/0x4a0 [btrfs]
	__btrfs_ioctl_snap_create+0x119/0x1a0 [btrfs]
	btrfs_ioctl_snap_create+0x58/0x80 [btrfs]
	btrfs_ioctl+0x1a92/0x36f0 [btrfs]
	__x64_sys_ioctl+0x83/0xb0
	do_syscall_64+0x33/0x80
	entry_SYSCALL_64_after_hwframe+0x44/0xa9
  INFO: Object 0x000000002b46292a @offset=13648
  INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1923 cpu=6 pid=1729873
	__slab_alloc.isra.0+0x109/0x1c0
	kmem_cache_alloc+0x7bb/0x830
	btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs]
	btrfs_alloc_tree_block+0x2bf/0x360 [btrfs]
	alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
	__btrfs_cow_block+0x12d/0x5f0 [btrfs]
	btrfs_cow_block+0xf7/0x220 [btrfs]
	btrfs_search_slot+0x62a/0xc40 [btrfs]
	btrfs_del_orphan_item+0x65/0xd0 [btrfs]
	btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
	open_ctree+0x125a/0x18a0 [btrfs]
	btrfs_mount_root.cold+0x13/0xed [btrfs]
	legacy_get_tree+0x30/0x60
	vfs_get_tree+0x28/0xe0
	fc_mount+0xe/0x40
	vfs_kern_mount.part.0+0x71/0x90
  INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=3164 cpu=6 pid=1729803
	kmem_cache_free+0x34c/0x3c0
	__btrfs_run_delayed_refs+0x63d/0x1290 [btrfs]
	btrfs_run_delayed_refs+0x81/0x210 [btrfs]
	commit_cowonly_roots+0xfb/0x300 [btrfs]
	btrfs_commit_transaction+0x367/0xc40 [btrfs]
	close_ctree+0x113/0x2fa [btrfs]
	generic_shutdown_super+0x6c/0x100
	kill_anon_super+0x14/0x30
	btrfs_kill_super+0x12/0x20 [btrfs]
	deactivate_locked_super+0x31/0x70
	cleanup_mnt+0x100/0x160
	task_work_run+0x68/0xb0
	exit_to_user_mode_prepare+0x1bb/0x1c0
	syscall_exit_to_user_mode+0x4b/0x260
	entry_SYSCALL_64_after_hwframe+0x44/0xa9
  kmem_cache_destroy btrfs_delayed_tree_ref: Slab cache still has objects
  CPU: 5 PID: 1729921 Comm: rmmod Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  Call Trace:
   dump_stack+0x8d/0xb5
   kmem_cache_destroy+0x119/0x120
   btrfs_delayed_ref_exit+0x1d/0x35 [btrfs]
   exit_btrfs_fs+0xa/0x59 [btrfs]
   __x64_sys_delete_module+0x194/0x260
   ? fpregs_assert_state_consistent+0x1e/0x40
   ? exit_to_user_mode_prepare+0x55/0x1c0
   ? trace_hardirqs_on+0x1b/0xf0
   do_syscall_64+0x33/0x80
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f693e305897
  Code: 73 01 c3 48 8b 0d f9 f5 (...)
  RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
  RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
  RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
  RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
  R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
  R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
  =============================================================================
  BUG btrfs_delayed_extent_op (Tainted: G    B   W        ): Objects remaining in btrfs_delayed_extent_op on __kmem_cache_shutdown()
  -----------------------------------------------------------------------------

  INFO: Slab 0x00000000f145ce2f objects=22 used=1 fp=0x00000000af0f92cf flags=0x17fffc000010200
  CPU: 5 PID: 1729921 Comm: rmmod Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  Call Trace:
   dump_stack+0x8d/0xb5
   slab_err+0xb7/0xdc
   ? lock_acquired+0x199/0x490
   __kmem_cache_shutdown+0x1ac/0x3c0
   ? __mutex_unlock_slowpath+0x45/0x2a0
   kmem_cache_destroy+0x55/0x120
   exit_btrfs_fs+0xa/0x59 [btrfs]
   __x64_sys_delete_module+0x194/0x260
   ? fpregs_assert_state_consistent+0x1e/0x40
   ? exit_to_user_mode_prepare+0x55/0x1c0
   ? trace_hardirqs_on+0x1b/0xf0
   do_syscall_64+0x33/0x80
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f693e305897
  Code: 73 01 c3 48 8b 0d f9 f5 (...)
  RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
  RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
  RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
  RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
  R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
  R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
  INFO: Object 0x000000004cf95ea8 @offset=6264
  INFO: Allocated in btrfs_alloc_tree_block+0x1e0/0x360 [btrfs] age=1931 cpu=6 pid=1729873
	__slab_alloc.isra.0+0x109/0x1c0
	kmem_cache_alloc+0x7bb/0x830
	btrfs_alloc_tree_block+0x1e0/0x360 [btrfs]
	alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
	__btrfs_cow_block+0x12d/0x5f0 [btrfs]
	btrfs_cow_block+0xf7/0x220 [btrfs]
	btrfs_search_slot+0x62a/0xc40 [btrfs]
	btrfs_del_orphan_item+0x65/0xd0 [btrfs]
	btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
	open_ctree+0x125a/0x18a0 [btrfs]
	btrfs_mount_root.cold+0x13/0xed [btrfs]
	legacy_get_tree+0x30/0x60
	vfs_get_tree+0x28/0xe0
	fc_mount+0xe/0x40
	vfs_kern_mount.part.0+0x71/0x90
	btrfs_mount+0x13b/0x3e0 [btrfs]
  INFO: Freed in __btrfs_run_delayed_refs+0xabd/0x1290 [btrfs] age=3173 cpu=6 pid=1729803
	kmem_cache_free+0x34c/0x3c0
	__btrfs_run_delayed_refs+0xabd/0x1290 [btrfs]
	btrfs_run_delayed_refs+0x81/0x210 [btrfs]
	commit_cowonly_roots+0xfb/0x300 [btrfs]
	btrfs_commit_transaction+0x367/0xc40 [btrfs]
	close_ctree+0x113/0x2fa [btrfs]
	generic_shutdown_super+0x6c/0x100
	kill_anon_super+0x14/0x30
	btrfs_kill_super+0x12/0x20 [btrfs]
	deactivate_locked_super+0x31/0x70
	cleanup_mnt+0x100/0x160
	task_work_run+0x68/0xb0
	exit_to_user_mode_prepare+0x1bb/0x1c0
	syscall_exit_to_user_mode+0x4b/0x260
	entry_SYSCALL_64_after_hwframe+0x44/0xa9
  kmem_cache_destroy btrfs_delayed_extent_op: Slab cache still has objects
  CPU: 3 PID: 1729921 Comm: rmmod Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  Call Trace:
   dump_stack+0x8d/0xb5
   kmem_cache_destroy+0x119/0x120
   exit_btrfs_fs+0xa/0x59 [btrfs]
   __x64_sys_delete_module+0x194/0x260
   ? fpregs_assert_state_consistent+0x1e/0x40
   ? exit_to_user_mode_prepare+0x55/0x1c0
   ? trace_hardirqs_on+0x1b/0xf0
   do_syscall_64+0x33/0x80
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f693e305897
  Code: 73 01 c3 48 8b 0d f9 (...)
  RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
  RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
  RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
  RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
  R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
  R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
  BTRFS: state leak: start 30408704 end 30425087 state 1 in tree 1 refs 1

Fix this issue by having the remount path stop the qgroup rescan worker
when we are remounting RO and teach the rescan worker to stop when a
remount is in progress. If later a remount in RW mode happens, we are
already resuming the qgroup rescan worker through the call to
btrfs_qgroup_rescan_resume(), so we do not need to worry about that.

Tested-by: Fabian Vogt <fvogt@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2021-01-19 18:26:14 +01:00

2487 lines
66 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
#include <linux/parser.h>
#include <linux/ctype.h>
#include <linux/namei.h>
#include <linux/miscdevice.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/cleancache.h>
#include <linux/ratelimit.h>
#include <linux/crc32c.h>
#include <linux/btrfs.h>
#include "delayed-inode.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
#include "props.h"
#include "xattr.h"
#include "volumes.h"
#include "export.h"
#include "compression.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "free-space-cache.h"
#include "backref.h"
#include "space-info.h"
#include "sysfs.h"
#include "tests/btrfs-tests.h"
#include "block-group.h"
#include "qgroup.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
static const struct super_operations btrfs_super_ops;
/*
* Types for mounting the default subvolume and a subvolume explicitly
* requested by subvol=/path. That way the callchain is straightforward and we
* don't have to play tricks with the mount options and recursive calls to
* btrfs_mount.
*
* The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
*/
static struct file_system_type btrfs_fs_type;
static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
const char *btrfs_decode_error(int errno)
{
char *errstr = "unknown";
switch (errno) {
case -EIO:
errstr = "IO failure";
break;
case -ENOMEM:
errstr = "Out of memory";
break;
case -EROFS:
errstr = "Readonly filesystem";
break;
case -EEXIST:
errstr = "Object already exists";
break;
case -ENOSPC:
errstr = "No space left";
break;
case -ENOENT:
errstr = "No such entry";
break;
}
return errstr;
}
/*
* __btrfs_handle_fs_error decodes expected errors from the caller and
* invokes the appropriate error response.
*/
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
#ifdef CONFIG_PRINTK
const char *errstr;
#endif
/*
* Special case: if the error is EROFS, and we're already
* under SB_RDONLY, then it is safe here.
*/
if (errno == -EROFS && sb_rdonly(sb))
return;
#ifdef CONFIG_PRINTK
errstr = btrfs_decode_error(errno);
if (fmt) {
struct va_format vaf;
va_list args;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
sb->s_id, function, line, errno, errstr, &vaf);
va_end(args);
} else {
pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
sb->s_id, function, line, errno, errstr);
}
#endif
/*
* Today we only save the error info to memory. Long term we'll
* also send it down to the disk
*/
set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
/* Don't go through full error handling during mount */
if (!(sb->s_flags & SB_BORN))
return;
if (sb_rdonly(sb))
return;
/* btrfs handle error by forcing the filesystem readonly */
sb->s_flags |= SB_RDONLY;
btrfs_info(fs_info, "forced readonly");
/*
* Note that a running device replace operation is not canceled here
* although there is no way to update the progress. It would add the
* risk of a deadlock, therefore the canceling is omitted. The only
* penalty is that some I/O remains active until the procedure
* completes. The next time when the filesystem is mounted writable
* again, the device replace operation continues.
*/
}
#ifdef CONFIG_PRINTK
static const char * const logtypes[] = {
"emergency",
"alert",
"critical",
"error",
"warning",
"notice",
"info",
"debug",
};
/*
* Use one ratelimit state per log level so that a flood of less important
* messages doesn't cause more important ones to be dropped.
*/
static struct ratelimit_state printk_limits[] = {
RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};
void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
va_list args;
int kern_level;
const char *type = logtypes[4];
struct ratelimit_state *ratelimit = &printk_limits[4];
va_start(args, fmt);
while ((kern_level = printk_get_level(fmt)) != 0) {
size_t size = printk_skip_level(fmt) - fmt;
if (kern_level >= '0' && kern_level <= '7') {
memcpy(lvl, fmt, size);
lvl[size] = '\0';
type = logtypes[kern_level - '0'];
ratelimit = &printk_limits[kern_level - '0'];
}
fmt += size;
}
vaf.fmt = fmt;
vaf.va = &args;
if (__ratelimit(ratelimit))
printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
fs_info ? fs_info->sb->s_id : "<unknown>", &vaf);
va_end(args);
}
#endif
/*
* We only mark the transaction aborted and then set the file system read-only.
* This will prevent new transactions from starting or trying to join this
* one.
*
* This means that error recovery at the call site is limited to freeing
* any local memory allocations and passing the error code up without
* further cleanup. The transaction should complete as it normally would
* in the call path but will return -EIO.
*
* We'll complete the cleanup in btrfs_end_transaction and
* btrfs_commit_transaction.
*/
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
unsigned int line, int errno)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
WRITE_ONCE(trans->aborted, errno);
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
if (!trans->dirty && list_empty(&trans->new_bgs)) {
const char *errstr;
errstr = btrfs_decode_error(errno);
btrfs_warn(fs_info,
"%s:%d: Aborting unused transaction(%s).",
function, line, errstr);
return;
}
WRITE_ONCE(trans->transaction->aborted, errno);
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
wake_up(&fs_info->transaction_blocked_wait);
__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
}
/*
* __btrfs_panic decodes unexpected, fatal errors from the caller,
* issues an alert, and either panics or BUGs, depending on mount options.
*/
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
char *s_id = "<unknown>";
const char *errstr;
struct va_format vaf = { .fmt = fmt };
va_list args;
if (fs_info)
s_id = fs_info->sb->s_id;
va_start(args, fmt);
vaf.va = &args;
errstr = btrfs_decode_error(errno);
if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
s_id, function, line, &vaf, errno, errstr);
btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
function, line, &vaf, errno, errstr);
va_end(args);
/* Caller calls BUG() */
}
static void btrfs_put_super(struct super_block *sb)
{
close_ctree(btrfs_sb(sb));
}
enum {
Opt_acl, Opt_noacl,
Opt_clear_cache,
Opt_commit_interval,
Opt_compress,
Opt_compress_force,
Opt_compress_force_type,
Opt_compress_type,
Opt_degraded,
Opt_device,
Opt_fatal_errors,
Opt_flushoncommit, Opt_noflushoncommit,
Opt_inode_cache, Opt_noinode_cache,
Opt_max_inline,
Opt_barrier, Opt_nobarrier,
Opt_datacow, Opt_nodatacow,
Opt_datasum, Opt_nodatasum,
Opt_defrag, Opt_nodefrag,
Opt_discard, Opt_nodiscard,
Opt_nologreplay,
Opt_norecovery,
Opt_ratio,
Opt_rescan_uuid_tree,
Opt_skip_balance,
Opt_space_cache, Opt_no_space_cache,
Opt_space_cache_version,
Opt_ssd, Opt_nossd,
Opt_ssd_spread, Opt_nossd_spread,
Opt_subvol,
Opt_subvol_empty,
Opt_subvolid,
Opt_thread_pool,
Opt_treelog, Opt_notreelog,
Opt_usebackuproot,
Opt_user_subvol_rm_allowed,
/* Deprecated options */
Opt_alloc_start,
Opt_recovery,
Opt_subvolrootid,
/* Debugging options */
Opt_check_integrity,
Opt_check_integrity_including_extent_data,
Opt_check_integrity_print_mask,
Opt_enospc_debug, Opt_noenospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
Opt_ref_verify,
#endif
Opt_err,
};
static const match_table_t tokens = {
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
{Opt_clear_cache, "clear_cache"},
{Opt_commit_interval, "commit=%u"},
{Opt_compress, "compress"},
{Opt_compress_type, "compress=%s"},
{Opt_compress_force, "compress-force"},
{Opt_compress_force_type, "compress-force=%s"},
{Opt_degraded, "degraded"},
{Opt_device, "device=%s"},
{Opt_fatal_errors, "fatal_errors=%s"},
{Opt_flushoncommit, "flushoncommit"},
{Opt_noflushoncommit, "noflushoncommit"},
{Opt_inode_cache, "inode_cache"},
{Opt_noinode_cache, "noinode_cache"},
{Opt_max_inline, "max_inline=%s"},
{Opt_barrier, "barrier"},
{Opt_nobarrier, "nobarrier"},
{Opt_datacow, "datacow"},
{Opt_nodatacow, "nodatacow"},
{Opt_datasum, "datasum"},
{Opt_nodatasum, "nodatasum"},
{Opt_defrag, "autodefrag"},
{Opt_nodefrag, "noautodefrag"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_nologreplay, "nologreplay"},
{Opt_norecovery, "norecovery"},
{Opt_ratio, "metadata_ratio=%u"},
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
{Opt_skip_balance, "skip_balance"},
{Opt_space_cache, "space_cache"},
{Opt_no_space_cache, "nospace_cache"},
{Opt_space_cache_version, "space_cache=%s"},
{Opt_ssd, "ssd"},
{Opt_nossd, "nossd"},
{Opt_ssd_spread, "ssd_spread"},
{Opt_nossd_spread, "nossd_spread"},
{Opt_subvol, "subvol=%s"},
{Opt_subvol_empty, "subvol="},
{Opt_subvolid, "subvolid=%s"},
{Opt_thread_pool, "thread_pool=%u"},
{Opt_treelog, "treelog"},
{Opt_notreelog, "notreelog"},
{Opt_usebackuproot, "usebackuproot"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
/* Deprecated options */
{Opt_alloc_start, "alloc_start=%s"},
{Opt_recovery, "recovery"},
{Opt_subvolrootid, "subvolrootid=%d"},
/* Debugging options */
{Opt_check_integrity, "check_int"},
{Opt_check_integrity_including_extent_data, "check_int_data"},
{Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
{Opt_enospc_debug, "enospc_debug"},
{Opt_noenospc_debug, "noenospc_debug"},
#ifdef CONFIG_BTRFS_DEBUG
{Opt_fragment_data, "fragment=data"},
{Opt_fragment_metadata, "fragment=metadata"},
{Opt_fragment_all, "fragment=all"},
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
{Opt_ref_verify, "ref_verify"},
#endif
{Opt_err, NULL},
};
/*
* Regular mount options parser. Everything that is needed only when
* reading in a new superblock is parsed here.
* XXX JDM: This needs to be cleaned up for remount.
*/
int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags)
{
substring_t args[MAX_OPT_ARGS];
char *p, *num;
u64 cache_gen;
int intarg;
int ret = 0;
char *compress_type;
bool compress_force = false;
enum btrfs_compression_type saved_compress_type;
int saved_compress_level;
bool saved_compress_force;
int no_compress = 0;
cache_gen = btrfs_super_cache_generation(info->super_copy);
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
else if (cache_gen)
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
/*
* Even the options are empty, we still need to do extra check
* against new flags
*/
if (!options)
goto check;
while ((p = strsep(&options, ",")) != NULL) {
int token;
if (!*p)
continue;
token = match_token(p, tokens, args);
switch (token) {
case Opt_degraded:
btrfs_info(info, "allowing degraded mounts");
btrfs_set_opt(info->mount_opt, DEGRADED);
break;
case Opt_subvol:
case Opt_subvol_empty:
case Opt_subvolid:
case Opt_subvolrootid:
case Opt_device:
/*
* These are parsed by btrfs_parse_subvol_options or
* btrfs_parse_device_options and can be ignored here.
*/
break;
case Opt_nodatasum:
btrfs_set_and_info(info, NODATASUM,
"setting nodatasum");
break;
case Opt_datasum:
if (btrfs_test_opt(info, NODATASUM)) {
if (btrfs_test_opt(info, NODATACOW))
btrfs_info(info,
"setting datasum, datacow enabled");
else
btrfs_info(info, "setting datasum");
}
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
break;
case Opt_nodatacow:
if (!btrfs_test_opt(info, NODATACOW)) {
if (!btrfs_test_opt(info, COMPRESS) ||
!btrfs_test_opt(info, FORCE_COMPRESS)) {
btrfs_info(info,
"setting nodatacow, compression disabled");
} else {
btrfs_info(info, "setting nodatacow");
}
}
btrfs_clear_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
btrfs_set_opt(info->mount_opt, NODATACOW);
btrfs_set_opt(info->mount_opt, NODATASUM);
break;
case Opt_datacow:
btrfs_clear_and_info(info, NODATACOW,
"setting datacow");
break;
case Opt_compress_force:
case Opt_compress_force_type:
compress_force = true;
/* Fallthrough */
case Opt_compress:
case Opt_compress_type:
saved_compress_type = btrfs_test_opt(info,
COMPRESS) ?
info->compress_type : BTRFS_COMPRESS_NONE;
saved_compress_force =
btrfs_test_opt(info, FORCE_COMPRESS);
saved_compress_level = info->compress_level;
if (token == Opt_compress ||
token == Opt_compress_force ||
strncmp(args[0].from, "zlib", 4) == 0) {
compress_type = "zlib";
info->compress_type = BTRFS_COMPRESS_ZLIB;
info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
/*
* args[0] contains uninitialized data since
* for these tokens we don't expect any
* parameter.
*/
if (token != Opt_compress &&
token != Opt_compress_force)
info->compress_level =
btrfs_compress_str2level(
BTRFS_COMPRESS_ZLIB,
args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
no_compress = 0;
} else if (strncmp(args[0].from, "lzo", 3) == 0) {
compress_type = "lzo";
info->compress_type = BTRFS_COMPRESS_LZO;
info->compress_level = 0;
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
btrfs_set_fs_incompat(info, COMPRESS_LZO);
no_compress = 0;
} else if (strncmp(args[0].from, "zstd", 4) == 0) {
compress_type = "zstd";
info->compress_type = BTRFS_COMPRESS_ZSTD;
info->compress_level =
btrfs_compress_str2level(
BTRFS_COMPRESS_ZSTD,
args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
no_compress = 0;
} else if (strncmp(args[0].from, "no", 2) == 0) {
compress_type = "no";
info->compress_level = 0;
info->compress_type = 0;
btrfs_clear_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
compress_force = false;
no_compress++;
} else {
ret = -EINVAL;
goto out;
}
if (compress_force) {
btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
} else {
/*
* If we remount from compress-force=xxx to
* compress=xxx, we need clear FORCE_COMPRESS
* flag, otherwise, there is no way for users
* to disable forcible compression separately.
*/
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
if (no_compress == 1) {
btrfs_info(info, "use no compression");
} else if ((info->compress_type != saved_compress_type) ||
(compress_force != saved_compress_force) ||
(info->compress_level != saved_compress_level)) {
btrfs_info(info, "%s %s compression, level %d",
(compress_force) ? "force" : "use",
compress_type, info->compress_level);
}
compress_force = false;
break;
case Opt_ssd:
btrfs_set_and_info(info, SSD,
"enabling ssd optimizations");
btrfs_clear_opt(info->mount_opt, NOSSD);
break;
case Opt_ssd_spread:
btrfs_set_and_info(info, SSD,
"enabling ssd optimizations");
btrfs_set_and_info(info, SSD_SPREAD,
"using spread ssd allocation scheme");
btrfs_clear_opt(info->mount_opt, NOSSD);
break;
case Opt_nossd:
btrfs_set_opt(info->mount_opt, NOSSD);
btrfs_clear_and_info(info, SSD,
"not using ssd optimizations");
/* Fallthrough */
case Opt_nossd_spread:
btrfs_clear_and_info(info, SSD_SPREAD,
"not using spread ssd allocation scheme");
break;
case Opt_barrier:
btrfs_clear_and_info(info, NOBARRIER,
"turning on barriers");
break;
case Opt_nobarrier:
btrfs_set_and_info(info, NOBARRIER,
"turning off barriers");
break;
case Opt_thread_pool:
ret = match_int(&args[0], &intarg);
if (ret) {
goto out;
} else if (intarg == 0) {
ret = -EINVAL;
goto out;
}
info->thread_pool_size = intarg;
break;
case Opt_max_inline:
num = match_strdup(&args[0]);
if (num) {
info->max_inline = memparse(num, NULL);
kfree(num);
if (info->max_inline) {
info->max_inline = min_t(u64,
info->max_inline,
info->sectorsize);
}
btrfs_info(info, "max_inline at %llu",
info->max_inline);
} else {
ret = -ENOMEM;
goto out;
}
break;
case Opt_alloc_start:
btrfs_info(info,
"option alloc_start is obsolete, ignored");
break;
case Opt_acl:
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
info->sb->s_flags |= SB_POSIXACL;
break;
#else
btrfs_err(info, "support for ACL not compiled in!");
ret = -EINVAL;
goto out;
#endif
case Opt_noacl:
info->sb->s_flags &= ~SB_POSIXACL;
break;
case Opt_notreelog:
btrfs_set_and_info(info, NOTREELOG,
"disabling tree log");
break;
case Opt_treelog:
btrfs_clear_and_info(info, NOTREELOG,
"enabling tree log");
break;
case Opt_norecovery:
case Opt_nologreplay:
btrfs_set_and_info(info, NOLOGREPLAY,
"disabling log replay at mount time");
break;
case Opt_flushoncommit:
btrfs_set_and_info(info, FLUSHONCOMMIT,
"turning on flush-on-commit");
break;
case Opt_noflushoncommit:
btrfs_clear_and_info(info, FLUSHONCOMMIT,
"turning off flush-on-commit");
break;
case Opt_ratio:
ret = match_int(&args[0], &intarg);
if (ret)
goto out;
info->metadata_ratio = intarg;
btrfs_info(info, "metadata ratio %u",
info->metadata_ratio);
break;
case Opt_discard:
btrfs_set_and_info(info, DISCARD,
"turning on discard");
break;
case Opt_nodiscard:
btrfs_clear_and_info(info, DISCARD,
"turning off discard");
break;
case Opt_space_cache:
case Opt_space_cache_version:
if (token == Opt_space_cache ||
strcmp(args[0].from, "v1") == 0) {
btrfs_clear_opt(info->mount_opt,
FREE_SPACE_TREE);
btrfs_set_and_info(info, SPACE_CACHE,
"enabling disk space caching");
} else if (strcmp(args[0].from, "v2") == 0) {
btrfs_clear_opt(info->mount_opt,
SPACE_CACHE);
btrfs_set_and_info(info, FREE_SPACE_TREE,
"enabling free space tree");
} else {
ret = -EINVAL;
goto out;
}
break;
case Opt_rescan_uuid_tree:
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
if (btrfs_test_opt(info, SPACE_CACHE)) {
btrfs_clear_and_info(info, SPACE_CACHE,
"disabling disk space caching");
}
if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
btrfs_clear_and_info(info, FREE_SPACE_TREE,
"disabling free space tree");
}
break;
case Opt_inode_cache:
btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
"enabling inode map caching");
break;
case Opt_noinode_cache:
btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
"disabling inode map caching");
break;
case Opt_clear_cache:
btrfs_set_and_info(info, CLEAR_CACHE,
"force clearing of disk cache");
break;
case Opt_user_subvol_rm_allowed:
btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
break;
case Opt_enospc_debug:
btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
break;
case Opt_noenospc_debug:
btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
break;
case Opt_defrag:
btrfs_set_and_info(info, AUTO_DEFRAG,
"enabling auto defrag");
break;
case Opt_nodefrag:
btrfs_clear_and_info(info, AUTO_DEFRAG,
"disabling auto defrag");
break;
case Opt_recovery:
btrfs_warn(info,
"'recovery' is deprecated, use 'usebackuproot' instead");
/* fall through */
case Opt_usebackuproot:
btrfs_info(info,
"trying to use backup root at mount time");
btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
break;
case Opt_skip_balance:
btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
break;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
case Opt_check_integrity_including_extent_data:
btrfs_info(info,
"enabling check integrity including extent data");
btrfs_set_opt(info->mount_opt,
CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
break;
case Opt_check_integrity:
btrfs_info(info, "enabling check integrity");
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
break;
case Opt_check_integrity_print_mask:
ret = match_int(&args[0], &intarg);
if (ret)
goto out;
info->check_integrity_print_mask = intarg;
btrfs_info(info, "check_integrity_print_mask 0x%x",
info->check_integrity_print_mask);
break;
#else
case Opt_check_integrity_including_extent_data:
case Opt_check_integrity:
case Opt_check_integrity_print_mask:
btrfs_err(info,
"support for check_integrity* not compiled in!");
ret = -EINVAL;
goto out;
#endif
case Opt_fatal_errors:
if (strcmp(args[0].from, "panic") == 0)
btrfs_set_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
else if (strcmp(args[0].from, "bug") == 0)
btrfs_clear_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
else {
ret = -EINVAL;
goto out;
}
break;
case Opt_commit_interval:
intarg = 0;
ret = match_int(&args[0], &intarg);
if (ret)
goto out;
if (intarg == 0) {
btrfs_info(info,
"using default commit interval %us",
BTRFS_DEFAULT_COMMIT_INTERVAL);
intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
} else if (intarg > 300) {
btrfs_warn(info, "excessive commit interval %d",
intarg);
}
info->commit_interval = intarg;
break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
btrfs_info(info, "fragmenting all space");
btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
break;
case Opt_fragment_metadata:
btrfs_info(info, "fragmenting metadata");
btrfs_set_opt(info->mount_opt,
FRAGMENT_METADATA);
break;
case Opt_fragment_data:
btrfs_info(info, "fragmenting data");
btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
break;
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
case Opt_ref_verify:
btrfs_info(info, "doing ref verification");
btrfs_set_opt(info->mount_opt, REF_VERIFY);
break;
#endif
case Opt_err:
btrfs_info(info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
goto out;
default:
break;
}
}
check:
/*
* Extra check for current option against current flag
*/
if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) {
btrfs_err(info,
"nologreplay must be used with ro mount option");
ret = -EINVAL;
}
out:
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
!btrfs_test_opt(info, FREE_SPACE_TREE) &&
!btrfs_test_opt(info, CLEAR_CACHE)) {
btrfs_err(info, "cannot disable free space tree");
ret = -EINVAL;
}
if (!ret && btrfs_test_opt(info, SPACE_CACHE))
btrfs_info(info, "disk space caching is enabled");
if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
btrfs_info(info, "using free space tree");
return ret;
}
/*
* Parse mount options that are required early in the mount process.
*
* All other options will be parsed on much later in the mount process and
* only when we need to allocate a new super block.
*/
static int btrfs_parse_device_options(const char *options, fmode_t flags,
void *holder)
{
substring_t args[MAX_OPT_ARGS];
char *device_name, *opts, *orig, *p;
struct btrfs_device *device = NULL;
int error = 0;
lockdep_assert_held(&uuid_mutex);
if (!options)
return 0;
/*
* strsep changes the string, duplicate it because btrfs_parse_options
* gets called later
*/
opts = kstrdup(options, GFP_KERNEL);
if (!opts)
return -ENOMEM;
orig = opts;
while ((p = strsep(&opts, ",")) != NULL) {
int token;
if (!*p)
continue;
token = match_token(p, tokens, args);
if (token == Opt_device) {
device_name = match_strdup(&args[0]);
if (!device_name) {
error = -ENOMEM;
goto out;
}
device = btrfs_scan_one_device(device_name, flags,
holder);
kfree(device_name);
if (IS_ERR(device)) {
error = PTR_ERR(device);
goto out;
}
}
}
out:
kfree(orig);
return error;
}
/*
* Parse mount options that are related to subvolume id
*
* The value is later passed to mount_subvol()
*/
static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
u64 *subvol_objectid)
{
substring_t args[MAX_OPT_ARGS];
char *opts, *orig, *p;
int error = 0;
u64 subvolid;
if (!options)
return 0;
/*
* strsep changes the string, duplicate it because
* btrfs_parse_device_options gets called later
*/
opts = kstrdup(options, GFP_KERNEL);
if (!opts)
return -ENOMEM;
orig = opts;
while ((p = strsep(&opts, ",")) != NULL) {
int token;
if (!*p)
continue;
token = match_token(p, tokens, args);
switch (token) {
case Opt_subvol:
kfree(*subvol_name);
*subvol_name = match_strdup(&args[0]);
if (!*subvol_name) {
error = -ENOMEM;
goto out;
}
break;
case Opt_subvolid:
error = match_u64(&args[0], &subvolid);
if (error)
goto out;
/* we want the original fs_tree */
if (subvolid == 0)
subvolid = BTRFS_FS_TREE_OBJECTID;
*subvol_objectid = subvolid;
break;
case Opt_subvolrootid:
pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n");
break;
default:
break;
}
}
out:
kfree(orig);
return error;
}
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
u64 subvol_objectid)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_root *fs_root;
struct btrfs_root_ref *root_ref;
struct btrfs_inode_ref *inode_ref;
struct btrfs_key key;
struct btrfs_path *path = NULL;
char *name = NULL, *ptr;
u64 dirid;
int len;
int ret;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto err;
}
path->leave_spinning = 1;
name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!name) {
ret = -ENOMEM;
goto err;
}
ptr = name + PATH_MAX - 1;
ptr[0] = '\0';
/*
* Walk up the subvolume trees in the tree of tree roots by root
* backrefs until we hit the top-level subvolume.
*/
while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
key.objectid = subvol_objectid;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
goto err;
} else if (ret > 0) {
ret = btrfs_previous_item(root, path, subvol_objectid,
BTRFS_ROOT_BACKREF_KEY);
if (ret < 0) {
goto err;
} else if (ret > 0) {
ret = -ENOENT;
goto err;
}
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
subvol_objectid = key.offset;
root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_root_ref);
len = btrfs_root_ref_name_len(path->nodes[0], root_ref);
ptr -= len + 1;
if (ptr < name) {
ret = -ENAMETOOLONG;
goto err;
}
read_extent_buffer(path->nodes[0], ptr + 1,
(unsigned long)(root_ref + 1), len);
ptr[0] = '/';
dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
btrfs_release_path(path);
key.objectid = subvol_objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
fs_root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(fs_root)) {
ret = PTR_ERR(fs_root);
goto err;
}
/*
* Walk up the filesystem tree by inode refs until we hit the
* root directory.
*/
while (dirid != BTRFS_FIRST_FREE_OBJECTID) {
key.objectid = dirid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
if (ret < 0) {
goto err;
} else if (ret > 0) {
ret = btrfs_previous_item(fs_root, path, dirid,
BTRFS_INODE_REF_KEY);
if (ret < 0) {
goto err;
} else if (ret > 0) {
ret = -ENOENT;
goto err;
}
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
dirid = key.offset;
inode_ref = btrfs_item_ptr(path->nodes[0],
path->slots[0],
struct btrfs_inode_ref);
len = btrfs_inode_ref_name_len(path->nodes[0],
inode_ref);
ptr -= len + 1;
if (ptr < name) {
ret = -ENAMETOOLONG;
goto err;
}
read_extent_buffer(path->nodes[0], ptr + 1,
(unsigned long)(inode_ref + 1), len);
ptr[0] = '/';
btrfs_release_path(path);
}
}
btrfs_free_path(path);
if (ptr == name + PATH_MAX - 1) {
name[0] = '/';
name[1] = '\0';
} else {
memmove(name, ptr, name + PATH_MAX - ptr);
}
return name;
err:
btrfs_free_path(path);
kfree(name);
return ERR_PTR(ret);
}
static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_dir_item *di;
struct btrfs_path *path;
struct btrfs_key location;
u64 dir_id;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->leave_spinning = 1;
/*
* Find the "default" dir item which points to the root item that we
* will mount by default if we haven't been given a specific subvolume
* to mount.
*/
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
return PTR_ERR(di);
}
if (!di) {
/*
* Ok the default dir item isn't there. This is weird since
* it's always been there, but don't freak out, just try and
* mount the top-level subvolume.
*/
btrfs_free_path(path);
*objectid = BTRFS_FS_TREE_OBJECTID;
return 0;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
btrfs_free_path(path);
*objectid = location.objectid;
return 0;
}
static int btrfs_fill_super(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
void *data)
{
struct inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_key key;
int err;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_magic = BTRFS_SUPER_MAGIC;
sb->s_op = &btrfs_super_ops;
sb->s_d_op = &btrfs_dentry_operations;
sb->s_export_op = &btrfs_export_ops;
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
sb->s_flags |= SB_POSIXACL;
#endif
sb->s_flags |= SB_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
err = super_setup_bdi(sb);
if (err) {
btrfs_err(fs_info, "super_setup_bdi failed");
return err;
}
err = open_ctree(sb, fs_devices, (char *)data);
if (err) {
btrfs_err(fs_info, "open_ctree failed");
return err;
}
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail_close;
}
sb->s_root = d_make_root(inode);
if (!sb->s_root) {
err = -ENOMEM;
goto fail_close;
}
cleancache_init_fs(sb);
sb->s_flags |= SB_ACTIVE;
return 0;
fail_close:
close_ctree(fs_info);
return err;
}
int btrfs_sync_fs(struct super_block *sb, int wait)
{
struct btrfs_trans_handle *trans;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
trace_btrfs_sync_fs(fs_info, wait);
if (!wait) {
filemap_flush(fs_info->btree_inode->i_mapping);
return 0;
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
if (PTR_ERR(trans) == -ENOENT) {
/*
* Exit unless we have some pending changes
* that need to go through commit
*/
if (fs_info->pending_changes == 0)
return 0;
/*
* A non-blocking test if the fs is frozen. We must not
* start a new transaction here otherwise a deadlock
* happens. The pending operations are delayed to the
* next commit after thawing.
*/
if (sb_start_write_trylock(sb))
sb_end_write(sb);
else
return 0;
trans = btrfs_start_transaction(root, 0);
}
if (IS_ERR(trans))
return PTR_ERR(trans);
}
return btrfs_commit_transaction(trans);
}
static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
const char *compress_type;
const char *subvol_name;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
if (btrfs_test_opt(info, NODATASUM))
seq_puts(seq, ",nodatasum");
if (btrfs_test_opt(info, NODATACOW))
seq_puts(seq, ",nodatacow");
if (btrfs_test_opt(info, NOBARRIER))
seq_puts(seq, ",nobarrier");
if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
seq_printf(seq, ",max_inline=%llu", info->max_inline);
if (info->thread_pool_size != min_t(unsigned long,
num_online_cpus() + 2, 8))
seq_printf(seq, ",thread_pool=%u", info->thread_pool_size);
if (btrfs_test_opt(info, COMPRESS)) {
compress_type = btrfs_compress_type2str(info->compress_type);
if (btrfs_test_opt(info, FORCE_COMPRESS))
seq_printf(seq, ",compress-force=%s", compress_type);
else
seq_printf(seq, ",compress=%s", compress_type);
if (info->compress_level)
seq_printf(seq, ":%d", info->compress_level);
}
if (btrfs_test_opt(info, NOSSD))
seq_puts(seq, ",nossd");
if (btrfs_test_opt(info, SSD_SPREAD))
seq_puts(seq, ",ssd_spread");
else if (btrfs_test_opt(info, SSD))
seq_puts(seq, ",ssd");
if (btrfs_test_opt(info, NOTREELOG))
seq_puts(seq, ",notreelog");
if (btrfs_test_opt(info, NOLOGREPLAY))
seq_puts(seq, ",nologreplay");
if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(info, DISCARD))
seq_puts(seq, ",discard");
if (!(info->sb->s_flags & SB_POSIXACL))
seq_puts(seq, ",noacl");
if (btrfs_test_opt(info, SPACE_CACHE))
seq_puts(seq, ",space_cache");
else if (btrfs_test_opt(info, FREE_SPACE_TREE))
seq_puts(seq, ",space_cache=v2");
else
seq_puts(seq, ",nospace_cache");
if (btrfs_test_opt(info, RESCAN_UUID_TREE))
seq_puts(seq, ",rescan_uuid_tree");
if (btrfs_test_opt(info, CLEAR_CACHE))
seq_puts(seq, ",clear_cache");
if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED))
seq_puts(seq, ",user_subvol_rm_allowed");
if (btrfs_test_opt(info, ENOSPC_DEBUG))
seq_puts(seq, ",enospc_debug");
if (btrfs_test_opt(info, AUTO_DEFRAG))
seq_puts(seq, ",autodefrag");
if (btrfs_test_opt(info, INODE_MAP_CACHE))
seq_puts(seq, ",inode_cache");
if (btrfs_test_opt(info, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
seq_puts(seq, ",check_int_data");
else if (btrfs_test_opt(info, CHECK_INTEGRITY))
seq_puts(seq, ",check_int");
if (info->check_integrity_print_mask)
seq_printf(seq, ",check_int_print_mask=%d",
info->check_integrity_print_mask);
#endif
if (info->metadata_ratio)
seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%u", info->commit_interval);
#ifdef CONFIG_BTRFS_DEBUG
if (btrfs_test_opt(info, FRAGMENT_DATA))
seq_puts(seq, ",fragment=data");
if (btrfs_test_opt(info, FRAGMENT_METADATA))
seq_puts(seq, ",fragment=metadata");
#endif
if (btrfs_test_opt(info, REF_VERIFY))
seq_puts(seq, ",ref_verify");
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
subvol_name = btrfs_get_subvol_name_from_objectid(info,
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
if (!IS_ERR(subvol_name)) {
seq_puts(seq, ",subvol=");
seq_escape(seq, subvol_name, " \t\n\\");
kfree(subvol_name);
}
return 0;
}
static int btrfs_test_super(struct super_block *s, void *data)
{
struct btrfs_fs_info *p = data;
struct btrfs_fs_info *fs_info = btrfs_sb(s);
return fs_info->fs_devices == p->fs_devices;
}
static int btrfs_set_super(struct super_block *s, void *data)
{
int err = set_anon_super(s, data);
if (!err)
s->s_fs_info = data;
return err;
}
/*
* subvolumes are identified by ino 256
*/
static inline int is_subvolume_inode(struct inode *inode)
{
if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
return 1;
return 0;
}
static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
struct vfsmount *mnt)
{
struct dentry *root;
int ret;
if (!subvol_name) {
if (!subvol_objectid) {
ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
&subvol_objectid);
if (ret) {
root = ERR_PTR(ret);
goto out;
}
}
subvol_name = btrfs_get_subvol_name_from_objectid(
btrfs_sb(mnt->mnt_sb), subvol_objectid);
if (IS_ERR(subvol_name)) {
root = ERR_CAST(subvol_name);
subvol_name = NULL;
goto out;
}
}
root = mount_subtree(mnt, subvol_name);
/* mount_subtree() drops our reference on the vfsmount. */
mnt = NULL;
if (!IS_ERR(root)) {
struct super_block *s = root->d_sb;
struct btrfs_fs_info *fs_info = btrfs_sb(s);
struct inode *root_inode = d_inode(root);
u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;
ret = 0;
if (!is_subvolume_inode(root_inode)) {
btrfs_err(fs_info, "'%s' is not a valid subvolume",
subvol_name);
ret = -EINVAL;
}
if (subvol_objectid && root_objectid != subvol_objectid) {
/*
* This will also catch a race condition where a
* subvolume which was passed by ID is renamed and
* another subvolume is renamed over the old location.
*/
btrfs_err(fs_info,
"subvol '%s' does not match subvolid %llu",
subvol_name, subvol_objectid);
ret = -EINVAL;
}
if (ret) {
dput(root);
root = ERR_PTR(ret);
deactivate_locked_super(s);
}
}
out:
mntput(mnt);
kfree(subvol_name);
return root;
}
/*
* Find a superblock for the given device / mount point.
*
* Note: This is based on mount_bdev from fs/super.c with a few additions
* for multiple device setup. Make sure to keep it in sync.
*/
static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
int flags, const char *device_name, void *data)
{
struct block_device *bdev = NULL;
struct super_block *s;
struct btrfs_device *device = NULL;
struct btrfs_fs_devices *fs_devices = NULL;
struct btrfs_fs_info *fs_info = NULL;
void *new_sec_opts = NULL;
fmode_t mode = FMODE_READ;
int error = 0;
if (!(flags & SB_RDONLY))
mode |= FMODE_WRITE;
if (data) {
error = security_sb_eat_lsm_opts(data, &new_sec_opts);
if (error)
return ERR_PTR(error);
}
/*
* Setup a dummy root and fs_info for test/set super. This is because
* we don't actually fill this stuff out until open_ctree, but we need
* it for searching for existing supers, so this lets us do that and
* then open_ctree will properly initialize everything later.
*/
fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
if (!fs_info) {
error = -ENOMEM;
goto error_sec_opts;
}
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
if (!fs_info->super_copy || !fs_info->super_for_commit) {
error = -ENOMEM;
goto error_fs_info;
}
mutex_lock(&uuid_mutex);
error = btrfs_parse_device_options(data, mode, fs_type);
if (error) {
mutex_unlock(&uuid_mutex);
goto error_fs_info;
}
device = btrfs_scan_one_device(device_name, mode, fs_type);
if (IS_ERR(device)) {
mutex_unlock(&uuid_mutex);
error = PTR_ERR(device);
goto error_fs_info;
}
fs_devices = device->fs_devices;
fs_info->fs_devices = fs_devices;
error = btrfs_open_devices(fs_devices, mode, fs_type);
mutex_unlock(&uuid_mutex);
if (error)
goto error_fs_info;
if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
error = -EACCES;
goto error_close_devices;
}
bdev = fs_devices->latest_bdev;
s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
fs_info);
if (IS_ERR(s)) {
error = PTR_ERR(s);
goto error_close_devices;
}
if (s->s_root) {
btrfs_close_devices(fs_devices);
free_fs_info(fs_info);
if ((flags ^ s->s_flags) & SB_RDONLY)
error = -EBUSY;
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
btrfs_sb(s)->bdev_holder = fs_type;
if (!strstr(crc32c_impl(), "generic"))
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
error = btrfs_fill_super(s, fs_devices, data);
}
if (!error)
error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
security_free_mnt_opts(&new_sec_opts);
if (error) {
deactivate_locked_super(s);
return ERR_PTR(error);
}
return dget(s->s_root);
error_close_devices:
btrfs_close_devices(fs_devices);
error_fs_info:
free_fs_info(fs_info);
error_sec_opts:
security_free_mnt_opts(&new_sec_opts);
return ERR_PTR(error);
}
/*
* Mount function which is called by VFS layer.
*
* In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
* which needs vfsmount* of device's root (/). This means device's root has to
* be mounted internally in any case.
*
* Operation flow:
* 1. Parse subvol id related options for later use in mount_subvol().
*
* 2. Mount device's root (/) by calling vfs_kern_mount().
*
* NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
* first place. In order to avoid calling btrfs_mount() again, we use
* different file_system_type which is not registered to VFS by
* register_filesystem() (btrfs_root_fs_type). As a result,
* btrfs_mount_root() is called. The return value will be used by
* mount_subtree() in mount_subvol().
*
* 3. Call mount_subvol() to get the dentry of subvolume. Since there is
* "btrfs subvolume set-default", mount_subvol() is called always.
*/
static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
const char *device_name, void *data)
{
struct vfsmount *mnt_root;
struct dentry *root;
char *subvol_name = NULL;
u64 subvol_objectid = 0;
int error = 0;
error = btrfs_parse_subvol_options(data, &subvol_name,
&subvol_objectid);
if (error) {
kfree(subvol_name);
return ERR_PTR(error);
}
/* mount device's root (/) */
mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
if (flags & SB_RDONLY) {
mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
flags & ~SB_RDONLY, device_name, data);
} else {
mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
flags | SB_RDONLY, device_name, data);
if (IS_ERR(mnt_root)) {
root = ERR_CAST(mnt_root);
kfree(subvol_name);
goto out;
}
down_write(&mnt_root->mnt_sb->s_umount);
error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
up_write(&mnt_root->mnt_sb->s_umount);
if (error < 0) {
root = ERR_PTR(error);
mntput(mnt_root);
kfree(subvol_name);
goto out;
}
}
}
if (IS_ERR(mnt_root)) {
root = ERR_CAST(mnt_root);
kfree(subvol_name);
goto out;
}
/* mount_subvol() will free subvol_name and mnt_root */
root = mount_subvol(subvol_name, subvol_objectid, mnt_root);
out:
return root;
}
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
u32 new_pool_size, u32 old_pool_size)
{
if (new_pool_size == old_pool_size)
return;
fs_info->thread_pool_size = new_pool_size;
btrfs_info(fs_info, "resize thread pool %d -> %d",
old_pool_size, new_pool_size);
btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
new_pool_size);
}
static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
{
set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
}
static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
unsigned long old_opts, int flags)
{
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
(!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
(flags & SB_RDONLY))) {
/* wait for any defraggers to finish */
wait_event(fs_info->transaction_wait,
(atomic_read(&fs_info->defrag_running) == 0));
if (flags & SB_RDONLY)
sync_filesystem(fs_info->sb);
}
}
static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
unsigned long old_opts)
{
/*
* We need to cleanup all defragable inodes if the autodefragment is
* close or the filesystem is read only.
*/
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
(!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || sb_rdonly(fs_info->sb))) {
btrfs_cleanup_defrag_inodes(fs_info);
}
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
}
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
unsigned old_flags = sb->s_flags;
unsigned long old_opts = fs_info->mount_opt;
unsigned long old_compress_type = fs_info->compress_type;
u64 old_max_inline = fs_info->max_inline;
u32 old_thread_pool_size = fs_info->thread_pool_size;
u32 old_metadata_ratio = fs_info->metadata_ratio;
int ret;
sync_filesystem(sb);
btrfs_remount_prepare(fs_info);
if (data) {
void *new_sec_opts = NULL;
ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
if (!ret)
ret = security_sb_remount(sb, new_sec_opts);
security_free_mnt_opts(&new_sec_opts);
if (ret)
goto restore;
}
ret = btrfs_parse_options(fs_info, data, *flags);
if (ret)
goto restore;
btrfs_remount_begin(fs_info, old_opts, *flags);
btrfs_resize_thread_pool(fs_info,
fs_info->thread_pool_size, old_thread_pool_size);
if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
if (*flags & SB_RDONLY) {
/*
* this also happens on 'umount -rf' or on shutdown, when
* the filesystem is busy.
*/
cancel_work_sync(&fs_info->async_reclaim_work);
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
/* avoid complains from lockdep et al. */
up(&fs_info->uuid_tree_rescan_sem);
sb->s_flags |= SB_RDONLY;
/*
* Setting SB_RDONLY will put the cleaner thread to
* sleep at the next loop if it's already active.
* If it's already asleep, we'll leave unused block
* groups on disk until we're mounted read-write again
* unless we clean them up here.
*/
btrfs_delete_unused_bgs(fs_info);
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
btrfs_pause_balance(fs_info);
/*
* Pause the qgroup rescan worker if it is running. We don't want
* it to be still running after we are in RO mode, as after that,
* by the time we unmount, it might have left a transaction open,
* so we would leak the transaction and/or crash.
*/
btrfs_qgroup_wait_for_completion(fs_info, false);
ret = btrfs_commit_super(fs_info);
if (ret)
goto restore;
} else {
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
btrfs_err(fs_info,
"Remounting read-write after error is not allowed");
ret = -EINVAL;
goto restore;
}
if (fs_info->fs_devices->rw_devices == 0) {
ret = -EACCES;
goto restore;
}
if (!btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"too many missing devices, writable remount is not allowed");
ret = -EACCES;
goto restore;
}
if (btrfs_super_log_root(fs_info->super_copy) != 0) {
btrfs_warn(fs_info,
"mount required to replay tree-log, cannot remount read-write");
ret = -EINVAL;
goto restore;
}
ret = btrfs_cleanup_fs_roots(fs_info);
if (ret)
goto restore;
/* recover relocation */
mutex_lock(&fs_info->cleaner_mutex);
ret = btrfs_recover_relocation(root);
mutex_unlock(&fs_info->cleaner_mutex);
if (ret)
goto restore;
ret = btrfs_resume_balance_async(fs_info);
if (ret)
goto restore;
ret = btrfs_resume_dev_replace_async(fs_info);
if (ret) {
btrfs_warn(fs_info, "failed to resume dev_replace");
goto restore;
}
btrfs_qgroup_rescan_resume(fs_info);
if (!fs_info->uuid_root) {
btrfs_info(fs_info, "creating UUID tree");
ret = btrfs_create_uuid_tree(fs_info);
if (ret) {
btrfs_warn(fs_info,
"failed to create the UUID tree %d",
ret);
goto restore;
}
}
sb->s_flags &= ~SB_RDONLY;
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
}
out:
/*
* We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
* since the absence of the flag means it can be toggled off by remount.
*/
*flags |= SB_I_VERSION;
wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
return 0;
restore:
/* We've hit an error - don't reset SB_RDONLY */
if (sb_rdonly(sb))
old_flags |= SB_RDONLY;
sb->s_flags = old_flags;
fs_info->mount_opt = old_opts;
fs_info->compress_type = old_compress_type;
fs_info->max_inline = old_max_inline;
btrfs_resize_thread_pool(fs_info,
old_thread_pool_size, fs_info->thread_pool_size);
fs_info->metadata_ratio = old_metadata_ratio;
btrfs_remount_cleanup(fs_info, old_opts);
return ret;
}
/* Used to sort the devices by max_avail(descending sort) */
static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
const void *dev_info2)
{
if (((struct btrfs_device_info *)dev_info1)->max_avail >
((struct btrfs_device_info *)dev_info2)->max_avail)
return -1;
else if (((struct btrfs_device_info *)dev_info1)->max_avail <
((struct btrfs_device_info *)dev_info2)->max_avail)
return 1;
else
return 0;
}
/*
* sort the devices by max_avail, in which max free extent size of each device
* is stored.(Descending Sort)
*/
static inline void btrfs_descending_sort_devices(
struct btrfs_device_info *devices,
size_t nr_devices)
{
sort(devices, nr_devices, sizeof(struct btrfs_device_info),
btrfs_cmp_device_free_bytes, NULL);
}
/*
* The helper to calc the free space on the devices that can be used to store
* file data.
*/
static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
u64 *free_bytes)
{
struct btrfs_device_info *devices_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 type;
u64 avail_space;
u64 min_stripe_size;
int num_stripes = 1;
int i = 0, nr_devices;
const struct btrfs_raid_attr *rattr;
/*
* We aren't under the device list lock, so this is racy-ish, but good
* enough for our purposes.
*/
nr_devices = fs_info->fs_devices->open_devices;
if (!nr_devices) {
smp_mb();
nr_devices = fs_info->fs_devices->open_devices;
ASSERT(nr_devices);
if (!nr_devices) {
*free_bytes = 0;
return 0;
}
}
devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
GFP_KERNEL);
if (!devices_info)
return -ENOMEM;
/* calc min stripe number for data space allocation */
type = btrfs_data_alloc_profile(fs_info);
rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)];
if (type & BTRFS_BLOCK_GROUP_RAID0)
num_stripes = nr_devices;
else if (type & BTRFS_BLOCK_GROUP_RAID1)
num_stripes = 2;
else if (type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = 4;
/* Adjust for more than 1 stripe per device */
min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN;
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&device->dev_state) ||
!device->bdev ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
continue;
if (i >= nr_devices)
break;
avail_space = device->total_bytes - device->bytes_used;
/* align with stripe_len */
avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN);
/*
* In order to avoid overwriting the superblock on the drive,
* btrfs starts at an offset of at least 1MB when doing chunk
* allocation.
*
* This ensures we have at least min_stripe_size free space
* after excluding 1MB.
*/
if (avail_space <= SZ_1M + min_stripe_size)
continue;
avail_space -= SZ_1M;
devices_info[i].dev = device;
devices_info[i].max_avail = avail_space;
i++;
}
rcu_read_unlock();
nr_devices = i;
btrfs_descending_sort_devices(devices_info, nr_devices);
i = nr_devices - 1;
avail_space = 0;
while (nr_devices >= rattr->devs_min) {
num_stripes = min(num_stripes, nr_devices);
if (devices_info[i].max_avail >= min_stripe_size) {
int j;
u64 alloc_size;
avail_space += devices_info[i].max_avail * num_stripes;
alloc_size = devices_info[i].max_avail;
for (j = i + 1 - num_stripes; j <= i; j++)
devices_info[j].max_avail -= alloc_size;
}
i--;
nr_devices--;
}
kfree(devices_info);
*free_bytes = avail_space;
return 0;
}
/*
* Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
*
* If there's a redundant raid level at DATA block groups, use the respective
* multiplier to scale the sizes.
*
* Unused device space usage is based on simulating the chunk allocator
* algorithm that respects the device sizes and order of allocations. This is
* a close approximation of the actual use but there are other factors that may
* change the result (like a new metadata chunk).
*
* If metadata is exhausted, f_bavail will be 0.
*/
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
struct btrfs_super_block *disk_super = fs_info->super_copy;
struct list_head *head = &fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
u64 total_free_meta = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
u64 thresh = 0;
int mixed = 0;
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
int i;
total_free_data += found->disk_total - found->disk_used;
total_free_data -=
btrfs_account_ro_block_groups_free_space(found);
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
if (!list_empty(&found->block_groups[i]))
factor = btrfs_bg_type_to_factor(
btrfs_raid_array[i].bg_flag);
}
}
/*
* Metadata in mixed block goup profiles are accounted in data
*/
if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA)
mixed = 1;
else
total_free_meta += found->disk_total -
found->disk_used;
}
total_used += found->disk_used;
}
rcu_read_unlock();
buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
buf->f_blocks >>= bits;
buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
/* Account global block reserve as used, it's in logical size already */
spin_lock(&block_rsv->lock);
/* Mixed block groups accounting is not byte-accurate, avoid overflow */
if (buf->f_bfree >= block_rsv->size >> bits)
buf->f_bfree -= block_rsv->size >> bits;
else
buf->f_bfree = 0;
spin_unlock(&block_rsv->lock);
buf->f_bavail = div_u64(total_free_data, factor);
ret = btrfs_calc_avail_data_space(fs_info, &total_free_data);
if (ret)
return ret;
buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
/*
* We calculate the remaining metadata space minus global reserve. If
* this is (supposedly) smaller than zero, there's no space. But this
* does not hold in practice, the exhausted state happens where's still
* some positive delta. So we apply some guesswork and compare the
* delta to a 4M threshold. (Practically observed delta was ~2M.)
*
* We probably cannot calculate the exact threshold value because this
* depends on the internal reservations requested by various
* operations, so some operations that consume a few metadata will
* succeed even if the Avail is zero. But this is better than the other
* way around.
*/
thresh = SZ_4M;
/*
* We only want to claim there's no available space if we can no longer
* allocate chunks for our metadata profile and our global reserve will
* not fit in the free metadata space. If we aren't ->full then we
* still can allocate chunks and thus are fine using the currently
* calculated f_bavail.
*/
if (!mixed && block_rsv->space_info->full &&
total_free_meta - thresh < block_rsv->size)
buf->f_bavail = 0;
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_namelen = BTRFS_NAME_LEN;
/* We treat it as constant endianness (it doesn't matter _which_)
because we want the fsid to come out the same whether mounted
on a big-endian or little-endian host */
buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
/* Mask in the root object ID too, to disambiguate subvols */
buf->f_fsid.val[0] ^=
BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
buf->f_fsid.val[1] ^=
BTRFS_I(d_inode(dentry))->root->root_key.objectid;
return 0;
}
static void btrfs_kill_super(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
kill_anon_super(sb);
free_fs_info(fs_info);
}
static struct file_system_type btrfs_fs_type = {
.owner = THIS_MODULE,
.name = "btrfs",
.mount = btrfs_mount,
.kill_sb = btrfs_kill_super,
.fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
};
static struct file_system_type btrfs_root_fs_type = {
.owner = THIS_MODULE,
.name = "btrfs",
.mount = btrfs_mount_root,
.kill_sb = btrfs_kill_super,
.fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
};
MODULE_ALIAS_FS("btrfs");
static int btrfs_control_open(struct inode *inode, struct file *file)
{
/*
* The control file's private_data is used to hold the
* transaction when it is started and is used to keep
* track of whether a transaction is already in progress.
*/
file->private_data = NULL;
return 0;
}
/*
* used by btrfsctl to scan devices when no FS is mounted
*/
static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct btrfs_ioctl_vol_args *vol;
struct btrfs_device *device = NULL;
int ret = -ENOTTY;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
vol = memdup_user((void __user *)arg, sizeof(*vol));
if (IS_ERR(vol))
return PTR_ERR(vol);
vol->name[BTRFS_PATH_NAME_MAX] = '\0';
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
mutex_lock(&uuid_mutex);
device = btrfs_scan_one_device(vol->name, FMODE_READ,
&btrfs_root_fs_type);
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
case BTRFS_IOC_FORGET_DEV:
ret = btrfs_forget_devices(vol->name);
break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
device = btrfs_scan_one_device(vol->name, FMODE_READ,
&btrfs_root_fs_type);
if (IS_ERR(device)) {
mutex_unlock(&uuid_mutex);
ret = PTR_ERR(device);
break;
}
ret = !(device->fs_devices->num_devices ==
device->fs_devices->total_devices);
mutex_unlock(&uuid_mutex);
break;
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
ret = btrfs_ioctl_get_supported_features((void __user*)arg);
break;
}
kfree(vol);
return ret;
}
static int btrfs_freeze(struct super_block *sb)
{
struct btrfs_trans_handle *trans;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
set_bit(BTRFS_FS_FROZEN, &fs_info->flags);
/*
* We don't need a barrier here, we'll wait for any transaction that
* could be in progress on other threads (and do delayed iputs that
* we want to avoid on a frozen filesystem), or do the commit
* ourselves.
*/
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
if (PTR_ERR(trans) == -ENOENT)
return 0;
return PTR_ERR(trans);
}
return btrfs_commit_transaction(trans);
}
static int btrfs_unfreeze(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
return 0;
}
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
struct btrfs_device *dev, *first_dev = NULL;
/*
* Lightweight locking of the devices. We should not need
* device_list_mutex here as we only read the device data and the list
* is protected by RCU. Even if a device is deleted during the list
* traversals, we'll get valid data, the freeing callback will wait at
* least until the rcu_read_unlock.
*/
rcu_read_lock();
list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) {
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
continue;
if (!dev->name)
continue;
if (!first_dev || dev->devid < first_dev->devid)
first_dev = dev;
}
if (first_dev)
seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\");
else
WARN_ON(1);
rcu_read_unlock();
return 0;
}
static const struct super_operations btrfs_super_ops = {
.drop_inode = btrfs_drop_inode,
.evict_inode = btrfs_evict_inode,
.put_super = btrfs_put_super,
.sync_fs = btrfs_sync_fs,
.show_options = btrfs_show_options,
.show_devname = btrfs_show_devname,
.alloc_inode = btrfs_alloc_inode,
.destroy_inode = btrfs_destroy_inode,
.free_inode = btrfs_free_inode,
.statfs = btrfs_statfs,
.remount_fs = btrfs_remount,
.freeze_fs = btrfs_freeze,
.unfreeze_fs = btrfs_unfreeze,
};
static const struct file_operations btrfs_ctl_fops = {
.open = btrfs_control_open,
.unlocked_ioctl = btrfs_control_ioctl,
.compat_ioctl = btrfs_control_ioctl,
.owner = THIS_MODULE,
.llseek = noop_llseek,
};
static struct miscdevice btrfs_misc = {
.minor = BTRFS_MINOR,
.name = "btrfs-control",
.fops = &btrfs_ctl_fops
};
MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
MODULE_ALIAS("devname:btrfs-control");
static int __init btrfs_interface_init(void)
{
return misc_register(&btrfs_misc);
}
static __cold void btrfs_interface_exit(void)
{
misc_deregister(&btrfs_misc);
}
static void __init btrfs_print_mod_info(void)
{
static const char options[] = ""
#ifdef CONFIG_BTRFS_DEBUG
", debug=on"
#endif
#ifdef CONFIG_BTRFS_ASSERT
", assert=on"
#endif
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
", integrity-checker=on"
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
", ref-verify=on"
#endif
;
pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
}
static int __init init_btrfs_fs(void)
{
int err;
btrfs_props_init();
err = btrfs_init_sysfs();
if (err)
return err;
btrfs_init_compress();
err = btrfs_init_cachep();
if (err)
goto free_compress;
err = extent_io_init();
if (err)
goto free_cachep;
err = extent_map_init();
if (err)
goto free_extent_io;
err = ordered_data_init();
if (err)
goto free_extent_map;
err = btrfs_delayed_inode_init();
if (err)
goto free_ordered_data;
err = btrfs_auto_defrag_init();
if (err)
goto free_delayed_inode;
err = btrfs_delayed_ref_init();
if (err)
goto free_auto_defrag;
err = btrfs_prelim_ref_init();
if (err)
goto free_delayed_ref;
err = btrfs_end_io_wq_init();
if (err)
goto free_prelim_ref;
err = btrfs_interface_init();
if (err)
goto free_end_io_wq;
btrfs_init_lockdep();
btrfs_print_mod_info();
err = btrfs_run_sanity_tests();
if (err)
goto unregister_ioctl;
err = register_filesystem(&btrfs_fs_type);
if (err)
goto unregister_ioctl;
return 0;
unregister_ioctl:
btrfs_interface_exit();
free_end_io_wq:
btrfs_end_io_wq_exit();
free_prelim_ref:
btrfs_prelim_ref_exit();
free_delayed_ref:
btrfs_delayed_ref_exit();
free_auto_defrag:
btrfs_auto_defrag_exit();
free_delayed_inode:
btrfs_delayed_inode_exit();
free_ordered_data:
ordered_data_exit();
free_extent_map:
extent_map_exit();
free_extent_io:
extent_io_exit();
free_cachep:
btrfs_destroy_cachep();
free_compress:
btrfs_exit_compress();
btrfs_exit_sysfs();
return err;
}
static void __exit exit_btrfs_fs(void)
{
btrfs_destroy_cachep();
btrfs_delayed_ref_exit();
btrfs_auto_defrag_exit();
btrfs_delayed_inode_exit();
btrfs_prelim_ref_exit();
ordered_data_exit();
extent_map_exit();
extent_io_exit();
btrfs_interface_exit();
btrfs_end_io_wq_exit();
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
btrfs_exit_compress();
}
late_initcall(init_btrfs_fs);
module_exit(exit_btrfs_fs)
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");