From 4e34323135ec0752af152ac588a4f96495074849 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Sun, 11 Aug 2019 16:27:41 -0400 Subject: [PATCH 01/21] ext4: fix warning when turn on dioread_nolock and inline_data mkfs.ext4 -O inline_data /dev/vdb mount -o dioread_nolock /dev/vdb /mnt echo "some inline data..." >> /mnt/test-file echo "some inline data..." >> /mnt/test-file sync The above script will trigger "WARN_ON(!io_end->handle && sbi->s_journal)" because ext4_should_dioread_nolock() returns false for a file with inline data. Move the check to a place after we have already removed the inline data and prepared inode to write normal pages. Reviewed-by: Jan Kara Signed-off-by: yangerkun Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 420fe3deed39..a6523516d681 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2785,15 +2785,6 @@ static int ext4_writepages(struct address_space *mapping, goto out_writepages; } - if (ext4_should_dioread_nolock(inode)) { - /* - * We may need to convert up to one extent per block in - * the page and we may dirty the inode. - */ - rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, - PAGE_SIZE >> inode->i_blkbits); - } - /* * If we have inline data and arrive here, it means that * we will soon create the block for the 1st page, so @@ -2812,6 +2803,15 @@ static int ext4_writepages(struct address_space *mapping, ext4_journal_stop(handle); } + if (ext4_should_dioread_nolock(inode)) { + /* + * We may need to convert up to one extent per block in + * the page and we may dirty the inode. + */ + rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, + PAGE_SIZE >> inode->i_blkbits); + } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; From 991f52306ab8b5d8427f6f953162b5ab92e88e51 Mon Sep 17 00:00:00 2001 From: Shi Siyuan Date: Sun, 11 Aug 2019 16:28:41 -0400 Subject: [PATCH 02/21] ext4: remove unnecessary error check Remove unnecessary error check in ext4_file_write_iter(), because this check will be done in upcoming later function -- ext4_write_checks() -> generic_write_checks() Change-Id: I7b0ab27f693a50765c15b5eaa3f4e7c38f42e01e Signed-off-by: shisiyuan Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 70b0438dbc94..bc378a7ac950 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -230,8 +230,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif - if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT)) - return -EOPNOTSUPP; if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) From 547b9ad698b434eadca46319cb47e5875b55ef03 Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Sun, 11 Aug 2019 16:29:41 -0400 Subject: [PATCH 03/21] jbd2: flush_descriptor(): Do not decrease buffer head's ref count When executing generic/388 on a ppc64le machine, we notice the following call trace, VFS: brelse: Trying to free free buffer WARNING: CPU: 0 PID: 6637 at /root/repos/linux/fs/buffer.c:1195 __brelse+0x84/0xc0 Call Trace: __brelse+0x80/0xc0 (unreliable) invalidate_bh_lru+0x78/0xc0 on_each_cpu_mask+0xa8/0x130 on_each_cpu_cond_mask+0x130/0x170 invalidate_bh_lrus+0x44/0x60 invalidate_bdev+0x38/0x70 ext4_put_super+0x294/0x560 generic_shutdown_super+0xb0/0x170 kill_block_super+0x38/0xb0 deactivate_locked_super+0xa4/0xf0 cleanup_mnt+0x164/0x1d0 task_work_run+0x110/0x160 do_notify_resume+0x414/0x460 ret_from_except_lite+0x70/0x74 The warning happens because flush_descriptor() drops bh reference it does not own. The bh reference acquired by jbd2_journal_get_descriptor_buffer() is owned by the log_bufs list and gets released when this list is processed. The reference for doing IO is only acquired in write_dirty_buffer() later in flush_descriptor(). Reported-by: Harish Sriram Reviewed-by: Jan Kara Signed-off-by: Chandan Rajendra Signed-off-by: Theodore Ts'o --- fs/jbd2/revoke.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 69b9bc329964..f08073d7bbf5 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -638,10 +638,8 @@ static void flush_descriptor(journal_t *journal, { jbd2_journal_revoke_header_t *header; - if (is_journal_aborted(journal)) { - put_bh(descriptor); + if (is_journal_aborted(journal)) return; - } header = (jbd2_journal_revoke_header_t *)descriptor->b_data; header->r_count = cpu_to_be32(offset); From b0c013e2928d3696ceb6401311dbc1d7fcccd6dd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 11 Aug 2019 16:30:41 -0400 Subject: [PATCH 04/21] ext4: add a new ioctl EXT4_IOC_CLEAR_ES_CACHE The new ioctl EXT4_IOC_CLEAR_ES_CACHE will force an inode's extent status cache to be cleared out. This is intended for use for debugging. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/extents_status.c | 28 ++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 1 + fs/ext4/ioctl.c | 9 +++++++++ 4 files changed, 40 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bf660aa7a9e0..b22f24f1d365 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -649,6 +649,8 @@ enum { #define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT #define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +/* ioctl codes 19--39 are reserved for fscrypt */ +#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 7521de2dcf3a..02cc8eb3eb0e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1374,6 +1374,34 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) return nr_shrunk; } +/* + * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove + * discretionary entries from the extent status cache. (Some entries + * must be present for proper operations.) + */ +void ext4_clear_inode_es(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct extent_status *es; + struct ext4_es_tree *tree; + struct rb_node *node; + + write_lock(&ei->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + tree->cache_es = NULL; + node = rb_first(&tree->root); + while (node) { + es = rb_entry(node, struct extent_status, rb_node); + node = rb_next(node); + if (!ext4_es_is_delayed(es)) { + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + } + } + ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED); + write_unlock(&ei->i_es_lock); +} + #ifdef ES_DEBUG__ static void ext4_print_pending_tree(struct inode *inode) { diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 131a8b7df265..e16785f431e7 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -248,5 +248,6 @@ extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); +extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 442f7ef873fc..15b1047878ab 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1115,6 +1115,14 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case EXT4_IOC_GET_ENCRYPTION_POLICY: return fscrypt_ioctl_get_policy(filp, (void __user *)arg); + case EXT4_IOC_CLEAR_ES_CACHE: + { + if (!inode_owner_or_capable(inode)) + return -EACCES; + ext4_clear_inode_es(inode); + return 0; + } + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -1233,6 +1241,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_GET_ENCRYPTION_POLICY: case EXT4_IOC_SHUTDOWN: case FS_IOC_GETFSMAP: + case EXT4_IOC_CLEAR_ES_CACHE: break; default: return -ENOIOCTLCMD; From 1ad3ea6e0a694b0486eb2cbe60378ad0fbf23642 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 11 Aug 2019 16:31:41 -0400 Subject: [PATCH 05/21] ext4: add a new ioctl EXT4_IOC_GETSTATE The new ioctl EXT4_IOC_GETSTATE returns some of the dynamic state of an ext4 inode for debugging purposes. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 11 +++++++++++ fs/ext4/ioctl.c | 17 +++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b22f24f1d365..ee296797bcd2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -651,6 +651,7 @@ enum { #define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY /* ioctl codes 19--39 are reserved for fscrypt */ #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) +#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR @@ -664,6 +665,16 @@ enum { #define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ #define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ +/* + * Flags returned by EXT4_IOC_GETSTATE + * + * We only expose to userspace a subset of the state flags in + * i_state_flags + */ +#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 +#define EXT4_STATE_FLAG_NEW 0x00000002 +#define EXT4_STATE_FLAG_NEWENTRY 0x00000004 +#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 15b1047878ab..ffb7bde4900d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1123,6 +1123,22 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return 0; } + case EXT4_IOC_GETSTATE: + { + __u32 state = 0; + + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED)) + state |= EXT4_STATE_FLAG_EXT_PRECACHED; + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) + state |= EXT4_STATE_FLAG_NEW; + if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) + state |= EXT4_STATE_FLAG_NEWENTRY; + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) + state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE; + + return put_user(state, (__u32 __user *) arg); + } + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -1242,6 +1258,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_SHUTDOWN: case FS_IOC_GETFSMAP: case EXT4_IOC_CLEAR_ES_CACHE: + case EXT4_IOC_GETSTATE: break; default: return -ENOIOCTLCMD; From bb5835edcdf8bf78bbe51cff13e332c439bc0567 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 11 Aug 2019 16:32:41 -0400 Subject: [PATCH 06/21] ext4: add new ioctl EXT4_IOC_GET_ES_CACHE For debugging reasons, it's useful to know the contents of the extent cache. Since the extent cache contains much of what is in the fiemap ioctl, use an fiemap-style interface to return this information. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 10 +++++ fs/ext4/extents.c | 94 ++++++++++++++++++++++++++++++++++++---- fs/ext4/extents_status.c | 10 +++++ fs/ext4/extents_status.h | 1 + fs/ext4/inode.c | 6 +-- fs/ext4/ioctl.c | 72 ++++++++++++++++++++++++++++++ 6 files changed, 182 insertions(+), 11 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ee296797bcd2..e2d8ad27f4d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -652,6 +652,7 @@ enum { /* ioctl codes 19--39 are reserved for fscrypt */ #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) +#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR @@ -692,6 +693,12 @@ enum { #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif +/* + * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. + * It indicates that the entry in extent status cache is for a hole. + */ +#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 + /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF @@ -3258,6 +3265,9 @@ extern int ext4_ext_check_inode(struct inode *inode); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +extern int ext4_get_es_cache(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 92266a2da7d6..0620d495fd8a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2315,6 +2315,52 @@ static int ext4_fill_fiemap_extents(struct inode *inode, return err; } +static int ext4_fill_es_cache_info(struct inode *inode, + ext4_lblk_t block, ext4_lblk_t num, + struct fiemap_extent_info *fieinfo) +{ + ext4_lblk_t next, end = block + num - 1; + struct extent_status es; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; + unsigned int flags; + int err; + + while (block <= end) { + next = 0; + flags = 0; + if (!ext4_es_lookup_extent(inode, block, &next, &es)) + break; + if (ext4_es_is_unwritten(&es)) + flags |= FIEMAP_EXTENT_UNWRITTEN; + if (ext4_es_is_delayed(&es)) + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + if (ext4_es_is_hole(&es)) + flags |= EXT4_FIEMAP_EXTENT_HOLE; + if (next == 0) + flags |= FIEMAP_EXTENT_LAST; + if (flags & (FIEMAP_EXTENT_DELALLOC| + EXT4_FIEMAP_EXTENT_HOLE)) + es.es_pblk = 0; + else + es.es_pblk = ext4_es_pblock(&es); + err = fiemap_fill_next_extent(fieinfo, + (__u64)es.es_lblk << blksize_bits, + (__u64)es.es_pblk << blksize_bits, + (__u64)es.es_len << blksize_bits, + flags); + if (next == 0) + break; + block = next; + if (err < 0) + return err; + if (err == 1) + return 0; + } + return 0; +} + + /* * ext4_ext_determine_hole - determine hole around given block * @inode: inode we lookup in @@ -5017,8 +5063,6 @@ static int ext4_find_delayed_extent(struct inode *inode, return next_del; } -/* fiemap flags we can handle specified here */ -#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) static int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) @@ -5055,10 +5099,16 @@ static int ext4_xattr_fiemap(struct inode *inode, return (error < 0 ? error : 0); } -int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) +static int _ext4_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, + int (*fill)(struct inode *, ext4_lblk_t, + ext4_lblk_t, + struct fiemap_extent_info *)) { ext4_lblk_t start_blk; + u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR; + int error = 0; if (ext4_has_inline_data(inode)) { @@ -5075,14 +5125,18 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, error = ext4_ext_precache(inode); if (error) return error; + fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } /* fallback to generic here if not in extents fmt */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && + fill == ext4_fill_fiemap_extents) return generic_block_fiemap(inode, fieinfo, start, len, ext4_get_block); - if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) + if (fill == ext4_fill_es_cache_info) + ext4_fiemap_flags &= FIEMAP_FLAG_XATTR; + if (fiemap_check_flags(fieinfo, ext4_fiemap_flags)) return -EBADR; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { @@ -5101,12 +5155,36 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Walk the extent tree gathering extent information * and pushing extents back to the user. */ - error = ext4_fill_fiemap_extents(inode, start_blk, - len_blks, fieinfo); + error = fill(inode, start_blk, len_blks, fieinfo); } return error; } +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + return _ext4_fiemap(inode, fieinfo, start, len, + ext4_fill_fiemap_extents); +} + +int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + if (ext4_has_inline_data(inode)) { + int has_inline; + + down_read(&EXT4_I(inode)->xattr_sem); + has_inline = ext4_has_inline_data(inode); + up_read(&EXT4_I(inode)->xattr_sem); + if (has_inline) + return 0; + } + + return _ext4_fiemap(inode, fieinfo, start, len, + ext4_fill_es_cache_info); +} + + /* * ext4_access_path: * Function to access the path buffer for marking it dirty. diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 02cc8eb3eb0e..a959adc59bcd 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -899,6 +899,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, * Return: 1 on found, 0 on not */ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, struct extent_status *es) { struct ext4_es_tree *tree; @@ -948,6 +949,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, if (!ext4_es_is_referenced(es1)) ext4_es_set_referenced(es1); stats->es_stats_cache_hits++; + if (next_lblk) { + node = rb_next(&es1->rb_node); + if (node) { + es1 = rb_entry(node, struct extent_status, + rb_node); + *next_lblk = es1->es_lblk; + } else + *next_lblk = 0; + } } else { stats->es_stats_cache_misses++; } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index e16785f431e7..eb56a1289031 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -140,6 +140,7 @@ extern void ext4_es_find_extent_range(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, struct extent_status *es); extern bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a6523516d681..4b92c7603907 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -527,7 +527,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, return -EFSCORRUPTED; /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -695,7 +695,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * extent status tree. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO) && - ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es)) goto out_sem; } @@ -1868,7 +1868,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, iblock, &es)) { + if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { if (ext4_es_is_hole(&es)) { retval = 0; down_read(&EXT4_I(inode)->i_data_sem); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index ffb7bde4900d..d6242b7b8718 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -745,6 +745,74 @@ static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa) fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid); } +/* copied from fs/ioctl.c */ +static int fiemap_check_ranges(struct super_block *sb, + u64 start, u64 len, u64 *new_len) +{ + u64 maxbytes = (u64) sb->s_maxbytes; + + *new_len = len; + + if (len == 0) + return -EINVAL; + + if (start > maxbytes) + return -EFBIG; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if (len > maxbytes || (maxbytes - len) < start) + *new_len = maxbytes - start; + + return 0; +} + +/* So that the fiemap access checks can't overflow on 32 bit machines. */ +#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) + +static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) +{ + struct fiemap fiemap; + struct fiemap __user *ufiemap = (struct fiemap __user *) arg; + struct fiemap_extent_info fieinfo = { 0, }; + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + u64 len; + int error; + + if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) + return -EFAULT; + + if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) + return -EINVAL; + + error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, + &len); + if (error) + return error; + + fieinfo.fi_flags = fiemap.fm_flags; + fieinfo.fi_extents_max = fiemap.fm_extent_count; + fieinfo.fi_extents_start = ufiemap->fm_extents; + + if (fiemap.fm_extent_count != 0 && + !access_ok(fieinfo.fi_extents_start, + fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) + return -EFAULT; + + if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) + filemap_write_and_wait(inode->i_mapping); + + error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len); + fiemap.fm_flags = fieinfo.fi_flags; + fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; + if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) + error = -EFAULT; + + return error; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1139,6 +1207,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return put_user(state, (__u32 __user *) arg); } + case EXT4_IOC_GET_ES_CACHE: + return ext4_ioctl_get_es_cache(filp, arg); + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -1259,6 +1330,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_GETFSMAP: case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: + case EXT4_IOC_GET_ES_CACHE: break; default: return -ENOIOCTLCMD; From cd2d99229dc96219547e6349841e1aad851c6acc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 12 Aug 2019 13:44:49 -0400 Subject: [PATCH 07/21] ext4: drop legacy pre-1970 encoding workaround Originally, support for expanded timestamps had a bug in that pre-1970 times were erroneously encoded as being in the the 24th century. This was fixed in commit a4dad1ae24f8 ("ext4: Fix handling of extended tv_sec") which landed in 4.4. Starting with 4.4, pre-1970 timestamps were correctly encoded, but for backwards compatibility those incorrectly encoded timestamps were mapped back to the pre-1970 dates. Given that backwards compatibility workaround has been around for 4 years, and given that running e2fsck from e2fsprogs 1.43.2 and later will offer to fix these timestamps (which has been released for 3 years), it's past time to drop the legacy workaround from the kernel. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e2d8ad27f4d1..17cc2dc13174 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -828,21 +828,8 @@ static inline __le32 ext4_encode_extra_time(struct timespec64 *time) static inline void ext4_decode_extra_time(struct timespec64 *time, __le32 extra) { - if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) { - -#if 1 - /* Handle legacy encoding of pre-1970 dates with epoch - * bits 1,1. (This backwards compatibility may be removed - * at the discretion of the ext4 developers.) - */ - u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK; - if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0) - extra_bits = 0; - time->tv_sec += extra_bits << 32; -#else + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; -#endif - } time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; } From 7a14826ede1d714f0bb56de8167c0e519041eeda Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 12 Aug 2019 14:29:38 -0400 Subject: [PATCH 08/21] ext4: set error return correctly when ext4_htree_store_dirent fails Currently when the call to ext4_htree_store_dirent fails the error return variable 'ret' is is not being set to the error code and variable count is instead, hence the error code is not being returned. Fix this by assigning ret to the error return code. Addresses-Coverity: ("Unused value") Fixes: 8af0f0822797 ("ext4: fix readdir error in the case of inline_data+dir_index") Signed-off-by: Colin Ian King Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 88cdf3c90bd1..2fec62d764fa 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1416,7 +1416,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); if (err) { - count = err; + ret = err; goto out; } count++; From e3d550c2c4f2f3dba469bc3c4b83d9332b4e99e1 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Thu, 22 Aug 2019 22:53:46 -0400 Subject: [PATCH 09/21] ext4: fix warning inside ext4_convert_unwritten_extents_endio Really enable warning when CONFIG_EXT4_DEBUG is set and fix missing first argument. This was introduced in commit ff95ec22cd7f ("ext4: add warning to ext4_convert_unwritten_extents_endio") and splitting extents inside endio would trigger it. Fixes: ff95ec22cd7f ("ext4: add warning to ext4_convert_unwritten_extents_endio") Signed-off-by: Rakesh Pandit Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0620d495fd8a..fb0f99dc8c22 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3859,8 +3859,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, * illegal. */ if (ee_block != map->m_lblk || ee_len > map->m_len) { -#ifdef EXT4_DEBUG - ext4_warning("Inode (%ld) finished: extent logical block %llu," +#ifdef CONFIG_EXT4_DEBUG + ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," " len %u; IO logical block %llu, len %u", inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); From 7963e5ac901251c7a3b36fe7c987623a3f309393 Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Thu, 22 Aug 2019 23:00:32 -0400 Subject: [PATCH 10/21] ext4: treat buffers with write errors as containing valid data I got some errors when I repair an ext4 volume which stacked by an iscsi target: Entry 'test60' in / (2) has deleted/unused inode 73750. Clear? It can be reproduced when the network not good enough. When I debug this I found ext4 will read entry buffer from disk and the buffer is marked with write_io_error. If the buffer is marked with write_io_error, it means it already wroten to journal, and not checked out to disk. IOW, the journal is newer than the data in disk. If this journal record 'delete test60', it means the 'test60' still on the disk metadata. In this case, if we read the buffer from disk successfully and create file continue, the new journal record will overwrite the journal which record 'delete test60', then the entry corruptioned. So, use the buffer rather than read from disk if the buffer is marked with write_io_error. Signed-off-by: Zhang Xiaoxu Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 13 +++++++++++++ fs/ext4/inode.c | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 17cc2dc13174..2348be3d66b7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3344,6 +3344,19 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) extern const struct iomap_ops ext4_iomap_ops; +static inline int ext4_buffer_uptodate(struct buffer_head *bh) +{ + /* + * If the buffer has the write error flag, we have failed + * to write out data in the block. In this case, we don't + * have to read the block because we may read the old data + * successfully. + */ + if (!buffer_uptodate(bh) && buffer_write_io_error(bh)) + set_buffer_uptodate(bh); + return buffer_uptodate(bh); +} + #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4b92c7603907..9db896fc6af8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1024,7 +1024,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; - if (!bh || buffer_uptodate(bh)) + if (!bh || ext4_buffer_uptodate(bh)) return bh; ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); @@ -1051,7 +1051,7 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, for (i = 0; i < bh_count; i++) /* Note that NULL bhs[i] is valid because of holes. */ - if (bhs[i] && !buffer_uptodate(bhs[i])) + if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bhs[i]); From 219db95bbe796277c739cca17fe134f6c2ce6676 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Thu, 22 Aug 2019 23:18:33 -0400 Subject: [PATCH 11/21] ext4: documentation fixes This commit aims to fix the following issues in ext4 documentation: - Flexible block group docs said that the aim was to group block metadata together instead of block group metadata. - The documentation consistly uses "location" instead of "block number". It is easy to confuse location to be an absolute offset on disk. Added a line to clarify all location values are in terms of block numbers. - Dirent2 docs said that the rec_len field is shortened instead of the name_len field. - Typo in bg_checksum description. - Inode size is 160 bytes now, and hence i_extra_isize is now 32. - Cluster size formula was incorrect, it did not include the +10 to s_log_cluster_size value. - Typo: there were two s_wtime_hi in the superblock struct. - Superblock struct was outdated, added the new fields which were part of s_reserved earlier. - Multiple mount protection seems to be implemented in fs/ext4/mmp.c. Signed-off-by: Ayush Ranjan Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- Documentation/filesystems/ext4/blockgroup.rst | 10 +++++----- Documentation/filesystems/ext4/blocks.rst | 4 +++- Documentation/filesystems/ext4/directory.rst | 2 +- .../filesystems/ext4/group_descr.rst | 9 ++++++--- Documentation/filesystems/ext4/inodes.rst | 4 ++-- Documentation/filesystems/ext4/super.rst | 20 +++++++++++++------ 6 files changed, 31 insertions(+), 18 deletions(-) diff --git a/Documentation/filesystems/ext4/blockgroup.rst b/Documentation/filesystems/ext4/blockgroup.rst index baf888e4c06a..3da156633339 100644 --- a/Documentation/filesystems/ext4/blockgroup.rst +++ b/Documentation/filesystems/ext4/blockgroup.rst @@ -71,11 +71,11 @@ if the flex\_bg size is 4, then group 0 will contain (in order) the superblock, group descriptors, data block bitmaps for groups 0-3, inode bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining space in group 0 is for file data. The effect of this is to group the -block metadata close together for faster loading, and to enable large -files to be continuous on disk. Backup copies of the superblock and -group descriptors are always at the beginning of block groups, even if -flex\_bg is enabled. The number of block groups that make up a flex\_bg -is given by 2 ^ ``sb.s_log_groups_per_flex``. +block group metadata close together for faster loading, and to enable +large files to be continuous on disk. Backup copies of the superblock +and group descriptors are always at the beginning of block groups, even +if flex\_bg is enabled. The number of block groups that make up a +flex\_bg is given by 2 ^ ``sb.s_log_groups_per_flex``. Meta Block Groups ----------------- diff --git a/Documentation/filesystems/ext4/blocks.rst b/Documentation/filesystems/ext4/blocks.rst index 73d4dc0f7bda..bd722ecd92d6 100644 --- a/Documentation/filesystems/ext4/blocks.rst +++ b/Documentation/filesystems/ext4/blocks.rst @@ -10,7 +10,9 @@ block groups. Block size is specified at mkfs time and typically is 4KiB. You may experience mounting problems if block size is greater than page size (i.e. 64KiB blocks on a i386 which only has 4KiB memory pages). By default a filesystem can contain 2^32 blocks; if the '64bit' -feature is enabled, then a filesystem can have 2^64 blocks. +feature is enabled, then a filesystem can have 2^64 blocks. The location +of structures is stored in terms of the block number the structure lives +in and not the absolute offset on disk. For 32-bit filesystems, limits are as follows: diff --git a/Documentation/filesystems/ext4/directory.rst b/Documentation/filesystems/ext4/directory.rst index 614034e24669..073940cc64ed 100644 --- a/Documentation/filesystems/ext4/directory.rst +++ b/Documentation/filesystems/ext4/directory.rst @@ -59,7 +59,7 @@ is at most 263 bytes long, though on disk you'll need to reference - File name. Since file names cannot be longer than 255 bytes, the new directory -entry format shortens the rec\_len field and uses the space for a file +entry format shortens the name\_len field and uses the space for a file type flag, probably to avoid having to load every inode during directory tree traversal. This format is ``ext4_dir_entry_2``, which is at most 263 bytes long, though on disk you'll need to reference diff --git a/Documentation/filesystems/ext4/group_descr.rst b/Documentation/filesystems/ext4/group_descr.rst index 0f783ed88592..7ba6114e7f5c 100644 --- a/Documentation/filesystems/ext4/group_descr.rst +++ b/Documentation/filesystems/ext4/group_descr.rst @@ -99,9 +99,12 @@ The block group descriptor is laid out in ``struct ext4_group_desc``. * - 0x1E - \_\_le16 - bg\_checksum - - Group descriptor checksum; crc16(sb\_uuid+group+desc) if the - RO\_COMPAT\_GDT\_CSUM feature is set, or crc32c(sb\_uuid+group\_desc) & - 0xFFFF if the RO\_COMPAT\_METADATA\_CSUM feature is set. + - Group descriptor checksum; crc16(sb\_uuid+group\_num+bg\_desc) if the + RO\_COMPAT\_GDT\_CSUM feature is set, or + crc32c(sb\_uuid+group\_num+bg\_desc) & 0xFFFF if the + RO\_COMPAT\_METADATA\_CSUM feature is set. The bg\_checksum + field in bg\_desc is skipped when calculating crc16 checksum, + and set to zero if crc32c checksum is used. * - - - diff --git a/Documentation/filesystems/ext4/inodes.rst b/Documentation/filesystems/ext4/inodes.rst index 6bd35e506b6f..34f62928cebc 100644 --- a/Documentation/filesystems/ext4/inodes.rst +++ b/Documentation/filesystems/ext4/inodes.rst @@ -470,8 +470,8 @@ inode, which allows struct ext4\_inode to grow for a new kernel without having to upgrade all of the on-disk inodes. Access to fields beyond EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within ``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as -of October 2013) the inode structure is 156 bytes -(``i_extra_isize = 28``). The extra space between the end of the inode +of August 2019) the inode structure is 160 bytes +(``i_extra_isize = 32``). The extra space between the end of the inode structure and the end of the inode record can be used to store extended attributes. Each inode record can be as large as the filesystem block size, though this is not terribly efficient. diff --git a/Documentation/filesystems/ext4/super.rst b/Documentation/filesystems/ext4/super.rst index 04ff079a2acf..48b6c78fc38e 100644 --- a/Documentation/filesystems/ext4/super.rst +++ b/Documentation/filesystems/ext4/super.rst @@ -58,7 +58,7 @@ The ext4 superblock is laid out as follows in * - 0x1C - \_\_le32 - s\_log\_cluster\_size - - Cluster size is (2 ^ s\_log\_cluster\_size) blocks if bigalloc is + - Cluster size is 2 ^ (10 + s\_log\_cluster\_size) blocks if bigalloc is enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size. * - 0x20 - \_\_le32 @@ -447,7 +447,7 @@ The ext4 superblock is laid out as follows in - Upper 8 bits of the s_wtime field. * - 0x275 - \_\_u8 - - s\_wtime_hi + - s\_mtime_hi - Upper 8 bits of the s_mtime field. * - 0x276 - \_\_u8 @@ -466,12 +466,20 @@ The ext4 superblock is laid out as follows in - s\_last_error_time_hi - Upper 8 bits of the s_last_error_time_hi field. * - 0x27A - - \_\_u8[2] - - s\_pad + - \_\_u8 + - s\_pad[2] - Zero padding. * - 0x27C + - \_\_le16 + - s\_encoding + - Filename charset encoding. + * - 0x27E + - \_\_le16 + - s\_encoding_flags + - Filename charset encoding flags. + * - 0x280 - \_\_le32 - - s\_reserved[96] + - s\_reserved[95] - Padding to the end of the block. * - 0x3FC - \_\_le32 @@ -617,7 +625,7 @@ following: * - 0x80 - Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT). * - 0x100 - - Multiple mount protection. Not implemented (INCOMPAT\_MMP). + - Multiple mount protection (INCOMPAT\_MMP). * - 0x200 - Flexible block groups. See the earlier discussion of this feature (INCOMPAT\_FLEX\_BG). From 8fcc3a580651cceb94a9f48e1914491400d5146b Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 22 Aug 2019 23:22:14 -0400 Subject: [PATCH 12/21] ext4: rework reserved cluster accounting when invalidating pages The goal of this patch is to remove two references to the buffer delay bit in ext4_da_page_release_reservation() as part of a larger effort to remove all such references from ext4. These two references are principally used to reduce the reserved block/cluster count when pages are invalidated as a result of truncating, punching holes, or collapsing a block range in a file. The entire function is removed and replaced with code in ext4_es_remove_extent() that reduces the reserved count as a side effect of removing a block range from delayed and not unwritten extents in the extent status tree as is done when truncating, punching holes, or collapsing ranges. The code is written to minimize the number of searches descending from rb tree roots for scalability. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 + fs/ext4/extents_status.c | 446 ++++++++++++++++++++++++++++++--------- fs/ext4/extents_status.h | 2 - fs/ext4/inode.c | 63 +----- 4 files changed, 353 insertions(+), 161 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2348be3d66b7..0664c43cc9dc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -284,6 +284,9 @@ struct ext4_io_submit { ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) #define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Fill in the low bits to get the last block of the cluster */ +#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ + ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) /* Get the cluster offset */ #define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index a959adc59bcd..5efbb116fba0 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -146,7 +146,7 @@ static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end); + ext4_lblk_t end, int *reserved); static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); @@ -836,7 +836,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_insert_extent_check(inode, &newes); write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end); + err = __es_remove_extent(inode, lblk, end, NULL); if (err != 0) goto error; retry: @@ -968,8 +968,322 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, return found; } +struct rsvd_count { + int ndelonly; + bool first_do_lblk_found; + ext4_lblk_t first_do_lblk; + ext4_lblk_t last_do_lblk; + struct extent_status *left_es; + bool partial; + ext4_lblk_t lclu; +}; + +/* + * init_rsvd - initialize reserved count data before removing block range + * in file from extent status tree + * + * @inode - file containing range + * @lblk - first block in range + * @es - pointer to first extent in range + * @rc - pointer to reserved count data + * + * Assumes es is not NULL + */ +static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es, struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct rb_node *node; + + rc->ndelonly = 0; + + /* + * for bigalloc, note the first delonly block in the range has not + * been found, record the extent containing the block to the left of + * the region to be removed, if any, and note that there's no partial + * cluster to track + */ + if (sbi->s_cluster_ratio > 1) { + rc->first_do_lblk_found = false; + if (lblk > es->es_lblk) { + rc->left_es = es; + } else { + node = rb_prev(&es->rb_node); + rc->left_es = node ? rb_entry(node, + struct extent_status, + rb_node) : NULL; + } + rc->partial = false; + } +} + +/* + * count_rsvd - count the clusters containing delayed and not unwritten + * (delonly) blocks in a range within an extent and add to + * the running tally in rsvd_count + * + * @inode - file containing extent + * @lblk - first block in range + * @len - length of range in blocks + * @es - pointer to extent containing clusters to be counted + * @rc - pointer to reserved count data + * + * Tracks partial clusters found at the beginning and end of extents so + * they aren't overcounted when they span adjacent extents + */ +static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, + struct extent_status *es, struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t i, end, nclu; + + if (!ext4_es_is_delonly(es)) + return; + + WARN_ON(len <= 0); + + if (sbi->s_cluster_ratio == 1) { + rc->ndelonly += (int) len; + return; + } + + /* bigalloc */ + + i = (lblk < es->es_lblk) ? es->es_lblk : lblk; + end = lblk + (ext4_lblk_t) len - 1; + end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; + + /* record the first block of the first delonly extent seen */ + if (rc->first_do_lblk_found == false) { + rc->first_do_lblk = i; + rc->first_do_lblk_found = true; + } + + /* update the last lblk in the region seen so far */ + rc->last_do_lblk = end; + + /* + * if we're tracking a partial cluster and the current extent + * doesn't start with it, count it and stop tracking + */ + if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) { + rc->ndelonly++; + rc->partial = false; + } + + /* + * if the first cluster doesn't start on a cluster boundary but + * ends on one, count it + */ + if (EXT4_LBLK_COFF(sbi, i) != 0) { + if (end >= EXT4_LBLK_CFILL(sbi, i)) { + rc->ndelonly++; + rc->partial = false; + i = EXT4_LBLK_CFILL(sbi, i) + 1; + } + } + + /* + * if the current cluster starts on a cluster boundary, count the + * number of whole delonly clusters in the extent + */ + if ((i + sbi->s_cluster_ratio - 1) <= end) { + nclu = (end - i + 1) >> sbi->s_cluster_bits; + rc->ndelonly += nclu; + i += nclu << sbi->s_cluster_bits; + } + + /* + * start tracking a partial cluster if there's a partial at the end + * of the current extent and we're not already tracking one + */ + if (!rc->partial && i <= end) { + rc->partial = true; + rc->lclu = EXT4_B2C(sbi, i); + } +} + +/* + * __pr_tree_search - search for a pending cluster reservation + * + * @root - root of pending reservation tree + * @lclu - logical cluster to search for + * + * Returns the pending reservation for the cluster identified by @lclu + * if found. If not, returns a reservation for the next cluster if any, + * and if not, returns NULL. + */ +static struct pending_reservation *__pr_tree_search(struct rb_root *root, + ext4_lblk_t lclu) +{ + struct rb_node *node = root->rb_node; + struct pending_reservation *pr = NULL; + + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + if (lclu < pr->lclu) + node = node->rb_left; + else if (lclu > pr->lclu) + node = node->rb_right; + else + return pr; + } + if (pr && lclu < pr->lclu) + return pr; + if (pr && lclu > pr->lclu) { + node = rb_next(&pr->rb_node); + return node ? rb_entry(node, struct pending_reservation, + rb_node) : NULL; + } + return NULL; +} + +/* + * get_rsvd - calculates and returns the number of cluster reservations to be + * released when removing a block range from the extent status tree + * and releases any pending reservations within the range + * + * @inode - file containing block range + * @end - last block in range + * @right_es - pointer to extent containing next block beyond end or NULL + * @rc - pointer to reserved count data + * + * The number of reservations to be released is equal to the number of + * clusters containing delayed and not unwritten (delonly) blocks within + * the range, minus the number of clusters still containing delonly blocks + * at the ends of the range, and minus the number of pending reservations + * within the range. + */ +static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, + struct extent_status *right_es, + struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct pending_reservation *pr; + struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; + struct rb_node *node; + ext4_lblk_t first_lclu, last_lclu; + bool left_delonly, right_delonly, count_pending; + struct extent_status *es; + + if (sbi->s_cluster_ratio > 1) { + /* count any remaining partial cluster */ + if (rc->partial) + rc->ndelonly++; + + if (rc->ndelonly == 0) + return 0; + + first_lclu = EXT4_B2C(sbi, rc->first_do_lblk); + last_lclu = EXT4_B2C(sbi, rc->last_do_lblk); + + /* + * decrease the delonly count by the number of clusters at the + * ends of the range that still contain delonly blocks - + * these clusters still need to be reserved + */ + left_delonly = right_delonly = false; + + es = rc->left_es; + while (es && ext4_es_end(es) >= + EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { + if (ext4_es_is_delonly(es)) { + rc->ndelonly--; + left_delonly = true; + break; + } + node = rb_prev(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, rb_node); + } + if (right_es && (!left_delonly || first_lclu != last_lclu)) { + if (end < ext4_es_end(right_es)) { + es = right_es; + } else { + node = rb_next(&right_es->rb_node); + es = node ? rb_entry(node, struct extent_status, + rb_node) : NULL; + } + while (es && es->es_lblk <= + EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { + if (ext4_es_is_delonly(es)) { + rc->ndelonly--; + right_delonly = true; + break; + } + node = rb_next(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, + rb_node); + } + } + + /* + * Determine the block range that should be searched for + * pending reservations, if any. Clusters on the ends of the + * original removed range containing delonly blocks are + * excluded. They've already been accounted for and it's not + * possible to determine if an associated pending reservation + * should be released with the information available in the + * extents status tree. + */ + if (first_lclu == last_lclu) { + if (left_delonly | right_delonly) + count_pending = false; + else + count_pending = true; + } else { + if (left_delonly) + first_lclu++; + if (right_delonly) + last_lclu--; + if (first_lclu <= last_lclu) + count_pending = true; + else + count_pending = false; + } + + /* + * a pending reservation found between first_lclu and last_lclu + * represents an allocated cluster that contained at least one + * delonly block, so the delonly total must be reduced by one + * for each pending reservation found and released + */ + if (count_pending) { + pr = __pr_tree_search(&tree->root, first_lclu); + while (pr && pr->lclu <= last_lclu) { + rc->ndelonly--; + node = rb_next(&pr->rb_node); + rb_erase(&pr->rb_node, &tree->root); + kmem_cache_free(ext4_pending_cachep, pr); + if (!node) + break; + pr = rb_entry(node, struct pending_reservation, + rb_node); + } + } + } + return rc->ndelonly; +} + + +/* + * __es_remove_extent - removes block range from extent status tree + * + * @inode - file containing range + * @lblk - first block in range + * @end - last block in range + * @reserved - number of cluster reservations released + * + * If @reserved is not NULL and delayed allocation is enabled, counts + * block/cluster reservations freed by removing range and if bigalloc + * enabled cancels pending reservations as needed. Returns 0 on success, + * error code on failure. + */ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end) + ext4_lblk_t end, int *reserved) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node *node; @@ -978,9 +1292,14 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len1, len2; ext4_fsblk_t block; int err; + bool count_reserved = true; + struct rsvd_count rc; + if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC)) + count_reserved = false; retry: err = 0; + es = __es_tree_search(&tree->root, lblk); if (!es) goto out; @@ -989,6 +1308,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, /* Simply invalidate cache_es. */ tree->cache_es = NULL; + if (count_reserved) + init_rsvd(inode, lblk, es, &rc); orig_es.es_lblk = es->es_lblk; orig_es.es_len = es->es_len; @@ -1030,10 +1351,16 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_store_pblock(es, block); } } + if (count_reserved) + count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, + &orig_es, &rc); goto out; } if (len1 > 0) { + if (count_reserved) + count_rsvd(inode, lblk, orig_es.es_len - len1, + &orig_es, &rc); node = rb_next(&es->rb_node); if (node) es = rb_entry(node, struct extent_status, rb_node); @@ -1042,6 +1369,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, } while (es && ext4_es_end(es) <= end) { + if (count_reserved) + count_rsvd(inode, es->es_lblk, es->es_len, es, &rc); node = rb_next(&es->rb_node); rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); @@ -1056,6 +1385,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t orig_len = es->es_len; len1 = ext4_es_end(es) - end; + if (count_reserved) + count_rsvd(inode, es->es_lblk, orig_len - len1, + es, &rc); es->es_lblk = end + 1; es->es_len = len1; if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { @@ -1064,20 +1396,28 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, } } + if (count_reserved) + *reserved = get_rsvd(inode, end, es, &rc); out: return err; } /* - * ext4_es_remove_extent() removes a space from a extent status tree. + * ext4_es_remove_extent - removes block range from extent status tree * - * Return 0 on success, error code on failure. + * @inode - file containing range + * @lblk - first block in range + * @len - number of blocks to remove + * + * Reduces block/cluster reservation count and for bigalloc cancels pending + * reservations as needed. Returns 0 on success, error code on failure. */ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len) { ext4_lblk_t end; int err = 0; + int reserved = 0; trace_ext4_es_remove_extent(inode, lblk, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", @@ -1095,9 +1435,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, * is reclaimed. */ write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end); + err = __es_remove_extent(inode, lblk, end, &reserved); write_unlock(&EXT4_I(inode)->i_es_lock); ext4_es_print_tree(inode); + ext4_da_release_space(inode, reserved); return err; } @@ -1327,6 +1668,7 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); if (!es) goto out_wrap; + while (*nr_to_scan > 0) { if (es->es_lblk > end) { ei->i_es_shrink_lblk = end + 1; @@ -1628,7 +1970,7 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, lblk); + err = __es_remove_extent(inode, lblk, lblk, NULL); if (err != 0) goto error; retry: @@ -1817,93 +2159,3 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, __remove_pending(inode, last); } } - -/* - * ext4_es_remove_blks - remove block range from extents status tree and - * reduce reservation count or cancel pending - * reservation as needed - * - * @inode - file containing range - * @lblk - first block in range - * @len - number of blocks to remove - * - */ -void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned int clu_size, reserved = 0; - ext4_lblk_t last_lclu, first, length, remainder, last; - bool delonly; - int err = 0; - struct pending_reservation *pr; - struct ext4_pending_tree *tree; - - /* - * Process cluster by cluster for bigalloc - there may be up to - * two clusters in a 4k page with a 1k block size and two blocks - * per cluster. Also necessary for systems with larger page sizes - * and potentially larger block sizes. - */ - clu_size = sbi->s_cluster_ratio; - last_lclu = EXT4_B2C(sbi, lblk + len - 1); - - write_lock(&EXT4_I(inode)->i_es_lock); - - for (first = lblk, remainder = len; - remainder > 0; - first += length, remainder -= length) { - - if (EXT4_B2C(sbi, first) == last_lclu) - length = remainder; - else - length = clu_size - EXT4_LBLK_COFF(sbi, first); - - /* - * The BH_Delay flag, which triggers calls to this function, - * and the contents of the extents status tree can be - * inconsistent due to writepages activity. So, note whether - * the blocks to be removed actually belong to an extent with - * delayed only status. - */ - delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first); - - /* - * because of the writepages effect, written and unwritten - * blocks could be removed here - */ - last = first + length - 1; - err = __es_remove_extent(inode, first, last); - if (err) - ext4_warning(inode->i_sb, - "%s: couldn't remove page (err = %d)", - __func__, err); - - /* non-bigalloc case: simply count the cluster for release */ - if (sbi->s_cluster_ratio == 1 && delonly) { - reserved++; - continue; - } - - /* - * bigalloc case: if all delayed allocated only blocks have - * just been removed from a cluster, either cancel a pending - * reservation if it exists or count a cluster for release - */ - if (delonly && - !__es_scan_clu(inode, &ext4_es_is_delonly, first)) { - pr = __get_pending(inode, EXT4_B2C(sbi, first)); - if (pr != NULL) { - tree = &EXT4_I(inode)->i_pending_tree; - rb_erase(&pr->rb_node, &tree->root); - kmem_cache_free(ext4_pending_cachep, pr); - } else { - reserved++; - } - } - } - - write_unlock(&EXT4_I(inode)->i_es_lock); - - ext4_da_release_space(inode, reserved); -} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index eb56a1289031..5e5c4a40d863 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -247,8 +247,6 @@ extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, bool allocated); extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len); extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9db896fc6af8..2b1c58da8d1e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1646,49 +1646,6 @@ void ext4_da_release_space(struct inode *inode, int to_free) dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } -static void ext4_da_page_release_reservation(struct page *page, - unsigned int offset, - unsigned int length) -{ - int contiguous_blks = 0; - struct buffer_head *head, *bh; - unsigned int curr_off = 0; - struct inode *inode = page->mapping->host; - unsigned int stop = offset + length; - ext4_fsblk_t lblk; - - BUG_ON(stop > PAGE_SIZE || stop < length); - - head = page_buffers(page); - bh = head; - do { - unsigned int next_off = curr_off + bh->b_size; - - if (next_off > stop) - break; - - if ((offset <= curr_off) && (buffer_delay(bh))) { - contiguous_blks++; - clear_buffer_delay(bh); - } else if (contiguous_blks) { - lblk = page->index << - (PAGE_SHIFT - inode->i_blkbits); - lblk += (curr_off >> inode->i_blkbits) - - contiguous_blks; - ext4_es_remove_blks(inode, lblk, contiguous_blks); - contiguous_blks = 0; - } - curr_off = next_off; - } while ((bh = bh->b_this_page) != head); - - if (contiguous_blks) { - lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); - lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_blks(inode, lblk, contiguous_blks); - } - -} - /* * Delayed allocation stuff */ @@ -3227,24 +3184,6 @@ static int ext4_da_write_end(struct file *file, return ret ? ret : copied; } -static void ext4_da_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - /* - * Drop reserved blocks - */ - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) - goto out; - - ext4_da_page_release_reservation(page, offset, length); - -out: - ext4_invalidatepage(page, offset, length); - - return; -} - /* * Force all delayed allocation blocks to be allocated for a given inode. */ @@ -3985,7 +3924,7 @@ static const struct address_space_operations ext4_da_aops = { .write_end = ext4_da_write_end, .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, - .invalidatepage = ext4_da_invalidatepage, + .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, .direct_IO = ext4_direct_IO, .migratepage = buffer_migrate_page, From c1e8220bd316d8ae8e524df39534b8a412a45d5e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 23 Aug 2019 22:38:00 -0400 Subject: [PATCH 13/21] ext4: fix punch hole for inline_data file systems If a program attempts to punch a hole on an inline data file, we need to convert it to a normal file first. This was detected using ext4/032 using the adv configuration. Simple reproducer: mke2fs -Fq -t ext4 -O inline_data /dev/vdc mount /vdc echo "" > /vdc/testfile xfs_io -c 'truncate 33554432' /vdc/testfile xfs_io -c 'fpunch 0 1048576' /vdc/testfile umount /vdc e2fsck -fy /dev/vdc Cc: stable@vger.kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2b1c58da8d1e..e567f0229d4e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4236,6 +4236,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) trace_ext4_punch_hole(inode, offset, length, 0); + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + if (ext4_has_inline_data(inode)) { + down_write(&EXT4_I(inode)->i_mmap_sem); + ret = ext4_convert_inline_data(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); + if (ret) + return ret; + } + /* * Write out all dirty pages to avoid race conditions * Then release them. From 4c273352bb4583750bf511fe24fe410610414496 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Sat, 24 Aug 2019 23:10:17 -0400 Subject: [PATCH 14/21] jbd2: add missing tracepoint for reserved handle This issue was found when I use ebpf to trace every jbd2 handle's running info in dioread_nolock case. Signed-off-by: Xiaoguang Wang Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 990e7b5062e7..afc06daee5bb 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -569,6 +569,9 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, } handle->h_type = type; handle->h_line_no = line_no; + trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, + handle->h_transaction->t_tid, type, + line_no, handle->h_buffer_credits); return 0; } EXPORT_SYMBOL(jbd2_journal_start_reserved); From 7727ae52975d4f4ef7ff69ed8e6e25f6a4168158 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 28 Aug 2019 11:13:24 -0400 Subject: [PATCH 15/21] ext4: fix potential use after free after remounting with noblock_validity Remount process will release system zone which was allocated before if "noblock_validity" is specified. If we mount an ext4 file system to two mountpoints with default mount options, and then remount one of them with "noblock_validity", it may trigger a use after free problem when someone accessing the other one. # mount /dev/sda foo # mount /dev/sda bar User access mountpoint "foo" | Remount mountpoint "bar" | ext4_map_blocks() | ext4_remount() check_block_validity() | ext4_setup_system_zone() ext4_data_block_valid() | ext4_release_system_zone() | free system_blks rb nodes access system_blks rb nodes | trigger use after free | This problem can also be reproduced by one mountpint, At the same time, add_system_zone() can get called during remount as well so there can be racing ext4_data_block_valid() reading the rbtree at the same time. This patch add RCU to protect system zone from releasing or building when doing a remount which inverse current "noblock_validity" mount option. It assign the rbtree after the whole tree was complete and do actual freeing after rcu grace period, avoid any intermediate state. Reported-by: syzbot+1e470567330b7ad711d5@syzkaller.appspotmail.com Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/block_validity.c | 197 ++++++++++++++++++++++++++++----------- fs/ext4/ext4.h | 10 +- 2 files changed, 151 insertions(+), 56 deletions(-) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 8e83741b02e0..d4d4fdfac1a6 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -38,6 +38,7 @@ int __init ext4_init_system_zone(void) void ext4_exit_system_zone(void) { + rcu_barrier(); kmem_cache_destroy(ext4_system_zone_cachep); } @@ -49,17 +50,26 @@ static inline int can_merge(struct ext4_system_zone *entry1, return 0; } +static void release_system_zone(struct ext4_system_blocks *system_blks) +{ + struct ext4_system_zone *entry, *n; + + rbtree_postorder_for_each_entry_safe(entry, n, + &system_blks->root, node) + kmem_cache_free(ext4_system_zone_cachep, entry); +} + /* * Mark a range of blocks as belonging to the "system zone" --- that * is, filesystem metadata blocks which should never be used by * inodes. */ -static int add_system_zone(struct ext4_sb_info *sbi, +static int add_system_zone(struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, unsigned int count) { struct ext4_system_zone *new_entry = NULL, *entry; - struct rb_node **n = &sbi->system_blks.rb_node, *node; + struct rb_node **n = &system_blks->root.rb_node, *node; struct rb_node *parent = NULL, *new_node = NULL; while (*n) { @@ -91,7 +101,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, new_node = &new_entry->node; rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &sbi->system_blks); + rb_insert_color(new_node, &system_blks->root); } /* Can we merge to the left? */ @@ -101,7 +111,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, if (can_merge(entry, new_entry)) { new_entry->start_blk = entry->start_blk; new_entry->count += entry->count; - rb_erase(node, &sbi->system_blks); + rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } @@ -112,7 +122,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, entry = rb_entry(node, struct ext4_system_zone, node); if (can_merge(new_entry, entry)) { new_entry->count += entry->count; - rb_erase(node, &sbi->system_blks); + rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } @@ -126,7 +136,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi) int first = 1; printk(KERN_INFO "System zones: "); - node = rb_first(&sbi->system_blks); + node = rb_first(&sbi->system_blks->root); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", @@ -137,7 +147,47 @@ static void debug_print_tree(struct ext4_sb_info *sbi) printk(KERN_CONT "\n"); } -static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with filesystem metadata blocks. + */ +static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, + struct ext4_system_blocks *system_blks, + ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *entry; + struct rb_node *n; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + + if (system_blks == NULL) + return 1; + + n = system_blks->root.rb_node; + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + } + return 1; +} + +static int ext4_protect_reserved_inode(struct super_block *sb, + struct ext4_system_blocks *system_blks, + u32 ino) { struct inode *inode; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -163,14 +213,15 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) if (n == 0) { i++; } else { - if (!ext4_data_block_valid(sbi, map.m_pblk, n)) { + if (!ext4_data_block_valid_rcu(sbi, system_blks, + map.m_pblk, n)) { ext4_error(sb, "blocks %llu-%llu from inode %u " "overlap system zone", map.m_pblk, map.m_pblk + map.m_len - 1, ino); err = -EFSCORRUPTED; break; } - err = add_system_zone(sbi, map.m_pblk, n); + err = add_system_zone(system_blks, map.m_pblk, n); if (err < 0) break; i += n; @@ -180,94 +231,130 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) return err; } +static void ext4_destroy_system_zone(struct rcu_head *rcu) +{ + struct ext4_system_blocks *system_blks; + + system_blks = container_of(rcu, struct ext4_system_blocks, rcu); + release_system_zone(system_blks); + kfree(system_blks); +} + +/* + * Build system zone rbtree which is used for block validity checking. + * + * The update of system_blks pointer in this function is protected by + * sb->s_umount semaphore. However we have to be careful as we can be + * racing with ext4_data_block_valid() calls reading system_blks rbtree + * protected only by RCU. That's why we first build the rbtree and then + * swap it in place. + */ int ext4_setup_system_zone(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_system_blocks *system_blks; struct ext4_group_desc *gdp; ext4_group_t i; int flex_size = ext4_flex_bg_size(sbi); int ret; if (!test_opt(sb, BLOCK_VALIDITY)) { - if (sbi->system_blks.rb_node) + if (sbi->system_blks) ext4_release_system_zone(sb); return 0; } - if (sbi->system_blks.rb_node) + if (sbi->system_blks) return 0; + system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); + if (!system_blks) + return -ENOMEM; + for (i=0; i < ngroups; i++) { cond_resched(); if (ext4_bg_has_super(sb, i) && ((i < 5) || ((i % flex_size) == 0))) - add_system_zone(sbi, ext4_group_first_block_no(sb, i), + add_system_zone(system_blks, + ext4_group_first_block_no(sb, i), ext4_bg_num_gdb(sb, i) + 1); gdp = ext4_get_group_desc(sb, i, NULL); - ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); + ret = add_system_zone(system_blks, + ext4_block_bitmap(sb, gdp), 1); if (ret) - return ret; - ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1); + goto err; + ret = add_system_zone(system_blks, + ext4_inode_bitmap(sb, gdp), 1); if (ret) - return ret; - ret = add_system_zone(sbi, ext4_inode_table(sb, gdp), + goto err; + ret = add_system_zone(system_blks, + ext4_inode_table(sb, gdp), sbi->s_itb_per_group); if (ret) - return ret; + goto err; } if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) { - ret = ext4_protect_reserved_inode(sb, + ret = ext4_protect_reserved_inode(sb, system_blks, le32_to_cpu(sbi->s_es->s_journal_inum)); if (ret) - return ret; + goto err; } + /* + * System blks rbtree complete, announce it once to prevent racing + * with ext4_data_block_valid() accessing the rbtree at the same + * time. + */ + rcu_assign_pointer(sbi->system_blks, system_blks); + if (test_opt(sb, DEBUG)) debug_print_tree(sbi); return 0; -} - -/* Called when the filesystem is unmounted */ -void ext4_release_system_zone(struct super_block *sb) -{ - struct ext4_system_zone *entry, *n; - - rbtree_postorder_for_each_entry_safe(entry, n, - &EXT4_SB(sb)->system_blks, node) - kmem_cache_free(ext4_system_zone_cachep, entry); - - EXT4_SB(sb)->system_blks = RB_ROOT; +err: + release_system_zone(system_blks); + kfree(system_blks); + return ret; } /* - * Returns 1 if the passed-in block region (start_blk, - * start_blk+count) is valid; 0 if some part of the block region - * overlaps with filesystem metadata blocks. + * Called when the filesystem is unmounted or when remounting it with + * noblock_validity specified. + * + * The update of system_blks pointer in this function is protected by + * sb->s_umount semaphore. However we have to be careful as we can be + * racing with ext4_data_block_valid() calls reading system_blks rbtree + * protected only by RCU. So we first clear the system_blks pointer and + * then free the rbtree only after RCU grace period expires. */ +void ext4_release_system_zone(struct super_block *sb) +{ + struct ext4_system_blocks *system_blks; + + system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks, + lockdep_is_held(&sb->s_umount)); + rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL); + + if (system_blks) + call_rcu(&system_blks->rcu, ext4_destroy_system_zone); +} + int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, unsigned int count) { - struct ext4_system_zone *entry; - struct rb_node *n = sbi->system_blks.rb_node; + struct ext4_system_blocks *system_blks; + int ret; - if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || - (start_blk + count < start_blk) || - (start_blk + count > ext4_blocks_count(sbi->s_es))) { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); - return 0; - } - while (n) { - entry = rb_entry(n, struct ext4_system_zone, node); - if (start_blk + count - 1 < entry->start_blk) - n = n->rb_left; - else if (start_blk >= (entry->start_blk + entry->count)) - n = n->rb_right; - else { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); - return 0; - } - } - return 1; + /* + * Lock the system zone to prevent it being released concurrently + * when doing a remount which inverse current "[no]block_validity" + * mount option. + */ + rcu_read_lock(); + system_blks = rcu_dereference(sbi->system_blks); + ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk, + count); + rcu_read_unlock(); + return ret; } int ext4_check_blockref(const char *function, unsigned int line, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0664c43cc9dc..c35bb8d734df 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -184,6 +184,14 @@ struct ext4_map_blocks { unsigned int m_flags; }; +/* + * Block validity checking, system zone rbtree. + */ +struct ext4_system_blocks { + struct rb_root root; + struct rcu_head rcu; +}; + /* * Flags for ext4_io_end->flags */ @@ -1431,7 +1439,7 @@ struct ext4_sb_info { int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ - struct rb_root system_blks; + struct ext4_system_blocks __rcu *system_blks; #ifdef EXTENTS_STATS /* ext4 extents stats */ From 520f897a3554b0665af1ae5d5ba286f290cecf5c Mon Sep 17 00:00:00 2001 From: Yang Guo Date: Wed, 28 Aug 2019 11:19:23 -0400 Subject: [PATCH 16/21] ext4: use percpu_counters for extent_status cache hits/misses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @es_stats_cache_hits and @es_stats_cache_misses are accessed frequently in ext4_es_lookup_extent function, it would influence the ext4 read/write performance in NUMA system. Let's optimize it using percpu_counter, it is profitable for the performance. The test command is as below: fio -name=randwrite -numjobs=8 -filename=/mnt/test1 -rw=randwrite -ioengine=libaio -direct=1 -iodepth=64 -sync=0 -norandommap -group_reporting -runtime=120 -time_based -bs=4k -size=5G And the result is better 10% than the initial implement: without the patch,IOPS=197k, BW=770MiB/s (808MB/s)(90.3GiB/120002msec) with the patch, IOPS=218k, BW=852MiB/s (894MB/s)(99.9GiB/120002msec) Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Eric Biggers Signed-off-by: Yang Guo Signed-off-by: Shaokun Zhang --- fs/ext4/extents_status.c | 37 ++++++++++++++++++++++++------------- fs/ext4/extents_status.h | 4 ++-- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 5efbb116fba0..d996b44d2265 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -948,7 +948,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, es->es_pblk = es1->es_pblk; if (!ext4_es_is_referenced(es1)) ext4_es_set_referenced(es1); - stats->es_stats_cache_hits++; + percpu_counter_inc(&stats->es_stats_cache_hits); if (next_lblk) { node = rb_next(&es1->rb_node); if (node) { @@ -959,7 +959,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, *next_lblk = 0; } } else { - stats->es_stats_cache_misses++; + percpu_counter_inc(&stats->es_stats_cache_misses); } read_unlock(&EXT4_I(inode)->i_es_lock); @@ -1586,9 +1586,9 @@ int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v) seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); - seq_printf(seq, " %lu/%lu cache hits/misses\n", - es_stats->es_stats_cache_hits, - es_stats->es_stats_cache_misses); + seq_printf(seq, " %lld/%lld cache hits/misses\n", + percpu_counter_sum_positive(&es_stats->es_stats_cache_hits), + percpu_counter_sum_positive(&es_stats->es_stats_cache_misses)); if (inode_cnt) seq_printf(seq, " %d inodes on list\n", inode_cnt); @@ -1615,35 +1615,46 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) sbi->s_es_nr_inode = 0; spin_lock_init(&sbi->s_es_lock); sbi->s_es_stats.es_stats_shrunk = 0; - sbi->s_es_stats.es_stats_cache_hits = 0; - sbi->s_es_stats.es_stats_cache_misses = 0; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0, + GFP_KERNEL); + if (err) + return err; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0, + GFP_KERNEL); + if (err) + goto err1; sbi->s_es_stats.es_stats_scan_time = 0; sbi->s_es_stats.es_stats_max_scan_time = 0; err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); if (err) - return err; + goto err2; err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL); if (err) - goto err1; + goto err3; sbi->s_es_shrinker.scan_objects = ext4_es_scan; sbi->s_es_shrinker.count_objects = ext4_es_count; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; err = register_shrinker(&sbi->s_es_shrinker); if (err) - goto err2; + goto err4; return 0; - -err2: +err4: percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); -err1: +err3: percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); +err2: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); +err1: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); return err; } void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); unregister_shrinker(&sbi->s_es_shrinker); diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 5e5c4a40d863..825313c59752 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -70,8 +70,8 @@ struct ext4_es_tree { struct ext4_es_stats { unsigned long es_stats_shrunk; - unsigned long es_stats_cache_hits; - unsigned long es_stats_cache_misses; + struct percpu_counter es_stats_cache_hits; + struct percpu_counter es_stats_cache_misses; u64 es_stats_scan_time; u64 es_stats_max_scan_time; struct percpu_counter es_stats_all_cnt; From 9ba55543fc0c6bb1cf8edd63be8802d9ab7e1202 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 28 Aug 2019 11:25:01 -0400 Subject: [PATCH 17/21] ext4: fix integer overflow when calculating commit interval If user specify a large enough value of "commit=" option, it may trigger signed integer overflow which may lead to sbi->s_commit_interval becomes a large or small value, zero in particular. UBSAN: Undefined behaviour in ../fs/ext4/super.c:1592:31 signed integer overflow: 536870912 * 1000 cannot be represented in type 'int' [...] Call trace: [...] [] ubsan_epilogue+0x34/0x9c lib/ubsan.c:166 [] handle_overflow+0x228/0x280 lib/ubsan.c:197 [] __ubsan_handle_mul_overflow+0x4c/0x68 lib/ubsan.c:218 [] handle_mount_opt fs/ext4/super.c:1592 [inline] [] parse_options+0x1724/0x1a40 fs/ext4/super.c:1773 [] ext4_remount+0x2ec/0x14a0 fs/ext4/super.c:4834 [...] Although it is not a big deal, still silence the UBSAN by limit the input value. Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4079605d437a..7310facffa9d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1874,6 +1874,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } else if (token == Opt_commit) { if (arg == 0) arg = JBD2_DEFAULT_MAX_COMMIT_AGE; + else if (arg > INT_MAX / HZ) { + ext4_msg(sb, KERN_ERR, + "Invalid commit interval %d, " + "must be smaller than %d", + arg, INT_MAX / HZ); + return -1; + } sbi->s_commit_interval = HZ * arg; } else if (token == Opt_debug_want_extra_isize) { sbi->s_want_extra_isize = arg; From 6456ca6520ab6c9aec589b4640169cd6da378c68 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 3 Sep 2019 01:43:17 -0400 Subject: [PATCH 18/21] ext4: fix kernel oops caused by spurious casefold flag If an directory has the a casefold flag set without the casefold feature set, s_encoding will not be initialized, and this will cause the kernel to dereference a NULL pointer. In addition to adding checks to avoid these kernel oops, attempts to load inodes with the casefold flag when the casefold feature is not enable will cause the file system to be declared corrupted. Signed-off-by: Theodore Ts'o --- fs/ext4/dir.c | 7 ++++--- fs/ext4/hash.c | 2 +- fs/ext4/inode.c | 3 +++ fs/ext4/namei.c | 4 ++-- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 86054f31fe4d..9fdd2b269d61 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -668,14 +668,15 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct qstr qstr = {.name = str, .len = len }; + struct inode *inode = dentry->d_parent->d_inode; - if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) { + if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) { if (len != name->len) return -1; return memcmp(str, name->name, len); } - return ext4_ci_compare(dentry->d_parent->d_inode, name, &qstr, false); + return ext4_ci_compare(inode, name, &qstr, false); } static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) @@ -685,7 +686,7 @@ static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) unsigned char *norm; int len, ret = 0; - if (!IS_CASEFOLDED(dentry->d_inode)) + if (!IS_CASEFOLDED(dentry->d_inode) || !um) return 0; norm = kmalloc(PATH_MAX, GFP_ATOMIC); diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index d358bfcb6b3f..3e133793a5a3 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -280,7 +280,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, unsigned char *buff; struct qstr qstr = {.name = name, .len = len }; - if (len && IS_CASEFOLDED(dir)) { + if (len && IS_CASEFOLDED(dir) && um) { buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); if (!buff) return -ENOMEM; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e567f0229d4e..4e271b509af1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5067,6 +5067,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } + if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) + ext4_error_inode(inode, function, line, 0, + "casefold flag without casefold feature"); brelse(iloc.bh); unlock_new_inode(inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 129029534075..a427d2031a8d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1312,7 +1312,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, { int len; - if (!IS_CASEFOLDED(dir)) { + if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) { cf_name->name = NULL; return; } @@ -2183,7 +2183,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, #ifdef CONFIG_UNICODE if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) && - utf8_validate(sbi->s_encoding, &dentry->d_name)) + sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name)) return -EINVAL; #endif From e85526404ca70fa49f3d13b3898b067392522b24 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Sat, 7 Sep 2019 11:56:47 -0400 Subject: [PATCH 19/21] ext4: add missing bigalloc documentation. There was a broken link for bigalloc. The page https://ext4.wiki.kernel.org/index.php/Bigalloc was not migrated into the current documentation sources. This patch adds the contents of that missing page into the section for Bigalloc itself. Signed-off-by: Ayush Ranjan Link: https://lore.kernel.org/r/20190831154419.GA30357@fa19-cs241-404.cs.illinois.edu Signed-off-by: Theodore Ts'o --- Documentation/filesystems/ext4/bigalloc.rst | 32 ++++++++++++++------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/Documentation/filesystems/ext4/bigalloc.rst b/Documentation/filesystems/ext4/bigalloc.rst index c6d88557553c..72075aa608e4 100644 --- a/Documentation/filesystems/ext4/bigalloc.rst +++ b/Documentation/filesystems/ext4/bigalloc.rst @@ -9,14 +9,26 @@ ext4 code is not prepared to handle the case where the block size exceeds the page size. However, for a filesystem of mostly huge files, it is desirable to be able to allocate disk blocks in units of multiple blocks to reduce both fragmentation and metadata overhead. The -`bigalloc `__ feature provides exactly this ability. The -administrator can set a block cluster size at mkfs time (which is stored -in the s\_log\_cluster\_size field in the superblock); from then on, the -block bitmaps track clusters, not individual blocks. This means that -block groups can be several gigabytes in size (instead of just 128MiB); -however, the minimum allocation unit becomes a cluster, not a block, -even for directories. TaoBao had a patchset to extend the “use units of -clusters instead of blocks” to the extent tree, though it is not clear -where those patches went-- they eventually morphed into “extent tree v2” -but that code has not landed as of May 2015. +bigalloc feature provides exactly this ability. + +The bigalloc feature (EXT4_FEATURE_RO_COMPAT_BIGALLOC) changes ext4 to +use clustered allocation, so that each bit in the ext4 block allocation +bitmap addresses a power of two number of blocks. For example, if the +file system is mainly going to be storing large files in the 4-32 +megabyte range, it might make sense to set a cluster size of 1 megabyte. +This means that each bit in the block allocation bitmap now addresses +256 4k blocks. This shrinks the total size of the block allocation +bitmaps for a 2T file system from 64 megabytes to 256 kilobytes. It also +means that a block group addresses 32 gigabytes instead of 128 megabytes, +also shrinking the amount of file system overhead for metadata. + +The administrator can set a block cluster size at mkfs time (which is +stored in the s\_log\_cluster\_size field in the superblock); from then +on, the block bitmaps track clusters, not individual blocks. This means +that block groups can be several gigabytes in size (instead of just +128MiB); however, the minimum allocation unit becomes a cluster, not a +block, even for directories. TaoBao had a patchset to extend the “use +units of clusters instead of blocks” to the extent tree, though it is +not clear where those patches went-- they eventually morphed into +“extent tree v2” but that code has not landed as of May 2015. From 334b427e96d10f6e07117a785636dac81c430141 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Fri, 30 Aug 2019 15:13:49 +0200 Subject: [PATCH 20/21] unicode: Move static keyword to the front of declarations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the static keyword to the front of declarations of nfdi_test_data and nfdicf_test_data, and resolve the following compiler warnings that can be seen when building with warnings enabled (W=1): fs/unicode/utf8-selftest.c:38:1: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] fs/unicode/utf8-selftest.c:92:1: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] Signed-off-by: Krzysztof Wilczynski Signed-off-by: Gabriel Krisman Bertazi --- fs/unicode/utf8-selftest.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c index 6c1a36bbf6ad..6fe8af7edccb 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/utf8-selftest.c @@ -35,7 +35,7 @@ unsigned int total_tests; #define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) #define test(cond) _test(cond, __func__, __LINE__, "") -const static struct { +static const struct { /* UTF-8 strings in this vector _must_ be NULL-terminated. */ unsigned char str[10]; unsigned char dec[10]; @@ -89,7 +89,7 @@ const static struct { }; -const static struct { +static const struct { /* UTF-8 strings in this vector _must_ be NULL-terminated. */ unsigned char str[30]; unsigned char ncf[30]; From aa28b98d6dbcdb1c822b53f90e509565dfc450b0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 6 Sep 2019 14:58:07 +0100 Subject: [PATCH 21/21] unicode: make array 'token' static const, makes object smaller Don't populate the array 'token' on the stack but instead make it static const. Makes the object code smaller by 234 bytes. Before: text data bss dec hex filename 5371 272 0 5643 160b fs/unicode/utf8-core.o After: text data bss dec hex filename 5041 368 0 5409 1521 fs/unicode/utf8-core.o (gcc version 9.2.1, amd64) Signed-off-by: Colin Ian King Reviewed-by: Theodore Ts'o Signed-off-by: Gabriel Krisman Bertazi --- fs/unicode/utf8-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c index 71ca4d047d65..2a878b739115 100644 --- a/fs/unicode/utf8-core.c +++ b/fs/unicode/utf8-core.c @@ -154,7 +154,7 @@ static int utf8_parse_version(const char *version, unsigned int *maj, { substring_t args[3]; char version_string[12]; - const struct match_token token[] = { + static const struct match_token token[] = { {1, "%d.%d.%d"}, {0, NULL} };