From 1c6eb026f7555e21199565d16b375b00f8ac7cb3 Mon Sep 17 00:00:00 2001 From: Nishad Kamdar Date: Sat, 25 Apr 2020 18:49:08 +0530 Subject: [PATCH 01/38] f2fs: Use the correct style for SPDX License Identifier This patch corrects the SPDX License Identifier style in header files related to F2FS File System support. For C header files Documentation/process/license-rules.rst mandates C-like comments (opposed to C source files where C++ style should be used). Changes made by using a script provided by Joe Perches here: https://lkml.org/lkml/2019/2/7/46. Suggested-by: Joe Perches Signed-off-by: Nishad Kamdar Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.h | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/gc.h | 2 +- fs/f2fs/node.h | 2 +- fs/f2fs/segment.h | 2 +- fs/f2fs/trace.h | 2 +- fs/f2fs/xattr.h | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index b96823c59b15..124868c13f80 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/acl.h * diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1eebeb3a11d9..ddfd6e76c922 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/f2fs.h * diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index bbac9d3787bd..db3c61046aa4 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/gc.h * diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e05af5df5648..6a2011deea23 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/node.h * diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 7a83bd530812..cba16cca5189 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/segment.h * diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h index e8075fc5b228..789f6aa727fc 100644 --- a/fs/f2fs/trace.h +++ b/fs/f2fs/trace.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * f2fs IO tracer * diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 938fcd20565d..6a192e6c7a9e 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/xattr.h * From 3b68c989a370635249551e8dee5ce16c0e67ad44 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 May 2020 16:35:23 -0700 Subject: [PATCH 02/38] f2fs: use strcmp() in parse_options() Remove the pointless string length checks. Just use strcmp(). Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c47e765aa430..01c5ea7b05b5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -442,11 +442,11 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (strlen(name) == 2 && !strncmp(name, "on", 2)) { + if (!strcmp(name, "on")) { F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; - } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) { + } else if (!strcmp(name, "off")) { F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF; - } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) { + } else if (!strcmp(name, "sync")) { F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC; } else { kvfree(name); @@ -606,16 +606,14 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (strlen(name) == 8 && - !strncmp(name, "adaptive", 8)) { + if (!strcmp(name, "adaptive")) { if (f2fs_sb_has_blkzoned(sbi)) { f2fs_warn(sbi, "adaptive mode is not allowed with zoned block device feature"); kvfree(name); return -EINVAL; } F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; - } else if (strlen(name) == 3 && - !strncmp(name, "lfs", 3)) { + } else if (!strcmp(name, "lfs")) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; } else { kvfree(name); @@ -740,14 +738,11 @@ static int parse_options(struct super_block *sb, char *options) name = match_strdup(&args[0]); if (!name) return -ENOMEM; - if (strlen(name) == 10 && - !strncmp(name, "user-based", 10)) { + if (!strcmp(name, "user-based")) { F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER; - } else if (strlen(name) == 3 && - !strncmp(name, "off", 3)) { + } else if (!strcmp(name, "off")) { F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - } else if (strlen(name) == 8 && - !strncmp(name, "fs-based", 8)) { + } else if (!strcmp(name, "fs-based")) { F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; } else { kvfree(name); @@ -760,11 +755,9 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (strlen(name) == 7 && - !strncmp(name, "default", 7)) { + if (!strcmp(name, "default")) { F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; - } else if (strlen(name) == 5 && - !strncmp(name, "reuse", 5)) { + } else if (!strcmp(name, "reuse")) { F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; } else { kvfree(name); @@ -776,14 +769,11 @@ static int parse_options(struct super_block *sb, char *options) name = match_strdup(&args[0]); if (!name) return -ENOMEM; - if (strlen(name) == 5 && - !strncmp(name, "posix", 5)) { + if (!strcmp(name, "posix")) { F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; - } else if (strlen(name) == 6 && - !strncmp(name, "strict", 6)) { + } else if (!strcmp(name, "strict")) { F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; - } else if (strlen(name) == 9 && - !strncmp(name, "nobarrier", 9)) { + } else if (!strcmp(name, "nobarrier")) { F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_NOBARRIER; } else { @@ -833,15 +823,13 @@ static int parse_options(struct super_block *sb, char *options) name = match_strdup(&args[0]); if (!name) return -ENOMEM; - if (strlen(name) == 3 && !strcmp(name, "lzo")) { + if (!strcmp(name, "lzo")) { F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; - } else if (strlen(name) == 3 && - !strcmp(name, "lz4")) { + } else if (!strcmp(name, "lz4")) { F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; - } else if (strlen(name) == 4 && - !strcmp(name, "zstd")) { + } else if (!strcmp(name, "zstd")) { F2FS_OPTION(sbi).compress_algorithm = COMPRESS_ZSTD; } else { From 69ff0dd3e9426413c657e5059b9b3077ae8302a3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Feb 2020 18:49:06 +0800 Subject: [PATCH 03/38] f2fs: remove redundant compress inode check due to f2fs_post_read_required() has did that. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ddfd6e76c922..bda7f6195605 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4081,8 +4081,6 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, return true; if (f2fs_is_multi_device(sbi)) return true; - if (f2fs_compressed_file(inode)) - return true; /* * for blkzoned device, fallback direct IO to buffered IO, so * all IOs can be serialized by log-structured write. From bcdfc98a52769f5161456e73b495d7f72b0f4abc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 18 Mar 2020 16:22:59 +0800 Subject: [PATCH 04/38] f2fs: support partial truncation on compressed inode Supports to truncate compressed/normal cluster partially on compressed inode. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/file.c | 19 +++++++++++++----- 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 26b071afe48a..d6283e351c95 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -954,6 +954,55 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, return first_index; } +int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) +{ + void *fsdata = NULL; + struct page *pagep; + int log_cluster_size = F2FS_I(inode)->i_log_cluster_size; + pgoff_t start_idx = from >> (PAGE_SHIFT + log_cluster_size) << + log_cluster_size; + int err; + + err = f2fs_is_compressed_cluster(inode, start_idx); + if (err < 0) + return err; + + /* truncate normal cluster */ + if (!err) + return f2fs_do_truncate_blocks(inode, from, lock); + + /* truncate compressed cluster */ + err = f2fs_prepare_compress_overwrite(inode, &pagep, + start_idx, &fsdata); + + /* should not be a normal cluster */ + f2fs_bug_on(F2FS_I_SB(inode), err == 0); + + if (err <= 0) + return err; + + if (err > 0) { + struct page **rpages = fsdata; + int cluster_size = F2FS_I(inode)->i_cluster_size; + int i; + + for (i = cluster_size - 1; i >= 0; i--) { + loff_t start = rpages[i]->index << PAGE_SHIFT; + + if (from <= start) { + zero_user_segment(rpages[i], 0, PAGE_SIZE); + } else { + zero_user_segment(rpages[i], from - start, + PAGE_SIZE); + break; + } + } + + f2fs_compress_write_end(inode, fsdata, start_idx, true); + } + return 0; +} + static int f2fs_write_compressed_pages(struct compress_ctx *cc, int *submitted, struct writeback_control *wbc, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bda7f6195605..0b5d25823c70 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3142,6 +3142,7 @@ static inline void f2fs_clear_page_private(struct page *page) */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void f2fs_truncate_data_blocks(struct dnode_of_data *dn); +int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); int f2fs_getattr(const struct path *path, struct kstat *stat, @@ -3859,6 +3860,7 @@ int f2fs_prepare_compress_overwrite(struct inode *inode, struct page **pagep, pgoff_t index, void **fsdata); bool f2fs_compress_write_end(struct inode *inode, void *fsdata, pgoff_t index, unsigned copied); +int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5548fab243f9..4b24bfde693d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -647,9 +647,6 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; } - if (f2fs_compressed_file(inode)) - return 0; - page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); @@ -665,7 +662,7 @@ truncate_out: return 0; } -static int do_truncate_blocks(struct inode *inode, u64 from, bool lock) +int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -733,7 +730,9 @@ free_partial: int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) { u64 free_from = from; + int err; +#ifdef CONFIG_F2FS_FS_COMPRESSION /* * for compressed file, only support cluster size * aligned truncation. @@ -748,8 +747,18 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) free_from++; free_from <<= cluster_shift; } +#endif - return do_truncate_blocks(inode, free_from, lock); + err = f2fs_do_truncate_blocks(inode, free_from, lock); + if (err) + return err; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (from != free_from) + err = f2fs_truncate_partial_cluster(inode, from, lock); +#endif + + return err; } int f2fs_truncate(struct inode *inode) From f4636b905af6ed189fc750d05ac4b31828fab7c3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Mar 2020 17:40:40 +0800 Subject: [PATCH 05/38] f2fs: support fiemap on compressed inode Map normal/compressed cluster of compressed inode correctly, and give the right fiemap flag FIEMAP_EXTENT_ENCODED on mapped compressed extent. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 39bebb095bd5..acb5f42bfa41 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1832,6 +1832,25 @@ static int f2fs_xattr_fiemap(struct inode *inode, return (err < 0 ? err : 0); } +static loff_t max_inode_blocks(struct inode *inode) +{ + loff_t result = ADDRS_PER_INODE(inode); + loff_t leaf_count = ADDRS_PER_BLOCK(inode); + + /* two direct node blocks */ + result += (leaf_count * 2); + + /* two indirect node blocks */ + leaf_count *= NIDS_PER_BLOCK; + result += (leaf_count * 2); + + /* one double indirect node block */ + leaf_count *= NIDS_PER_BLOCK; + result += leaf_count; + + return result; +} + int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -1841,6 +1860,8 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; + bool compr_cluster = false; + unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { ret = f2fs_precache_extents(inode); @@ -1875,6 +1896,9 @@ next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; + if (compr_cluster) + map_bh.b_size = blk_to_logical(inode, cluster_size - 1); + ret = get_data_block(inode, start_blk, &map_bh, 0, F2FS_GET_BLOCK_FIEMAP, &next_pgofs); if (ret) @@ -1885,7 +1909,7 @@ next: start_blk = next_pgofs; if (blk_to_logical(inode, start_blk) < blk_to_logical(inode, - F2FS_I_SB(inode)->max_file_blocks)) + max_inode_blocks(inode))) goto prep_next; flags |= FIEMAP_EXTENT_LAST; @@ -1897,11 +1921,38 @@ next: ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); + if (ret) + goto out; + size = 0; } - if (start_blk > last_blk || ret) + if (start_blk > last_blk) goto out; + if (compr_cluster) { + compr_cluster = false; + + + logical = blk_to_logical(inode, start_blk - 1); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = blk_to_logical(inode, cluster_size); + + flags |= FIEMAP_EXTENT_ENCODED; + + start_blk += cluster_size - 1; + + if (start_blk > last_blk) + goto out; + + goto prep_next; + } + + if (map_bh.b_blocknr == COMPRESS_ADDR) { + compr_cluster = true; + start_blk++; + goto prep_next; + } + logical = blk_to_logical(inode, start_blk); phys = blk_to_logical(inode, map_bh.b_blocknr); size = map_bh.b_size; From 2211c9c08592a2f90d1cb751bbb3216f7f20a77d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 30 Mar 2020 17:13:29 +0800 Subject: [PATCH 06/38] f2fs: introduce f2fs_bmap_compress() to support bmap() on compressed inode: if queried block locates in non-compressed cluster, return its physical block address. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index acb5f42bfa41..6ea6d0355033 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3666,6 +3666,37 @@ static int f2fs_set_data_page_dirty(struct page *page) return 0; } + +static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct dnode_of_data dn; + sector_t start_idx, blknr = 0; + int ret; + + start_idx = round_down(block, F2FS_I(inode)->i_cluster_size); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (ret) + return 0; + + if (dn.data_blkaddr != COMPRESS_ADDR) { + dn.ofs_in_node += block - start_idx; + blknr = f2fs_data_blkaddr(&dn); + if (!__is_valid_data_blkaddr(blknr)) + blknr = 0; + } + + f2fs_put_dnode(&dn); + + return blknr; +#else + return -EOPNOTSUPP; +#endif +} + + static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; @@ -3677,6 +3708,9 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) filemap_write_and_wait(mapping); + if (f2fs_compressed_file(inode)) + return f2fs_bmap_compress(inode, block); + return generic_block_bmap(mapping, block, get_data_block_bmap); } From aa39675a81e6cfdf4d5d0593fc6ec5fe5b8fe134 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Apr 2020 19:56:05 +0800 Subject: [PATCH 07/38] f2fs: introduce mempool for {,de}compress intermediate page allocation If compression feature is on, in scenario of no enough free memory, page refault ratio is higher than before, the root cause is: - {,de}compression flow needs to allocate intermediate pages to store compressed data in cluster, so during their allocation, vm may reclaim mmaped pages. - if above reclaimed pages belong to compressed cluster, during its refault, it may cause more intermediate pages allocation, result in reclaiming more mmaped pages. So this patch introduces a mempool for intermediate page allocation, in order to avoid high refault ratio, by default, number of preallocated page in pool is 512, user can change the number by assigning 'num_compress_pages' parameter during module initialization. Ma Feng found warnings in the original patch and fixed like below. Fix the following sparse warning: fs/f2fs/compress.c:501:5: warning: symbol 'num_compress_pages' was not declared. Should it be static? fs/f2fs/compress.c:530:6: warning: symbol 'f2fs_compress_free_page' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Chao Yu Signed-off-by: Ma Feng Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 65 ++++++++++++++++++++++++++++++---------------- fs/f2fs/f2fs.h | 4 +++ fs/f2fs/super.c | 6 +++++ 3 files changed, 53 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d6283e351c95..b726aad898de 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -65,15 +66,6 @@ static void f2fs_set_compressed_page(struct page *page, page->mapping = inode->i_mapping; } -static void f2fs_put_compressed_page(struct page *page) -{ - set_page_private(page, (unsigned long)NULL); - ClearPagePrivate(page); - page->mapping = NULL; - unlock_page(page); - put_page(page); -} - static void f2fs_drop_rpages(struct compress_ctx *cc, int len, bool unlock) { int i; @@ -476,17 +468,47 @@ bool f2fs_is_compress_backend_ready(struct inode *inode) return f2fs_cops[F2FS_I(inode)->i_compress_algorithm]; } -static struct page *f2fs_grab_page(void) +static mempool_t *compress_page_pool = NULL; +static int num_compress_pages = 512; +module_param(num_compress_pages, uint, 0444); +MODULE_PARM_DESC(num_compress_pages, + "Number of intermediate compress pages to preallocate"); + +int f2fs_init_compress_mempool(void) +{ + compress_page_pool = mempool_create_page_pool(num_compress_pages, 0); + if (!compress_page_pool) + return -ENOMEM; + + return 0; +} + +void f2fs_destroy_compress_mempool(void) +{ + mempool_destroy(compress_page_pool); +} + +static struct page *f2fs_compress_alloc_page(void) { struct page *page; - page = alloc_page(GFP_NOFS); - if (!page) - return NULL; + page = mempool_alloc(compress_page_pool, GFP_NOFS); lock_page(page); + return page; } +static void f2fs_compress_free_page(struct page *page) +{ + if (!page) + return; + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + page->mapping = NULL; + unlock_page(page); + mempool_free(page, compress_page_pool); +} + static int f2fs_compress_pages(struct compress_ctx *cc) { struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); @@ -516,7 +538,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) } for (i = 0; i < cc->nr_cpages; i++) { - cc->cpages[i] = f2fs_grab_page(); + cc->cpages[i] = f2fs_compress_alloc_page(); if (!cc->cpages[i]) { ret = -ENOMEM; goto out_free_cpages; @@ -561,7 +583,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) vunmap(cc->rbuf); for (i = nr_cpages; i < cc->nr_cpages; i++) { - f2fs_put_compressed_page(cc->cpages[i]); + f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -581,7 +603,7 @@ out_vunmap_rbuf: out_free_cpages: for (i = 0; i < cc->nr_cpages; i++) { if (cc->cpages[i]) - f2fs_put_compressed_page(cc->cpages[i]); + f2fs_compress_free_page(cc->cpages[i]); } kfree(cc->cpages); cc->cpages = NULL; @@ -1183,7 +1205,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) if (unlikely(bio->bi_status)) mapping_set_error(cic->inode->i_mapping, -EIO); - f2fs_put_compressed_page(page); + f2fs_compress_free_page(page); dec_page_count(sbi, F2FS_WB_DATA); @@ -1344,7 +1366,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) for (i = 0; i < dic->nr_cpages; i++) { struct page *page; - page = f2fs_grab_page(); + page = f2fs_compress_alloc_page(); if (!page) goto out_free; @@ -1364,7 +1386,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) continue; } - dic->tpages[i] = f2fs_grab_page(); + dic->tpages[i] = f2fs_compress_alloc_page(); if (!dic->tpages[i]) goto out_free; } @@ -1386,8 +1408,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) continue; if (!dic->tpages[i]) continue; - unlock_page(dic->tpages[i]); - put_page(dic->tpages[i]); + f2fs_compress_free_page(dic->tpages[i]); } kfree(dic->tpages); } @@ -1396,7 +1417,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) for (i = 0; i < dic->nr_cpages; i++) { if (!dic->cpages[i]) continue; - f2fs_put_compressed_page(dic->cpages[i]); + f2fs_compress_free_page(dic->cpages[i]); } kfree(dic->cpages); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0b5d25823c70..fd1800ea518c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3863,6 +3863,8 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); +int f2fs_init_compress_mempool(void); +void f2fs_destroy_compress_mempool(void); void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); @@ -3896,6 +3898,8 @@ static inline struct page *f2fs_compress_control_page(struct page *page) WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } +static inline int f2fs_init_compress_mempool(void) { return 0; } +static inline void f2fs_destroy_compress_mempool(void) { } #endif static inline void set_compress_context(struct inode *inode) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 01c5ea7b05b5..96b9af4277e2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3909,7 +3909,12 @@ static int __init init_f2fs_fs(void) err = f2fs_init_bioset(); if (err) goto free_bio_enrty_cache; + err = f2fs_init_compress_mempool(); + if (err) + goto free_bioset; return 0; +free_bioset: + f2fs_destroy_bioset(); free_bio_enrty_cache: f2fs_destroy_bio_entry_cache(); free_post_read: @@ -3937,6 +3942,7 @@ fail: static void __exit exit_f2fs_fs(void) { + f2fs_destroy_compress_mempool(); f2fs_destroy_bioset(); f2fs_destroy_bio_entry_cache(); f2fs_destroy_post_read_processing(); From b8c0bb245eb6dcfb533a5eb16d36ac6eabee1335 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 5 May 2020 11:41:11 -0700 Subject: [PATCH 08/38] f2fs: correctly fix the parent inode number during fsync() fsync() may be called on a deleted file that's still open. So when fsync() tries to set the parent inode number when the inode has LOST_PINO and i_nlink == 1 (to avoid later checkpoints), it needs to make sure to get the parent directory via a non-deleted alias. Also remove the unnecessary igrab() and iput(), as the caller already holds a reference to the inode. Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4b24bfde693d..2fe90872923a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -169,9 +169,11 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) { struct dentry *dentry; - inode = igrab(inode); - dentry = d_find_any_alias(inode); - iput(inode); + /* + * Make sure to get the non-deleted alias. The alias associated with + * the open file descriptor being fsync()'ed may be deleted already. + */ + dentry = d_find_alias(inode); if (!dentry) return 0; From 0627412db60646a3bc747f090016ff133c586434 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 8 May 2020 17:50:20 +0800 Subject: [PATCH 09/38] f2fs: shrink spinlock coverage In f2fs_try_to_free_nids(), .nid_list_lock spinlock critical region will increase as expected shrink number increase, to avoid spining other CPUs for long time, we change to release nid caches with small batch each time under .nid_list_lock coverage. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 25 +++++++++++++++---------- fs/f2fs/node.h | 3 +++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 68422396e620..62a31f72593d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2488,7 +2488,6 @@ void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct free_nid *i, *next; int nr = nr_shrink; if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) @@ -2497,17 +2496,23 @@ int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) if (!mutex_trylock(&nm_i->build_lock)) return 0; - spin_lock(&nm_i->nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || - nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) - break; + while (nr_shrink && nm_i->nid_cnt[FREE_NID] > MAX_FREE_NIDS) { + struct free_nid *i, *next; + unsigned int batch = SHRINK_NID_BATCH_SIZE; - __remove_free_nid(sbi, i, FREE_NID); - kmem_cache_free(free_nid_slab, i); - nr_shrink--; + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { + if (!nr_shrink || !batch || + nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) + break; + __remove_free_nid(sbi, i, FREE_NID); + kmem_cache_free(free_nid_slab, i); + nr_shrink--; + batch--; + } + spin_unlock(&nm_i->nid_list_lock); } - spin_unlock(&nm_i->nid_list_lock); + mutex_unlock(&nm_i->build_lock); return nr - nr_shrink; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 6a2011deea23..69e5859e993c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -15,6 +15,9 @@ #define FREE_NID_PAGES 8 #define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) +/* size of free nid batch when shrinking */ +#define SHRINK_NID_BATCH_SIZE 8 + #define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */ /* maximum readahead size for node during getting data blocks */ From d722726c7083308caa30f108a0ba329ecf33c21a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 Mar 2020 15:36:09 +0800 Subject: [PATCH 10/38] f2fs: introduce F2FS_IOC_RELEASE_COMPRESS_BLOCKS There are still reserved blocks on compressed inode, this patch introduce a new ioctl to help release reserved blocks back to filesystem, so that userspace can reuse those freed space. ---- Daeho fixed a bug like below. Now, if writing pages and releasing compress blocks occur simultaneously, and releasing cblocks is executed more than one time to a file, then total block count of filesystem and block count of the file could be incorrect and damaged. We have to execute releasing compress blocks only one time for a file without being interfered by writepages path. --- Signed-off-by: Chao Yu Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++ fs/f2fs/file.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 174 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fd1800ea518c..166bfa0f3ad6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -428,6 +428,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15) #define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64) #define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64) +#define F2FS_IOC_RELEASE_COMPRESS_BLOCKS \ + _IOR(F2FS_IOCTL_MAGIC, 18, __u64) #define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL #define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL @@ -4046,6 +4048,10 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode, { int diff = F2FS_I(inode)->i_cluster_size - blocks; + /* don't update i_compr_blocks if saved blocks were released */ + if (!add && !F2FS_I(inode)->i_compr_blocks) + return; + if (add) { F2FS_I(inode)->i_compr_blocks += diff; stat_add_compr_blocks(inode, diff); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2fe90872923a..017b3715e230 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -563,6 +563,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) bool compressed_cluster = false; int cluster_index = 0, valid_blocks = 0; int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + bool released = !F2FS_I(dn->inode)->i_compr_blocks; if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) base = get_extra_isize(dn->inode); @@ -601,7 +602,9 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); f2fs_invalidate_blocks(sbi, blkaddr); - nr_free++; + + if (!released || blkaddr != COMPRESS_ADDR) + nr_free++; } if (compressed_cluster) @@ -3437,6 +3440,167 @@ static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg) return put_user(blocks, (u64 __user *)arg); } +static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + unsigned int released_blocks = 0; + int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + block_t blkaddr; + int i; + + for (i = 0; i < count; i++) { + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + continue; + if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE))) + return -EFSCORRUPTED; + } + + while (count) { + int compr_blocks = 0; + + for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) { + blkaddr = f2fs_data_blkaddr(dn); + + if (i == 0) { + if (blkaddr == COMPRESS_ADDR) + continue; + dn->ofs_in_node += cluster_size; + goto next; + } + + if (__is_valid_data_blkaddr(blkaddr)) + compr_blocks++; + + if (blkaddr != NEW_ADDR) + continue; + + dn->data_blkaddr = NULL_ADDR; + f2fs_set_data_blkaddr(dn); + } + + f2fs_i_compr_blocks_update(dn->inode, compr_blocks, false); + dec_valid_block_count(sbi, dn->inode, + cluster_size - compr_blocks); + + released_blocks += cluster_size - compr_blocks; +next: + count -= cluster_size; + } + + return released_blocks; +} + +static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int released_blocks = 0; + int ret; + int writecount; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + inode_lock(inode); + + writecount = atomic_read(&inode->i_writecount); + if ((filp->f_mode & FMODE_WRITE && writecount != 1) || writecount) { + ret = -EBUSY; + goto out; + } + + if (IS_IMMUTABLE(inode)) { + ret = -EINVAL; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + if (!F2FS_I(inode)->i_compr_blocks) + goto out; + + F2FS_I(inode)->i_flags |= F2FS_IMMUTABLE_FL; + f2fs_set_inode_flags(inode); + inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, true); + + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + while (page_idx < last_idx) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) { + page_idx = f2fs_get_next_page_offset(&dn, + page_idx); + ret = 0; + continue; + } + break; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); + count = roundup(count, F2FS_I(inode)->i_cluster_size); + + ret = release_compress_blocks(&dn, count); + + f2fs_put_dnode(&dn); + + if (ret < 0) + break; + + page_idx += count; + released_blocks += ret; + } + + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_mmap_sem); +out: + inode_unlock(inode); + + mnt_drop_write_file(filp); + + if (ret >= 0) { + ret = put_user(released_blocks, (u64 __user *)arg); + } else if (released_blocks && F2FS_I(inode)->i_compr_blocks) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " + "iblocks=%llu, released=%u, compr_blocks=%llu, " + "run fsck to fix.", + __func__, inode->i_ino, (u64)inode->i_blocks, + released_blocks, + F2FS_I(inode)->i_compr_blocks); + } + + return ret; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -3519,6 +3683,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_set_volume_name(filp, arg); case F2FS_IOC_GET_COMPRESS_BLOCKS: return f2fs_get_compress_blocks(filp, arg); + case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: + return f2fs_release_compress_blocks(filp, arg); default: return -ENOTTY; } @@ -3685,6 +3851,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GET_VOLUME_NAME: case F2FS_IOC_SET_VOLUME_NAME: case F2FS_IOC_GET_COMPRESS_BLOCKS: + case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: break; default: return -ENOIOCTLCMD; From ef11afbd6118208263db88083b28f40c3446ffc6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 8 May 2020 12:25:45 -0700 Subject: [PATCH 11/38] f2fs: remove blk_plugging in block_operations blk_plugging doesn't seem to give any benefit. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e3604a3a43f5..3254ca7d8814 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1166,11 +1166,8 @@ static int block_operations(struct f2fs_sb_info *sbi) .nr_to_write = LONG_MAX, .for_reclaim = 0, }; - struct blk_plug plug; int err = 0, cnt = 0; - blk_start_plug(&plug); - retry_flush_quotas: f2fs_lock_all(sbi); if (__need_flush_quota(sbi)) { @@ -1198,7 +1195,7 @@ retry_flush_dents: f2fs_unlock_all(sbi); err = f2fs_sync_dirty_inodes(sbi, DIR_INODE); if (err) - goto out; + return err; cond_resched(); goto retry_flush_quotas; } @@ -1214,7 +1211,7 @@ retry_flush_dents: f2fs_unlock_all(sbi); err = f2fs_sync_inode_meta(sbi); if (err) - goto out; + return err; cond_resched(); goto retry_flush_quotas; } @@ -1230,7 +1227,7 @@ retry_flush_nodes: if (err) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); - goto out; + return err; } cond_resched(); goto retry_flush_nodes; @@ -1242,8 +1239,6 @@ retry_flush_nodes: */ __prepare_cp_block(sbi); up_write(&sbi->node_change); -out: - blk_finish_plug(&plug); return err; } From 3067a8d931ab5cdf4cb269fa6f66ebe0a7c1859d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 9 May 2020 15:01:04 +0800 Subject: [PATCH 12/38] f2fs: compress: let lz4 compressor handle output buffer budget properly Commonly, in order to handle lz4 worst compress case, caller should allocate buffer with size of LZ4_compressBound(inputsize) for target compressed data storing, however in this case, if caller didn't allocate enough space, lz4 compressor still can handle output buffer budget properly, and end up compressing when left space in output buffer is not enough. So we don't have to allocate buffer with size for worst case, then we can avoid 2 * 4KB size intermediate buffer allocation when log_cluster_size is 2, and avoid unnecessary compressing work of compressor if we can not save at least 4KB space. Suggested-by: Daeho Jeong Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index b726aad898de..fd7cdb814def 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -228,7 +228,12 @@ static int lz4_init_compress_ctx(struct compress_ctx *cc) if (!cc->private) return -ENOMEM; - cc->clen = LZ4_compressBound(PAGE_SIZE << cc->log_cluster_size); + /* + * we do not change cc->clen to LZ4_compressBound(inputsize) to + * adapt worst compress case, because lz4 compressor can handle + * output budget properly. + */ + cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE; return 0; } @@ -244,11 +249,9 @@ static int lz4_compress_pages(struct compress_ctx *cc) len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, cc->private); - if (!len) { - printk_ratelimited("%sF2FS-fs (%s): lz4 compress failed\n", - KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id); - return -EIO; - } + if (!len) + return -EAGAIN; + cc->clen = len; return 0; } From 4caf0692b6f30f76ae17de99e4d0c32e54e3e2cc Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 9 May 2020 19:21:35 +0800 Subject: [PATCH 13/38] f2fs: Fix wrong stub helper update_sit_info update_sit_info should be f2fs_update_sit_info, otherwise build fails while no CONFIG_F2FS_STAT_FS. Fixes: fc7100ea2a52 ("f2fs: Add f2fs stats to sysfs") Signed-off-by: YueHaibing Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 166bfa0f3ad6..23754ff99bb5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3726,7 +3726,7 @@ static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } -static inline void update_sit_info(struct f2fs_sb_info *sbi) {} +static inline void f2fs_update_sit_info(struct f2fs_sb_info *sbi) {} #endif extern const struct file_operations f2fs_dir_operations; From c8bff92f75cb9854c325fbdbe45f798f766f69a5 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 11 May 2020 09:15:18 +0300 Subject: [PATCH 14/38] f2fs: report delalloc reserve as non-free in statfs for project quota This reserved space isn't committed yet but cannot be used for allocations. For userspace it has no difference from used space. See the same fix in ext4 commit f06925c73942 ("ext4: report delalloc reserve as non-free in statfs for project quota"). Fixes: ddc34e328d06 ("f2fs: introduce f2fs_statfs_project") Signed-off-by: Konstantin Khlebnikov Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 96b9af4277e2..68836cada44f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1298,7 +1298,8 @@ static int f2fs_statfs_project(struct super_block *sb, limit >>= sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { - curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + curblock = (dquot->dq_dqb.dqb_curspace + + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; buf->f_blocks = limit; buf->f_bfree = buf->f_bavail = (buf->f_blocks > curblock) ? From 4456a259714c1f962088390af68d5f0c3d161edc Mon Sep 17 00:00:00 2001 From: Sayali Lokhande Date: Thu, 30 Apr 2020 16:28:29 +0530 Subject: [PATCH 15/38] f2fs: Avoid double lock for cp_rwsem during checkpoint There could be a scenario where f2fs_sync_node_pages gets called during checkpoint, which in turn tries to flush inline data and calls iput(). This results in deadlock as iput() tries to hold cp_rwsem, which is already held at the beginning by checkpoint->block_operations(). Call stack : Thread A Thread B f2fs_write_checkpoint() - block_operations(sbi) - f2fs_lock_all(sbi); - down_write(&sbi->cp_rwsem); - open() - igrab() - write() write inline data - unlink() - f2fs_sync_node_pages() - if (is_inline_node(page)) - flush_inline_data() - ilookup() page = f2fs_pagecache_get_page() if (!page) goto iput_out; iput_out: -close() -iput() iput(inode); - f2fs_evict_inode() - f2fs_truncate_blocks() - f2fs_lock_op() - down_read(&sbi->cp_rwsem); Fixes: 2049d4fcb057 ("f2fs: avoid multiple node page writes due to inline_data") Signed-off-by: Sayali Lokhande Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 51 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 3254ca7d8814..a70e76d160f1 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1168,6 +1168,11 @@ static int block_operations(struct f2fs_sb_info *sbi) }; int err = 0, cnt = 0; + /* + * Let's flush inline_data in dirty node pages. + */ + f2fs_flush_inline_data(sbi); + retry_flush_quotas: f2fs_lock_all(sbi); if (__need_flush_quota(sbi)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 23754ff99bb5..7e1223f59042 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3282,6 +3282,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); int f2fs_move_node_page(struct page *node_page, int gc_type); +int f2fs_flush_inline_data(struct f2fs_sb_info *sbi); int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic, unsigned int *seq_id); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 62a31f72593d..79dc8b9e1816 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1807,6 +1807,53 @@ static bool flush_dirty_inode(struct page *page) return true; } +int f2fs_flush_inline_data(struct f2fs_sb_info *sbi) +{ + pgoff_t index = 0; + struct pagevec pvec; + int nr_pages; + int ret = 0; + + pagevec_init(&pvec); + + while ((nr_pages = pagevec_lookup_tag(&pvec, + NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (!IS_DNODE(page)) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + /* flush inline_data, if it's async context. */ + if (is_inline_node(page)) { + clear_inline_node(page); + unlock_page(page); + flush_inline_data(sbi, ino_of_node(page)); + continue; + } + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + return ret; +} + int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type) @@ -1870,8 +1917,8 @@ continue_unlock: goto continue_unlock; } - /* flush inline_data */ - if (is_inline_node(page)) { + /* flush inline_data, if it's async context. */ + if (do_balance && is_inline_node(page)) { clear_inline_node(page); unlock_page(page); flush_inline_data(sbi, ino_of_node(page)); From 390a0e2c6b129cd93f830b64b2fffb7215974802 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 Mar 2020 14:35:33 +0800 Subject: [PATCH 16/38] f2fs: introduce F2FS_IOC_RESERVE_COMPRESS_BLOCKS This patch introduces a new ioctl to rollback all compress inode status: - add reserved blocks in dnode blocks - increase i_compr_blocks, i_blocks, total_valid_block_count - remove immutable flag Then compress inode can be restored to support overwrite functionality again. Signee-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 + fs/f2fs/file.c | 162 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7e1223f59042..d3fd3e32164b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -430,6 +430,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64) #define F2FS_IOC_RELEASE_COMPRESS_BLOCKS \ _IOR(F2FS_IOCTL_MAGIC, 18, __u64) +#define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \ + _IOR(F2FS_IOCTL_MAGIC, 19, __u64) #define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL #define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 017b3715e230..add8852c1a5c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3601,6 +3601,165 @@ out: return ret; } +static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + unsigned int reserved_blocks = 0; + int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + block_t blkaddr; + int i; + + for (i = 0; i < count; i++) { + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + continue; + if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE))) + return -EFSCORRUPTED; + } + + while (count) { + int compr_blocks = 0; + blkcnt_t reserved; + int ret; + + for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) { + blkaddr = f2fs_data_blkaddr(dn); + + if (i == 0) { + if (blkaddr == COMPRESS_ADDR) + continue; + dn->ofs_in_node += cluster_size; + goto next; + } + + if (__is_valid_data_blkaddr(blkaddr)) { + compr_blocks++; + continue; + } + + dn->data_blkaddr = NEW_ADDR; + f2fs_set_data_blkaddr(dn); + } + + reserved = cluster_size - compr_blocks; + ret = inc_valid_block_count(sbi, dn->inode, &reserved); + if (ret) + return ret; + + if (reserved != cluster_size - compr_blocks) + return -ENOSPC; + + f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true); + + reserved_blocks += reserved; +next: + count -= cluster_size; + } + + return reserved_blocks; +} + +static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int reserved_blocks = 0; + int ret; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (F2FS_I(inode)->i_compr_blocks) + goto out; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + inode_lock(inode); + + if (!IS_IMMUTABLE(inode)) { + ret = -EINVAL; + goto unlock_inode; + } + + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + while (page_idx < last_idx) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) { + page_idx = f2fs_get_next_page_offset(&dn, + page_idx); + ret = 0; + continue; + } + break; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); + count = roundup(count, F2FS_I(inode)->i_cluster_size); + + ret = reserve_compress_blocks(&dn, count); + + f2fs_put_dnode(&dn); + + if (ret < 0) + break; + + page_idx += count; + reserved_blocks += ret; + } + + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_mmap_sem); + + if (ret >= 0) { + F2FS_I(inode)->i_flags &= ~F2FS_IMMUTABLE_FL; + f2fs_set_inode_flags(inode); + inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, true); + } +unlock_inode: + inode_unlock(inode); +out: + mnt_drop_write_file(filp); + + if (ret >= 0) { + ret = put_user(reserved_blocks, (u64 __user *)arg); + } else if (reserved_blocks && F2FS_I(inode)->i_compr_blocks) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " + "iblocks=%llu, reserved=%u, compr_blocks=%llu, " + "run fsck to fix.", + __func__, inode->i_ino, (u64)inode->i_blocks, + reserved_blocks, + F2FS_I(inode)->i_compr_blocks); + } + + return ret; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -3685,6 +3844,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_get_compress_blocks(filp, arg); case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: return f2fs_release_compress_blocks(filp, arg); + case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: + return f2fs_reserve_compress_blocks(filp, arg); default: return -ENOTTY; } @@ -3852,6 +4013,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_SET_VOLUME_NAME: case F2FS_IOC_GET_COMPRESS_BLOCKS: case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: + case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: break; default: return -ENOIOCTLCMD; From 959c4f05fb1aec40ce7aab9106dea49031f03b0d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Apr 2020 19:55:17 +0800 Subject: [PATCH 17/38] f2fs: use round_up to enhance calculation .i_cluster_size should be power of 2, so we can use round_up() instead of roundup() to enhance the calculation. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index add8852c1a5c..af9ddc52bcd4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -742,16 +742,9 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) * for compressed file, only support cluster size * aligned truncation. */ - if (f2fs_compressed_file(inode)) { - size_t cluster_shift = PAGE_SHIFT + - F2FS_I(inode)->i_log_cluster_size; - size_t cluster_mask = (1 << cluster_shift) - 1; - - free_from = from >> cluster_shift; - if (from & cluster_mask) - free_from++; - free_from <<= cluster_shift; - } + if (f2fs_compressed_file(inode)) + free_from = round_up(from, + F2FS_I(inode)->i_cluster_size << PAGE_SHIFT); #endif err = f2fs_do_truncate_blocks(inode, free_from, lock); @@ -3566,7 +3559,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) end_offset = ADDRS_PER_PAGE(dn.node_page, inode); count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); - count = roundup(count, F2FS_I(inode)->i_cluster_size); + count = round_up(count, F2FS_I(inode)->i_cluster_size); ret = release_compress_blocks(&dn, count); @@ -3718,7 +3711,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) end_offset = ADDRS_PER_PAGE(dn.node_page, inode); count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); - count = roundup(count, F2FS_I(inode)->i_cluster_size); + count = round_up(count, F2FS_I(inode)->i_cluster_size); ret = reserve_compress_blocks(&dn, count); From abb0096a66c3ad6991dfa1b30e6775c606bb6e2d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 31 Mar 2020 11:43:07 -0700 Subject: [PATCH 18/38] f2fs: refactor resize_fs to avoid meta updates in progress Sahitya raised an issue: - prevent meta updates while checkpoint is in progress allocate_segment_for_resize() can cause metapage updates if it requires to change the current node/data segments for resizing. Stop these meta updates when there is a checkpoint already in progress to prevent inconsistent CP data. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 5 +- fs/f2fs/gc.c | 119 +++++++++++++++++++++--------------- fs/f2fs/super.c | 1 - include/trace/events/f2fs.h | 4 +- 6 files changed, 78 insertions(+), 59 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a70e76d160f1..3dc3ac6fe143 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1562,7 +1562,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) return 0; f2fs_warn(sbi, "Start checkpoint disabled!"); } - mutex_lock(&sbi->cp_mutex); + if (cpc->reason != CP_RESIZE) + mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || @@ -1631,7 +1632,8 @@ stop: f2fs_update_time(sbi, CP_TIME); trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: - mutex_unlock(&sbi->cp_mutex); + if (cpc->reason != CP_RESIZE) + mutex_unlock(&sbi->cp_mutex); return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d3fd3e32164b..3a604d067154 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -194,6 +194,7 @@ enum { #define CP_DISCARD 0x00000010 #define CP_TRIMMED 0x00000020 #define CP_PAUSE 0x00000040 +#define CP_RESIZE 0x00000080 #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ @@ -1470,7 +1471,6 @@ struct f2fs_sb_info { unsigned int segs_per_sec; /* segments per section */ unsigned int secs_per_zone; /* sections per zone */ unsigned int total_sections; /* total section count */ - struct mutex resize_mutex; /* for resize exclusion */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ loff_t max_file_blocks; /* max block index of file */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index af9ddc52bcd4..9ce95f63422e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3315,7 +3315,6 @@ static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg) { struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); __u64 block_count; - int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3327,9 +3326,7 @@ static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg) sizeof(block_count))) return -EFAULT; - ret = f2fs_resize_fs(sbi, block_count); - - return ret; + return f2fs_resize_fs(sbi, block_count); } static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 28a8c79c8bdc..f3c45ec2a7e7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -1405,12 +1406,29 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } -static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start, - unsigned int end) +static int free_segment_range(struct f2fs_sb_info *sbi, + unsigned int secs, bool gc_only) { - int type; - unsigned int segno, next_inuse; + unsigned int segno, next_inuse, start, end; + struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; + int gc_mode, gc_type; int err = 0; + int type; + + /* Force block allocation for GC */ + MAIN_SECS(sbi) -= secs; + start = MAIN_SECS(sbi) * sbi->segs_per_sec; + end = MAIN_SEGS(sbi) - 1; + + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++) + if (SIT_I(sbi)->last_victim[gc_mode] >= start) + SIT_I(sbi)->last_victim[gc_mode] = 0; + + for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++) + if (sbi->next_victim_seg[gc_type] >= start) + sbi->next_victim_seg[gc_type] = NULL_SEGNO; + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); /* Move out cursegs from the target range */ for (type = CURSEG_HOT_DATA; type < NR_CURSEG_TYPE; type++) @@ -1423,18 +1441,24 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - down_write(&sbi->gc_lock); do_garbage_collect(sbi, segno, &gc_list, FG_GC); - up_write(&sbi->gc_lock); put_gc_inode(&gc_list); - if (get_valid_blocks(sbi, segno, true)) - return -EAGAIN; + if (!gc_only && get_valid_blocks(sbi, segno, true)) { + err = -EAGAIN; + goto out; + } + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto out; + } } + if (gc_only) + goto out; - err = f2fs_sync_fs(sbi->sb, 1); + err = f2fs_write_checkpoint(sbi, &cpc); if (err) - return err; + goto out; next_inuse = find_next_inuse(FREE_I(sbi), end + 1, start); if (next_inuse <= end) { @@ -1442,6 +1466,8 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start, next_inuse); f2fs_bug_on(sbi, 1); } +out: + MAIN_SECS(sbi) += secs; return err; } @@ -1487,6 +1513,7 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs; MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs; + MAIN_SECS(sbi) += secs; FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs; FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs; F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks); @@ -1508,8 +1535,8 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) { __u64 old_block_count, shrunk_blocks; + struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; unsigned int secs; - int gc_mode, gc_type; int err = 0; __u32 rem; @@ -1544,10 +1571,27 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) return -EINVAL; } - freeze_bdev(sbi->sb->s_bdev); - shrunk_blocks = old_block_count - block_count; secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); + + /* stop other GC */ + if (!down_write_trylock(&sbi->gc_lock)) + return -EAGAIN; + + /* stop CP to protect MAIN_SEC in free_segment_range */ + f2fs_lock_op(sbi); + err = free_segment_range(sbi, secs, true); + f2fs_unlock_op(sbi); + up_write(&sbi->gc_lock); + if (err) + return err; + + set_sbi_flag(sbi, SBI_IS_RESIZEFS); + + freeze_super(sbi->sb); + down_write(&sbi->gc_lock); + mutex_lock(&sbi->cp_mutex); + spin_lock(&sbi->stat_lock); if (shrunk_blocks + valid_user_blocks(sbi) + sbi->current_reserved_blocks + sbi->unusable_block_count + @@ -1556,69 +1600,44 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) else sbi->user_block_count -= shrunk_blocks; spin_unlock(&sbi->stat_lock); - if (err) { - thaw_bdev(sbi->sb->s_bdev, sbi->sb); - return err; - } - - mutex_lock(&sbi->resize_mutex); - set_sbi_flag(sbi, SBI_IS_RESIZEFS); - - mutex_lock(&DIRTY_I(sbi)->seglist_lock); - - MAIN_SECS(sbi) -= secs; - - for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++) - if (SIT_I(sbi)->last_victim[gc_mode] >= - MAIN_SECS(sbi) * sbi->segs_per_sec) - SIT_I(sbi)->last_victim[gc_mode] = 0; - - for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++) - if (sbi->next_victim_seg[gc_type] >= - MAIN_SECS(sbi) * sbi->segs_per_sec) - sbi->next_victim_seg[gc_type] = NULL_SEGNO; - - mutex_unlock(&DIRTY_I(sbi)->seglist_lock); - - err = free_segment_range(sbi, MAIN_SECS(sbi) * sbi->segs_per_sec, - MAIN_SEGS(sbi) - 1); if (err) - goto out; + goto out_err; + + err = free_segment_range(sbi, secs, false); + if (err) + goto recover_out; update_sb_metadata(sbi, -secs); err = f2fs_commit_super(sbi, false); if (err) { update_sb_metadata(sbi, secs); - goto out; + goto recover_out; } - mutex_lock(&sbi->cp_mutex); update_fs_metadata(sbi, -secs); clear_sbi_flag(sbi, SBI_IS_RESIZEFS); set_sbi_flag(sbi, SBI_IS_DIRTY); - mutex_unlock(&sbi->cp_mutex); - err = f2fs_sync_fs(sbi->sb, 1); + err = f2fs_write_checkpoint(sbi, &cpc); if (err) { - mutex_lock(&sbi->cp_mutex); update_fs_metadata(sbi, secs); - mutex_unlock(&sbi->cp_mutex); update_sb_metadata(sbi, secs); f2fs_commit_super(sbi, false); } -out: +recover_out: if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_err(sbi, "resize_fs failed, should run fsck to repair!"); - MAIN_SECS(sbi) += secs; spin_lock(&sbi->stat_lock); sbi->user_block_count += shrunk_blocks; spin_unlock(&sbi->stat_lock); } +out_err: + mutex_unlock(&sbi->cp_mutex); + up_write(&sbi->gc_lock); + thaw_super(sbi->sb); clear_sbi_flag(sbi, SBI_IS_RESIZEFS); - mutex_unlock(&sbi->resize_mutex); - thaw_bdev(sbi->sb->s_bdev, sbi->sb); return err; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 68836cada44f..f752711f3685 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3442,7 +3442,6 @@ try_onemore: init_rwsem(&sbi->gc_lock); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); - mutex_init(&sbi->resize_mutex); init_rwsem(&sbi->node_write); init_rwsem(&sbi->node_change); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 7daa8e5f6186..c299deb046c2 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -50,6 +50,7 @@ TRACE_DEFINE_ENUM(CP_RECOVERY); TRACE_DEFINE_ENUM(CP_DISCARD); TRACE_DEFINE_ENUM(CP_TRIMMED); TRACE_DEFINE_ENUM(CP_PAUSE); +TRACE_DEFINE_ENUM(CP_RESIZE); #define show_block_type(type) \ __print_symbolic(type, \ @@ -136,7 +137,8 @@ TRACE_DEFINE_ENUM(CP_PAUSE); { CP_RECOVERY, "Recovery" }, \ { CP_DISCARD, "Discard" }, \ { CP_PAUSE, "Pause" }, \ - { CP_TRIMMED, "Trimmed" }) + { CP_TRIMMED, "Trimmed" }, \ + { CP_RESIZE, "Resize" }) #define show_fsync_cpreason(type) \ __print_symbolic(type, \ From 934f1fbed2f6fb3041d39bf6cc92ec36a25d3594 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 20 Apr 2020 23:00:57 +0100 Subject: [PATCH 19/38] f2fs: remove redundant assignment to variable err The variable err is being assigned with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 58db1de7ca94..63b34a161cf4 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -564,7 +564,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = d_inode(dentry); struct f2fs_dir_entry *de; struct page *page; - int err = -ENOENT; + int err; trace_f2fs_unlink_enter(dir, dentry); From 54ab9909d4b0c7a7ad8c114a945df3fd1429bb7d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 21 Apr 2020 19:36:21 +0800 Subject: [PATCH 20/38] f2fs: compress: don't handle non-compressed data in workqueue If bio has no compressed data, we don't need to handle end_io work in workqueue, instead, it should just let interrupter handle it directly to speed up IO response. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6ea6d0355033..6ed7f359febd 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -114,7 +114,8 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { STEP_DECRYPT, - STEP_DECOMPRESS, + STEP_DECOMPRESS_NOWQ, /* handle normal cluster data inplace */ + STEP_DECOMPRESS, /* handle compressed cluster data in workqueue */ STEP_VERITY, }; @@ -990,7 +991,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, if (f2fs_encrypted_file(inode)) post_read_steps |= 1 << STEP_DECRYPT; if (f2fs_compressed_file(inode)) - post_read_steps |= 1 << STEP_DECOMPRESS; + post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ; if (f2fs_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; @@ -2189,6 +2190,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, for (i = 0; i < dic->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; + struct bio_post_read_ctx *ctx; blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); @@ -2225,6 +2227,11 @@ submit_and_realloc: if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; + /* tag STEP_DECOMPRESS to handle IO in wq */ + ctx = bio->bi_private; + if (!(ctx->enabled_steps & (1 << STEP_DECOMPRESS))) + ctx->enabled_steps |= 1 << STEP_DECOMPRESS; + inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); ClearPageError(page); From a61d65d2b368b32a7382e2d5b4dc3743cbe0100d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 23 Apr 2020 17:57:33 +0800 Subject: [PATCH 21/38] f2fs: fix potential use-after-free issue In error path of f2fs_read_multi_pages(), it should let last referrer release decompress io context memory, otherwise, other referrer will cause use-after-free issue. Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6ed7f359febd..a773c2902bae 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2208,16 +2208,16 @@ submit_and_realloc: page->index, for_write); if (IS_ERR(bio)) { ret = PTR_ERR(bio); - bio = NULL; dic->failed = true; if (refcount_sub_and_test(dic->nr_cpages - i, - &dic->ref)) + &dic->ref)) { f2fs_decompress_end_io(dic->rpages, cc->cluster_size, true, false); - f2fs_free_dic(dic); + f2fs_free_dic(dic); + } f2fs_put_dnode(&dn); - *bio_ret = bio; + *bio_ret = NULL; return ret; } } From 6654faac9e94c8b3c29b19ef47c051f46c842436 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 23 Apr 2020 18:03:06 +0800 Subject: [PATCH 22/38] f2fs: add compressed/gc data read IO stat in order to account data read IOs more accurately. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 1 + fs/f2fs/f2fs.h | 2 ++ fs/f2fs/gc.c | 2 ++ fs/f2fs/sysfs.c | 7 +++++++ include/trace/events/f2fs.h | 11 ++++++++--- 5 files changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a773c2902bae..07bf03700310 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2234,6 +2234,7 @@ submit_and_realloc: inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, FS_CDATA_READ_IO, F2FS_BLKSIZE); ClearPageError(page); *last_block_in_bio = blkaddr; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3a604d067154..d7df622e5dca 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1149,6 +1149,8 @@ enum iostat_type { APP_READ_IO, /* app read IOs */ APP_MAPPED_READ_IO, /* app mapped read IOs */ FS_DATA_READ_IO, /* data read IOs */ + FS_GDATA_READ_IO, /* data read IOs from background gc */ + FS_CDATA_READ_IO, /* compressed data read IOs */ FS_NODE_READ_IO, /* node read IOs */ FS_META_READ_IO, /* meta read IOs */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f3c45ec2a7e7..5b95d5a146eb 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -740,6 +740,7 @@ got_it: f2fs_put_page(page, 1); f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); return 0; put_encrypted_page: @@ -846,6 +847,7 @@ static int move_data_block(struct inode *inode, block_t bidx, } f2fs_update_iostat(fio.sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); lock_page(mpage); if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) || diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 0b20f498762b..8877d0de1bbc 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -801,6 +801,7 @@ static int __maybe_unused iostat_info_seq_show(struct seq_file *seq, seq_printf(seq, "time: %-16llu\n", now); /* print app write IOs */ + seq_puts(seq, "[WRITE]\n"); seq_printf(seq, "app buffered: %-16llu\n", sbi->rw_iostat[APP_BUFFERED_IO]); seq_printf(seq, "app direct: %-16llu\n", @@ -827,6 +828,7 @@ static int __maybe_unused iostat_info_seq_show(struct seq_file *seq, sbi->rw_iostat[FS_CP_META_IO]); /* print app read IOs */ + seq_puts(seq, "[READ]\n"); seq_printf(seq, "app buffered: %-16llu\n", sbi->rw_iostat[APP_BUFFERED_READ_IO]); seq_printf(seq, "app direct: %-16llu\n", @@ -837,12 +839,17 @@ static int __maybe_unused iostat_info_seq_show(struct seq_file *seq, /* print fs read IOs */ seq_printf(seq, "fs data: %-16llu\n", sbi->rw_iostat[FS_DATA_READ_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->rw_iostat[FS_GDATA_READ_IO]); + seq_printf(seq, "fs compr_data: %-16llu\n", + sbi->rw_iostat[FS_CDATA_READ_IO]); seq_printf(seq, "fs node: %-16llu\n", sbi->rw_iostat[FS_NODE_READ_IO]); seq_printf(seq, "fs meta: %-16llu\n", sbi->rw_iostat[FS_META_READ_IO]); /* print other IOs */ + seq_puts(seq, "[OTHER]\n"); seq_printf(seq, "fs discard: %-16llu\n", sbi->rw_iostat[FS_DISCARD]); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index c299deb046c2..3c7c018a5740 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1849,6 +1849,8 @@ TRACE_EVENT(f2fs_iostat, __field(unsigned long long, app_rio) __field(unsigned long long, app_mrio) __field(unsigned long long, fs_drio) + __field(unsigned long long, fs_gdrio) + __field(unsigned long long, fs_cdrio) __field(unsigned long long, fs_nrio) __field(unsigned long long, fs_mrio) __field(unsigned long long, fs_discard) @@ -1873,6 +1875,8 @@ TRACE_EVENT(f2fs_iostat, __entry->app_rio = iostat[APP_READ_IO]; __entry->app_mrio = iostat[APP_MAPPED_READ_IO]; __entry->fs_drio = iostat[FS_DATA_READ_IO]; + __entry->fs_gdrio = iostat[FS_GDATA_READ_IO]; + __entry->fs_cdrio = iostat[FS_CDATA_READ_IO]; __entry->fs_nrio = iostat[FS_NODE_READ_IO]; __entry->fs_mrio = iostat[FS_META_READ_IO]; __entry->fs_discard = iostat[FS_DISCARD]; @@ -1884,15 +1888,16 @@ TRACE_EVENT(f2fs_iostat, "gc [data=%llu, node=%llu], " "cp [data=%llu, node=%llu, meta=%llu], " "app [read=%llu (direct=%llu, buffered=%llu), mapped=%llu], " - "fs [data=%llu, node=%llu, meta=%llu]", + "fs [data=%llu, (gc_data=%llu, compr_data=%llu), " + "node=%llu, meta=%llu]", show_dev(__entry->dev), __entry->app_wio, __entry->app_dio, __entry->app_bio, __entry->app_mio, __entry->fs_dio, __entry->fs_nio, __entry->fs_mio, __entry->fs_discard, __entry->fs_gc_dio, __entry->fs_gc_nio, __entry->fs_cp_dio, __entry->fs_cp_nio, __entry->fs_cp_mio, __entry->app_rio, __entry->app_drio, __entry->app_brio, - __entry->app_mrio, __entry->fs_drio, __entry->fs_nrio, - __entry->fs_mrio) + __entry->app_mrio, __entry->fs_drio, __entry->fs_gdrio, + __entry->fs_cdrio, __entry->fs_nrio, __entry->fs_mrio) ); #endif /* _TRACE_F2FS_H */ From 190172325d695ad15e0d53d661dabf26e5d57b7d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 8 May 2020 09:16:03 +0800 Subject: [PATCH 23/38] f2fs: compress: fix zstd data corruption During zstd compression, ZSTD_endStream() may return non-zero value because distination buffer is full, but there is still compressed data remained in intermediate buffer, it means that zstd algorithm can not save at last one block space, let's just writeback raw data instead of compressed one, this can fix data corruption when decompressing incomplete stored compression data. Fixes: 50cfa66f0de0 ("f2fs: compress: support zstd compress algorithm") Signed-off-by: Daeho Jeong Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index fd7cdb814def..f4b06a5c3a8e 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -361,6 +361,13 @@ static int zstd_compress_pages(struct compress_ctx *cc) return -EIO; } + /* + * there is compressed data remained in intermediate buffer due to + * no more space in cbuf.cdata + */ + if (ret) + return -EAGAIN; + cc->clen = outbuf.pos; return 0; } From c1d16a3208b90b64eaaf1e9b0226e839969c1133 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 18 May 2020 18:00:33 -0700 Subject: [PATCH 24/38] f2fs: avoid inifinite loop to wait for flushing node pages at cp_error Shutdown test is somtimes hung, since it keeps trying to flush dirty node pages in an inifinite loop. Let's drop dirty pages at umount in that case. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 79dc8b9e1816..f9823bbb0fc6 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1520,8 +1520,15 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, trace_f2fs_writepage(page, NODE); - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); + return 0; + } goto redirty_out; + } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; From 762f39459ad31780a41aaf2f3a972e1f20095675 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 29 May 2020 15:05:22 +0200 Subject: [PATCH 25/38] writeback: Avoid skipping inode writeback Inode's i_io_list list head is used to attach inode to several different lists - wb->{b_dirty, b_dirty_time, b_io, b_more_io}. When flush worker prepares a list of inodes to writeback e.g. for sync(2), it moves inodes to b_io list. Thus it is critical for sync(2) data integrity guarantees that inode is not requeued to any other writeback list when inode is queued for processing by flush worker. That's the reason why writeback_single_inode() does not touch i_io_list (unless the inode is completely clean) and why __mark_inode_dirty() does not touch i_io_list if I_SYNC flag is set. However there are two flaws in the current logic: 1) When inode has only I_DIRTY_TIME set but it is already queued in b_io list due to sync(2), concurrent __mark_inode_dirty(inode, I_DIRTY_SYNC) can still move inode back to b_dirty list resulting in skipping writeback of inode time stamps during sync(2). 2) When inode is on b_dirty_time list and writeback_single_inode() races with __mark_inode_dirty() like: writeback_single_inode() __mark_inode_dirty(inode, I_DIRTY_PAGES) inode->i_state |= I_SYNC __writeback_single_inode() inode->i_state |= I_DIRTY_PAGES; if (inode->i_state & I_SYNC) bail if (!(inode->i_state & I_DIRTY_ALL)) - not true so nothing done We end up with I_DIRTY_PAGES inode on b_dirty_time list and thus standard background writeback will not writeback this inode leading to possible dirty throttling stalls etc. (thanks to Martijn Coenen for this analysis). Fix these problems by tracking whether inode is queued in b_io or b_more_io lists in a new I_SYNC_QUEUED flag. When this flag is set, we know flush worker has queued inode and we should not touch i_io_list. On the other hand we also know that once flush worker is done with the inode it will requeue the inode to appropriate dirty list. When I_SYNC_QUEUED is not set, __mark_inode_dirty() can (and must) move inode to appropriate dirty list. Reported-by: Martijn Coenen Fixes: 0ae45f63d4ef ("vfs: add support for a lazytime mount option") CC: stable@vger.kernel.org Signed-off-by: Jan Kara --- fs/fs-writeback.c | 39 +++++++++++++++++++++++++++++---------- include/linux/fs.h | 8 ++++++-- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 471d863958bc..36dda58f2846 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -160,7 +160,9 @@ static void inode_io_list_del_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); + inode->i_state &= ~I_SYNC_QUEUED; list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } @@ -994,7 +996,9 @@ void inode_io_list_del(struct inode *inode) struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); + spin_lock(&inode->i_lock); inode_io_list_del_locked(inode, wb); + spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); } @@ -1043,8 +1047,9 @@ void sb_clear_inode_writeback(struct inode *inode) * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ -static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) +static void __redirty_tail(struct inode *inode, struct bdi_writeback *wb) { + assert_spin_locked(&inode->i_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -1053,6 +1058,14 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) inode->dirtied_when = jiffies; } inode_io_list_move_locked(inode, wb, &wb->b_dirty); + inode->i_state &= ~I_SYNC_QUEUED; +} + +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) +{ + spin_lock(&inode->i_lock); + __redirty_tail(inode, wb); + spin_unlock(&inode->i_lock); } /* @@ -1121,8 +1134,11 @@ static int move_expired_inodes(struct list_head *delaying_queue, break; list_move(&inode->i_io_list, &tmp); moved++; + spin_lock(&inode->i_lock); if (flags & EXPIRE_DIRTY_ATIME) - set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); + inode->i_state |= I_DIRTY_TIME_EXPIRED; + inode->i_state |= I_SYNC_QUEUED; + spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) @@ -1265,7 +1281,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * writeback is not making progress due to locked * buffers. Skip this inode for now. */ - redirty_tail(inode, wb); + __redirty_tail(inode, wb); return; } @@ -1285,7 +1301,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ - redirty_tail(inode, wb); + __redirty_tail(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* @@ -1293,10 +1309,11 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * such as delayed allocation during submission or metadata * updates after data IO completion. */ - redirty_tail(inode, wb); + __redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); + inode->i_state &= ~I_SYNC_QUEUED; } else { /* The inode is clean. Remove from writeback lists. */ inode_io_list_del_locked(inode, wb); @@ -1540,8 +1557,9 @@ static long writeback_sb_inodes(struct super_block *sb, */ spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + inode->i_state &= ~I_SYNC_QUEUED; + __redirty_tail(inode, wb); spin_unlock(&inode->i_lock); - redirty_tail(inode, wb); continue; } if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { @@ -2160,11 +2178,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) inode->i_state |= flags; /* - * If the inode is being synced, just update its dirty state. - * The unlocker will place the inode on the appropriate - * superblock list, based upon its state. + * If the inode is queued for writeback by flush worker, just + * update its dirty state. Once the flush worker is done with + * the inode it will place it on the appropriate superblock + * list, based upon its state. */ - if (inode->i_state & I_SYNC) + if (inode->i_state & I_SYNC_QUEUED) goto out_unlock_inode; /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 4b8b7898623a..6675c2ee96d6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2051,6 +2051,10 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) * * I_CREATING New object's inode in the middle of setting up. * + * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. + * Used to detect that mark_inode_dirty() should not move + * inode between dirty lists. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -2068,11 +2072,11 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) #define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) -#define __I_DIRTY_TIME_EXPIRED 12 -#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) +#define I_DIRTY_TIME_EXPIRED (1 << 12) #define I_WB_SWITCH (1 << 13) #define I_OVL_INUSE (1 << 14) #define I_CREATING (1 << 15) +#define I_SYNC_QUEUED (1 << 16) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) From 4b6c30cb00acecbbbaa1c5b45d3bb3836bf327ee Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 26 May 2020 17:05:43 +0800 Subject: [PATCH 26/38] f2fs: code cleanup by removing ifdef macro surrounding Define f2fs_listxattr and to NULL when CONFIG_F2FS_FS_XATTR is not enabled, then we can remove many ugly ifdef macros in the code. Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 -- fs/f2fs/namei.c | 8 -------- fs/f2fs/xattr.h | 6 +----- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9ce95f63422e..efe46b308b4b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -982,9 +982,7 @@ const struct inode_operations f2fs_file_inode_operations = { .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, -#ifdef CONFIG_F2FS_FS_XATTR .listxattr = f2fs_listxattr, -#endif .fiemap = f2fs_fiemap, }; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 63b34a161cf4..8f55201441f7 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1287,9 +1287,7 @@ const struct inode_operations f2fs_encrypted_symlink_inode_operations = { .get_link = f2fs_encrypted_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, -#ifdef CONFIG_F2FS_FS_XATTR .listxattr = f2fs_listxattr, -#endif }; const struct inode_operations f2fs_dir_inode_operations = { @@ -1307,9 +1305,7 @@ const struct inode_operations f2fs_dir_inode_operations = { .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, -#ifdef CONFIG_F2FS_FS_XATTR .listxattr = f2fs_listxattr, -#endif .fiemap = f2fs_fiemap, }; @@ -1317,9 +1313,7 @@ const struct inode_operations f2fs_symlink_inode_operations = { .get_link = f2fs_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, -#ifdef CONFIG_F2FS_FS_XATTR .listxattr = f2fs_listxattr, -#endif }; const struct inode_operations f2fs_special_inode_operations = { @@ -1327,7 +1321,5 @@ const struct inode_operations f2fs_special_inode_operations = { .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, -#ifdef CONFIG_F2FS_FS_XATTR .listxattr = f2fs_listxattr, -#endif }; diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 6a192e6c7a9e..416d652774a3 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -136,6 +136,7 @@ extern void f2fs_destroy_xattr_caches(struct f2fs_sb_info *); #else #define f2fs_xattr_handlers NULL +#define f2fs_listxattr NULL static inline int f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *page, int flags) @@ -148,11 +149,6 @@ static inline int f2fs_getxattr(struct inode *inode, int index, { return -EOPNOTSUPP; } -static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, - size_t buffer_size) -{ - return -EOPNOTSUPP; -} static inline int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { } #endif From 8213d843a57daa83101a911341c6cc5fde99730f Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Wed, 27 May 2020 13:02:31 +0900 Subject: [PATCH 27/38] f2fs: protect new segment allocation in expand_inode_data Found a new segemnt allocation without f2fs_lock_op() in expand_inode_data(). So, when we do fallocate() for a pinned file and trigger checkpoint very frequently and simultaneously. F2FS gets stuck in the below code of do_checkpoint() forever. f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); /* Wait for all dirty meta pages to be submitted for IO */ <= if fallocate() here, f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META); <= it'll wait forever. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index efe46b308b4b..61b3e4f9e97e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1661,7 +1661,11 @@ next_alloc: down_write(&sbi->pin_sem); map.m_seg_type = CURSEG_COLD_DATA_PINNED; + + f2fs_lock_op(sbi); f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA); + f2fs_unlock_op(sbi); + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); up_write(&sbi->pin_sem); From 1aaf4f7af711c9b1388dbe520c1287277876b560 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 27 May 2020 18:27:51 +0800 Subject: [PATCH 28/38] f2fs: fix wrong value of tracepoint parameter In f2fs_lookup(), we should set @err correctly before printing it in tracepoint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 8f55201441f7..e94e02c6580a 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -504,6 +504,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, err = PTR_ERR(page); goto out; } + err = -ENOENT; goto out_splice; } @@ -549,7 +550,7 @@ out_splice: #endif new = d_splice_alias(inode, dentry); err = PTR_ERR_OR_ZERO(new); - trace_f2fs_lookup_end(dir, dentry, ino, err); + trace_f2fs_lookup_end(dir, dentry, ino, !new ? -ENOENT : err); return new; out_iput: iput(inode); From e8cda40e84505a1d2cfc21214ae4e4154f533845 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 27 May 2020 18:27:52 +0800 Subject: [PATCH 29/38] f2fs: remove unneeded return value of __insert_discard_tree() We never use return value of __insert_discard_tree(), so remove it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 021a1bb9eb43..0429bdce8177 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1221,7 +1221,7 @@ submit: return err; } -static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, +static void __insert_discard_tree(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len, struct rb_node **insert_p, @@ -1230,7 +1230,6 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node **p; struct rb_node *parent = NULL; - struct discard_cmd *dc = NULL; bool leftmost = true; if (insert_p && insert_parent) { @@ -1242,12 +1241,8 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart, &leftmost); do_insert: - dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, + __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p, leftmost); - if (!dc) - return NULL; - - return dc; } static void __relocate_discard_cmd(struct discard_cmd_control *dcc, From f9bfcd403577c73defffc3959cd622b2318baac5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 26 May 2020 09:55:02 +0800 Subject: [PATCH 30/38] f2fs: compress: don't compress any datas after cp stop While compressed data writeback, we need to drop dirty pages like we did for non-compressed pages if cp stops, however it's not needed to compress any data in such case, so let's detect cp stop condition in cluster_may_compress() to avoid redundant compressing and let following f2fs_write_raw_pages() drops dirty pages correctly. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index f4b06a5c3a8e..38762d13d19a 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -820,6 +820,8 @@ static bool cluster_may_compress(struct compress_ctx *cc) return false; if (!f2fs_cluster_is_full(cc)) return false; + if (unlikely(f2fs_cp_error(F2FS_I_SB(cc->inode)))) + return false; return __cluster_may_compress(cc); } From 8b7de4d0084fea24576f39c4e4fcda7b42515761 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 May 2020 17:29:47 +0800 Subject: [PATCH 31/38] f2fs: fix wrong discard space Under heavy fsstress, we may triggle panic while issuing discard, because __check_sit_bitmap() detects that discard command may earse valid data blocks, the root cause is as below race stack described, since we removed lock when flushing quota data, quota data writeback may race with write_checkpoint(), so that it causes inconsistency in between cached discard entry and segment bitmap. - f2fs_write_checkpoint - block_operations - set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH) - f2fs_flush_sit_entries - add_discard_addrs - __set_bit_le(i, (void *)de->discard_map); - f2fs_write_data_pages - f2fs_write_single_data_page : inode is quota one, cp_rwsem won't be locked - f2fs_do_write_data_page - f2fs_allocate_data_block - f2fs_wait_discard_bio : discard entry has not been added yet. - update_sit_entry - f2fs_clear_prefree_segments - f2fs_issue_discard : add discard entry In order to fix this, this patch uses node_write to serialize f2fs_allocate_data_block and checkpoint. Fixes: 435cbab95e39 ("f2fs: fix quota_sync failure due to f2fs_lock_op") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0429bdce8177..e4898789c5c0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3106,6 +3106,14 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, type = CURSEG_COLD_DATA; } + /* + * We need to wait for node_write to avoid block allocation during + * checkpoint. This can only happen to quota writes which can cause + * the below discard race condition. + */ + if (IS_DATASEG(type)) + down_write(&sbi->node_write); + down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); @@ -3171,6 +3179,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, up_read(&SM_I(sbi)->curseg_lock); + if (IS_DATASEG(type)) + up_write(&sbi->node_write); + if (put_pin_sem) up_read(&sbi->pin_sem); } From 0c23000823ccfd33efddd57c809679edfa10de6f Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Tue, 2 Jun 2020 18:11:47 +0530 Subject: [PATCH 32/38] f2fs: fix retry logic in f2fs_write_cache_pages() In case a compressed file is getting overwritten, the current retry logic doesn't include the current page to be retried now as it sets the new start index as 0 and new end index as writeback_index - 1. This causes the corresponding cluster to be uncompressed and written as normal pages without compression. Fix this by allowing writeback to be retried for the current page as well (in case of compressed page getting retried due to index mismatch with cluster index). So that this cluster can be written compressed in case of overwrite. Also, align f2fs_write_cache_pages() according to the change - <64081362e8ff>("mm/page-writeback.c: fix range_cyclic writeback vs writepages deadlock"). Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 07bf03700310..31d3b815f884 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2861,7 +2861,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; - int cycled; int range_whole = 0; int tag; int nwritten = 0; @@ -2879,17 +2878,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; - if (index == 0) - cycled = 1; - else - cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; @@ -3054,12 +3048,13 @@ next: } } #endif - if ((!cycled && !done) || retry) { - cycled = 1; + if (retry) { index = 0; - end = writeback_index - 1; + end = -1; goto retry; } + if (wbc->range_cyclic && !done) + done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; From d17c0c64eb1681a2c2967be80efdfc3b92037152 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 4 Jun 2020 21:57:48 -0700 Subject: [PATCH 33/38] f2fs: don't return vmalloc() memory from f2fs_kmalloc() kmalloc() returns kmalloc'ed memory, and kvmalloc() returns either kmalloc'ed or vmalloc'ed memory. But the f2fs wrappers, f2fs_kmalloc() and f2fs_kvmalloc(), both return both kinds of memory. It's redundant to have two functions that do the same thing, and also breaking the standard naming convention is causing bugs since people assume it's safe to kfree() memory allocated by f2fs_kmalloc(). See e.g. the various allocations in fs/f2fs/compress.c. Fix this by making f2fs_kmalloc() just use kmalloc(). And to avoid re-introducing the allocation failures that the vmalloc fallback was intended to fix, convert the largest allocations to use f2fs_kvmalloc(). Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/f2fs.h | 8 +------- fs/f2fs/node.c | 8 ++++---- fs/f2fs/super.c | 2 +- 4 files changed, 8 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 3dc3ac6fe143..236064930251 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -895,8 +895,8 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) int i; int err; - sbi->ckpt = f2fs_kzalloc(sbi, array_size(blk_size, cp_blks), - GFP_KERNEL); + sbi->ckpt = f2fs_kvzalloc(sbi, array_size(blk_size, cp_blks), + GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d7df622e5dca..70e1c5d7e091 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2996,18 +2996,12 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { - void *ret; - if (time_to_inject(sbi, FAULT_KMALLOC)) { f2fs_show_injection_info(sbi, FAULT_KMALLOC); return NULL; } - ret = kmalloc(size, flags); - if (ret) - return ret; - - return kvmalloc(size, flags); + return kmalloc(size, flags); } static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f9823bbb0fc6..bd9da9de9b17 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2993,7 +2993,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) return 0; nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); - nm_i->nat_bits = f2fs_kzalloc(sbi, + nm_i->nat_bits = f2fs_kvzalloc(sbi, nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; @@ -3126,9 +3126,9 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) int i; nm_i->free_nid_bitmap = - f2fs_kzalloc(sbi, array_size(sizeof(unsigned char *), - nm_i->nat_blocks), - GFP_KERNEL); + f2fs_kvzalloc(sbi, array_size(sizeof(unsigned char *), + nm_i->nat_blocks), + GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f752711f3685..b05de4256b5b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3028,7 +3028,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) FDEV(devi).nr_blkz++; - FDEV(devi).blkz_seq = f2fs_kzalloc(sbi, + FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi, BITS_TO_LONGS(FDEV(devi).nr_blkz) * sizeof(unsigned long), GFP_KERNEL); From 475122997d54126bdc946b1e43a607084ddd853f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 1 Jun 2020 13:08:05 -0700 Subject: [PATCH 34/38] f2fs: avoid utf8_strncasecmp() with unstable name If the dentry name passed to ->d_compare() fits in dentry::d_iname, then it may be concurrently modified by a rename. This can cause undefined behavior (possibly out-of-bounds memory accesses or crashes) in utf8_strncasecmp(), since fs/unicode/ isn't written to handle strings that may be concurrently modified. Fix this by first copying the filename to a stack buffer if needed. This way we get a stable snapshot of the filename. Fixes: 2c2eb7a300cd ("f2fs: Support case-insensitive file name lookups") Cc: # v5.4+ Cc: Al Viro Cc: Daniel Rosenberg Cc: Gabriel Krisman Bertazi Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c5c6559e550a..a02ffa06d9df 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1114,11 +1114,27 @@ static int f2fs_d_compare(const struct dentry *dentry, unsigned int len, const struct inode *dir = READ_ONCE(parent->d_inode); const struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); struct qstr entry = QSTR_INIT(str, len); + char strbuf[DNAME_INLINE_LEN]; int res; if (!dir || !IS_CASEFOLDED(dir)) goto fallback; + /* + * If the dentry name is stored in-line, then it may be concurrently + * modified by a rename. If this happens, the VFS will eventually retry + * the lookup, so it doesn't matter what ->d_compare() returns. + * However, it's unsafe to call utf8_strncasecmp() with an unstable + * string. Therefore, we have to copy the name into a temporary buffer. + */ + if (len <= DNAME_INLINE_LEN - 1) { + memcpy(strbuf, str, len); + strbuf[len] = 0; + entry.name = strbuf; + /* prevent compiler from optimizing out the temporary buffer */ + barrier(); + } + res = utf8_strncasecmp(sbi->s_encoding, name, &entry); if (res >= 0) return res; From 63dbcae6b44b890886f49b0860a3d0bf1242e4c9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 8 Jun 2020 20:03:16 +0800 Subject: [PATCH 35/38] f2fs: handle readonly filesystem in f2fs_ioc_shutdown() If mountpoint is readonly, we should allow shutdowning filesystem successfully, this fixes issue found by generic/599 testcase of xfstest. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 61b3e4f9e97e..90f3945fd988 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2235,8 +2235,15 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (in != F2FS_GOING_DOWN_FULLSYNC) { ret = mnt_want_write_file(filp); - if (ret) + if (ret) { + if (ret == -EROFS) { + ret = 0; + f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + trace_f2fs_shutdown(sbi, in, ret); + } return ret; + } } switch (in) { From 7457e0d51771554bf084b016a0f5c1926a8c8b4f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 8 Jun 2020 20:03:17 +0800 Subject: [PATCH 36/38] f2fs: remove unused parameter of f2fs_put_rpages_mapping() Just cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 38762d13d19a..e97263bf181b 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -90,8 +90,7 @@ static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) f2fs_drop_rpages(cc, len, true); } -static void f2fs_put_rpages_mapping(struct compress_ctx *cc, - struct address_space *mapping, +static void f2fs_put_rpages_mapping(struct address_space *mapping, pgoff_t start, int len) { int i; @@ -913,7 +912,7 @@ retry: if (!PageUptodate(page)) { f2fs_unlock_rpages(cc, i + 1); - f2fs_put_rpages_mapping(cc, mapping, start_idx, + f2fs_put_rpages_mapping(mapping, start_idx, cc->cluster_size); f2fs_destroy_compress_ctx(cc); goto retry; @@ -948,7 +947,7 @@ retry: unlock_pages: f2fs_unlock_rpages(cc, i); release_pages: - f2fs_put_rpages_mapping(cc, mapping, start_idx, i); + f2fs_put_rpages_mapping(mapping, start_idx, i); f2fs_destroy_compress_ctx(cc); return ret; } From 9c48217ad11f316c42a40f259571cf38b213ad6a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 4 Jun 2020 11:49:43 -0700 Subject: [PATCH 37/38] f2fs: add node_io_flag for bio flags likewise data_io_flag This patch adds another way to attach bio flags to node writes. Description: Give a way to attach REQ_META|FUA to node writes given temperature-based bits. Now the bits indicate: * REQ_META | REQ_FUA | * 5 | 4 | 3 | 2 | 1 | 0 | * Cold | Warm | Hot | Cold | Warm | Hot | Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 9 +++++++++ fs/f2fs/data.c | 24 +++++++++++++++--------- fs/f2fs/f2fs.h | 1 + fs/f2fs/sysfs.c | 2 ++ 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 427f5b45c67f..4bb93a06d8ab 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -333,6 +333,15 @@ Description: Give a way to attach REQ_META|FUA to data writes * 5 | 4 | 3 | 2 | 1 | 0 | * Cold | Warm | Hot | Cold | Warm | Hot | +What: /sys/fs/f2fs//node_io_flag +Date: June 2020 +Contact: "Jaegeuk Kim" +Description: Give a way to attach REQ_META|FUA to node writes + given temperature-based bits. Now the bits indicate: + * REQ_META | REQ_FUA | + * 5 | 4 | 3 | 2 | 1 | 0 | + * Cold | Warm | Hot | Cold | Warm | Hot | + What: /sys/fs/f2fs//iostat_period_ms Date: April 2020 Contact: "Daeho Jeong" diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 31d3b815f884..1b5c43b85efe 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -514,22 +514,28 @@ void f2fs_submit_bio(struct f2fs_sb_info *sbi, __submit_bio(sbi, bio, type); } -static void __attach_data_io_flag(struct f2fs_io_info *fio) +static void __attach_io_flag(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; - unsigned int fua_flag = sbi->data_io_flag & temp_mask; - unsigned int meta_flag = (sbi->data_io_flag >> NR_TEMP_TYPE) & - temp_mask; + unsigned int io_flag, fua_flag, meta_flag; + + if (fio->type == DATA) + io_flag = sbi->data_io_flag; + else if (fio->type == NODE) + io_flag = sbi->node_io_flag; + else + return; + + fua_flag = io_flag & temp_mask; + meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; + /* - * data io flag bits per temp: + * data/node io flag bits per temp: * REQ_META | REQ_FUA | * 5 | 4 | 3 | 2 | 1 | 0 | * Cold | Warm | Hot | Cold | Warm | Hot | */ - if (fio->type != DATA) - return; - if ((1 << fio->temp) & meta_flag) fio->op_flags |= REQ_META; if ((1 << fio->temp) & fua_flag) @@ -543,7 +549,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - __attach_data_io_flag(fio); + __attach_io_flag(fio); bio_set_op_attrs(io->bio, fio->op, fio->op_flags); if (is_read_io(fio->op)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 70e1c5d7e091..72b9925a7a47 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1566,6 +1566,7 @@ struct f2fs_sb_info { /* to attach REQ_META|REQ_FUA flags */ unsigned int data_io_flag; + unsigned int node_io_flag; /* For sysfs suppport */ struct kobject s_kobj; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 8877d0de1bbc..ec00a2e3d206 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -554,6 +554,7 @@ F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); @@ -635,6 +636,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(inject_type), #endif ATTR_LIST(data_io_flag), + ATTR_LIST(node_io_flag), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), ATTR_LIST(unusable), From 551162c9bf51fda41d2d5ff5e2c5f70a5800cd5f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 4 Jun 2020 16:45:30 -0700 Subject: [PATCH 38/38] f2fs: attach IO flags to the missing cases This adds more IOs to attach flags. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1b5c43b85efe..547c2c8f1a9d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -691,6 +691,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (fio->io_wbc && !is_read_io(fio->op)) wbc_account_io(fio->io_wbc, page, PAGE_SIZE); + __attach_io_flag(fio); bio_set_op_attrs(bio, fio->op, fio->op_flags); inc_page_count(fio->sbi, is_read_io(fio->op) ? @@ -877,6 +878,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_PAGES); + __attach_io_flag(fio); bio_set_op_attrs(bio, fio->op, fio->op_flags); add_bio_entry(fio->sbi, bio, page, fio->temp);