From 30b2257ff67152075e5237e585c56b8240985655 Mon Sep 17 00:00:00 2001 From: Douglas Raillard Date: Mon, 6 Mar 2023 12:25:49 +0000 Subject: [PATCH 01/54] f2fs: Fix f2fs_truncate_partial_nodes ftrace event Fix the nid_t field so that its size is correctly reported in the text format embedded in trace.dat files. As it stands, it is reported as being of size 4: field:nid_t nid[3]; offset:24; size:4; signed:0; Instead of 12: field:nid_t nid[3]; offset:24; size:12; signed:0; This also fixes the reported offset of subsequent fields so that they match with the actual struct layout. Signed-off-by: Douglas Raillard Reviewed-by: Mukesh Ojha Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index b5ab8149cc14..3c2919ea59f5 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -512,7 +512,7 @@ TRACE_EVENT(f2fs_truncate_partial_nodes, TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(nid_t, nid[3]) + __array(nid_t, nid, 3) __field(int, depth) __field(int, err) ), From 982eb225e4804bcab58fa11ae8d226791fee7e89 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 23 Feb 2023 01:25:13 +0000 Subject: [PATCH 02/54] f2fs: make f2fs_sync_inode_meta() static After commit 26b5a079197c ("f2fs: cleanup dirty pages if recover failed"), f2fs_sync_inode_meta() is only used in checkpoint.c, so f2fs_sync_inode_meta() should only be visible inside. Delete the declaration in the header file and change f2fs_sync_inode_meta() to static. Signed-off-by: Li Zetao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f108cf6d8763..f43121fe513b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1131,7 +1131,7 @@ retry: goto retry; } -int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) +static int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) { struct list_head *head = &sbi->inode_list[DIRTY_META]; struct inode *inode; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f74ac7e67188..16bb6a6ea813 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3729,7 +3729,6 @@ void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); -int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi); void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi); void f2fs_add_orphan_inode(struct inode *inode); From a6fe7522e931d79226cf6726b3745a9a80da1529 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 16 Feb 2023 22:09:35 +0800 Subject: [PATCH 03/54] f2fs: export compress_percent and compress_watermark entries This patch export below sysfs entries for better control cached compress page count. /sys/fs/f2fs//compress_watermark /sys/fs/f2fs//compress_percent Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 17 +++++++++++++++++ fs/f2fs/sysfs.c | 18 ++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 94132745ecbe..c1314b7fe544 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -729,3 +729,20 @@ What: /sys/fs/f2fs//last_age_weight Date: January 2023 Contact: "Ping Xiong" Description: When DATA SEPARATION is on, it controls the weight of last data block age. + +What: /sys/fs/f2fs//compress_watermark +Date: February 2023 +Contact: "Yangtao Li" +Description: When compress cache is on, it controls free memory watermark + in order to limit caching compress page. If free memory is lower + than watermark, then deny caching compress page. The value should be in + range of (0, 100], by default it was initialized as 20(%). + +What: /sys/fs/f2fs//compress_percent +Date: February 2023 +Contact: "Yangtao Li" +Description: When compress cache is on, it controls cached page + percent(compress pages / free_ram) in order to limit caching compress page. + If cached page percent exceed threshold, then deny caching compress page. + The value should be in range of (0, 100], by default it was initialized + as 20(%). diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 4c72686ca36d..c7158a974b7e 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -599,6 +599,20 @@ out: sbi->compr_new_inode = 0; return count; } + + if (!strcmp(a->attr.name, "compress_percent")) { + if (t == 0 || t > 100) + return -EINVAL; + *ui = t; + return count; + } + + if (!strcmp(a->attr.name, "compress_watermark")) { + if (t == 0 || t > 100) + return -EINVAL; + *ui = t; + return count; + } #endif if (!strcmp(a->attr.name, "atgc_candidate_ratio")) { @@ -951,6 +965,8 @@ F2FS_FEATURE_RO_ATTR(compression); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compress_percent, compress_percent); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compress_watermark, compress_watermark); #endif F2FS_FEATURE_RO_ATTR(pin_file); @@ -1058,6 +1074,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(compr_written_block), ATTR_LIST(compr_saved_block), ATTR_LIST(compr_new_inode), + ATTR_LIST(compress_percent), + ATTR_LIST(compress_watermark), #endif /* For ATGC */ ATTR_LIST(atgc_candidate_ratio), From afd94b56a80d5b7fa133b5aa67ed533e352bd920 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 16 Feb 2023 21:53:24 +0800 Subject: [PATCH 04/54] f2fs: convert to use bitmap API Let's use BIT() and GENMASK() instead of open it. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/compress.c | 4 ++-- fs/f2fs/data.c | 12 ++++++------ fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 26 +++++++++++++------------- fs/f2fs/file.c | 2 +- fs/f2fs/inode.c | 4 ++-- fs/f2fs/node.h | 20 +++++++++----------- fs/f2fs/super.c | 16 ++++++++-------- fs/f2fs/sysfs.c | 2 +- include/linux/f2fs_fs.h | 9 ++++----- 11 files changed, 48 insertions(+), 51 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f43121fe513b..24f6051bc284 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -980,7 +980,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); if (cur_page == cp2) - cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg)); for (i = 1; i < cp_blks; i++) { void *sit_bitmap_ptr; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index a8413b68fb12..901191d6e391 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -669,7 +669,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) cc->cbuf->clen = cpu_to_le32(cc->clen); - if (fi->i_compress_flag & 1 << COMPRESS_CHKSUM) + if (fi->i_compress_flag & BIT(COMPRESS_CHKSUM)) chksum = f2fs_crc32(F2FS_I_SB(cc->inode), cc->cbuf->cdata, cc->clen); cc->cbuf->chksum = cpu_to_le32(chksum); @@ -760,7 +760,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) ret = cops->decompress_pages(dic); - if (!ret && (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)) { + if (!ret && (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))) { u32 provided = le32_to_cpu(dic->cbuf->chksum); u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 197a57b2e334..8e878f92dcb4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -94,17 +94,17 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { #ifdef CONFIG_FS_ENCRYPTION - STEP_DECRYPT = 1 << 0, + STEP_DECRYPT = BIT(0), #else STEP_DECRYPT = 0, /* compile out the decryption-related code */ #endif #ifdef CONFIG_F2FS_FS_COMPRESSION - STEP_DECOMPRESS = 1 << 1, + STEP_DECOMPRESS = BIT(1), #else STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ #endif #ifdef CONFIG_FS_VERITY - STEP_VERITY = 1 << 2, + STEP_VERITY = BIT(2), #else STEP_VERITY = 0, /* compile out the verity-related code */ #endif @@ -421,7 +421,7 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static unsigned int f2fs_io_flags(struct f2fs_io_info *fio) { - unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; + unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); unsigned int fua_flag, meta_flag, io_flag; unsigned int op_flags = 0; @@ -443,9 +443,9 @@ static unsigned int f2fs_io_flags(struct f2fs_io_info *fio) * 5 | 4 | 3 | 2 | 1 | 0 | * Cold | Warm | Hot | Cold | Warm | Hot | */ - if ((1 << fio->temp) & meta_flag) + if (BIT(fio->temp) & meta_flag) op_flags |= REQ_META; - if ((1 << fio->temp) & fua_flag) + if (BIT(fio->temp) & fua_flag) op_flags |= REQ_FUA; return op_flags; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7850eac3eda8..ce895a927c9f 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -29,7 +29,7 @@ static unsigned long dir_blocks(struct inode *inode) static unsigned int dir_buckets(unsigned int level, int dir_level) { if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) - return 1 << (level + dir_level); + return BIT(level + dir_level); else return MAX_DIR_BUCKETS; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 16bb6a6ea813..d1002d2dfee8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -62,7 +62,7 @@ enum { }; #ifdef CONFIG_F2FS_FAULT_INJECTION -#define F2FS_ALL_FAULT_TYPE ((1 << FAULT_MAX) - 1) +#define F2FS_ALL_FAULT_TYPE (GENMASK(FAULT_MAX - 1, 0)) struct f2fs_fault_info { atomic_t inject_ops; @@ -71,7 +71,7 @@ struct f2fs_fault_info { }; extern const char *f2fs_fault_name[FAULT_MAX]; -#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type)) #endif /* @@ -1433,7 +1433,7 @@ static inline void set_page_private_##name(struct page *page) \ static inline void clear_page_private_##name(struct page *page) \ { \ clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ - if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { \ + if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) { \ set_page_private(page, 0); \ if (PagePrivate(page)) { \ ClearPagePrivate(page); \ @@ -1480,8 +1480,8 @@ static inline void set_page_private_data(struct page *page, unsigned long data) static inline void clear_page_private_data(struct page *page) { - page_private(page) &= (1 << PAGE_PRIVATE_MAX) - 1; - if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { + page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0); + if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) { set_page_private(page, 0); if (PagePrivate(page)) { ClearPagePrivate(page); @@ -2884,7 +2884,7 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr) int mask; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); return mask & *addr; } @@ -2893,7 +2893,7 @@ static inline void f2fs_set_bit(unsigned int nr, char *addr) int mask; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); *addr |= mask; } @@ -2902,7 +2902,7 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr) int mask; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); *addr &= ~mask; } @@ -2912,7 +2912,7 @@ static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr) int ret; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); ret = mask & *addr; *addr |= mask; return ret; @@ -2924,7 +2924,7 @@ static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr) int ret; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); ret = mask & *addr; *addr &= ~mask; return ret; @@ -2935,7 +2935,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) int mask; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); *addr ^= mask; } @@ -4350,9 +4350,9 @@ static inline int set_compress_context(struct inode *inode) F2FS_OPTION(sbi).compress_log_size; F2FS_I(inode)->i_compress_flag = F2FS_OPTION(sbi).compress_chksum ? - 1 << COMPRESS_CHKSUM : 0; + BIT(COMPRESS_CHKSUM) : 0; F2FS_I(inode)->i_cluster_size = - 1 << F2FS_I(inode)->i_log_cluster_size; + BIT(F2FS_I(inode)->i_log_cluster_size); if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 || F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) && F2FS_OPTION(sbi).compress_level) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7e51c59d26ef..071aaad9836b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3952,7 +3952,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) F2FS_I(inode)->i_compress_algorithm = option.algorithm; F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size; - F2FS_I(inode)->i_cluster_size = 1 << option.log_cluster_size; + F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size); f2fs_mark_inode_dirty_sync(inode, true); if (!f2fs_is_compress_backend_ready(inode)) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e05b8b0fa5e6..efdc555dea96 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -454,8 +454,8 @@ static int do_read_inode(struct inode *inode) fi->i_compress_level = compress_flag >> COMPRESS_LEVEL_OFFSET; fi->i_compress_flag = compress_flag & - (BIT(COMPRESS_LEVEL_OFFSET) - 1); - fi->i_cluster_size = 1 << fi->i_log_cluster_size; + GENMASK(COMPRESS_LEVEL_OFFSET - 1, 0); + fi->i_cluster_size = BIT(fi->i_log_cluster_size); set_inode_flag(inode, FI_COMPRESSED_FILE); } } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 99454d46a939..906fb67a99da 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -93,17 +93,15 @@ static inline void copy_node_info(struct node_info *dst, static inline void set_nat_flag(struct nat_entry *ne, unsigned int type, bool set) { - unsigned char mask = 0x01 << type; if (set) - ne->ni.flag |= mask; + ne->ni.flag |= BIT(type); else - ne->ni.flag &= ~mask; + ne->ni.flag &= ~BIT(type); } static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) { - unsigned char mask = 0x01 << type; - return ne->ni.flag & mask; + return ne->ni.flag & BIT(type); } static inline void nat_reset_flag(struct nat_entry *ne) @@ -225,7 +223,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, struct f2fs_nm_info *nm_i = NM_I(sbi); block_addr -= nm_i->nat_blkaddr; - block_addr ^= 1 << sbi->log_blocks_per_seg; + block_addr ^= BIT(sbi->log_blocks_per_seg); return block_addr + nm_i->nat_blkaddr; } @@ -395,7 +393,7 @@ static inline nid_t get_nid(struct page *p, int off, bool i) static inline int is_node(struct page *page, int type) { struct f2fs_node *rn = F2FS_NODE(page); - return le32_to_cpu(rn->footer.flag) & (1 << type); + return le32_to_cpu(rn->footer.flag) & BIT(type); } #define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) @@ -408,9 +406,9 @@ static inline void set_cold_node(struct page *page, bool is_dir) unsigned int flag = le32_to_cpu(rn->footer.flag); if (is_dir) - flag &= ~(0x1 << COLD_BIT_SHIFT); + flag &= ~BIT(COLD_BIT_SHIFT); else - flag |= (0x1 << COLD_BIT_SHIFT); + flag |= BIT(COLD_BIT_SHIFT); rn->footer.flag = cpu_to_le32(flag); } @@ -419,9 +417,9 @@ static inline void set_mark(struct page *page, int mark, int type) struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) - flag |= (0x1 << type); + flag |= BIT(type); else - flag &= ~(0x1 << type); + flag &= ~BIT(type); rn->footer.flag = cpu_to_le32(flag); #ifdef CONFIG_F2FS_CHECK_FS diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 20830acec099..6414fd77f111 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -887,8 +887,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) if (args->from && match_int(args, &arg)) return -EINVAL; if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) { - f2fs_warn(sbi, "Not support %d, larger than %d", - 1 << arg, BIO_MAX_VECS); + f2fs_warn(sbi, "Not support %ld, larger than %d", + BIT(arg), BIO_MAX_VECS); return -EINVAL; } F2FS_OPTION(sbi).write_io_size_bits = arg; @@ -1317,7 +1317,7 @@ default_check: #endif if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { - f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", + f2fs_err(sbi, "Should set mode=lfs with %luKB-sized IO", F2FS_IO_SIZE_KB(sbi)); return -EINVAL; } @@ -3357,7 +3357,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, total_sections = le32_to_cpu(raw_super->section_count); /* blocks_per_seg should be 512, given the above check */ - blocks_per_seg = 1 << le32_to_cpu(raw_super->log_blocks_per_seg); + blocks_per_seg = BIT(le32_to_cpu(raw_super->log_blocks_per_seg)); if (segment_count > F2FS_MAX_SEGMENT || segment_count < F2FS_MIN_SEGMENTS) { @@ -3626,9 +3626,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize); - sbi->blocksize = 1 << sbi->log_blocksize; + sbi->blocksize = BIT(sbi->log_blocksize); sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); - sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg; + sbi->blocks_per_seg = BIT(sbi->log_blocks_per_seg); sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); sbi->total_sections = le32_to_cpu(raw_super->section_count); @@ -3884,7 +3884,7 @@ void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) f2fs_down_write(&sbi->sb_lock); - if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1)) + if (raw_super->s_stop_reason[reason] < GENMASK(BITS_PER_BYTE - 1, 0)) raw_super->s_stop_reason[reason]++; err = f2fs_commit_super(sbi, false); @@ -4034,7 +4034,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).start_blk, FDEV(i).end_blk); } f2fs_info(sbi, - "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi)); + "IO Block Size: %8ld KB", F2FS_IO_SIZE_KB(sbi)); return 0; } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index c7158a974b7e..56e6451e5726 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -453,7 +453,7 @@ out: if (ret < 0) return ret; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + if (a->struct_type == FAULT_INFO_TYPE && t >= BIT(FAULT_MAX)) return -EINVAL; if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX) return -EINVAL; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 1701f25117ea..881eb9321967 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -40,9 +40,8 @@ #define F2FS_ENC_UTF8_12_1 1 -#define F2FS_IO_SIZE(sbi) (1 << F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */ -#define F2FS_IO_SIZE_KB(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 2)) /* KB */ -#define F2FS_IO_SIZE_BYTES(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 12)) /* B */ +#define F2FS_IO_SIZE(sbi) BIT(F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */ +#define F2FS_IO_SIZE_KB(sbi) BIT(F2FS_OPTION(sbi).write_io_size_bits + 2) /* KB */ #define F2FS_IO_SIZE_BITS(sbi) (F2FS_OPTION(sbi).write_io_size_bits) /* power of 2 */ #define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1) #define F2FS_IO_ALIGNED(sbi) (F2FS_IO_SIZE(sbi) > 1) @@ -340,7 +339,7 @@ enum { OFFSET_BIT_SHIFT }; -#define OFFSET_BIT_MASK (0x07) /* (0x01 << OFFSET_BIT_SHIFT) - 1 */ +#define OFFSET_BIT_MASK GENMASK(OFFSET_BIT_SHIFT - 1, 0) struct node_footer { __le32 nid; /* node id */ @@ -545,7 +544,7 @@ typedef __le32 f2fs_hash_t; #define MAX_DIR_HASH_DEPTH 63 /* MAX buckets in one level of dir */ -#define MAX_DIR_BUCKETS (1 << ((MAX_DIR_HASH_DEPTH / 2) - 1)) +#define MAX_DIR_BUCKETS BIT((MAX_DIR_HASH_DEPTH / 2) - 1) /* * space utilization of regular dentry and inline dentry (w/o extra reservation) From 2bdc6a5daa9b176d2728204f887d4631b753c749 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 21 Feb 2023 22:45:50 +0800 Subject: [PATCH 05/54] f2fs: handle dqget error in f2fs_transfer_project_quota() We should set the error code when dqget() failed. Fixes: 2c1d03056991 ("f2fs: support F2FS_IOC_FS{GET,SET}XATTR") Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 071aaad9836b..734fc8dae911 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2997,15 +2997,16 @@ int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) struct dquot *transfer_to[MAXQUOTAS] = {}; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; - int err = 0; + int err; transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); - if (!IS_ERR(transfer_to[PRJQUOTA])) { - err = __dquot_transfer(inode, transfer_to); - if (err) - set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - dqput(transfer_to[PRJQUOTA]); - } + if (IS_ERR(transfer_to[PRJQUOTA])) + return PTR_ERR(transfer_to[PRJQUOTA]); + + err = __dquot_transfer(inode, transfer_to); + if (err) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + dqput(transfer_to[PRJQUOTA]); return err; } From 7fabd7ebebf7bd95995fcc4a3f196b594b5acde4 Mon Sep 17 00:00:00 2001 From: Yonggil Song Date: Thu, 16 Feb 2023 16:13:50 +0900 Subject: [PATCH 06/54] f2fs: fix uninitialized skipped_gc_rwsem When f2fs skipped a gc round during victim migration, there was a bug which would skip all upcoming gc rounds unconditionally because skipped_gc_rwsem was not initialized. It fixes the bug by correctly initializing the skipped_gc_rwsem inside the gc loop. Fixes: 6f8d4455060d ("f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc") Signed-off-by: Yonggil Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 74205ec00c34..e7f8d7c5a5d0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1792,8 +1792,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) prefree_segments(sbi)); cpc.reason = __get_cp_reason(sbi); - sbi->skipped_gc_rwsem = 0; gc_more: + sbi->skipped_gc_rwsem = 0; if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; goto stop; From 3e1d88e5ccd54d0572e611432d49c6823b870f28 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 10 Mar 2023 10:04:26 -0800 Subject: [PATCH 07/54] f2fs: factor out victim_entry usage from general rb_tree use Let's reduce the complexity of mixed use of rb_tree in victim_entry from extent_cache and discard_cmd. This should fix arm32 memory alignment issue caused by shared rb_entry. [struct victim_entry] [struct rb_entry] [0] struct rb_node rb_node; [0] struct rb_node rb_node; union { struct { unsigned int ofs; unsigned int len; }; [16] unsigned long long mtime; [12] unsigned long long key; } __packed; Cc: Fixes: 093749e296e2 ("f2fs: support age threshold based garbage collection") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 36 +---------- fs/f2fs/f2fs.h | 15 +---- fs/f2fs/gc.c | 139 +++++++++++++++++++++++++---------------- fs/f2fs/gc.h | 14 +---- fs/f2fs/segment.c | 4 +- 5 files changed, 93 insertions(+), 115 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 28b12553f2b3..d1aa4609ca6b 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -204,29 +204,6 @@ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, return re; } -struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, - struct rb_node **parent, - unsigned long long key, bool *leftmost) -{ - struct rb_node **p = &root->rb_root.rb_node; - struct rb_entry *re; - - while (*p) { - *parent = *p; - re = rb_entry(*parent, struct rb_entry, rb_node); - - if (key < re->key) { - p = &(*p)->rb_left; - } else { - p = &(*p)->rb_right; - *leftmost = false; - } - } - - return p; -} - struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root_cached *root, struct rb_node **parent, @@ -335,7 +312,7 @@ lookup_neighbors: } bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, bool check_key) + struct rb_root_cached *root) { #ifdef CONFIG_F2FS_CHECK_FS struct rb_node *cur = rb_first_cached(root), *next; @@ -352,23 +329,12 @@ bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, cur_re = rb_entry(cur, struct rb_entry, rb_node); next_re = rb_entry(next, struct rb_entry, rb_node); - if (check_key) { - if (cur_re->key > next_re->key) { - f2fs_info(sbi, "inconsistent rbtree, " - "cur(%llu) next(%llu)", - cur_re->key, next_re->key); - return false; - } - goto next; - } - if (cur_re->ofs + cur_re->len > next_re->ofs) { f2fs_info(sbi, "inconsistent rbtree, cur(%u, %u) next(%u, %u)", cur_re->ofs, cur_re->len, next_re->ofs, next_re->len); return false; } -next: cur = next; } #endif diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d1002d2dfee8..f8206501fa5c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -627,13 +627,8 @@ enum extent_type { struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ - union { - struct { - unsigned int ofs; /* start offset of the entry */ - unsigned int len; /* length of the entry */ - }; - unsigned long long key; /* 64-bits key */ - } __packed; + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ }; struct extent_info { @@ -4136,10 +4131,6 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); bool sanity_check_extent_cache(struct inode *inode); struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs); -struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, - struct rb_node **parent, - unsigned long long key, bool *left_most); struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root_cached *root, struct rb_node **parent, @@ -4150,7 +4141,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, struct rb_node ***insert_p, struct rb_node **insert_parent, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, bool check_key); + struct rb_root_cached *root); void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); void f2fs_destroy_extent_node(struct inode *inode); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e7f8d7c5a5d0..81b8ff81a16d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -390,40 +390,95 @@ static unsigned int count_bits(const unsigned long *addr, return sum; } -static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi, - unsigned long long mtime, unsigned int segno, - struct rb_node *parent, struct rb_node **p, - bool left_most) +static bool f2fs_check_victim_tree(struct f2fs_sb_info *sbi, + struct rb_root_cached *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first_cached(root), *next; + struct victim_entry *cur_ve, *next_ve; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_ve = rb_entry(cur, struct victim_entry, rb_node); + next_ve = rb_entry(next, struct victim_entry, rb_node); + + if (cur_ve->mtime > next_ve->mtime) { + f2fs_info(sbi, "broken victim_rbtree, " + "cur_mtime(%llu) next_mtime(%llu)", + cur_ve->mtime, next_ve->mtime); + return false; + } + cur = next; + } +#endif + return true; +} + +static struct victim_entry *__lookup_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime) +{ + struct atgc_management *am = &sbi->am; + struct rb_node *node = am->root.rb_root.rb_node; + struct victim_entry *ve = NULL; + + while (node) { + ve = rb_entry(node, struct victim_entry, rb_node); + + if (mtime < ve->mtime) + node = node->rb_left; + else + node = node->rb_right; + } + return ve; +} + +static struct victim_entry *__create_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno) { struct atgc_management *am = &sbi->am; struct victim_entry *ve; - ve = f2fs_kmem_cache_alloc(victim_entry_slab, - GFP_NOFS, true, NULL); + ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS, true, NULL); ve->mtime = mtime; ve->segno = segno; - rb_link_node(&ve->rb_node, parent, p); - rb_insert_color_cached(&ve->rb_node, &am->root, left_most); - list_add_tail(&ve->list, &am->victim_list); - am->victim_count++; return ve; } -static void insert_victim_entry(struct f2fs_sb_info *sbi, +static void __insert_victim_entry(struct f2fs_sb_info *sbi, unsigned long long mtime, unsigned int segno) { struct atgc_management *am = &sbi->am; - struct rb_node **p; + struct rb_root_cached *root = &am->root; + struct rb_node **p = &root->rb_root.rb_node; struct rb_node *parent = NULL; + struct victim_entry *ve; bool left_most = true; - p = f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, mtime, &left_most); - attach_victim_entry(sbi, mtime, segno, parent, p, left_most); + /* look up rb tree to find parent node */ + while (*p) { + parent = *p; + ve = rb_entry(parent, struct victim_entry, rb_node); + + if (mtime < ve->mtime) { + p = &(*p)->rb_left; + } else { + p = &(*p)->rb_right; + left_most = false; + } + } + + ve = __create_victim_entry(sbi, mtime, segno); + + rb_link_node(&ve->rb_node, parent, p); + rb_insert_color_cached(&ve->rb_node, root, left_most); } static void add_victim_entry(struct f2fs_sb_info *sbi, @@ -459,19 +514,7 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, if (sit_i->dirty_max_mtime - mtime < p->age_threshold) return; - insert_victim_entry(sbi, mtime, segno); -} - -static struct rb_node *lookup_central_victim(struct f2fs_sb_info *sbi, - struct victim_sel_policy *p) -{ - struct atgc_management *am = &sbi->am; - struct rb_node *parent = NULL; - bool left_most; - - f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, p->age, &left_most); - - return parent; + __insert_victim_entry(sbi, mtime, segno); } static void atgc_lookup_victim(struct f2fs_sb_info *sbi, @@ -481,7 +524,6 @@ static void atgc_lookup_victim(struct f2fs_sb_info *sbi, struct atgc_management *am = &sbi->am; struct rb_root_cached *root = &am->root; struct rb_node *node; - struct rb_entry *re; struct victim_entry *ve; unsigned long long total_time; unsigned long long age, u, accu; @@ -508,12 +550,10 @@ static void atgc_lookup_victim(struct f2fs_sb_info *sbi, node = rb_first_cached(root); next: - re = rb_entry_safe(node, struct rb_entry, rb_node); - if (!re) + ve = rb_entry_safe(node, struct victim_entry, rb_node); + if (!ve) return; - ve = (struct victim_entry *)re; - if (ve->mtime >= max_mtime || ve->mtime < min_mtime) goto skip; @@ -555,8 +595,6 @@ static void atssr_lookup_victim(struct f2fs_sb_info *sbi, { struct sit_info *sit_i = SIT_I(sbi); struct atgc_management *am = &sbi->am; - struct rb_node *node; - struct rb_entry *re; struct victim_entry *ve; unsigned long long age; unsigned long long max_mtime = sit_i->dirty_max_mtime; @@ -566,25 +604,22 @@ static void atssr_lookup_victim(struct f2fs_sb_info *sbi, unsigned int dirty_threshold = max(am->max_candidate_count, am->candidate_ratio * am->victim_count / 100); - unsigned int cost; - unsigned int iter = 0; + unsigned int cost, iter; int stage = 0; if (max_mtime < min_mtime) return; max_mtime += 1; next_stage: - node = lookup_central_victim(sbi, p); + iter = 0; + ve = __lookup_victim_entry(sbi, p->age); next_node: - re = rb_entry_safe(node, struct rb_entry, rb_node); - if (!re) { - if (stage == 0) - goto skip_stage; + if (!ve) { + if (stage++ == 0) + goto next_stage; return; } - ve = (struct victim_entry *)re; - if (ve->mtime >= max_mtime || ve->mtime < min_mtime) goto skip_node; @@ -610,24 +645,20 @@ next_node: } skip_node: if (iter < dirty_threshold) { - if (stage == 0) - node = rb_prev(node); - else if (stage == 1) - node = rb_next(node); + ve = rb_entry(stage == 0 ? rb_prev(&ve->rb_node) : + rb_next(&ve->rb_node), + struct victim_entry, rb_node); goto next_node; } -skip_stage: - if (stage < 1) { - stage++; - iter = 0; + + if (stage++ == 0) goto next_stage; - } } + static void lookup_victim_by_age(struct f2fs_sb_info *sbi, struct victim_sel_policy *p) { - f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &sbi->am.root, true)); + f2fs_bug_on(sbi, !f2fs_check_victim_tree(sbi, &sbi->am.root)); if (p->gc_mode == GC_AT) atgc_lookup_victim(sbi, p); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 15bd1d680f67..5ad6ac63e13f 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -55,20 +55,10 @@ struct gc_inode_list { struct radix_tree_root iroot; }; -struct victim_info { - unsigned long long mtime; /* mtime of section */ - unsigned int segno; /* section No. */ -}; - struct victim_entry { struct rb_node rb_node; /* rb node located in rb-tree */ - union { - struct { - unsigned long long mtime; /* mtime of section */ - unsigned int segno; /* segment No. */ - }; - struct victim_info vi; /* victim info */ - }; + unsigned long long mtime; /* mtime of section */ + unsigned int segno; /* segment No. */ struct list_head list; }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 83342626a27f..8bd5888854f4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1479,7 +1479,7 @@ retry: goto next; if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root, false)); + &dcc->root)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -2966,7 +2966,7 @@ next: mutex_lock(&dcc->cmd_lock); if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root, false)); + &dcc->root)); dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, start, From 3bcee3b97cc96bab9ddd912841dca28e1a1c49cd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 10 Mar 2023 11:12:35 -0800 Subject: [PATCH 08/54] f2fs: factor out discard_cmd usage from general rb_tree use This is a second part to remove the mixed use of rb_tree in discard_cmd from extent_cache. This should also fix arm32 memory alignment issue caused by shared rb_entry. [struct discard_cmd] [struct rb_entry] [0] struct rb_node rb_node; [0] struct rb_node rb_node; union { union { struct { struct { [16] block_t lstart; [12] unsigned int ofs; block_t len; unsigned int len; }; unsigned long long key; } __packed; Cc: Fixes: 004b68621897 ("f2fs: use rb-tree to track pending discard commands") Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 36 +----- fs/f2fs/f2fs.h | 23 +--- fs/f2fs/segment.c | 247 +++++++++++++++++++++++++++-------------- 3 files changed, 168 insertions(+), 138 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index d1aa4609ca6b..5c206f941aac 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -192,7 +192,7 @@ static struct rb_entry *__lookup_rb_tree_slow(struct rb_root_cached *root, return NULL; } -struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, +static struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs) { struct rb_entry *re; @@ -204,7 +204,7 @@ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, return re; } -struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, +static struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root_cached *root, struct rb_node **parent, unsigned int ofs, bool *leftmost) @@ -238,7 +238,7 @@ struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, * in order to simplify the insertion after. * tree must stay unchanged between lookup and insertion. */ -struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, +static struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, @@ -311,36 +311,6 @@ lookup_neighbors: return re; } -bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root) -{ -#ifdef CONFIG_F2FS_CHECK_FS - struct rb_node *cur = rb_first_cached(root), *next; - struct rb_entry *cur_re, *next_re; - - if (!cur) - return true; - - while (cur) { - next = rb_next(cur); - if (!next) - return true; - - cur_re = rb_entry(cur, struct rb_entry, rb_node); - next_re = rb_entry(next, struct rb_entry, rb_node); - - if (cur_re->ofs + cur_re->len > next_re->ofs) { - f2fs_info(sbi, "inconsistent rbtree, cur(%u, %u) next(%u, %u)", - cur_re->ofs, cur_re->len, - next_re->ofs, next_re->len); - return false; - } - cur = next; - } -#endif - return true; -} - static struct kmem_cache *extent_tree_slab; static struct kmem_cache *extent_node_slab; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f8206501fa5c..ce4e50574d6c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -350,15 +350,7 @@ struct discard_info { struct discard_cmd { struct rb_node rb_node; /* rb node located in rb-tree */ - union { - struct { - block_t lstart; /* logical start address */ - block_t len; /* length */ - block_t start; /* actual start address in dev */ - }; - struct discard_info di; /* discard info */ - - }; + struct discard_info di; /* discard info */ struct list_head list; /* command list */ struct completion wait; /* compleation */ struct block_device *bdev; /* bdev */ @@ -4129,19 +4121,6 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); * extent_cache.c */ bool sanity_check_extent_cache(struct inode *inode); -struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, - struct rb_entry *cached_re, unsigned int ofs); -struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, - struct rb_node **parent, - unsigned int ofs, bool *leftmost); -struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, - struct rb_entry *cached_re, unsigned int ofs, - struct rb_entry **prev_entry, struct rb_entry **next_entry, - struct rb_node ***insert_p, struct rb_node **insert_parent, - bool force, bool *leftmost); -bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root); void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); void f2fs_destroy_extent_node(struct inode *inode); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8bd5888854f4..f3549555b1c2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -932,9 +932,9 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL); INIT_LIST_HEAD(&dc->list); dc->bdev = bdev; - dc->lstart = lstart; - dc->start = start; - dc->len = len; + dc->di.lstart = lstart; + dc->di.start = start; + dc->di.len = len; dc->ref = 0; dc->state = D_PREP; dc->queued = 0; @@ -949,20 +949,108 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, return dc; } -static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, - struct block_device *bdev, block_t lstart, - block_t start, block_t len, - struct rb_node *parent, struct rb_node **p, - bool leftmost) +static bool f2fs_check_discard_tree(struct f2fs_sb_info *sbi) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node *cur = rb_first_cached(&dcc->root), *next; + struct discard_cmd *cur_dc, *next_dc; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_dc = rb_entry(cur, struct discard_cmd, rb_node); + next_dc = rb_entry(next, struct discard_cmd, rb_node); + + if (cur_dc->di.lstart + cur_dc->di.len > next_dc->di.lstart) { + f2fs_info(sbi, "broken discard_rbtree, " + "cur(%u, %u) next(%u, %u)", + cur_dc->di.lstart, cur_dc->di.len, + next_dc->di.lstart, next_dc->di.len); + return false; + } + cur = next; + } +#endif + return true; +} + +static struct discard_cmd *__lookup_discard_cmd(struct f2fs_sb_info *sbi, + block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node *node = dcc->root.rb_root.rb_node; struct discard_cmd *dc; - dc = __create_discard_cmd(sbi, bdev, lstart, start, len); + while (node) { + dc = rb_entry(node, struct discard_cmd, rb_node); - rb_link_node(&dc->rb_node, parent, p); - rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost); + if (blkaddr < dc->di.lstart) + node = node->rb_left; + else if (blkaddr >= dc->di.lstart + dc->di.len) + node = node->rb_right; + else + return dc; + } + return NULL; +} +static struct discard_cmd *__lookup_discard_cmd_ret(struct rb_root_cached *root, + block_t blkaddr, + struct discard_cmd **prev_entry, + struct discard_cmd **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent) +{ + struct rb_node **pnode = &root->rb_root.rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct discard_cmd *dc; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(&root->rb_root)) + return NULL; + + while (*pnode) { + parent = *pnode; + dc = rb_entry(*pnode, struct discard_cmd, rb_node); + + if (blkaddr < dc->di.lstart) + pnode = &(*pnode)->rb_left; + else if (blkaddr >= dc->di.lstart + dc->di.len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + dc = rb_entry(parent, struct discard_cmd, rb_node); + tmp_node = parent; + if (parent && blkaddr > dc->di.lstart) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); + + tmp_node = parent; + if (parent && blkaddr < dc->di.lstart) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); + return NULL; + +lookup_neighbors: + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&dc->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); + + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&dc->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); return dc; } @@ -974,7 +1062,7 @@ static void __detach_discard_cmd(struct discard_cmd_control *dcc, list_del(&dc->list); rb_erase_cached(&dc->rb_node, &dcc->root); - dcc->undiscard_blks -= dc->len; + dcc->undiscard_blks -= dc->di.len; kmem_cache_free(discard_cmd_slab, dc); @@ -987,7 +1075,7 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; unsigned long flags; - trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len); + trace_f2fs_remove_discard(dc->bdev, dc->di.start, dc->di.len); spin_lock_irqsave(&dc->lock, flags); if (dc->bio_ref) { @@ -1005,7 +1093,7 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, printk_ratelimited( "%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d", KERN_INFO, sbi->sb->s_id, - dc->lstart, dc->start, dc->len, dc->error); + dc->di.lstart, dc->di.start, dc->di.len, dc->error); __detach_discard_cmd(dcc, dc); } @@ -1122,14 +1210,14 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) return 0; - trace_f2fs_issue_discard(bdev, dc->start, dc->len); + trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len); - lstart = dc->lstart; - start = dc->start; - len = dc->len; + lstart = dc->di.lstart; + start = dc->di.start; + len = dc->di.len; total_len = len; - dc->len = 0; + dc->di.len = 0; while (total_len && *issued < dpolicy->max_requests && !err) { struct bio *bio = NULL; @@ -1145,7 +1233,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, if (*issued == dpolicy->max_requests) last = true; - dc->len += len; + dc->di.len += len; if (time_to_inject(sbi, FAULT_DISCARD)) { err = -EIO; @@ -1207,34 +1295,41 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, return err; } -static void __insert_discard_tree(struct f2fs_sb_info *sbi, +static void __insert_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, - block_t start, block_t len, - struct rb_node **insert_p, - struct rb_node *insert_parent) + block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct rb_node **p; + struct rb_node **p = &dcc->root.rb_root.rb_node; struct rb_node *parent = NULL; + struct discard_cmd *dc; bool leftmost = true; - if (insert_p && insert_parent) { - parent = insert_parent; - p = insert_p; - goto do_insert; + /* look up rb tree to find parent node */ + while (*p) { + parent = *p; + dc = rb_entry(parent, struct discard_cmd, rb_node); + + if (lstart < dc->di.lstart) { + p = &(*p)->rb_left; + } else if (lstart >= dc->di.lstart + dc->di.len) { + p = &(*p)->rb_right; + leftmost = false; + } else { + f2fs_bug_on(sbi, 1); + } } - p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, - lstart, &leftmost); -do_insert: - __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, - p, leftmost); + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); + + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost); } static void __relocate_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { - list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]); + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->di.len)]); } static void __punch_discard_cmd(struct f2fs_sb_info *sbi, @@ -1244,7 +1339,7 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, struct discard_info di = dc->di; bool modified = false; - if (dc->state == D_DONE || dc->len == 1) { + if (dc->state == D_DONE || dc->di.len == 1) { __remove_discard_cmd(sbi, dc); return; } @@ -1252,23 +1347,22 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, dcc->undiscard_blks -= di.len; if (blkaddr > di.lstart) { - dc->len = blkaddr - dc->lstart; - dcc->undiscard_blks += dc->len; + dc->di.len = blkaddr - dc->di.lstart; + dcc->undiscard_blks += dc->di.len; __relocate_discard_cmd(dcc, dc); modified = true; } if (blkaddr < di.lstart + di.len - 1) { if (modified) { - __insert_discard_tree(sbi, dc->bdev, blkaddr + 1, + __insert_discard_cmd(sbi, dc->bdev, blkaddr + 1, di.start + blkaddr + 1 - di.lstart, - di.lstart + di.len - 1 - blkaddr, - NULL, NULL); + di.lstart + di.len - 1 - blkaddr); } else { - dc->lstart++; - dc->len--; - dc->start++; - dcc->undiscard_blks += dc->len; + dc->di.lstart++; + dc->di.len--; + dc->di.start++; + dcc->undiscard_blks += dc->di.len; __relocate_discard_cmd(dcc, dc); } } @@ -1288,17 +1382,14 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, SECTOR_TO_BLOCK(q->limits.max_discard_sectors); block_t end = lstart + len; - dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, - NULL, lstart, - (struct rb_entry **)&prev_dc, - (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true, NULL); + dc = __lookup_discard_cmd_ret(&dcc->root, lstart, + &prev_dc, &next_dc, &insert_p, &insert_parent); if (dc) prev_dc = dc; if (!prev_dc) { di.lstart = lstart; - di.len = next_dc ? next_dc->lstart - lstart : len; + di.len = next_dc ? next_dc->di.lstart - lstart : len; di.len = min(di.len, len); di.start = start; } @@ -1309,16 +1400,16 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct discard_cmd *tdc = NULL; if (prev_dc) { - di.lstart = prev_dc->lstart + prev_dc->len; + di.lstart = prev_dc->di.lstart + prev_dc->di.len; if (di.lstart < lstart) di.lstart = lstart; if (di.lstart >= end) break; - if (!next_dc || next_dc->lstart > end) + if (!next_dc || next_dc->di.lstart > end) di.len = end - di.lstart; else - di.len = next_dc->lstart - di.lstart; + di.len = next_dc->di.lstart - di.lstart; di.start = start + di.lstart - lstart; } @@ -1351,10 +1442,9 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, merged = true; } - if (!merged) { - __insert_discard_tree(sbi, bdev, di.lstart, di.start, - di.len, NULL, NULL); - } + if (!merged) + __insert_discard_cmd(sbi, bdev, + di.lstart, di.start, di.len); next: prev_dc = next_dc; if (!prev_dc) @@ -1393,15 +1483,11 @@ static void __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; struct blk_plug plug; - unsigned int pos = dcc->next_pos; bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, - NULL, pos, - (struct rb_entry **)&prev_dc, - (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true, NULL); + dc = __lookup_discard_cmd_ret(&dcc->root, dcc->next_pos, + &prev_dc, &next_dc, &insert_p, &insert_parent); if (!dc) dc = next_dc; @@ -1419,7 +1505,7 @@ static void __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, break; } - dcc->next_pos = dc->lstart + dc->len; + dcc->next_pos = dc->di.lstart + dc->di.len; err = __submit_discard_cmd(sbi, dpolicy, dc, issued); if (*issued >= dpolicy->max_requests) @@ -1478,8 +1564,7 @@ retry: if (list_empty(pend_list)) goto next; if (unlikely(dcc->rbtree_check)) - f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root)); + f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -1557,7 +1642,7 @@ static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, dc->ref--; if (!dc->ref) { if (!dc->error) - len = dc->len; + len = dc->di.len; __remove_discard_cmd(sbi, dc); } mutex_unlock(&dcc->cmd_lock); @@ -1580,14 +1665,15 @@ next: mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(iter, tmp, wait_list, list) { - if (iter->lstart + iter->len <= start || end <= iter->lstart) + if (iter->di.lstart + iter->di.len <= start || + end <= iter->di.lstart) continue; - if (iter->len < dpolicy->granularity) + if (iter->di.len < dpolicy->granularity) continue; if (iter->state == D_DONE && !iter->ref) { wait_for_completion_io(&iter->wait); if (!iter->error) - trimmed += iter->len; + trimmed += iter->di.len; __remove_discard_cmd(sbi, iter); } else { iter->ref++; @@ -1631,8 +1717,7 @@ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) bool need_wait = false; mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)f2fs_lookup_rb_tree(&dcc->root, - NULL, blkaddr); + dc = __lookup_discard_cmd(sbi, blkaddr); if (dc) { if (dc->state == D_PREP) { __punch_discard_cmd(sbi, dc, blkaddr); @@ -2965,24 +3050,20 @@ next: mutex_lock(&dcc->cmd_lock); if (unlikely(dcc->rbtree_check)) - f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root)); + f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); - dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, - NULL, start, - (struct rb_entry **)&prev_dc, - (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true, NULL); + dc = __lookup_discard_cmd_ret(&dcc->root, start, + &prev_dc, &next_dc, &insert_p, &insert_parent); if (!dc) dc = next_dc; blk_start_plug(&plug); - while (dc && dc->lstart <= end) { + while (dc && dc->di.lstart <= end) { struct rb_node *node; int err = 0; - if (dc->len < dpolicy->granularity) + if (dc->di.len < dpolicy->granularity) goto skip; if (dc->state != D_PREP) { @@ -2993,7 +3074,7 @@ next: err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) { - start = dc->lstart + dc->len; + start = dc->di.lstart + dc->di.len; if (err) __remove_discard_cmd(sbi, dc); From e06222371ac35b1cffa14a85b72c5953f6784511 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 10 Mar 2023 11:49:57 -0800 Subject: [PATCH 09/54] f2fs: remove entire rb_entry sharing This is a last part to remove the memory sharing for rb_tree in extent_cache. This should also fix arm32 memory alignment issue. [struct extent_node] [struct rb_entry] [0] struct rb_node rb_node; [0] struct rb_node rb_node; union { union { struct { struct { [16] unsigned int fofs; [12] unsigned int ofs; unsigned int len; unsigned int len; }; unsigned long long key; } __packed; Cc: Fixes: 13054c548a1c ("f2fs: introduce infra macro and data structure of rb-tree extent cache") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 177 +++++++++++++++++------------------------ fs/f2fs/f2fs.h | 6 -- 2 files changed, 71 insertions(+), 112 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 5c206f941aac..9a8153895d20 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -161,95 +161,52 @@ static bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front, type); } -static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, - unsigned int ofs) -{ - if (cached_re) { - if (cached_re->ofs <= ofs && - cached_re->ofs + cached_re->len > ofs) { - return cached_re; - } - } - return NULL; -} - -static struct rb_entry *__lookup_rb_tree_slow(struct rb_root_cached *root, - unsigned int ofs) +static struct extent_node *__lookup_extent_node(struct rb_root_cached *root, + struct extent_node *cached_en, unsigned int fofs) { struct rb_node *node = root->rb_root.rb_node; - struct rb_entry *re; + struct extent_node *en; + /* check a cached entry */ + if (cached_en && cached_en->ei.fofs <= fofs && + cached_en->ei.fofs + cached_en->ei.len > fofs) + return cached_en; + + /* check rb_tree */ while (node) { - re = rb_entry(node, struct rb_entry, rb_node); + en = rb_entry(node, struct extent_node, rb_node); - if (ofs < re->ofs) + if (fofs < en->ei.fofs) node = node->rb_left; - else if (ofs >= re->ofs + re->len) + else if (fofs >= en->ei.fofs + en->ei.len) node = node->rb_right; else - return re; + return en; } return NULL; } -static struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, - struct rb_entry *cached_re, unsigned int ofs) -{ - struct rb_entry *re; - - re = __lookup_rb_tree_fast(cached_re, ofs); - if (!re) - return __lookup_rb_tree_slow(root, ofs); - - return re; -} - -static struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, - struct rb_node **parent, - unsigned int ofs, bool *leftmost) -{ - struct rb_node **p = &root->rb_root.rb_node; - struct rb_entry *re; - - while (*p) { - *parent = *p; - re = rb_entry(*parent, struct rb_entry, rb_node); - - if (ofs < re->ofs) { - p = &(*p)->rb_left; - } else if (ofs >= re->ofs + re->len) { - p = &(*p)->rb_right; - *leftmost = false; - } else { - f2fs_bug_on(sbi, 1); - } - } - - return p; -} - /* - * lookup rb entry in position of @ofs in rb-tree, + * lookup rb entry in position of @fofs in rb-tree, * if hit, return the entry, otherwise, return NULL - * @prev_ex: extent before ofs - * @next_ex: extent after ofs - * @insert_p: insert point for new extent at ofs + * @prev_ex: extent before fofs + * @next_ex: extent after fofs + * @insert_p: insert point for new extent at fofs * in order to simplify the insertion after. * tree must stay unchanged between lookup and insertion. */ -static struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, - struct rb_entry *cached_re, - unsigned int ofs, - struct rb_entry **prev_entry, - struct rb_entry **next_entry, +static struct extent_node *__lookup_extent_node_ret(struct rb_root_cached *root, + struct extent_node *cached_en, + unsigned int fofs, + struct extent_node **prev_entry, + struct extent_node **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent, - bool force, bool *leftmost) + bool *leftmost) { struct rb_node **pnode = &root->rb_root.rb_node; struct rb_node *parent = NULL, *tmp_node; - struct rb_entry *re = cached_re; + struct extent_node *en = cached_en; *insert_p = NULL; *insert_parent = NULL; @@ -259,24 +216,20 @@ static struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, if (RB_EMPTY_ROOT(&root->rb_root)) return NULL; - if (re) { - if (re->ofs <= ofs && re->ofs + re->len > ofs) - goto lookup_neighbors; - } + if (en && en->ei.fofs <= fofs && en->ei.fofs + en->ei.len > fofs) + goto lookup_neighbors; - if (leftmost) - *leftmost = true; + *leftmost = true; while (*pnode) { parent = *pnode; - re = rb_entry(*pnode, struct rb_entry, rb_node); + en = rb_entry(*pnode, struct extent_node, rb_node); - if (ofs < re->ofs) { + if (fofs < en->ei.fofs) { pnode = &(*pnode)->rb_left; - } else if (ofs >= re->ofs + re->len) { + } else if (fofs >= en->ei.fofs + en->ei.len) { pnode = &(*pnode)->rb_right; - if (leftmost) - *leftmost = false; + *leftmost = false; } else { goto lookup_neighbors; } @@ -285,30 +238,32 @@ static struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, *insert_p = pnode; *insert_parent = parent; - re = rb_entry(parent, struct rb_entry, rb_node); + en = rb_entry(parent, struct extent_node, rb_node); tmp_node = parent; - if (parent && ofs > re->ofs) + if (parent && fofs > en->ei.fofs) tmp_node = rb_next(parent); - *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + *next_entry = rb_entry_safe(tmp_node, struct extent_node, rb_node); tmp_node = parent; - if (parent && ofs < re->ofs) + if (parent && fofs < en->ei.fofs) tmp_node = rb_prev(parent); - *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct extent_node, rb_node); return NULL; lookup_neighbors: - if (ofs == re->ofs || force) { + if (fofs == en->ei.fofs) { /* lookup prev node for merging backward later */ - tmp_node = rb_prev(&re->rb_node); - *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + tmp_node = rb_prev(&en->rb_node); + *prev_entry = rb_entry_safe(tmp_node, + struct extent_node, rb_node); } - if (ofs == re->ofs + re->len - 1 || force) { + if (fofs == en->ei.fofs + en->ei.len - 1) { /* lookup next node for merging frontward later */ - tmp_node = rb_next(&re->rb_node); - *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + tmp_node = rb_next(&en->rb_node); + *next_entry = rb_entry_safe(tmp_node, + struct extent_node, rb_node); } - return re; + return en; } static struct kmem_cache *extent_tree_slab; @@ -523,8 +478,7 @@ static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = (struct extent_node *)f2fs_lookup_rb_tree(&et->root, - (struct rb_entry *)et->cached_en, pgofs); + en = __lookup_extent_node(&et->root, et->cached_en, pgofs); if (!en) goto out; @@ -598,7 +552,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, bool leftmost) { struct extent_tree_info *eti = &sbi->extent_tree[et->type]; - struct rb_node **p; + struct rb_node **p = &et->root.rb_root.rb_node; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -610,8 +564,21 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, leftmost = true; - p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, - ei->fofs, &leftmost); + /* look up extent_node in the rb tree */ + while (*p) { + parent = *p; + en = rb_entry(parent, struct extent_node, rb_node); + + if (ei->fofs < en->ei.fofs) { + p = &(*p)->rb_left; + } else if (ei->fofs >= en->ei.fofs + en->ei.len) { + p = &(*p)->rb_right; + leftmost = false; + } else { + f2fs_bug_on(sbi, 1); + } + } + do_insert: en = __attach_extent_node(sbi, et, ei, parent, p, leftmost); if (!en) @@ -670,11 +637,10 @@ static void __update_extent_tree_range(struct inode *inode, } /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, - (struct rb_entry *)et->cached_en, fofs, - (struct rb_entry **)&prev_en, - (struct rb_entry **)&next_en, - &insert_p, &insert_parent, false, + en = __lookup_extent_node_ret(&et->root, + et->cached_en, fofs, + &prev_en, &next_en, + &insert_p, &insert_parent, &leftmost); if (!en) en = next_en; @@ -812,12 +778,11 @@ void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, write_lock(&et->lock); - en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, - (struct rb_entry *)et->cached_en, fofs, - (struct rb_entry **)&prev_en, - (struct rb_entry **)&next_en, - &insert_p, &insert_parent, false, - &leftmost); + en = __lookup_extent_node_ret(&et->root, + et->cached_en, fofs, + &prev_en, &next_en, + &insert_p, &insert_parent, + &leftmost); if (en) goto unlock_out; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ce4e50574d6c..bbc881f1e104 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -617,12 +617,6 @@ enum extent_type { NR_EXTENT_CACHES, }; -struct rb_entry { - struct rb_node rb_node; /* rb node located in rb-tree */ - unsigned int ofs; /* start offset of the entry */ - unsigned int len; /* length of the entry */ -}; - struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ From dcb4c5ba2500793193bee1ee37bc373fe11e7706 Mon Sep 17 00:00:00 2001 From: Yonggil Song Date: Mon, 13 Mar 2023 18:48:25 +0900 Subject: [PATCH 10/54] f2fs: Fix discard bug on zoned block devices with 2MiB zone size When using f2fs on a zoned block device with 2MiB zone size, IO errors occurs because f2fs tries to write data to a zone that has not been reset. The cause is that f2fs tries to discard multiple zones at once. This is caused by a condition in f2fs_clear_prefree_segments that does not check for zoned block devices when setting the discard range. This leads to invalid reset commands and write pointer mismatches. This patch fixes the zoned block device with 2MiB zone size to reset one zone at a time. Signed-off-by: Yonggil Song Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f3549555b1c2..9bac32b74918 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2070,7 +2070,9 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, (end - 1) <= cpc->trim_end) continue; - if (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi)) { + /* Should cover 2MB zoned device for zone-based reset */ + if (!f2fs_sb_has_blkzoned(sbi) && + (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi))) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); continue; From ea45ca92661d26d9ab193cd7a8de72619560f3a4 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Wed, 8 Mar 2023 22:06:23 +0800 Subject: [PATCH 11/54] f2fs: convert to MAX_SBI_FLAG instead of 32 in stat_show() BIW reduce the s_flag array size and make s_flag constant. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 36 ++++++++++++++++++------------------ fs/f2fs/f2fs.h | 6 +++++- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 30a77936e3c5..99c7fc832ec7 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -336,22 +336,22 @@ get_cache: #endif } -static char *s_flag[] = { - [SBI_IS_DIRTY] = " fs_dirty", - [SBI_IS_CLOSE] = " closing", - [SBI_NEED_FSCK] = " need_fsck", - [SBI_POR_DOING] = " recovering", - [SBI_NEED_SB_WRITE] = " sb_dirty", - [SBI_NEED_CP] = " need_cp", - [SBI_IS_SHUTDOWN] = " shutdown", - [SBI_IS_RECOVERED] = " recovered", - [SBI_CP_DISABLED] = " cp_disabled", - [SBI_CP_DISABLED_QUICK] = " cp_disabled_quick", - [SBI_QUOTA_NEED_FLUSH] = " quota_need_flush", - [SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush", - [SBI_QUOTA_NEED_REPAIR] = " quota_need_repair", - [SBI_IS_RESIZEFS] = " resizefs", - [SBI_IS_FREEZING] = " freezefs", +static const char *s_flag[MAX_SBI_FLAG] = { + [SBI_IS_DIRTY] = "fs_dirty", + [SBI_IS_CLOSE] = "closing", + [SBI_NEED_FSCK] = "need_fsck", + [SBI_POR_DOING] = "recovering", + [SBI_NEED_SB_WRITE] = "sb_dirty", + [SBI_NEED_CP] = "need_cp", + [SBI_IS_SHUTDOWN] = "shutdown", + [SBI_IS_RECOVERED] = "recovered", + [SBI_CP_DISABLED] = "cp_disabled", + [SBI_CP_DISABLED_QUICK] = "cp_disabled_quick", + [SBI_QUOTA_NEED_FLUSH] = "quota_need_flush", + [SBI_QUOTA_SKIP_FLUSH] = "quota_skip_flush", + [SBI_QUOTA_NEED_REPAIR] = "quota_need_repair", + [SBI_IS_RESIZEFS] = "resizefs", + [SBI_IS_FREEZING] = "freezefs", }; static const char *ipu_mode_names[F2FS_IPU_MAX] = { @@ -384,8 +384,8 @@ static int stat_show(struct seq_file *s, void *v) "Disabled" : (f2fs_cp_error(sbi) ? "Error" : "Good")); if (sbi->s_flag) { seq_puts(s, "[SBI:"); - for_each_set_bit(j, &sbi->s_flag, 32) - seq_puts(s, s_flag[j]); + for_each_set_bit(j, &sbi->s_flag, MAX_SBI_FLAG) + seq_printf(s, " %s", s_flag[j]); seq_puts(s, "]\n"); } seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bbc881f1e104..621bc415e3cc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1271,7 +1271,10 @@ struct f2fs_gc_control { unsigned int nr_free_secs; /* # of free sections to do GC */ }; -/* For s_flag in struct f2fs_sb_info */ +/* + * For s_flag in struct f2fs_sb_info + * Modification on enum should be synchronized with s_flag array + */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ SBI_IS_CLOSE, /* specify unmounting */ @@ -1288,6 +1291,7 @@ enum { SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ SBI_IS_RESIZEFS, /* resizefs is in process */ SBI_IS_FREEZING, /* freezefs is in process */ + MAX_SBI_FLAG, }; enum { From 3f036f17d946876c2100a186248a8ac6e1af3d2e Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 2 Mar 2023 17:55:09 +0800 Subject: [PATCH 12/54] f2fs: fix to handle filemap_fdatawrite() error in f2fs_ioc_decompress_file/f2fs_ioc_compress_file It seems inappropriate that the current logic does not handle filemap_fdatawrite() errors, so let's fix it. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 734fc8dae911..5fc5cfaa13bd 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4051,8 +4051,11 @@ static int f2fs_ioc_decompress_file(struct file *filp) if (ret < 0) break; - if (get_dirty_pages(inode) >= blk_per_seg) - filemap_fdatawrite(inode->i_mapping); + if (get_dirty_pages(inode) >= blk_per_seg) { + ret = filemap_fdatawrite(inode->i_mapping); + if (ret < 0) + break; + } count -= len; page_idx += len; @@ -4122,8 +4125,11 @@ static int f2fs_ioc_compress_file(struct file *filp) if (ret < 0) break; - if (get_dirty_pages(inode) >= blk_per_seg) - filemap_fdatawrite(inode->i_mapping); + if (get_dirty_pages(inode) >= blk_per_seg) { + ret = filemap_fdatawrite(inode->i_mapping); + if (ret < 0) + break; + } count -= len; page_idx += len; From b6b979879544e0b903da3e8c23bb5e81ec772889 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 21 Mar 2023 15:58:04 -0700 Subject: [PATCH 13/54] f2fs: apply zone capacity to all zone type If we manage the zone capacity per zone type, it'll break the GC assumption. And, the current logic complains valid block count mismatch. Let's apply zone capacity to all zone type, if specified. Fixes: de881df97768 ("f2fs: support zone capacity less than zone size") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 65 +++-------------------------------------------- fs/f2fs/segment.h | 3 +++ 2 files changed, 7 insertions(+), 61 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9bac32b74918..95d9d72a291d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -5004,48 +5004,6 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) return 0; } -static bool is_conv_zone(struct f2fs_sb_info *sbi, unsigned int zone_idx, - unsigned int dev_idx) -{ - if (!bdev_is_zoned(FDEV(dev_idx).bdev)) - return true; - return !test_bit(zone_idx, FDEV(dev_idx).blkz_seq); -} - -/* Return the zone index in the given device */ -static unsigned int get_zone_idx(struct f2fs_sb_info *sbi, unsigned int secno, - int dev_idx) -{ - block_t sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); - - return (sec_start_blkaddr - FDEV(dev_idx).start_blk) >> - sbi->log_blocks_per_blkz; -} - -/* - * Return the usable segments in a section based on the zone's - * corresponding zone capacity. Zone is equal to a section. - */ -static inline unsigned int f2fs_usable_zone_segs_in_sec( - struct f2fs_sb_info *sbi, unsigned int segno) -{ - unsigned int dev_idx, zone_idx; - - dev_idx = f2fs_target_device_index(sbi, START_BLOCK(sbi, segno)); - zone_idx = get_zone_idx(sbi, GET_SEC_FROM_SEG(sbi, segno), dev_idx); - - /* Conventional zone's capacity is always equal to zone size */ - if (is_conv_zone(sbi, zone_idx, dev_idx)) - return sbi->segs_per_sec; - - if (!sbi->unusable_blocks_per_sec) - return sbi->segs_per_sec; - - /* Get the segment count beyond zone capacity block */ - return sbi->segs_per_sec - (sbi->unusable_blocks_per_sec >> - sbi->log_blocks_per_seg); -} - /* * Return the number of usable blocks in a segment. The number of blocks * returned is always equal to the number of blocks in a segment for @@ -5058,23 +5016,13 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg( struct f2fs_sb_info *sbi, unsigned int segno) { block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr; - unsigned int zone_idx, dev_idx, secno; - - secno = GET_SEC_FROM_SEG(sbi, segno); - seg_start = START_BLOCK(sbi, segno); - dev_idx = f2fs_target_device_index(sbi, seg_start); - zone_idx = get_zone_idx(sbi, secno, dev_idx); - - /* - * Conventional zone's capacity is always equal to zone size, - * so, blocks per segment is unchanged. - */ - if (is_conv_zone(sbi, zone_idx, dev_idx)) - return sbi->blocks_per_seg; + unsigned int secno; if (!sbi->unusable_blocks_per_sec) return sbi->blocks_per_seg; + secno = GET_SEC_FROM_SEG(sbi, segno); + seg_start = START_BLOCK(sbi, segno); sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); sec_cap_blkaddr = sec_start_blkaddr + CAP_BLKS_PER_SEC(sbi); @@ -5108,11 +5056,6 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi return 0; } -static inline unsigned int f2fs_usable_zone_segs_in_sec(struct f2fs_sb_info *sbi, - unsigned int segno) -{ - return 0; -} #endif unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno) @@ -5127,7 +5070,7 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, unsigned int segno) { if (f2fs_sb_has_blkzoned(sbi)) - return f2fs_usable_zone_segs_in_sec(sbi, segno); + return CAP_SEGS_PER_SEC(sbi); return sbi->segs_per_sec; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index efdb7fc3b797..babb29a1c034 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -104,6 +104,9 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define CAP_BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg - \ (sbi)->unusable_blocks_per_sec) +#define CAP_SEGS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec - ((sbi)->unusable_blocks_per_sec >>\ + (sbi)->log_blocks_per_seg)) #define GET_SEC_FROM_SEG(sbi, segno) \ (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ From 256d5300ce5e6bf5bcda1554c69efa136d1b1c3b Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 21 Mar 2023 01:31:36 +0800 Subject: [PATCH 14/54] f2fs: remove else in f2fs_write_cache_pages() As Christoph Hellwig point out: Please avoid the else by doing the goto in the branch. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8e878f92dcb4..cbb4352d30ec 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3105,11 +3105,9 @@ continue_unlock: } if (PageWriteback(page)) { - if (wbc->sync_mode != WB_SYNC_NONE) - f2fs_wait_on_page_writeback(page, - DATA, true, true); - else + if (wbc->sync_mode == WB_SYNC_NONE) goto continue_unlock; + f2fs_wait_on_page_writeback(page, DATA, true, true); } if (!clear_page_dirty_for_io(page)) From 7ce655ef9fdbbc7cb832086864dd8361a722805f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 21 Mar 2023 01:22:18 +0800 Subject: [PATCH 15/54] f2fs: compress: fix to call f2fs_wait_on_page_writeback() in f2fs_write_raw_pages() BUG_ON() will be triggered when writing files concurrently, because the same page is writtenback multiple times. 1597 void folio_end_writeback(struct folio *folio) 1598 { ...... 1618 if (!__folio_end_writeback(folio)) 1619 BUG(); ...... 1625 } kernel BUG at mm/filemap.c:1619! Call Trace: f2fs_write_end_io+0x1a0/0x370 blk_update_request+0x6c/0x410 blk_mq_end_request+0x15/0x130 blk_complete_reqs+0x3c/0x50 __do_softirq+0xb8/0x29b ? sort_range+0x20/0x20 run_ksoftirqd+0x19/0x20 smpboot_thread_fn+0x10b/0x1d0 kthread+0xde/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 Below is the concurrency scenario: [Process A] [Process B] [Process C] f2fs_write_raw_pages() - redirty_page_for_writepage() - unlock page() f2fs_do_write_data_page() - lock_page() - clear_page_dirty_for_io() - set_page_writeback() [1st writeback] ..... - unlock page() generic_perform_write() - f2fs_write_begin() - wait_for_stable_page() - f2fs_write_end() - set_page_dirty() - lock_page() - f2fs_do_write_data_page() - set_page_writeback() [2st writeback] This problem was introduced by the previous commit 7377e853967b ("f2fs: compress: fix potential deadlock of compress file"). All pagelocks were released in f2fs_write_raw_pages(), but whether the page was in the writeback state was ignored in the subsequent writing process. Let's fix it by waiting for the page to writeback before writing. Cc: Christoph Hellwig Fixes: 4c8ff7095bef ("f2fs: support data compression") Fixes: 7377e853967b ("f2fs: compress: fix potential deadlock of compress file") Signed-off-by: Qi Han Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 901191d6e391..4ca94026b5db 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1455,6 +1455,12 @@ continue_unlock: if (!PageDirty(cc->rpages[i])) goto continue_unlock; + if (PageWriteback(cc->rpages[i])) { + if (wbc->sync_mode == WB_SYNC_NONE) + goto continue_unlock; + f2fs_wait_on_page_writeback(cc->rpages[i], DATA, true, true); + } + if (!clear_page_dirty_for_io(cc->rpages[i])) goto continue_unlock; From 82ab408329a9ad987f3bba843e7581e3235966e4 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Mon, 20 Feb 2023 13:20:04 +0100 Subject: [PATCH 16/54] f2fs: preserve direct write semantics when buffering is forced In some cases, e.g. for zoned block devices, direct writes are forced into buffered writes that will populate the page cache and be written out just like buffered io. Direct reads, on the other hand, is supported for the zoned block device case. This has the effect that applications built for direct io will fill up the page cache with data that will never be read, and that is a waste of resources. If we agree that this is a problem, how do we fix it? A) Supporting proper direct writes for zoned block devices would be the best, but it is currently not supported (probably for a good but non-obvious reason). Would it be feasible to implement proper direct IO? B) Avoid the cost of keeping unwanted data by syncing and throwing out the cached pages for buffered O_DIRECT writes before completion. This patch implements B) by reusing the code for how partial block writes are flushed out on the "normal" direct write path. Note that this changes the performance characteristics of f2fs quite a bit. Direct IO performance for zoned block devices is lower for small writes after this patch, but this should be expected with direct IO and in line with how f2fs behaves on top of conventional block devices. Another open question is if the flushing should be done for all cases where buffered writes are forced. Signed-off-by: Hans Holmberg Reviewed-by: Yonggil Song Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5fc5cfaa13bd..5164f673efff 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4529,6 +4529,19 @@ static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = { .end_io = f2fs_dio_write_end_io, }; +static void f2fs_flush_buffered_write(struct address_space *mapping, + loff_t start_pos, loff_t end_pos) +{ + int ret; + + ret = filemap_write_and_wait_range(mapping, start_pos, end_pos); + if (ret < 0) + return; + invalidate_mapping_pages(mapping, + start_pos >> PAGE_SHIFT, + end_pos >> PAGE_SHIFT); +} + static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, bool *may_need_sync) { @@ -4627,14 +4640,9 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, ret += ret2; - ret2 = filemap_write_and_wait_range(file->f_mapping, - bufio_start_pos, - bufio_end_pos); - if (ret2 < 0) - goto out; - invalidate_mapping_pages(file->f_mapping, - bufio_start_pos >> PAGE_SHIFT, - bufio_end_pos >> PAGE_SHIFT); + f2fs_flush_buffered_write(file->f_mapping, + bufio_start_pos, + bufio_end_pos); } } else { /* iomap_dio_rw() already handled the generic_write_sync(). */ @@ -4717,8 +4725,18 @@ out_unlock: inode_unlock(inode); out: trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret); + if (ret > 0 && may_need_sync) ret = generic_write_sync(iocb, ret); + + /* If buffered IO was forced, flush and drop the data from + * the page cache to preserve O_DIRECT semantics + */ + if (ret > 0 && !dio && (iocb->ki_flags & IOCB_DIRECT)) + f2fs_flush_buffered_write(iocb->ki_filp->f_mapping, + orig_pos, + orig_pos + ret - 1); + return ret; } From 80901998218e87be3cf15656800b99c300150231 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 23 Mar 2023 15:37:54 -0700 Subject: [PATCH 17/54] f2fs: fix scheduling while atomic in decompression path [ 16.945668][ C0] Call trace: [ 16.945678][ C0] dump_backtrace+0x110/0x204 [ 16.945706][ C0] dump_stack_lvl+0x84/0xbc [ 16.945735][ C0] __schedule_bug+0xb8/0x1ac [ 16.945756][ C0] __schedule+0x724/0xbdc [ 16.945778][ C0] schedule+0x154/0x258 [ 16.945793][ C0] bit_wait_io+0x48/0xa4 [ 16.945808][ C0] out_of_line_wait_on_bit+0x114/0x198 [ 16.945824][ C0] __sync_dirty_buffer+0x1f8/0x2e8 [ 16.945853][ C0] __f2fs_commit_super+0x140/0x1f4 [ 16.945881][ C0] f2fs_commit_super+0x110/0x28c [ 16.945898][ C0] f2fs_handle_error+0x1f4/0x2f4 [ 16.945917][ C0] f2fs_decompress_cluster+0xc4/0x450 [ 16.945942][ C0] f2fs_end_read_compressed_page+0xc0/0xfc [ 16.945959][ C0] f2fs_handle_step_decompress+0x118/0x1cc [ 16.945978][ C0] f2fs_read_end_io+0x168/0x2b0 [ 16.945993][ C0] bio_endio+0x25c/0x2c8 [ 16.946015][ C0] dm_io_dec_pending+0x3e8/0x57c [ 16.946052][ C0] clone_endio+0x134/0x254 [ 16.946069][ C0] bio_endio+0x25c/0x2c8 [ 16.946084][ C0] blk_update_request+0x1d4/0x478 [ 16.946103][ C0] scsi_end_request+0x38/0x4cc [ 16.946129][ C0] scsi_io_completion+0x94/0x184 [ 16.946147][ C0] scsi_finish_command+0xe8/0x154 [ 16.946164][ C0] scsi_complete+0x90/0x1d8 [ 16.946181][ C0] blk_done_softirq+0xa4/0x11c [ 16.946198][ C0] _stext+0x184/0x614 [ 16.946214][ C0] __irq_exit_rcu+0x78/0x144 [ 16.946234][ C0] handle_domain_irq+0xd4/0x154 [ 16.946260][ C0] gic_handle_irq.33881+0x5c/0x27c [ 16.946281][ C0] call_on_irq_stack+0x40/0x70 [ 16.946298][ C0] do_interrupt_handler+0x48/0xa4 [ 16.946313][ C0] el1_interrupt+0x38/0x68 [ 16.946346][ C0] el1h_64_irq_handler+0x20/0x30 [ 16.946362][ C0] el1h_64_irq+0x78/0x7c [ 16.946377][ C0] finish_task_switch+0xc8/0x3d8 [ 16.946394][ C0] __schedule+0x600/0xbdc [ 16.946408][ C0] preempt_schedule_common+0x34/0x5c [ 16.946423][ C0] preempt_schedule+0x44/0x48 [ 16.946438][ C0] process_one_work+0x30c/0x550 [ 16.946456][ C0] worker_thread+0x414/0x8bc [ 16.946472][ C0] kthread+0x16c/0x1e0 [ 16.946486][ C0] ret_from_fork+0x10/0x20 Fixes: bff139b49d9f ("f2fs: handle decompress only post processing in softirq") Fixes: 95fa90c9e5a7 ("f2fs: support recording errors into superblock") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 7 ++++++- fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 4ca94026b5db..4cd10d99b56f 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -754,7 +754,12 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; - f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); + + /* Avoid f2fs_commit_super in irq context */ + if (in_task) + f2fs_save_errors(sbi, ERROR_FAIL_DECOMPRESSION); + else + f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); goto out_release; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 621bc415e3cc..3acfe10d4df0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3531,6 +3531,7 @@ int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); +void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6414fd77f111..1b3d717891ad 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3894,7 +3894,7 @@ void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) f2fs_up_write(&sbi->sb_lock); } -static void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) +void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) { spin_lock(&sbi->error_lock); if (!test_bit(flag, (unsigned long *)sbi->errors)) { From 8278903a99d570504b93b536a2fb58bf4ee3694b Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sun, 19 Mar 2023 15:58:22 +0800 Subject: [PATCH 18/54] f2fs: convert is_extension_exist() to return bool type is_extension_exist() only return two values, 0 or 1. So there is no need to use int type. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index c318a7366cf8..93b6eba55401 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -22,7 +22,7 @@ #include "acl.h" #include -static inline int is_extension_exist(const unsigned char *s, const char *sub, +static inline bool is_extension_exist(const unsigned char *s, const char *sub, bool tmp_ext) { size_t slen = strlen(s); @@ -30,19 +30,19 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub, int i; if (sublen == 1 && *sub == '*') - return 1; + return true; /* * filename format of multimedia file should be defined as: * "filename + '.' + extension + (optional: '.' + temp extension)". */ if (slen < sublen + 2) - return 0; + return false; if (!tmp_ext) { /* file has no temp extension */ if (s[slen - sublen - 1] != '.') - return 0; + return false; return !strncasecmp(s + slen - sublen, sub, sublen); } @@ -50,10 +50,10 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub, if (s[i] != '.') continue; if (!strncasecmp(s + i + 1, sub, sublen)) - return 1; + return true; } - return 0; + return false; } int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, From 3c58b4a084f3deb74b0ca1ad316e0a0b50f1505d Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sun, 19 Mar 2023 15:51:30 +0800 Subject: [PATCH 19/54] f2fs: add compression feature check for all compress mount opt Opt_compress_chksum, Opt_compress_mode and Opt_compress_cache lack the necessary check to see if the image supports compression, let's add it. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1b3d717891ad..dd91ec01f7f9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1186,9 +1186,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) kfree(name); break; case Opt_compress_chksum: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } F2FS_OPTION(sbi).compress_chksum = true; break; case Opt_compress_mode: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } name = match_strdup(&args[0]); if (!name) return -ENOMEM; @@ -1203,6 +1211,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) kfree(name); break; case Opt_compress_cache: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } set_opt(sbi, COMPRESS_CACHE); break; #else From b73ed3a21311995d49a7151093df05c4a4eeb5c7 Mon Sep 17 00:00:00 2001 From: Yohan Joung Date: Sun, 2 Apr 2023 12:12:59 +0900 Subject: [PATCH 20/54] f2fs: fix align check for npo2 Fix alignment check to be correct in npo2 as well Signed-off-by: Yohan Joung Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 95d9d72a291d..1b34c473d023 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1846,6 +1846,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, sector_t sector, nr_sects; block_t lblkstart = blkstart; int devi = 0; + u64 remainder = 0; if (f2fs_is_multi_device(sbi)) { devi = f2fs_target_device_index(sbi, blkstart); @@ -1861,9 +1862,9 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, if (f2fs_blkz_is_seq(sbi, devi, blkstart)) { sector = SECTOR_FROM_BLOCK(blkstart); nr_sects = SECTOR_FROM_BLOCK(blklen); + div64_u64_rem(sector, bdev_zone_sectors(bdev), &remainder); - if (sector & (bdev_zone_sectors(bdev) - 1) || - nr_sects != bdev_zone_sectors(bdev)) { + if (remainder || nr_sects != bdev_zone_sectors(bdev)) { f2fs_err(sbi, "(%d) %s: Unaligned zone reset attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path : "", blkstart, blklen); From a7867e5603c3a109af1681a2b0f68aedf22188ba Mon Sep 17 00:00:00 2001 From: Qilin Tan Date: Fri, 31 Mar 2023 17:26:56 +0800 Subject: [PATCH 21/54] f2fs: fix iostat lock protection Made iostat lock irq safe to avoid potentinal deadlock. Deadlock scenario: f2fs_attr_store -> f2fs_sbi_store -> _sbi_store -> spin_lock(sbi->iostat_lock) -> scsi_end_request -> bio_endio -> f2fs_dio_read_end_io -> f2fs_update_iostat -> spin_lock_irqsave(sbi->iostat_lock) ===> Dead lock here Fixes: 61803e984307 ("f2fs: fix iostat related lock protection") Fixes: a1e09b03e6f5 ("f2fs: use iomap for direct I/O") Signed-off-by: Qilin Tan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 56e6451e5726..d5b4c052e0e7 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -576,9 +576,9 @@ out: if (!strcmp(a->attr.name, "iostat_period_ms")) { if (t < MIN_IOSTAT_PERIOD_MS || t > MAX_IOSTAT_PERIOD_MS) return -EINVAL; - spin_lock(&sbi->iostat_lock); + spin_lock_irq(&sbi->iostat_lock); sbi->iostat_period_ms = (unsigned int)t; - spin_unlock(&sbi->iostat_lock); + spin_unlock_irq(&sbi->iostat_lock); return count; } #endif From 676d6204f79ae8d2f7cd5d1fb72ab377fb36ed32 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 3 Apr 2023 09:37:24 -0700 Subject: [PATCH 22/54] f2fs: fix null pointer panic in tracepoint in __replace_atomic_write_block We got a kernel panic if old_addr is NULL. https://bugzilla.kernel.org/show_bug.cgi?id=217266 BUG: kernel NULL pointer dereference, address: 0000000000000000 Call Trace: f2fs_commit_atomic_write+0x619/0x990 [f2fs a1b985b80f5babd6f3ea778384908880812bfa43] __f2fs_ioctl+0xd8e/0x4080 [f2fs a1b985b80f5babd6f3ea778384908880812bfa43] ? vfs_write+0x2ae/0x3f0 ? vfs_write+0x2ae/0x3f0 __x64_sys_ioctl+0x91/0xd0 do_syscall_64+0x5c/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7f69095fe53f Fixes: 2f3a9ae990a7 ("f2fs: introduce trace_f2fs_replace_atomic_write_block") Cc: Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1b34c473d023..efcd55bd1490 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -256,7 +256,7 @@ retry: f2fs_put_dnode(&dn); trace_f2fs_replace_atomic_write_block(inode, F2FS_I(inode)->cow_inode, - index, *old_addr, new_addr, recover); + index, old_addr ? *old_addr : 0, new_addr, recover); return 0; } From 8904daeb6b3a1680ee177fe644eb4444a62c09a0 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 4 Apr 2023 12:00:51 +0800 Subject: [PATCH 23/54] f2fs: remove struct victim_selection default_v_ops There is only single instance of these ops, and Jaegeuk point out that: Originally this was intended to give a chance to provide other allocation option. Anyway, it seems quit hard to do it anymore. So remove the indirection and call f2fs_get_victim() directly. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++++ fs/f2fs/gc.c | 15 ++++----------- fs/f2fs/segment.c | 5 ++--- fs/f2fs/segment.h | 7 ------- 4 files changed, 10 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3acfe10d4df0..68f24f82c923 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3812,6 +3812,10 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); void f2fs_destroy_garbage_collection_cache(void); +/* victim selection function for cleaning and SSR */ +int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, + int gc_type, int type, char alloc_mode, + unsigned long long age); /* * recovery.c diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 81b8ff81a16d..308fc2726e9a 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -741,9 +741,9 @@ static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, * When it is called from SSR segment selection, it finds a segment * which has minimum valid blocks and removes it from dirty seglist. */ -static int get_victim_by_default(struct f2fs_sb_info *sbi, - unsigned int *result, int gc_type, int type, - char alloc_mode, unsigned long long age) +int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, + int gc_type, int type, char alloc_mode, + unsigned long long age) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct sit_info *sm = SIT_I(sbi); @@ -937,10 +937,6 @@ out: return ret; } -static const struct victim_selection default_v_ops = { - .get_victim = get_victim_by_default, -}; - static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino) { struct inode_entry *ie; @@ -1672,8 +1668,7 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, int ret; down_write(&sit_i->sentry_lock); - ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, - NO_CHECK_TYPE, LFS, 0); + ret = f2fs_get_victim(sbi, victim, gc_type, NO_CHECK_TYPE, LFS, 0); up_write(&sit_i->sentry_lock); return ret; } @@ -1964,8 +1959,6 @@ static void init_atgc_management(struct f2fs_sb_info *sbi) void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) { - DIRTY_I(sbi)->v_ops = &default_v_ops; - sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; /* give warm/cold data area from slower device */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index efcd55bd1490..7416d33fce27 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2876,7 +2876,6 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); - const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; unsigned segno = NULL_SEGNO; unsigned short seg_type = curseg->seg_type; int i, cnt; @@ -2885,7 +2884,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, sanity_check_seg_type(sbi, seg_type); /* f2fs_need_SSR() already forces to do this */ - if (!v_ops->get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) { + if (!f2fs_get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) { curseg->next_segno = segno; return 1; } @@ -2912,7 +2911,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, for (; cnt-- > 0; reversed ? i-- : i++) { if (i == seg_type) continue; - if (!v_ops->get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) { + if (!f2fs_get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) { curseg->next_segno = segno; return 1; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index babb29a1c034..99e34d32c5c6 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -289,7 +289,6 @@ enum dirty_type { }; struct dirty_seglist_info { - const struct victim_selection *v_ops; /* victim selction operation */ unsigned long *dirty_segmap[NR_DIRTY_TYPE]; unsigned long *dirty_secmap; struct mutex seglist_lock; /* lock for segment bitmaps */ @@ -300,12 +299,6 @@ struct dirty_seglist_info { bool enable_pin_section; /* enable pinning section */ }; -/* victim selection function for cleaning and SSR */ -struct victim_selection { - int (*get_victim)(struct f2fs_sb_info *, unsigned int *, - int, int, char, unsigned long long); -}; - /* for active log information */ struct curseg_info { struct mutex curseg_mutex; /* lock for consistency */ From ae96beabad7ff8d429b7f6cd9ef2056d87d7bdcd Mon Sep 17 00:00:00 2001 From: Yonggil Song Date: Tue, 21 Mar 2023 09:12:51 +0900 Subject: [PATCH 24/54] f2fs: Fix system crash due to lack of free space in LFS When f2fs tries to checkpoint during foreground gc in LFS mode, system crash occurs due to lack of free space if the amount of dirty node and dentry pages generated by data migration exceeds free space. The reproduction sequence is as follows. - 20GiB capacity block device (null_blk) - format and mount with LFS mode - create a file and write 20,000MiB - 4k random write on full range of the file RIP: 0010:new_curseg+0x48a/0x510 [f2fs] Code: 55 e7 f5 89 c0 48 0f af c3 48 8b 5d c0 48 c1 e8 20 83 c0 01 89 43 6c 48 83 c4 28 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc <0f> 0b f0 41 80 4f 48 04 45 85 f6 0f 84 ba fd ff ff e9 ef fe ff ff RSP: 0018:ffff977bc397b218 EFLAGS: 00010246 RAX: 00000000000027b9 RBX: 0000000000000000 RCX: 00000000000027c0 RDX: 0000000000000000 RSI: 00000000000027b9 RDI: ffff8c25ab4e74f8 RBP: ffff977bc397b268 R08: 00000000000027b9 R09: ffff8c29e4a34b40 R10: 0000000000000001 R11: ffff977bc397b0d8 R12: 0000000000000000 R13: ffff8c25b4dd81a0 R14: 0000000000000000 R15: ffff8c2f667f9000 FS: 0000000000000000(0000) GS:ffff8c344ec80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000c00055d000 CR3: 0000000e30810003 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: allocate_segment_by_default+0x9c/0x110 [f2fs] f2fs_allocate_data_block+0x243/0xa30 [f2fs] ? __mod_lruvec_page_state+0xa0/0x150 do_write_page+0x80/0x160 [f2fs] f2fs_do_write_node_page+0x32/0x50 [f2fs] __write_node_page+0x339/0x730 [f2fs] f2fs_sync_node_pages+0x5a6/0x780 [f2fs] block_operations+0x257/0x340 [f2fs] f2fs_write_checkpoint+0x102/0x1050 [f2fs] f2fs_gc+0x27c/0x630 [f2fs] ? folio_mark_dirty+0x36/0x70 f2fs_balance_fs+0x16f/0x180 [f2fs] This patch adds checking whether free sections are enough before checkpoint during gc. Signed-off-by: Yonggil Song [Jaegeuk Kim: code clean-up] Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 10 ++++++++-- fs/f2fs/gc.h | 2 ++ fs/f2fs/segment.h | 39 ++++++++++++++++++++++++++++++--------- 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 308fc2726e9a..6652bace8984 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1806,6 +1806,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; unsigned int skipped_round = 0, round = 0; + unsigned int upper_secs; trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, gc_control->nr_free_secs, @@ -1891,8 +1892,13 @@ retry: } } - /* Write checkpoint to reclaim prefree segments */ - if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && + __get_secs_required(sbi, NULL, &upper_secs, NULL); + + /* + * Write checkpoint to reclaim prefree segments. + * We need more three extra sections for writer's data/node/dentry. + */ + if (free_sections(sbi) <= upper_secs + NR_GC_CHECKPOINT_SECS && prefree_segments(sbi)) { ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 5ad6ac63e13f..28a00942802c 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -30,6 +30,8 @@ /* Search max. number of dirty segments to select a victim segment */ #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ +#define NR_GC_CHECKPOINT_SECS (3) /* data/node/dentry sections */ + struct f2fs_gc_kthread { struct task_struct *f2fs_gc_task; wait_queue_head_t gc_wait_queue_head; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 99e34d32c5c6..ac2e35170f2d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -595,8 +595,12 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, return true; } -static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, - int freed, int needed) +/* + * calculate needed sections for dirty node/dentry + * and call has_curseg_enough_space + */ +static inline void __get_secs_required(struct f2fs_sb_info *sbi, + unsigned int *lower_p, unsigned int *upper_p, bool *curseg_p) { unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + get_pages(sbi, F2FS_DIRTY_DENTS) + @@ -606,20 +610,37 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); - unsigned int free, need_lower, need_upper; + + if (lower_p) + *lower_p = node_secs + dent_secs; + if (upper_p) + *upper_p = node_secs + dent_secs + + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + if (curseg_p) + *curseg_p = has_curseg_enough_space(sbi, + node_blocks, dent_blocks); +} + +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) +{ + unsigned int free_secs, lower_secs, upper_secs; + bool curseg_space; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - free = free_sections(sbi) + freed; - need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed; - need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + __get_secs_required(sbi, &lower_secs, &upper_secs, &curseg_space); - if (free > need_upper) + free_secs = free_sections(sbi) + freed; + lower_secs += needed + reserved_sections(sbi); + upper_secs += needed + reserved_sections(sbi); + + if (free_secs > upper_secs) return false; - else if (free <= need_lower) + else if (free_secs <= lower_secs) return true; - return !has_curseg_enough_space(sbi, node_blocks, dent_blocks); + return !curseg_space; } static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) From 4b5a208981ead6f1ecfbd160b793f56e54c6b518 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 31 Mar 2023 00:56:48 +0800 Subject: [PATCH 25/54] f2fs: set default compress option only when sb_has_compression If the compress feature is not enabled, there is no need to set compress-related parameters. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dd91ec01f7f9..e8b2d1a42ea2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2080,10 +2080,12 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); - F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; - F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; - F2FS_OPTION(sbi).compress_ext_cnt = 0; - F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; + if (f2fs_sb_has_compression(sbi)) { + F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; + F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; + F2FS_OPTION(sbi).compress_ext_cnt = 0; + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; + } F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL; From f08d4d315c7cb21e3c30e115143afd5e37bdc36a Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 31 Mar 2023 19:33:05 +0800 Subject: [PATCH 26/54] f2fs: convert to use sysfs_emit Let's use sysfs_emit. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d5b4c052e0e7..9f67504710be 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -313,19 +313,14 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { struct ckpt_req_control *cprc = &sbi->cprc_info; - int len = 0; int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio); int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio); - if (class == IOPRIO_CLASS_RT) - len += scnprintf(buf + len, PAGE_SIZE - len, "rt,"); - else if (class == IOPRIO_CLASS_BE) - len += scnprintf(buf + len, PAGE_SIZE - len, "be,"); - else + if (class != IOPRIO_CLASS_RT && class != IOPRIO_CLASS_BE) return -EINVAL; - len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data); - return len; + return sysfs_emit(buf, "%s,%d\n", + class == IOPRIO_CLASS_RT ? "rt" : "be", data); } #ifdef CONFIG_F2FS_FS_COMPRESSION From eba89e2e48419ce0e6c2eca4102ef783d0ef9bac Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 31 Mar 2023 00:49:47 +0800 Subject: [PATCH 27/54] f2fs: merge lz4hc_compress_pages() to lz4_compress_pages() Remove unnecessary lz4hc_compress_pages(). Signed-off-by: Yangtao Li Reviewed-by: Chao Yu [Jaegeuk Kim: clean up] Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 4cd10d99b56f..159cf481bb25 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -263,35 +263,21 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc) cc->private = NULL; } -#ifdef CONFIG_F2FS_FS_LZ4HC -static int lz4hc_compress_pages(struct compress_ctx *cc) -{ - unsigned char level = F2FS_I(cc->inode)->i_compress_level; - int len; - - if (level) - len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, - cc->clen, level, cc->private); - else - len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, - cc->clen, cc->private); - if (!len) - return -EAGAIN; - - cc->clen = len; - return 0; -} -#endif - static int lz4_compress_pages(struct compress_ctx *cc) { - int len; + int len = -EINVAL; + unsigned char level = F2FS_I(cc->inode)->i_compress_level; -#ifdef CONFIG_F2FS_FS_LZ4HC - return lz4hc_compress_pages(cc); -#endif - len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + if (!level) + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, cc->private); +#ifdef CONFIG_F2FS_FS_LZ4HC + else + len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, level, cc->private); +#endif + if (len < 0) + return len; if (!len) return -EAGAIN; From d760228a7a4ed46aa911b8d7fa87c907c3ecdd52 Mon Sep 17 00:00:00 2001 From: Weizhao Ouyang Date: Tue, 4 Apr 2023 11:28:44 +0800 Subject: [PATCH 28/54] f2fs: use common implementation of file type Use common implementation of file type conversion helpers. Signed-off-by: Weizhao Ouyang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 39 +++------------------------------------ fs/f2fs/f2fs.h | 1 - fs/f2fs/inline.c | 2 +- include/linux/f2fs_fs.h | 15 --------------- 4 files changed, 4 insertions(+), 53 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index ce895a927c9f..1769e8b44ff3 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -42,39 +42,6 @@ static unsigned int bucket_blocks(unsigned int level) return 4; } -static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { - [F2FS_FT_UNKNOWN] = DT_UNKNOWN, - [F2FS_FT_REG_FILE] = DT_REG, - [F2FS_FT_DIR] = DT_DIR, - [F2FS_FT_CHRDEV] = DT_CHR, - [F2FS_FT_BLKDEV] = DT_BLK, - [F2FS_FT_FIFO] = DT_FIFO, - [F2FS_FT_SOCK] = DT_SOCK, - [F2FS_FT_SYMLINK] = DT_LNK, -}; - -static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, - [S_IFCHR >> S_SHIFT] = F2FS_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = F2FS_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = F2FS_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = F2FS_FT_SOCK, - [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, -}; - -static void set_de_type(struct f2fs_dir_entry *de, umode_t mode) -{ - de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; -} - -unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de) -{ - if (de->file_type < F2FS_FT_MAX) - return f2fs_filetype_table[de->file_type]; - return DT_UNKNOWN; -} - /* If @dir is casefolded, initialize @fname->cf_name from @fname->usr_fname. */ int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) @@ -485,7 +452,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, lock_page(page); f2fs_wait_on_page_writeback(page, type, true, true); de->ino = cpu_to_le32(inode->i_ino); - set_de_type(de, inode->i_mode); + de->file_type = fs_umode_to_ftype(inode->i_mode); set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); @@ -699,7 +666,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, de->name_len = cpu_to_le16(name->len); memcpy(d->filename[bit_pos], name->name, name->len); de->ino = cpu_to_le32(ino); - set_de_type(de, mode); + de->file_type = fs_umode_to_ftype(mode); for (i = 0; i < slots; i++) { __set_bit_le(bit_pos + i, (void *)d->bitmap); /* avoid wrong garbage data for readdir */ @@ -1036,7 +1003,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, continue; } - d_type = f2fs_get_de_type(de); + d_type = fs_ftype_to_dtype(de->file_type); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 68f24f82c923..ec2310c6ac47 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3466,7 +3466,6 @@ int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, /* * dir.c */ -unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de); int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname); int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 72269e7efd26..4638fee16a91 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -497,7 +497,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) fname.hash = de->hash_code; ino = le32_to_cpu(de->ino); - fake_mode = f2fs_get_de_type(de) << S_SHIFT; + fake_mode = fs_ftype_to_dtype(de->file_type) << S_DT_SHIFT; err = f2fs_add_regular_entry(dir, &fname, NULL, ino, fake_mode); if (err) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 881eb9321967..1d6402529d10 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -584,21 +584,6 @@ struct f2fs_dentry_block { __u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN]; } __packed; -/* file types used in inode_info->flags */ -enum { - F2FS_FT_UNKNOWN, - F2FS_FT_REG_FILE, - F2FS_FT_DIR, - F2FS_FT_CHRDEV, - F2FS_FT_BLKDEV, - F2FS_FT_FIFO, - F2FS_FT_SOCK, - F2FS_FT_SYMLINK, - F2FS_FT_MAX -}; - -#define S_SHIFT 12 - #define F2FS_DEF_PROJID 0 /* default project ID */ #endif /* _LINUX_F2FS_FS_H */ From 03b9ae463f59b96e5d600af9e8e9053bf357dc54 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 Apr 2023 10:17:22 +0800 Subject: [PATCH 29/54] f2fs: use f2fs_hw_is_readonly() instead of bdev_read_only() f2fs has supported multi-device feature, to check devices' rw status, it should use f2fs_hw_is_readonly() rather than bdev_read_only(), fix it. Meanwhile, it removes f2fs_hw_is_readonly() check condition in: - f2fs_write_checkpoint() - f2fs_convert_inline_inode() As it has checked f2fs_readonly() condition, and if f2fs' devices were readonly, f2fs_readonly() must be true. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/super.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 24f6051bc284..e6e9a7b3d3af 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -713,7 +713,7 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; - if (bdev_read_only(sbi->sb->s_bdev)) { + if (f2fs_hw_is_readonly(sbi)) { f2fs_info(sbi, "write access unavailable, skipping orphan cleanup"); return 0; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e8b2d1a42ea2..2976f774139d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3283,7 +3283,7 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, raw_super->segment_count = cpu_to_le32((main_end_blkaddr - segment0_blkaddr) >> log_blocks_per_seg); - if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) { + if (f2fs_readonly(sb) || f2fs_hw_is_readonly(sbi)) { set_sbi_flag(sbi, SBI_NEED_SB_WRITE); res = "internally"; } else { @@ -3859,7 +3859,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) int err; if ((recover && f2fs_readonly(sbi->sb)) || - bdev_read_only(sbi->sb->s_bdev)) { + f2fs_hw_is_readonly(sbi)) { set_sbi_flag(sbi, SBI_NEED_SB_WRITE); return -EROFS; } From 6fe7c2cf08a6d3c1de561d96508eea6b1d4b8e6b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 Apr 2023 10:20:12 +0800 Subject: [PATCH 30/54] f2fs: remove unneeded in-memory i_crtime copy i_crtime will never change after inode creation, so we don't need to copy it into f2fs_inode_info.i_disk_time[3], and monitor its change to decide whether updating inode page, remove related stuff. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +---- fs/f2fs/inode.c | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ec2310c6ac47..d96f5e9e2be1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -840,7 +840,7 @@ struct f2fs_inode_info { kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ struct timespec64 i_crtime; /* inode creation time */ - struct timespec64 i_disk_time[4];/* inode disk times */ + struct timespec64 i_disk_time[3];/* inode disk times */ /* for file compress */ atomic_t i_compr_blocks; /* # of compressed blocks */ @@ -3284,9 +3284,6 @@ static inline bool f2fs_is_time_consistent(struct inode *inode) return false; if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3, - &F2FS_I(inode)->i_crtime)) - return false; return true; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index efdc555dea96..7a13ab90e854 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -326,7 +326,6 @@ static void init_idisk_time(struct inode *inode) fi->i_disk_time[0] = inode->i_atime; fi->i_disk_time[1] = inode->i_ctime; fi->i_disk_time[2] = inode->i_mtime; - fi->i_disk_time[3] = fi->i_crtime; } static int do_read_inode(struct inode *inode) From 4aebaf7bfff9c178e9be72cc63f346aa36e625c6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 Apr 2023 10:14:02 +0800 Subject: [PATCH 31/54] f2fs: fix to avoid use-after-free for cached IPU bio xfstest generic/019 reports a bug: kernel BUG at mm/filemap.c:1619! RIP: 0010:folio_end_writeback+0x8a/0x90 Call Trace: end_page_writeback+0x1c/0x60 f2fs_write_end_io+0x199/0x420 bio_endio+0x104/0x180 submit_bio_noacct+0xa5/0x510 submit_bio+0x48/0x80 f2fs_submit_write_bio+0x35/0x300 f2fs_submit_merged_ipu_write+0x2a0/0x2b0 f2fs_write_single_data_page+0x838/0x8b0 f2fs_write_cache_pages+0x379/0xa30 f2fs_write_data_pages+0x30c/0x340 do_writepages+0xd8/0x1b0 __writeback_single_inode+0x44/0x370 writeback_sb_inodes+0x233/0x4d0 __writeback_inodes_wb+0x56/0xf0 wb_writeback+0x1dd/0x2d0 wb_workfn+0x367/0x4a0 process_one_work+0x21d/0x430 worker_thread+0x4e/0x3c0 kthread+0x103/0x130 ret_from_fork+0x2c/0x50 The root cause is: after cp_error is set, f2fs_submit_merged_ipu_write() in f2fs_write_single_data_page() tries to flush IPU bio in cache, however f2fs_submit_merged_ipu_write() missed to check validity of @bio parameter, result in submitting random cached bio which belong to other IO context, then it will cause use-after-free issue, fix it by adding additional validity check. Fixes: 0b20fcec8651 ("f2fs: cache global IPU bio") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cbb4352d30ec..a5c28050066a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -875,6 +875,8 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, bool found = false; struct bio *target = bio ? *bio : NULL; + f2fs_bug_on(sbi, !target && !page); + for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct list_head *head = &io->bio_list; @@ -2910,7 +2912,8 @@ out: if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_write(sbi, DATA); - f2fs_submit_merged_ipu_write(sbi, bio, NULL); + if (bio && *bio) + f2fs_submit_merged_ipu_write(sbi, bio, NULL); submitted = NULL; } From ac9998fd5a835464b9b3b87b79a7e0a2e0f39bda Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 Apr 2023 10:12:22 +0800 Subject: [PATCH 32/54] f2fs: fix to drop all dirty pages during umount() if cp_error is set xfstest generic/361 reports a bug as below: f2fs_bug_on(sbi, sbi->fsync_node_num); kernel BUG at fs/f2fs/super.c:1627! RIP: 0010:f2fs_put_super+0x3a8/0x3b0 Call Trace: generic_shutdown_super+0x8c/0x1b0 kill_block_super+0x2b/0x60 kill_f2fs_super+0x87/0x110 deactivate_locked_super+0x39/0x80 deactivate_super+0x46/0x50 cleanup_mnt+0x109/0x170 __cleanup_mnt+0x16/0x20 task_work_run+0x65/0xa0 exit_to_user_mode_prepare+0x175/0x190 syscall_exit_to_user_mode+0x25/0x50 do_syscall_64+0x4c/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc During umount(), if cp_error is set, f2fs_wait_on_all_pages() should not stop waiting all F2FS_WB_CP_DATA pages to be writebacked, otherwise, fsync_node_num can be non-zero after f2fs_wait_on_all_pages() causing this bug. In this case, to avoid deadloop in f2fs_wait_on_all_pages(), it needs to drop all dirty pages rather than redirtying them. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 12 ++++++++++-- fs/f2fs/data.c | 3 ++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e6e9a7b3d3af..d79e189511e6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -325,8 +325,15 @@ static int __f2fs_write_meta_page(struct page *page, trace_f2fs_writepage(page, META); - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_META); + unlock_page(page); + return 0; + } goto redirty_out; + } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) @@ -1304,7 +1311,8 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) if (!get_pages(sbi, type)) break; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi) && + !is_sbi_flag_set(sbi, SBI_IS_CLOSE))) break; if (type == F2FS_DIRTY_META) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a5c28050066a..5cc9353900a0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2812,7 +2812,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, * don't drop any dirty dentry pages for keeping lastest * directory structure. */ - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode) && + !is_sbi_flag_set(sbi, SBI_IS_CLOSE)) goto redirty_out; goto out; } From edb05b6e374092d38a3ea8d5bc510c02101368f7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 Apr 2023 10:11:40 +0800 Subject: [PATCH 33/54] f2fs: fix to keep consistent i_gc_rwsem lock order i_gc_rwsem[WRITE] and i_gc_rwsem[READ] lock order is reversed in gc_data_segment() and f2fs_dio_write_iter(), fix to keep consistent lock order as below: 1. lock i_gc_rwsem[WRITE] 2. lock i_gc_rwsem[READ] Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6652bace8984..a2abde99c7f3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1617,14 +1617,14 @@ next_step: int err; if (S_ISREG(inode->i_mode)) { - if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[READ])) { + if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[WRITE])) { sbi->skipped_gc_rwsem++; continue; } if (!f2fs_down_write_trylock( - &fi->i_gc_rwsem[WRITE])) { + &fi->i_gc_rwsem[READ])) { sbi->skipped_gc_rwsem++; - f2fs_up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); continue; } locked = true; @@ -1647,8 +1647,8 @@ next_step: submitted++; if (locked) { - f2fs_up_write(&fi->i_gc_rwsem[WRITE]); f2fs_up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); } stat_inc_data_blk_count(sbi, 1, gc_type); From 0e43dfdc5d4fe6808daddc215287aaa4102e1006 Mon Sep 17 00:00:00 2001 From: Wang Han Date: Sun, 9 Apr 2023 13:29:51 +0800 Subject: [PATCH 34/54] docs: f2fs: Correct instruction to disable checkpoint This should be 'disable' rather than 'disabled'. Reported-by: LoveSy Signed-off-by: Wang Han Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 122d520e0a0c..0a9360b89eb6 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -264,7 +264,7 @@ checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enabl disabled, any unmounting or unexpected shutdowns will cause the filesystem contents to appear as they did when the filesystem was mounted with that option. - While mounting with checkpoint=disabled, the filesystem must + While mounting with checkpoint=disable, the filesystem must run garbage collection to ensure that all available space can be used. If this takes too much time, the mount may return EAGAIN. You may optionally add a value to indicate how much From b99e20f09df424c9cca466df711c071f8c6f8abe Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 4 Apr 2023 23:28:07 +0800 Subject: [PATCH 35/54] f2fs: fix to check readonly condition correctly With below case, it can mount multi-device image w/ rw option, however one of secondary device is set as ro, later update will cause panic, so let's introduce f2fs_dev_is_readonly(), and check multi-devices rw status in f2fs_remount() w/ it in order to avoid such inconsistent mount status. mkfs.f2fs -c /dev/zram1 /dev/zram0 -f blockdev --setro /dev/zram1 mount -t f2fs dev/zram0 /mnt/f2fs mount: /mnt/f2fs: WARNING: source write-protected, mounted read-only. mount -t f2fs -o remount,rw mnt/f2fs dd if=/dev/zero of=/mnt/f2fs/file bs=1M count=8192 kernel BUG at fs/f2fs/inline.c:258! RIP: 0010:f2fs_write_inline_data+0x23e/0x2d0 [f2fs] Call Trace: f2fs_write_single_data_page+0x26b/0x9f0 [f2fs] f2fs_write_cache_pages+0x389/0xa60 [f2fs] __f2fs_write_data_pages+0x26b/0x2d0 [f2fs] f2fs_write_data_pages+0x2e/0x40 [f2fs] do_writepages+0xd3/0x1b0 __writeback_single_inode+0x5b/0x420 writeback_sb_inodes+0x236/0x5a0 __writeback_inodes_wb+0x56/0xf0 wb_writeback+0x2a3/0x490 wb_do_writeback+0x2b2/0x330 wb_workfn+0x6a/0x260 process_one_work+0x270/0x5e0 worker_thread+0x52/0x3e0 kthread+0xf4/0x120 ret_from_fork+0x29/0x50 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/super.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d96f5e9e2be1..ebf16f538282 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4428,6 +4428,11 @@ static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi) return false; } +static inline bool f2fs_dev_is_readonly(struct f2fs_sb_info *sbi) +{ + return f2fs_sb_has_readonly(sbi) || f2fs_hw_is_readonly(sbi); +} + static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) { return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2976f774139d..7e0a0c5f6e89 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2296,7 +2296,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) goto skip; - if (f2fs_sb_has_readonly(sbi) && !(*flags & SB_RDONLY)) { + if (f2fs_dev_is_readonly(sbi) && !(*flags & SB_RDONLY)) { err = -EROFS; goto restore_opts; } From 931b9b0fa8d6d5d842bbc7ae961f5c5234765f26 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 2 Apr 2023 19:27:06 +0800 Subject: [PATCH 36/54] f2fs: fix to recover quota data correctly With -O quota mkfs option, xfstests generic/417 fails due to fsck detects data corruption on quota inodes. [ASSERT] (fsck_chk_quota_files:2051) --> Quota file is missing or invalid quota file content found. The root cause is there is a hole f2fs doesn't hold quota inodes, so all recovered quota data will be dropped due to SBI_POR_DOING flag was set. - f2fs_fill_super - f2fs_recover_orphan_inodes - f2fs_enable_quota_files - f2fs_quota_off_umount <--- quota inodes were dropped ---> - f2fs_recover_fsync_data - f2fs_enable_quota_files - f2fs_quota_off_umount This patch tries to eliminate the hole by holding quota inodes during entire recovery flow as below: - f2fs_fill_super - f2fs_recover_quota_begin - f2fs_recover_orphan_inodes - f2fs_recover_fsync_data - f2fs_recover_quota_end Then, recovered quota data can be persisted after SBI_POR_DOING is cleared. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 23 +----------------- fs/f2fs/debug.c | 1 + fs/f2fs/f2fs.h | 1 + fs/f2fs/recovery.c | 17 +------------ fs/f2fs/super.c | 57 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 61 insertions(+), 38 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d79e189511e6..cb66a55b7629 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -711,11 +711,7 @@ err_out: int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; - unsigned int s_flags = sbi->sb->s_flags; int err = 0; -#ifdef CONFIG_QUOTA - int quota_enabled; -#endif if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; @@ -725,18 +721,8 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) return 0; } - if (s_flags & SB_RDONLY) { + if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) f2fs_info(sbi, "orphan cleanup on readonly fs"); - sbi->sb->s_flags &= ~SB_RDONLY; - } - -#ifdef CONFIG_QUOTA - /* - * Turn on quotas which were not enabled for read-only mounts if - * filesystem has quota feature, so that they are updated correctly. - */ - quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); -#endif start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -770,13 +756,6 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) out: set_sbi_flag(sbi, SBI_IS_RECOVERED); -#ifdef CONFIG_QUOTA - /* Turn quotas off */ - if (quota_enabled) - f2fs_quota_off_umount(sbi->sb); -#endif - sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ - return err; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 99c7fc832ec7..61c35b59126e 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -352,6 +352,7 @@ static const char *s_flag[MAX_SBI_FLAG] = { [SBI_QUOTA_NEED_REPAIR] = "quota_need_repair", [SBI_IS_RESIZEFS] = "resizefs", [SBI_IS_FREEZING] = "freezefs", + [SBI_IS_WRITABLE] = "writable", }; static const char *ipu_mode_names[F2FS_IPU_MAX] = { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ebf16f538282..bee60c2e03c2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1291,6 +1291,7 @@ enum { SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ SBI_IS_RESIZEFS, /* resizefs is in process */ SBI_IS_FREEZING, /* freezefs is in process */ + SBI_IS_WRITABLE, /* remove ro mountoption transiently */ MAX_SBI_FLAG, }; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 9eb9ec9bddb2..4e37096cea9a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -825,19 +825,9 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; bool fix_curseg_write_pointer = false; -#ifdef CONFIG_QUOTA - int quota_enabled; -#endif - if (s_flags & SB_RDONLY) { + if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) f2fs_info(sbi, "recover fsync data on readonly fs"); - sbi->sb->s_flags &= ~SB_RDONLY; - } - -#ifdef CONFIG_QUOTA - /* Turn on quotas so that they are updated correctly */ - quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); -#endif INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&tmp_inode_list); @@ -909,11 +899,6 @@ skip: } } -#ifdef CONFIG_QUOTA - /* Turn quotas off */ - if (quota_enabled) - f2fs_quota_off_umount(sbi->sb); -#endif sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return ret ? ret : err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7e0a0c5f6e89..52d3226bf90b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2509,6 +2509,54 @@ restore_opts: } #ifdef CONFIG_QUOTA +static bool f2fs_need_recovery(struct f2fs_sb_info *sbi) +{ + /* need to recovery orphan */ + if (is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) + return true; + /* need to recovery data */ + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) + return false; + if (test_opt(sbi, NORECOVERY)) + return false; + return !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG); +} + +static bool f2fs_recover_quota_begin(struct f2fs_sb_info *sbi) +{ + bool readonly = f2fs_readonly(sbi->sb); + + if (!f2fs_need_recovery(sbi)) + return false; + + /* it doesn't need to check f2fs_sb_has_readonly() */ + if (f2fs_hw_is_readonly(sbi)) + return false; + + if (readonly) { + sbi->sb->s_flags &= ~SB_RDONLY; + set_sbi_flag(sbi, SBI_IS_WRITABLE); + } + + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + return f2fs_enable_quota_files(sbi, readonly); +} + +static void f2fs_recover_quota_end(struct f2fs_sb_info *sbi, + bool quota_enabled) +{ + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); + + if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) { + clear_sbi_flag(sbi, SBI_IS_WRITABLE); + sbi->sb->s_flags |= SB_RDONLY; + } +} + /* Read data from quotafile */ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) @@ -4118,6 +4166,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) int recovery, i, valid_super_block; struct curseg_info *seg_i; int retry_cnt = 1; +#ifdef CONFIG_QUOTA + bool quota_enabled = false; +#endif try_onemore: err = -EINVAL; @@ -4411,6 +4462,8 @@ try_onemore: if (err) f2fs_err(sbi, "Cannot turn on quotas: error %d", err); } + + quota_enabled = f2fs_recover_quota_begin(sbi); #endif /* if there are any orphan inodes, free them */ err = f2fs_recover_orphan_inodes(sbi); @@ -4468,6 +4521,10 @@ try_onemore: } } +#ifdef CONFIG_QUOTA + f2fs_recover_quota_end(sbi, quota_enabled); +#endif + /* * If the f2fs is not readonly and fsync data recovery succeeds, * check zoned block devices' write pointer consistency. From 475f660169d633373ee2f2a0e61a666ebb9ce9a4 Mon Sep 17 00:00:00 2001 From: Yohan Joung Date: Mon, 27 Mar 2023 22:58:00 +0900 Subject: [PATCH 37/54] f2fs: add radix_tree_preload_end in error case To prevent excessive increase in preemption count add radix_tree_preload_end in retry Signed-off-by: Yohan Joung Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index cb66a55b7629..e13d47c04131 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -513,6 +513,7 @@ retry: if (!e) { if (!new) { spin_unlock(&im->ino_lock); + radix_tree_preload_end(); goto retry; } e = new; From f879f5e216a13501316806353b4d0aa5d5727fd7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 Apr 2023 11:18:48 -0700 Subject: [PATCH 38/54] f2fs: fix potential corruption when moving a directory F2FS has the same issue in ext4_rename causing crash revealed by xfstests/generic/707. See also commit 0813299c586b ("ext4: Fix possible corruption when moving a directory") CC: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 93b6eba55401..342f18f6ebb1 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -992,12 +992,20 @@ static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, goto out; } + /* + * Copied from ext4_rename: we need to protect against old.inode + * directory getting converted from inline directory format into + * a normal one. + */ + if (S_ISDIR(old_inode->i_mode)) + inode_lock_nested(old_inode, I_MUTEX_NONDIR2); + err = -ENOENT; old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) err = PTR_ERR(old_page); - goto out; + goto out_unlock_old; } if (S_ISDIR(old_inode->i_mode)) { @@ -1105,6 +1113,9 @@ static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, f2fs_unlock_op(sbi); + if (S_ISDIR(old_inode->i_mode)) + inode_unlock(old_inode); + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); @@ -1119,6 +1130,9 @@ out_dir: f2fs_put_page(old_dir_page, 0); out_old: f2fs_put_page(old_page, 0); +out_unlock_old: + if (S_ISDIR(old_inode->i_mode)) + inode_unlock(old_inode); out: iput(whiteout); return err; From cba03c29a1e1c789d6ca6ca9697103b361d0a63b Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 6 Apr 2023 15:11:04 -0700 Subject: [PATCH 39/54] f2fs: fix passing relative address when discard zones We should not pass relative address in a zone to __f2fs_issue_discard_zone(). Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7416d33fce27..1494d335e81f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4943,9 +4943,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) "New zone for curseg[%d] is not yet discarded. " "Reset the zone: curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); - err = __f2fs_issue_discard_zone(sbi, zbd->bdev, - zone_sector >> log_sectors_per_block, - zone.len >> log_sectors_per_block); + err = __f2fs_issue_discard_zone(sbi, zbd->bdev, cs_zone_block, + zone.len >> log_sectors_per_block); if (err) { f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", zbd->path, err); From 5a37d6f4943555ef57fc16016872f713818f0b2a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Apr 2023 22:44:53 +0800 Subject: [PATCH 40/54] f2fs: fix to check return value of f2fs_do_truncate_blocks() Otherwise, if truncation on cow_inode failed, remained data may pollute current transaction of atomic write. Cc: Daeho Jeong Fixes: a46bebd502fe ("f2fs: synchronize atomic write aborts") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5164f673efff..176d1b1c8ceb 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2100,7 +2100,11 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); } else { /* Reuse the already created COW inode */ - f2fs_do_truncate_blocks(fi->cow_inode, 0, true); + ret = f2fs_do_truncate_blocks(fi->cow_inode, 0, true); + if (ret) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + goto out; + } } f2fs_write_inode(inode, NULL); From 9f45220756d973668d27ab2639d68c350ad5455b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Apr 2023 22:45:36 +0800 Subject: [PATCH 41/54] f2fs: fix to check return value of inc_valid_block_count() In __replace_atomic_write_block(), we missed to check return value of inc_valid_block_count(), for extreme testcase that f2fs image is run out of space, it may cause inconsistent status in between SIT table and total valid block count. Cc: Daeho Jeong Fixes: 3db1de0e582c ("f2fs: change the current atomic write way") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1494d335e81f..877096c3e6e3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -245,10 +245,16 @@ retry: } else { blkcnt_t count = 1; + err = inc_valid_block_count(sbi, inode, &count); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + *old_addr = dn.data_blkaddr; f2fs_truncate_data_blocks_range(&dn, 1); dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); - inc_valid_block_count(sbi, inode, &count); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, ni.version, true, false); } From 6a328c646fd36494ce7c8040f7b487c3eafe1274 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 4 Apr 2023 22:38:31 +0800 Subject: [PATCH 42/54] f2fs: remove batched_trim_sections node description It's deprecated since commit 377224c47118 ("f2fs: don't split checkpoint in fstrim"). Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index c1314b7fe544..8140fc98f5ae 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -190,12 +190,6 @@ Description: Controls the memory footprint used by free nids and cached nat entries. By default, 1 is set, which indicates 10 MB / 1 GB RAM. -What: /sys/fs/f2fs//batched_trim_sections -Date: February 2015 -Contact: "Jaegeuk Kim" -Description: Controls the trimming rate in batch mode. - - What: /sys/fs/f2fs//cp_interval Date: October 2015 Contact: "Jaegeuk Kim" From 56326e3e0d6ddfec2bc0cf3f4db9eeb3eeb5b1aa Mon Sep 17 00:00:00 2001 From: Wu Bo Date: Thu, 13 Apr 2023 18:17:11 +0800 Subject: [PATCH 43/54] f2fs: support iopoll method Wire up the iopoll method to the common implementation. As f2fs use common dio infrastructure: commit a1e09b03e6f5 ("f2fs: use iomap for direct I/O") Signed-off-by: Wu Bo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 176d1b1c8ceb..ac2d352b0811 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4895,6 +4895,7 @@ const struct file_operations f2fs_file_operations = { .llseek = f2fs_llseek, .read_iter = f2fs_file_read_iter, .write_iter = f2fs_file_write_iter, + .iopoll = iomap_dio_iopoll, .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, From a1b01c5b02dd989383a69d9eba79a89b61314e48 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 7 Apr 2023 03:16:29 +0800 Subject: [PATCH 44/54] f2fs: remove bulk remove_proc_entry() and unnecessary kobject_del() Convert to use remove_proc_subtree() and kill kobject_del() directly. kobject_put() actually covers kobject removal automatically, which is single stage removal. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 9f67504710be..6e23bec9bd20 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1463,25 +1463,14 @@ put_sb_kobj: void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { - if (sbi->s_proc) { -#ifdef CONFIG_F2FS_IOSTAT - remove_proc_entry("iostat_info", sbi->s_proc); -#endif - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry("victim_bits", sbi->s_proc); - remove_proc_entry("discard_plist_info", sbi->s_proc); - remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); - } + if (sbi->s_proc) + remove_proc_subtree(sbi->sb->s_id, f2fs_proc_root); - kobject_del(&sbi->s_stat_kobj); kobject_put(&sbi->s_stat_kobj); wait_for_completion(&sbi->s_stat_kobj_unregister); - kobject_del(&sbi->s_feature_list_kobj); kobject_put(&sbi->s_feature_list_kobj); wait_for_completion(&sbi->s_feature_list_kobj_unregister); - kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); } From b4ad76d4ca87473304f9b7387be8d97b48166c1d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 21 Dec 2022 20:13:45 +0800 Subject: [PATCH 45/54] f2fs: fix to call clear_page_private_reference in .{release,invalid}_folio b763f3bedc2d ("f2fs: restructure f2fs page.private layout") missed to call clear_page_private_reference() in .{release,invalid}_folio, fix it, though it's not a big deal since folio_detach_private() was called to clear all privae info and reference count in the page. BTW, remove page_private_reference() definition as it never be used. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ fs/f2fs/f2fs.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5cc9353900a0..67b337088741 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3710,6 +3710,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, } } + clear_page_private_reference(page); clear_page_private_gcing(page); if (test_opt(sbi, COMPRESS_CACHE) && @@ -3733,6 +3734,7 @@ int f2fs_release_page(struct page *page, gfp_t wait) clear_page_private_data(page); } + clear_page_private_reference(page); clear_page_private_gcing(page); detach_page_private(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bee60c2e03c2..988b4ff573d6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1429,7 +1429,6 @@ static inline void clear_page_private_##name(struct page *page) \ } PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); -PAGE_PRIVATE_GET_FUNC(reference, REF_RESOURCE); PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE); From aac0a045e507e2f830b99d27876fdbf36349cc39 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 Apr 2023 10:24:17 +0800 Subject: [PATCH 46/54] f2fs: remove folio_detach_private() in .invalidate_folio and .release_folio We have maintain PagePrivate and page_private and page reference w/ {set,clear}_page_private_*, it doesn't need to call folio_detach_private() in the end of .invalidate_folio and .release_folio, remove it and use f2fs_bug_on instead. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 24 +------- fs/f2fs/dir.c | 6 +- fs/f2fs/f2fs.h | 154 ++++++++++++++++++++++++------------------------- 3 files changed, 77 insertions(+), 107 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 67b337088741..3ba33a244bd9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3709,16 +3709,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, f2fs_remove_dirty_inode(inode); } } - - clear_page_private_reference(page); - clear_page_private_gcing(page); - - if (test_opt(sbi, COMPRESS_CACHE) && - inode->i_ino == F2FS_COMPRESS_INO(sbi)) - clear_page_private_data(page); - - detach_page_private(page); - set_page_private(page, 0); + clear_page_private_all(page); } int f2fs_release_page(struct page *page, gfp_t wait) @@ -3727,18 +3718,7 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (PageDirty(page)) return 0; - if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) { - struct inode *inode = page->mapping->host; - - if (inode->i_ino == F2FS_COMPRESS_INO(F2FS_I_SB(inode))) - clear_page_private_data(page); - } - - clear_page_private_reference(page); - clear_page_private_gcing(page); - - detach_page_private(page); - set_page_private(page, 0); + clear_page_private_all(page); return 1; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 1769e8b44ff3..30cb141abc64 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -905,14 +905,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); ClearPageUptodate(page); - - clear_page_private_gcing(page); + clear_page_private_all(page); inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); - - detach_page_private(page); - set_page_private(page, 0); } f2fs_put_page(page, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 988b4ff573d6..9400e7ff9257 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1395,86 +1395,6 @@ enum { PAGE_PRIVATE_MAX }; -#define PAGE_PRIVATE_GET_FUNC(name, flagname) \ -static inline bool page_private_##name(struct page *page) \ -{ \ - return PagePrivate(page) && \ - test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ - test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ -} - -#define PAGE_PRIVATE_SET_FUNC(name, flagname) \ -static inline void set_page_private_##name(struct page *page) \ -{ \ - if (!PagePrivate(page)) { \ - get_page(page); \ - SetPagePrivate(page); \ - set_page_private(page, 0); \ - } \ - set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \ - set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ -} - -#define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ -static inline void clear_page_private_##name(struct page *page) \ -{ \ - clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ - if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) { \ - set_page_private(page, 0); \ - if (PagePrivate(page)) { \ - ClearPagePrivate(page); \ - put_page(page); \ - }\ - } \ -} - -PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); -PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); -PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE); - -PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); -PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); -PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE); - -PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); -PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); -PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE); - -static inline unsigned long get_page_private_data(struct page *page) -{ - unsigned long data = page_private(page); - - if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) - return 0; - return data >> PAGE_PRIVATE_MAX; -} - -static inline void set_page_private_data(struct page *page, unsigned long data) -{ - if (!PagePrivate(page)) { - get_page(page); - SetPagePrivate(page); - set_page_private(page, 0); - } - set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); - page_private(page) |= data << PAGE_PRIVATE_MAX; -} - -static inline void clear_page_private_data(struct page *page) -{ - page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0); - if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) { - set_page_private(page, 0); - if (PagePrivate(page)) { - ClearPagePrivate(page); - put_page(page); - } - } -} - /* For compression */ enum compress_algorithm_type { COMPRESS_LZO, @@ -2369,6 +2289,80 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...); #define f2fs_debug(sbi, fmt, ...) \ f2fs_printk(sbi, KERN_DEBUG fmt, ##__VA_ARGS__) +#define PAGE_PRIVATE_GET_FUNC(name, flagname) \ +static inline bool page_private_##name(struct page *page) \ +{ \ + return PagePrivate(page) && \ + test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ + test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_SET_FUNC(name, flagname) \ +static inline void set_page_private_##name(struct page *page) \ +{ \ + if (!PagePrivate(page)) \ + attach_page_private(page, (void *)0); \ + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \ + set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ +static inline void clear_page_private_##name(struct page *page) \ +{ \ + clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ + if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) \ + detach_page_private(page); \ +} + +PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); +PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE); + +static inline unsigned long get_page_private_data(struct page *page) +{ + unsigned long data = page_private(page); + + if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) + return 0; + return data >> PAGE_PRIVATE_MAX; +} + +static inline void set_page_private_data(struct page *page, unsigned long data) +{ + if (!PagePrivate(page)) + attach_page_private(page, (void *)0); + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); + page_private(page) |= data << PAGE_PRIVATE_MAX; +} + +static inline void clear_page_private_data(struct page *page) +{ + page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0); + if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) + detach_page_private(page); +} + +static inline void clear_page_private_all(struct page *page) +{ + clear_page_private_data(page); + clear_page_private_reference(page); + clear_page_private_gcing(page); + clear_page_private_inline(page); + + f2fs_bug_on(F2FS_P_SB(page), page_private(page)); +} + static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, block_t count) From 60ada758fa8952a4f1d25ccc81eb6907cbf6f7f9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 10 Apr 2023 14:48:50 -0700 Subject: [PATCH 47/54] f2fs: refactor f2fs_gc to call checkpoint in urgent condition The major change is to call checkpoint, if there's not enough space while having some prefree segments in FG_GC case. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a2abde99c7f3..6f9e29ac20d9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1830,7 +1830,10 @@ gc_more: goto stop; } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { + /* Let's run FG_GC, if we don't have enough space. */ + if (has_not_enough_free_secs(sbi, 0, 0)) { + gc_type = FG_GC; + /* * For example, if there are many prefree_segments below given * threshold, we can make them free by checkpoint. Then, we @@ -1841,8 +1844,6 @@ gc_more: if (ret) goto stop; } - if (has_not_enough_free_secs(sbi, 0, 0)) - gc_type = FG_GC; } /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ @@ -1869,19 +1870,15 @@ retry: if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) sec_freed++; - if (gc_type == FG_GC) + if (gc_type == FG_GC) { sbi->cur_victim_sec = NULL_SEGNO; - if (gc_control->init_gc_type == FG_GC || - !has_not_enough_free_secs(sbi, - (gc_type == FG_GC) ? sec_freed : 0, 0)) { - if (gc_type == FG_GC && sec_freed < gc_control->nr_free_secs) - goto go_gc_more; - goto stop; - } - - /* FG_GC stops GC by skip_count */ - if (gc_type == FG_GC) { + if (!has_not_enough_free_secs(sbi, sec_freed, 0)) { + if (!gc_control->no_bg_gc && + sec_freed < gc_control->nr_free_secs) + goto go_gc_more; + goto stop; + } if (sbi->skipped_gc_rwsem) skipped_round++; round++; @@ -1890,6 +1887,8 @@ retry: ret = f2fs_write_checkpoint(sbi, &cpc); goto stop; } + } else if (!has_not_enough_free_secs(sbi, 0, 0)) { + goto stop; } __get_secs_required(sbi, NULL, &upper_secs, NULL); From 4d199b7d832e89d39a63787f293c7d912ac097f9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 Apr 2023 16:56:20 -0700 Subject: [PATCH 48/54] f2fs: relax sanity check if checkpoint is corrupted 1. extent_cache - let's drop the largest extent_cache 2. invalidate_block - don't show the warnings Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 10 ++++++++++ fs/f2fs/data.c | 4 ++++ fs/f2fs/extent_cache.c | 22 +++++++++++++++------- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e13d47c04131..cdde115c542e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -152,6 +152,11 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, se = get_seg_entry(sbi, segno); exist = f2fs_test_bit(offset, se->cur_valid_map); + + /* skip data, if we already have an error in checkpoint. */ + if (unlikely(f2fs_cp_error(sbi))) + return exist; + if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) { f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); @@ -202,6 +207,11 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, case DATA_GENERIC_ENHANCE_UPDATE: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) { + + /* Skip to emit an error message. */ + if (unlikely(f2fs_cp_error(sbi))) + return false; + f2fs_warn(sbi, "access invalid blkaddr:%u", blkaddr); set_sbi_flag(sbi, SBI_NEED_FSCK); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3ba33a244bd9..d39496383656 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2250,6 +2250,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (ret) goto out; + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + goto out_put_dnode; + } f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); skip_reading_dnode: diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 9a8153895d20..bea6ab9d846a 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -23,18 +23,26 @@ bool sanity_check_extent_cache(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); + struct extent_tree *et = fi->extent_tree[EX_READ]; struct extent_info *ei; - if (!fi->extent_tree[EX_READ]) + if (!et) return true; - ei = &fi->extent_tree[EX_READ]->largest; + ei = &et->largest; + if (!ei->len) + return true; - if (ei->len && - (!f2fs_is_valid_blkaddr(sbi, ei->blk, - DATA_GENERIC_ENHANCE) || - !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, - DATA_GENERIC_ENHANCE))) { + /* Let's drop, if checkpoint got corrupted. */ + if (is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) { + ei->len = 0; + et->largest_updated = true; + return true; + } + + if (!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE) || + !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, + DATA_GENERIC_ENHANCE)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", __func__, inode->i_ino, From 3421338a094476fc469a936cde17b7ff8fec770c Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 14 Apr 2023 00:59:51 +0800 Subject: [PATCH 49/54] f2fs: add has_enough_free_secs() Replace !has_not_enough_free_secs w/ has_enough_free_secs. BTW avoid nested 'if' statements in f2fs_balance_fs(). Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 4 ++-- fs/f2fs/segment.c | 41 +++++++++++++++++++++-------------------- fs/f2fs/segment.h | 8 +++++++- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6f9e29ac20d9..ca5f2294a136 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1873,7 +1873,7 @@ retry: if (gc_type == FG_GC) { sbi->cur_victim_sec = NULL_SEGNO; - if (!has_not_enough_free_secs(sbi, sec_freed, 0)) { + if (has_enough_free_secs(sbi, sec_freed, 0)) { if (!gc_control->no_bg_gc && sec_freed < gc_control->nr_free_secs) goto go_gc_more; @@ -1887,7 +1887,7 @@ retry: ret = f2fs_write_checkpoint(sbi, &cpc); goto stop; } - } else if (!has_not_enough_free_secs(sbi, 0, 0)) { + } else if (has_enough_free_secs(sbi, 0, 0)) { goto stop; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 877096c3e6e3..1b9a16f36dfb 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -411,27 +411,28 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ - if (has_not_enough_free_secs(sbi, 0, 0)) { - if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && - sbi->gc_thread->f2fs_gc_task) { - DEFINE_WAIT(wait); + if (has_enough_free_secs(sbi, 0, 0)) + return; - prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, - TASK_UNINTERRUPTIBLE); - wake_up(&sbi->gc_thread->gc_wait_queue_head); - io_schedule(); - finish_wait(&sbi->gc_thread->fggc_wq, &wait); - } else { - struct f2fs_gc_control gc_control = { - .victim_segno = NULL_SEGNO, - .init_gc_type = BG_GC, - .no_bg_gc = true, - .should_migrate_blocks = false, - .err_gc_skipped = false, - .nr_free_secs = 1 }; - f2fs_down_write(&sbi->gc_lock); - f2fs_gc(sbi, &gc_control); - } + if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && + sbi->gc_thread->f2fs_gc_task) { + DEFINE_WAIT(wait); + + prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sbi->gc_thread->gc_wait_queue_head); + io_schedule(); + finish_wait(&sbi->gc_thread->fggc_wq, &wait); + } else { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = BG_GC, + .no_bg_gc = true, + .should_migrate_blocks = false, + .err_gc_skipped = false, + .nr_free_secs = 1 }; + f2fs_down_write(&sbi->gc_lock); + f2fs_gc(sbi, &gc_control); } } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ac2e35170f2d..2ca8fb5d0dc4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -643,11 +643,17 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, return !curseg_space; } +static inline bool has_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) +{ + return !has_not_enough_free_secs(sbi, freed, needed); +} + static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) { if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; - if (likely(!has_not_enough_free_secs(sbi, 0, 0))) + if (likely(has_enough_free_secs(sbi, 0, 0))) return true; return false; } From 5d4d9fa907f0be4fdbb64456836a60837431c1df Mon Sep 17 00:00:00 2001 From: Wu Bo Date: Fri, 14 Apr 2023 18:43:08 +0800 Subject: [PATCH 50/54] f2fs: allocate trace path buffer from names_cache It would be better to use the dedicated slab to store path. Signed-off-by: Wu Bo Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 13 +++++++++++++ fs/f2fs/file.c | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9400e7ff9257..c031c62b8d2c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3338,6 +3338,19 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, return kmalloc(size, flags); } +static inline void *f2fs_getname(struct f2fs_sb_info *sbi) +{ + if (time_to_inject(sbi, FAULT_KMALLOC)) + return NULL; + + return __getname(); +} + +static inline void f2fs_putname(char *buf) +{ + __putname(buf); +} + static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ac2d352b0811..97ea95bbec7d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4360,7 +4360,7 @@ static void f2fs_trace_rw_file_path(struct kiocb *iocb, size_t count, int rw) struct inode *inode = file_inode(iocb->ki_filp); char *buf, *path; - buf = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); + buf = f2fs_getname(F2FS_I_SB(inode)); if (!buf) return; path = dentry_path_raw(file_dentry(iocb->ki_filp), buf, PATH_MAX); @@ -4373,7 +4373,7 @@ static void f2fs_trace_rw_file_path(struct kiocb *iocb, size_t count, int rw) trace_f2fs_dataread_start(inode, iocb->ki_pos, count, current->pid, path, current->comm); free_buf: - kfree(buf); + f2fs_putname(buf); } static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) From b8cd743c9d9b2266a37ecd82fe34e1a6dc708552 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Apr 2023 17:12:52 -0700 Subject: [PATCH 51/54] f2fs: remove power-of-two limitation of zoned device In f2fs, there's no reason to force po2. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/gc.c | 4 ++-- fs/f2fs/segment.c | 2 +- fs/f2fs/super.c | 8 ++------ 4 files changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c031c62b8d2c..cba9113b99a2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1520,7 +1520,6 @@ struct f2fs_sb_info { #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ - unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ #endif /* for node-related operations */ @@ -4386,7 +4385,7 @@ F2FS_FEATURE_FUNCS(readonly, RO); static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, block_t blkaddr) { - unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + unsigned int zno = blkaddr / sbi->blocks_per_blkz; return test_bit(zno, FDEV(devi).blkz_seq); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ca5f2294a136..2e4f48beed46 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -2094,8 +2094,8 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) FDEV(last_dev).end_blk = (long long)FDEV(last_dev).end_blk + blks; #ifdef CONFIG_BLK_DEV_ZONED - FDEV(last_dev).nr_blkz = (int)FDEV(last_dev).nr_blkz + - (int)(blks >> sbi->log_blocks_per_blkz); + FDEV(last_dev).nr_blkz = FDEV(last_dev).nr_blkz + + div_u64(blks, sbi->blocks_per_blkz); #endif } } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1b9a16f36dfb..1130cbb59a6a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2076,7 +2076,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, if (force && start >= cpc->trim_start && (end - 1) <= cpc->trim_end) - continue; + continue; /* Should cover 2MB zoned device for zone-based reset */ if (!f2fs_sb_has_blkzoned(sbi) && diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 52d3226bf90b..5900a4fb4256 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3815,12 +3815,8 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) SECTOR_TO_BLOCK(zone_sectors)) return -EINVAL; sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors); - if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != - __ilog2_u32(sbi->blocks_per_blkz)) - return -EINVAL; - sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); - FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> - sbi->log_blocks_per_blkz; + FDEV(devi).nr_blkz = div_u64(SECTOR_TO_BLOCK(nr_sectors), + sbi->blocks_per_blkz); if (nr_sectors & (zone_sectors - 1)) FDEV(devi).nr_blkz++; From a13c8e91f9c28459f8cc5c03b3c4b225f7173826 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 18 Apr 2023 10:52:06 -0700 Subject: [PATCH 52/54] f2fs: use cow inode data when updating atomic write Need to use cow inode data content instead of the one in the original inode, when we try to write the already updated atomic write files. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d39496383656..c4133826b8db 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3472,7 +3472,7 @@ unlock_out: static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, struct page *page, loff_t pos, unsigned int len, - block_t *blk_addr, bool *node_changed) + block_t *blk_addr, bool *node_changed, bool *use_cow) { struct inode *inode = page->mapping->host; struct inode *cow_inode = F2FS_I(inode)->cow_inode; @@ -3486,10 +3486,12 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, /* Look for the block in COW inode first */ err = __find_data_block(cow_inode, index, blk_addr); - if (err) + if (err) { return err; - else if (*blk_addr != NULL_ADDR) + } else if (*blk_addr != NULL_ADDR) { + *use_cow = true; return 0; + } if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE)) goto reserve_block; @@ -3520,6 +3522,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; bool need_balance = false; + bool use_cow = false; block_t blkaddr = NULL_ADDR; int err = 0; @@ -3579,7 +3582,7 @@ repeat: if (f2fs_is_atomic_file(inode)) err = prepare_atomic_write_begin(sbi, page, pos, len, - &blkaddr, &need_balance); + &blkaddr, &need_balance, &use_cow); else err = prepare_write_begin(sbi, page, pos, len, &blkaddr, &need_balance); @@ -3619,7 +3622,9 @@ repeat: f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } - err = f2fs_submit_page_read(inode, page, blkaddr, 0, true); + err = f2fs_submit_page_read(use_cow ? + F2FS_I(inode)->cow_inode : inode, page, + blkaddr, 0, true); if (err) goto fail; From 4bfa7f8c9f4561f707366ad1a1b56a2f7d3645aa Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 18 Apr 2023 10:42:01 -0700 Subject: [PATCH 53/54] f2fs: allocate node blocks for atomic write block replacement When a node block is missing for atomic write block replacement, we need to allocate it in advance of the replacement. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1130cbb59a6a..ccfe19e48d1a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -216,7 +216,7 @@ static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE_RA); + err = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (err) { if (err == -ENOMEM) { f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); From e773d28b76ec8e1db82af1b9112cb16ff1cc7f7f Mon Sep 17 00:00:00 2001 From: Qi Han Date: Tue, 18 Apr 2023 14:09:54 +0800 Subject: [PATCH 54/54] f2fs: remove unnessary comment in __may_age_extent_tree This comment make no sense and is in the wrong place, so let's remove it. Signed-off-by: Qi Han Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index bea6ab9d846a..0e2d49140c07 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -94,7 +94,6 @@ static bool __may_age_extent_tree(struct inode *inode) if (!test_opt(sbi, AGE_EXTENT_CACHE)) return false; - /* don't cache block age info for cold file */ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) return false; if (file_is_cold(inode))