From fa33f94b9954bab030f58021167e538360bcff9c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 21 Apr 2022 11:30:15 -0700 Subject: [PATCH 01/57] FROMGIT: scsi: sd: sd_zbc: Improve source code documentation Add several kernel-doc headers. Declare input arrays const. Specify the array size in function declarations. Link: https://lore.kernel.org/r/20220421183023.3462291-2-bvanassche@acm.org Reviewed-by: Damien Le Moal Reviewed-by: Himanshu Madhani Acked-by: Douglas Gilbert Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen Bug: 230616396 (cherry picked from commit aa96bfb4caff59c93f0637092efe3a714cab0fe6 mkp-scsi/staging) Signed-off-by: Bart Van Assche Change-Id: I51e73bc54328388adf8822d2db1049fc2d1198bf --- drivers/scsi/sd.h | 5 ++-- drivers/scsi/sd_zbc.c | 55 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index b59136c4125b..33cc01f98c6a 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -217,7 +217,7 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp) #ifdef CONFIG_BLK_DEV_ZONED void sd_zbc_release_disk(struct scsi_disk *sdkp); -int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer); +int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]); int sd_zbc_revalidate_zones(struct scsi_disk *sdkp); blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all); @@ -233,8 +233,7 @@ blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, static inline void sd_zbc_release_disk(struct scsi_disk *sdkp) {} -static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, - unsigned char *buf) +static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) { return 0; } diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index ed06798983f8..ddd752e728ac 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -20,6 +20,12 @@ #include "sd.h" +/** + * sd_zbc_get_zone_wp_offset - Get zone write pointer offset. + * @zone: Zone for which to return the write pointer offset. + * + * Return: offset of the write pointer from the start of the zone. + */ static unsigned int sd_zbc_get_zone_wp_offset(struct blk_zone *zone) { if (zone->type == ZBC_ZONE_TYPE_CONV) @@ -44,7 +50,21 @@ static unsigned int sd_zbc_get_zone_wp_offset(struct blk_zone *zone) } } -static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf, +/** + * sd_zbc_parse_report - Parse a SCSI zone descriptor + * @sdkp: SCSI disk pointer. + * @buf: SCSI zone descriptor. + * @idx: Index of the zone relative to the first zone reported by the current + * sd_zbc_report_zones() call. + * @cb: Callback function pointer. + * @data: Second argument passed to @cb. + * + * Return: Value returned by @cb. + * + * Convert a SCSI zone descriptor into struct blk_zone format. Additionally, + * call @cb(blk_zone, @data). + */ +static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64], unsigned int idx, report_zones_cb cb, void *data) { struct scsi_device *sdp = sdkp->device; @@ -189,6 +209,17 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp) return logical_to_sectors(sdkp->device, sdkp->zone_blocks); } +/** + * sd_zbc_report_zones - SCSI .report_zones() callback. + * @disk: Disk to report zones for. + * @sector: Start sector. + * @nr_zones: Maximum number of zones to report. + * @cb: Callback function called to report zone information. + * @data: Second argument passed to @cb. + * + * Called by the block layer to iterate over zone information. See also the + * disk->fops->report_zones() calls in block/blk-zoned.c. + */ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { @@ -276,6 +307,10 @@ static int sd_zbc_update_wp_offset_cb(struct blk_zone *zone, unsigned int idx, return 0; } +/* + * An attempt to append a zone triggered an invalid write pointer error. + * Reread the write pointer of the zone(s) in which the append failed. + */ static void sd_zbc_update_wp_offset_workfn(struct work_struct *work) { struct scsi_disk *sdkp; @@ -587,7 +622,7 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp, * sd_zbc_check_capacity - Check the device capacity * @sdkp: Target disk * @buf: command buffer - * @zblocks: zone size in number of blocks + * @zblocks: zone size in logical blocks * * Get the device zone size and check that the device capacity as reported * by READ CAPACITY matches the max_lba value (plus one) of the report zones @@ -698,6 +733,11 @@ static void sd_zbc_revalidate_zones_cb(struct gendisk *disk) swap(sdkp->zones_wp_offset, sdkp->rev_wp_offset); } +/* + * Call blk_revalidate_disk_zones() if any of the zoned disk properties have + * changed that make it necessary to call that function. Called by + * sd_revalidate_disk() after the gendisk capacity has been set. + */ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) { struct gendisk *disk = sdkp->disk; @@ -776,7 +816,16 @@ unlock: return ret; } -int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) +/** + * sd_zbc_read_zones - Read zone information and update the request queue + * @sdkp: SCSI disk pointer. + * @buf: 512 byte buffer used for storing SCSI command output. + * + * Read zone information and update the request queue zone characteristics and + * also the zoned device information in *sdkp. Called by sd_revalidate_disk() + * before the gendisk capacity has been set. + */ +int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) { struct gendisk *disk = sdkp->disk; struct request_queue *q = disk->queue; From 90f56ddc367201faaac396b85542ef6db471baf6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 21 Apr 2022 11:30:16 -0700 Subject: [PATCH 02/57] FROMGIT: scsi: sd: sd_zbc: Verify that the zone size is a power of two The following check in sd_zbc_cmnd_checks() can only work correctly if the zone size is a power of two: if (sector & (sd_zbc_zone_sectors(sdkp) - 1)) /* Unaligned request */ return BLK_STS_IOERR; Hence this patch that verifies that the zone size is a power of two. Link: https://lore.kernel.org/r/20220421183023.3462291-3-bvanassche@acm.org Reviewed-by: Damien Le Moal Reviewed-by: Himanshu Madhani Acked-by: Douglas Gilbert Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen Bug: 230616396 (cherry picked from commit 9a93b9c9d38aee1f729f3cea72971a2616dca936 mkp-scsi/staging) Signed-off-by: Bart Van Assche Change-Id: I99b06e3dbc4f99d3696edde0af60e8fb876745b7 --- drivers/scsi/sd_zbc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index ddd752e728ac..bc2bd6d64340 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -666,6 +666,13 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, unsigned char *buf, return -EFBIG; } + if (!is_power_of_2(zone_blocks)) { + sd_printk(KERN_ERR, sdkp, + "Zone size %llu is not a power of two.\n", + zone_blocks); + return -EINVAL; + } + *zblocks = zone_blocks; return 0; From b58cf8995b1ab298b5f06f5c550b580b881d29b2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 21 Apr 2022 11:30:17 -0700 Subject: [PATCH 03/57] FROMGIT: scsi: sd: sd_zbc: Use logical blocks as unit when querying zones When querying zones, track the position in logical blocks instead of in sectors. This change slightly simplifies sd_zbc_report_zones(). Link: https://lore.kernel.org/r/20220421183023.3462291-4-bvanassche@acm.org Reviewed-by: Himanshu Madhani Acked-by: Douglas Gilbert Signed-off-by: Damien Le Moal [ bvanassche: extracted this change from a larger patch ] Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen Bug: 230616396 (cherry picked from commit 43af5da09efb8abe450ec859d3063adeb7d1eb54 mkp-scsi/staging) Signed-off-by: Bart Van Assche Change-Id: I0f4e9c441c3c77c13ee0f715bb2cf59ee5798bdf --- drivers/scsi/sd_zbc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index bc2bd6d64340..f40633ac8f4b 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -224,7 +224,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct scsi_disk *sdkp = scsi_disk(disk); - sector_t capacity = logical_to_sectors(sdkp->device, sdkp->capacity); + sector_t lba = sectors_to_logical(sdkp->device, sector); unsigned int nr, i; unsigned char *buf; size_t offset, buflen = 0; @@ -235,7 +235,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, /* Not a zoned device */ return -EOPNOTSUPP; - if (!capacity) + if (!sdkp->capacity) /* Device gone or invalid */ return -ENODEV; @@ -243,9 +243,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, if (!buf) return -ENOMEM; - while (zone_idx < nr_zones && sector < capacity) { - ret = sd_zbc_do_report_zones(sdkp, buf, buflen, - sectors_to_logical(sdkp->device, sector), true); + while (zone_idx < nr_zones && lba < sdkp->capacity) { + ret = sd_zbc_do_report_zones(sdkp, buf, buflen, lba, true); if (ret) goto out; @@ -263,7 +262,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, zone_idx++; } - sector += sd_zbc_zone_sectors(sdkp) * i; + lba += sdkp->zone_blocks * i; } ret = zone_idx; From 368693d3e2fb53a3a0d1b83ded05eaa736500570 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 21 Apr 2022 11:30:18 -0700 Subject: [PATCH 04/57] FROMGIT: scsi: sd: sd_zbc: Introduce struct zoned_disk_info Deriving the meaning of the nr_zones, rev_nr_zones, zone_blocks and rev_zone_blocks member variables requires careful analysis of the source code. Make the meaning of these member variables easier to understand by introducing struct zoned_disk_info. Link: https://lore.kernel.org/r/20220421183023.3462291-5-bvanassche@acm.org Reviewed-by: Damien Le Moal Reviewed-by: Himanshu Madhani Acked-by: Douglas Gilbert Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen Bug: 230616396 (cherry picked from commit 628617be8968e575ca0a0637fb227f8a990cb2f7 mkp-scsi/staging) Signed-off-by: Bart Van Assche Change-Id: Ic846769a2562fcab60511873afc1586a8e2a02ab --- drivers/scsi/sd.h | 22 +++++++++++++++---- drivers/scsi/sd_zbc.c | 49 ++++++++++++++++++++----------------------- 2 files changed, 41 insertions(+), 30 deletions(-) diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 33cc01f98c6a..6e93800d6e76 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -67,6 +67,20 @@ enum { SD_ZERO_WS10_UNMAP, /* Use WRITE SAME(10) with UNMAP */ }; +/** + * struct zoned_disk_info - Specific properties of a ZBC SCSI device. + * @nr_zones: number of zones. + * @zone_blocks: number of logical blocks per zone. + * + * This data structure holds the ZBC SCSI device properties that are retrieved + * twice: a first time before the gendisk capacity is known and a second time + * after the gendisk capacity is known. + */ +struct zoned_disk_info { + u32 nr_zones; + u32 zone_blocks; +}; + struct scsi_disk { struct scsi_driver *driver; /* always &sd_template */ struct scsi_device *device; @@ -74,10 +88,10 @@ struct scsi_disk { struct gendisk *disk; struct opal_dev *opal_dev; #ifdef CONFIG_BLK_DEV_ZONED - u32 nr_zones; - u32 rev_nr_zones; - u32 zone_blocks; - u32 rev_zone_blocks; + /* Updated during revalidation before the gendisk capacity is known. */ + struct zoned_disk_info early_zone_info; + /* Updated during revalidation after the gendisk capacity is known. */ + struct zoned_disk_info zone_info; u32 zones_optimal_open; u32 zones_optimal_nonseq; u32 zones_max_open; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index f40633ac8f4b..1b31e623ba31 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -181,7 +181,7 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, * sure that the allocated buffer can always be mapped by limiting the * number of pages allocated to the HBA max segments limit. */ - nr_zones = min(nr_zones, sdkp->nr_zones); + nr_zones = min(nr_zones, sdkp->zone_info.nr_zones); bufsize = roundup((nr_zones + 1) * 64, SECTOR_SIZE); bufsize = min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); @@ -206,7 +206,7 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, */ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp) { - return logical_to_sectors(sdkp->device, sdkp->zone_blocks); + return logical_to_sectors(sdkp->device, sdkp->zone_info.zone_blocks); } /** @@ -262,7 +262,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, zone_idx++; } - lba += sdkp->zone_blocks * i; + lba += sdkp->zone_info.zone_blocks * i; } ret = zone_idx; @@ -320,14 +320,14 @@ static void sd_zbc_update_wp_offset_workfn(struct work_struct *work) sdkp = container_of(work, struct scsi_disk, zone_wp_offset_work); spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); - for (zno = 0; zno < sdkp->nr_zones; zno++) { + for (zno = 0; zno < sdkp->zone_info.nr_zones; zno++) { if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST) continue; spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); ret = sd_zbc_do_report_zones(sdkp, sdkp->zone_wp_update_buf, SD_BUF_SIZE, - zno * sdkp->zone_blocks, true); + zno * sdkp->zone_info.zone_blocks, true); spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); if (!ret) sd_zbc_parse_report(sdkp, sdkp->zone_wp_update_buf + 64, @@ -394,7 +394,7 @@ blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, break; default: wp_offset = sectors_to_logical(sdkp->device, wp_offset); - if (wp_offset + nr_blocks > sdkp->zone_blocks) { + if (wp_offset + nr_blocks > sdkp->zone_info.zone_blocks) { ret = BLK_STS_IOERR; break; } @@ -525,7 +525,7 @@ static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd, break; case REQ_OP_ZONE_RESET_ALL: memset(sdkp->zones_wp_offset, 0, - sdkp->nr_zones * sizeof(unsigned int)); + sdkp->zone_info.nr_zones * sizeof(unsigned int)); break; default: break; @@ -682,16 +682,16 @@ static void sd_zbc_print_zones(struct scsi_disk *sdkp) if (!sd_is_zoned(sdkp) || !sdkp->capacity) return; - if (sdkp->capacity & (sdkp->zone_blocks - 1)) + if (sdkp->capacity & (sdkp->zone_info.zone_blocks - 1)) sd_printk(KERN_NOTICE, sdkp, "%u zones of %u logical blocks + 1 runt zone\n", - sdkp->nr_zones - 1, - sdkp->zone_blocks); + sdkp->zone_info.nr_zones - 1, + sdkp->zone_info.zone_blocks); else sd_printk(KERN_NOTICE, sdkp, "%u zones of %u logical blocks\n", - sdkp->nr_zones, - sdkp->zone_blocks); + sdkp->zone_info.nr_zones, + sdkp->zone_info.zone_blocks); } static int sd_zbc_init_disk(struct scsi_disk *sdkp) @@ -718,10 +718,8 @@ static void sd_zbc_clear_zone_info(struct scsi_disk *sdkp) kfree(sdkp->zone_wp_update_buf); sdkp->zone_wp_update_buf = NULL; - sdkp->nr_zones = 0; - sdkp->rev_nr_zones = 0; - sdkp->zone_blocks = 0; - sdkp->rev_zone_blocks = 0; + sdkp->early_zone_info = (struct zoned_disk_info){ }; + sdkp->zone_info = (struct zoned_disk_info){ }; mutex_unlock(&sdkp->rev_mutex); } @@ -748,8 +746,8 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) { struct gendisk *disk = sdkp->disk; struct request_queue *q = disk->queue; - u32 zone_blocks = sdkp->rev_zone_blocks; - unsigned int nr_zones = sdkp->rev_nr_zones; + u32 zone_blocks = sdkp->early_zone_info.zone_blocks; + unsigned int nr_zones = sdkp->early_zone_info.nr_zones; u32 max_append; int ret = 0; unsigned int flags; @@ -780,14 +778,14 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) */ mutex_lock(&sdkp->rev_mutex); - if (sdkp->zone_blocks == zone_blocks && - sdkp->nr_zones == nr_zones && + if (sdkp->zone_info.zone_blocks == zone_blocks && + sdkp->zone_info.nr_zones == nr_zones && disk->queue->nr_zones == nr_zones) goto unlock; flags = memalloc_noio_save(); - sdkp->zone_blocks = zone_blocks; - sdkp->nr_zones = nr_zones; + sdkp->zone_info.zone_blocks = zone_blocks; + sdkp->zone_info.nr_zones = nr_zones; sdkp->rev_wp_offset = kvcalloc(nr_zones, sizeof(u32), GFP_KERNEL); if (!sdkp->rev_wp_offset) { ret = -ENOMEM; @@ -802,8 +800,7 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) sdkp->rev_wp_offset = NULL; if (ret) { - sdkp->zone_blocks = 0; - sdkp->nr_zones = 0; + sdkp->zone_info = (struct zoned_disk_info){ }; sdkp->capacity = 0; goto unlock; } @@ -889,8 +886,8 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) if (blk_queue_zoned_model(q) == BLK_ZONED_HM) blk_queue_zone_write_granularity(q, sdkp->physical_block_size); - sdkp->rev_nr_zones = nr_zones; - sdkp->rev_zone_blocks = zone_blocks; + sdkp->early_zone_info.nr_zones = nr_zones; + sdkp->early_zone_info.zone_blocks = zone_blocks; return 0; From 2f6918f3a3102092d49c169cd55b991130ff2a30 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 21 Apr 2022 11:30:19 -0700 Subject: [PATCH 05/57] FROMGIT: scsi: sd: sd_zbc: Return early in sd_zbc_check_zoned_characteristics() Return early in sd_zbc_check_zoned_characteristics() for host-aware disks. This patch does not change any functionality but makes a later patch easier to read. Link: https://lore.kernel.org/r/20220421183023.3462291-6-bvanassche@acm.org Reviewed-by: Himanshu Madhani Acked-by: Douglas Gilbert Signed-off-by: Damien Le Moal [ bvanassche: extracted this change from a larger patch ] Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen Bug: 230616396 (cherry picked from commit 60caf3758103b8edc90724ba781ff119f739162a mkp-scsi/staging) Signed-off-by: Bart Van Assche Change-Id: Ib9b5e6cbcefdae3cfe1e677cb9e7e86c6f442b08 --- drivers/scsi/sd_zbc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 1b31e623ba31..592b279b1d88 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -594,14 +594,15 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp, sdkp->zones_optimal_open = get_unaligned_be32(&buf[8]); sdkp->zones_optimal_nonseq = get_unaligned_be32(&buf[12]); sdkp->zones_max_open = 0; - } else { - /* Host-managed */ - sdkp->urswrz = buf[4] & 1; - sdkp->zones_optimal_open = 0; - sdkp->zones_optimal_nonseq = 0; - sdkp->zones_max_open = get_unaligned_be32(&buf[16]); + return 0; } + /* Host-managed */ + sdkp->urswrz = buf[4] & 1; + sdkp->zones_optimal_open = 0; + sdkp->zones_optimal_nonseq = 0; + sdkp->zones_max_open = get_unaligned_be32(&buf[16]); + /* * Check for unconstrained reads: host-managed devices with * constrained reads (drives failing read after write pointer) From 7eae3febdf0124a034cbbc754faa9abc33e2d570 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 21 Apr 2022 11:30:20 -0700 Subject: [PATCH 06/57] FROMGIT: scsi: sd: sd_zbc: Hide gap zones ZBC-2 allows host-managed disks to report gap zones. This allow zoned disks to report an offset between data zone starts that is a power of two even if the number of logical blocks with data per zone is not a power of two. Another new feature in ZBC-2 is support for constant zone starting LBA offsets. For zoned disks that report a constant zone starting LBA offset, hide the gap zones from the block layer. Report the offset between data zone starts as zone size and report the number of logical blocks with data per zone as the zone capacity. Link: https://lore.kernel.org/r/20220421183023.3462291-7-bvanassche@acm.org Acked-by: Douglas Gilbert Signed-off-by: Damien Le Moal [ bvanassche: Reworked this patch ] Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen Bug: 230616396 (cherry picked from commit c976e588b34e4ff2fdd2922edab4b983244a17e6 mkp-scsi/staging) Signed-off-by: Bart Van Assche Change-Id: Iae809a10943a805d21bd4fdffc20ee9d6960d6e4 --- drivers/scsi/sd.h | 5 ++ drivers/scsi/sd_zbc.c | 111 ++++++++++++++++++++++++++++++++------ include/scsi/scsi_proto.h | 9 +++- 3 files changed, 108 insertions(+), 17 deletions(-) diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 6e93800d6e76..f1cc648a453a 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -95,6 +95,11 @@ struct scsi_disk { u32 zones_optimal_open; u32 zones_optimal_nonseq; u32 zones_max_open; + /* + * Either zero or a power of two. If not zero it means that the offset + * between zone starting LBAs is constant. + */ + u32 zone_starting_lba_gran; u32 *zones_wp_offset; spinlock_t zones_wp_offset_lock; u32 *rev_wp_offset; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 592b279b1d88..164f41fcf9eb 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -50,6 +50,12 @@ static unsigned int sd_zbc_get_zone_wp_offset(struct blk_zone *zone) } } +/* Whether or not a SCSI zone descriptor describes a gap zone. */ +static bool sd_zbc_is_gap_zone(const u8 buf[64]) +{ + return (buf[0] & 0xf) == ZBC_ZONE_TYPE_GAP; +} + /** * sd_zbc_parse_report - Parse a SCSI zone descriptor * @sdkp: SCSI disk pointer. @@ -69,8 +75,12 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64], { struct scsi_device *sdp = sdkp->device; struct blk_zone zone = { 0 }; + sector_t start_lba, gran; int ret; + if (WARN_ON_ONCE(sd_zbc_is_gap_zone(buf))) + return -EINVAL; + zone.type = buf[0] & 0x0f; zone.cond = (buf[1] >> 4) & 0xf; if (buf[1] & 0x01) @@ -78,13 +88,31 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64], if (buf[1] & 0x02) zone.non_seq = 1; - zone.len = logical_to_sectors(sdp, get_unaligned_be64(&buf[8])); - zone.capacity = zone.len; - zone.start = logical_to_sectors(sdp, get_unaligned_be64(&buf[16])); - zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24])); - if (zone.type != ZBC_ZONE_TYPE_CONV && - zone.cond == ZBC_ZONE_COND_FULL) + start_lba = get_unaligned_be64(&buf[16]); + zone.start = logical_to_sectors(sdp, start_lba); + zone.capacity = logical_to_sectors(sdp, get_unaligned_be64(&buf[8])); + zone.len = zone.capacity; + if (sdkp->zone_starting_lba_gran) { + gran = logical_to_sectors(sdp, sdkp->zone_starting_lba_gran); + if (zone.len > gran) { + sd_printk(KERN_ERR, sdkp, + "Invalid zone at LBA %llu with capacity %llu and length %llu; granularity = %llu\n", + start_lba, + sectors_to_logical(sdp, zone.capacity), + sectors_to_logical(sdp, zone.len), + sectors_to_logical(sdp, gran)); + return -EINVAL; + } + /* + * Use the starting LBA granularity instead of the zone length + * obtained from the REPORT ZONES command. + */ + zone.len = gran; + } + if (zone.cond == ZBC_ZONE_COND_FULL) zone.wp = zone.start + zone.len; + else + zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24])); ret = cb(&zone, idx, data); if (ret) @@ -227,6 +255,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, sector_t lba = sectors_to_logical(sdkp->device, sector); unsigned int nr, i; unsigned char *buf; + u64 zone_length, start_lba; size_t offset, buflen = 0; int zone_idx = 0; int ret; @@ -255,14 +284,36 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, for (i = 0; i < nr && zone_idx < nr_zones; i++) { offset += 64; + start_lba = get_unaligned_be64(&buf[offset + 16]); + zone_length = get_unaligned_be64(&buf[offset + 8]); + if ((zone_idx == 0 && + (lba < start_lba || + lba >= start_lba + zone_length)) || + (zone_idx > 0 && start_lba != lba) || + start_lba + zone_length < start_lba) { + sd_printk(KERN_ERR, sdkp, + "Zone %d at LBA %llu is invalid: %llu + %llu\n", + zone_idx, lba, start_lba, zone_length); + ret = -EINVAL; + goto out; + } + lba = start_lba + zone_length; + if (sd_zbc_is_gap_zone(&buf[offset])) { + if (sdkp->zone_starting_lba_gran) + continue; + sd_printk(KERN_ERR, sdkp, + "Gap zone without constant LBA offsets\n"); + ret = -EINVAL; + goto out; + } + ret = sd_zbc_parse_report(sdkp, buf + offset, zone_idx, cb, data); if (ret) goto out; + zone_idx++; } - - lba += sdkp->zone_info.zone_blocks * i; } ret = zone_idx; @@ -581,6 +632,7 @@ unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp, unsigned char *buf) { + u64 zone_starting_lba_gran; if (scsi_get_vpd_page(sdkp->device, 0xb6, buf, 64)) { sd_printk(KERN_NOTICE, sdkp, @@ -602,6 +654,29 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp, sdkp->zones_optimal_open = 0; sdkp->zones_optimal_nonseq = 0; sdkp->zones_max_open = get_unaligned_be32(&buf[16]); + /* Check zone alignment method */ + switch (buf[23] & 0xf) { + case 0: + case ZBC_CONSTANT_ZONE_LENGTH: + /* Use zone length */ + break; + case ZBC_CONSTANT_ZONE_START_OFFSET: + zone_starting_lba_gran = get_unaligned_be64(&buf[24]); + if (zone_starting_lba_gran == 0 || + !is_power_of_2(zone_starting_lba_gran) || + logical_to_sectors(sdkp->device, zone_starting_lba_gran) > + UINT_MAX) { + sd_printk(KERN_ERR, sdkp, + "Invalid zone starting LBA granularity %llu\n", + zone_starting_lba_gran); + return -ENODEV; + } + sdkp->zone_starting_lba_gran = zone_starting_lba_gran; + break; + default: + sd_printk(KERN_ERR, sdkp, "Invalid zone alignment method\n"); + return -ENODEV; + } /* * Check for unconstrained reads: host-managed devices with @@ -656,14 +731,18 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, unsigned char *buf, } } - /* Get the size of the first reported zone */ - rec = buf + 64; - zone_blocks = get_unaligned_be64(&rec[8]); - if (logical_to_sectors(sdkp->device, zone_blocks) > UINT_MAX) { - if (sdkp->first_scan) - sd_printk(KERN_NOTICE, sdkp, - "Zone size too large\n"); - return -EFBIG; + if (sdkp->zone_starting_lba_gran == 0) { + /* Get the size of the first reported zone */ + rec = buf + 64; + zone_blocks = get_unaligned_be64(&rec[8]); + if (logical_to_sectors(sdkp->device, zone_blocks) > UINT_MAX) { + if (sdkp->first_scan) + sd_printk(KERN_NOTICE, sdkp, + "Zone size too large\n"); + return -EFBIG; + } + } else { + zone_blocks = sdkp->zone_starting_lba_gran; } if (!is_power_of_2(zone_blocks)) { diff --git a/include/scsi/scsi_proto.h b/include/scsi/scsi_proto.h index f017843a8124..c03e35fc382c 100644 --- a/include/scsi/scsi_proto.h +++ b/include/scsi/scsi_proto.h @@ -307,7 +307,9 @@ enum zbc_zone_type { ZBC_ZONE_TYPE_CONV = 0x1, ZBC_ZONE_TYPE_SEQWRITE_REQ = 0x2, ZBC_ZONE_TYPE_SEQWRITE_PREF = 0x3, - /* 0x4 to 0xf are reserved */ + ZBC_ZONE_TYPE_SEQ_OR_BEFORE_REQ = 0x4, + ZBC_ZONE_TYPE_GAP = 0x5, + /* 0x6 to 0xf are reserved */ }; /* Zone conditions of REPORT ZONES zone descriptors */ @@ -323,6 +325,11 @@ enum zbc_zone_cond { ZBC_ZONE_COND_OFFLINE = 0xf, }; +enum zbc_zone_alignment_method { + ZBC_CONSTANT_ZONE_LENGTH = 0x1, + ZBC_CONSTANT_ZONE_START_OFFSET = 0x8, +}; + /* Version descriptor values for INQUIRY */ enum scsi_version_descriptor { SCSI_VERSION_DESCRIPTOR_FCP4 = 0x0a40, From b27960f1089cafe59ed69fcdaf4b580c382e59f1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 21 Apr 2022 11:30:21 -0700 Subject: [PATCH 07/57] FROMGIT: scsi: scsi_debug: Fix a typo Change a single occurrence of "nad" into "and". Link: https://lore.kernel.org/r/20220421183023.3462291-8-bvanassche@acm.org Cc: Douglas Gilbert Reviewed-by: Damien Le Moal Reviewed-by: Himanshu Madhani Acked-by: Douglas Gilbert Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen Bug: 230616396 (cherry picked from commit 897284e8a04894537b58a079a2729a70731c229d mkp-scsi/staging) Signed-off-by: Bart Van Assche Change-Id: I6181e5a9a3a1d20e6d71c99aef6c1a2e9d3ac053 --- drivers/scsi/scsi_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 66f507469a31..73b410bf8f48 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -4300,7 +4300,7 @@ cleanup: #define RZONES_DESC_HD 64 -/* Report zones depending on start LBA nad reporting options */ +/* Report zones depending on start LBA and reporting options */ static int resp_report_zones(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) { From c8622f49e4c90c9597b3f41f2fe4d94e28325cc3 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 21 Apr 2022 11:30:22 -0700 Subject: [PATCH 08/57] FROMGIT: scsi: scsi_debug: Rename zone type constants Rename the scsi_debug zone type constants to prevent a conflict with the ZBC_ZONE_TYPE_GAP constant from include/scsi/scsi_proto.h. Link: https://lore.kernel.org/r/20220421183023.3462291-9-bvanassche@acm.org Cc: Douglas Gilbert Acked-by: Douglas Gilbert Signed-off-by: Damien Le Moal [ bvanassche: Extracted these changes from a larger patch ] Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen Bug: 230616396 (cherry picked from commit 35dbe2b9a7b0c92777c855c6a2cca8390f4c166b mkp-scsi/staging) Signed-off-by: Bart Van Assche Change-Id: Ibd71fb99c22839188b66649c948afe1a600f8c03 --- drivers/scsi/scsi_debug.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 73b410bf8f48..27a7cf6661c0 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -250,9 +250,9 @@ static const char *sdebug_version_date = "20200710"; /* Zone types (zbcr05 table 25) */ enum sdebug_z_type { - ZBC_ZONE_TYPE_CNV = 0x1, - ZBC_ZONE_TYPE_SWR = 0x2, - ZBC_ZONE_TYPE_SWP = 0x3, + ZBC_ZTYPE_CNV = 0x1, + ZBC_ZTYPE_SWR = 0x2, + ZBC_ZTYPE_SWP = 0x3, }; /* enumeration names taken from table 26, zbcr05 */ @@ -2670,7 +2670,7 @@ static struct sdeb_zone_state *zbc_zone(struct sdebug_dev_info *devip, static inline bool zbc_zone_is_conv(struct sdeb_zone_state *zsp) { - return zsp->z_type == ZBC_ZONE_TYPE_CNV; + return zsp->z_type == ZBC_ZTYPE_CNV; } static void zbc_close_zone(struct sdebug_dev_info *devip, @@ -2751,7 +2751,7 @@ static void zbc_inc_wp(struct sdebug_dev_info *devip, if (zbc_zone_is_conv(zsp)) return; - if (zsp->z_type == ZBC_ZONE_TYPE_SWR) { + if (zsp->z_type == ZBC_ZTYPE_SWR) { zsp->z_wp += num; if (zsp->z_wp >= zend) zsp->z_cond = ZC5_FULL; @@ -2818,7 +2818,7 @@ static int check_zbc_access_params(struct scsi_cmnd *scp, return 0; } - if (zsp->z_type == ZBC_ZONE_TYPE_SWR) { + if (zsp->z_type == ZBC_ZTYPE_SWR) { /* Writes cannot cross sequential zone boundaries */ if (zsp_end != zsp) { mk_sense_buffer(scp, ILLEGAL_REQUEST, @@ -4895,14 +4895,14 @@ static int sdebug_device_create_zones(struct sdebug_dev_info *devip) zsp->z_start = zstart; if (i < devip->nr_conv_zones) { - zsp->z_type = ZBC_ZONE_TYPE_CNV; + zsp->z_type = ZBC_ZTYPE_CNV; zsp->z_cond = ZBC_NOT_WRITE_POINTER; zsp->z_wp = (sector_t)-1; } else { if (devip->zmodel == BLK_ZONED_HM) - zsp->z_type = ZBC_ZONE_TYPE_SWR; + zsp->z_type = ZBC_ZTYPE_SWR; else - zsp->z_type = ZBC_ZONE_TYPE_SWP; + zsp->z_type = ZBC_ZTYPE_SWP; zsp->z_cond = ZC1_EMPTY; zsp->z_wp = zsp->z_start; } From 423088083d83d996ce855274cfcf415381703b9e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 21 Apr 2022 11:30:23 -0700 Subject: [PATCH 09/57] FROMGIT: scsi: scsi_debug: Add gap zone support Add the 'zone_cap_mb' kernel module parameter. This parameter defines the zone capacity. The zone capacity must be less than or equal to the zone size. Report that sequential write zones and gap zones are paired in the Zoned Block Device Characteristics VPD page (page B6h). This patch has been tested as follows: modprobe scsi_debug delay=0 sector_size=512 dev_size_mb=128 zbc=host-managed zone_nr_conv=16 zone_size_mb=4 zone_cap_mb=3 modprobe brd rd_nr=1 rd_size=$((1<<20)) mkfs.f2fs -m /dev/ram0 -c /dev/${scsi_debug_dev} mount /dev/ram0 /mnt # Run a fio job that uses /mnt Link: https://lore.kernel.org/r/20220421183023.3462291-10-bvanassche@acm.org Cc: Douglas Gilbert Acked-by: Douglas Gilbert Signed-off-by: Damien Le Moal [ bvanassche: Switched to reporting a constant zone starting LBA granularity ] Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen Bug: 230616396 (cherry picked from commit 4a5fc1c6d75261f2f014a961d39ab10aae66a0f9 mkp-scsi/staging) Signed-off-by: Bart Van Assche Change-Id: Ie8010a82a0070123cd56fcd0bf70ffa81d397d66 --- drivers/scsi/scsi_debug.c | 129 ++++++++++++++++++++++++++++++-------- 1 file changed, 104 insertions(+), 25 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 27a7cf6661c0..22dd136bddb9 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -16,7 +16,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ #include - +#include #include #include #include @@ -98,6 +98,7 @@ static const char *sdebug_version_date = "20200710"; #define WRITE_BOUNDARY_ASCQ 0x5 #define READ_INVDATA_ASCQ 0x6 #define READ_BOUNDARY_ASCQ 0x7 +#define ATTEMPT_ACCESS_GAP 0x9 #define INSUFF_ZONE_ASCQ 0xe /* Additional Sense Code Qualifier (ASCQ) */ @@ -253,6 +254,8 @@ enum sdebug_z_type { ZBC_ZTYPE_CNV = 0x1, ZBC_ZTYPE_SWR = 0x2, ZBC_ZTYPE_SWP = 0x3, + /* ZBC_ZTYPE_SOBR = 0x4, */ + ZBC_ZTYPE_GAP = 0x5, }; /* enumeration names taken from table 26, zbcr05 */ @@ -290,10 +293,12 @@ struct sdebug_dev_info { /* For ZBC devices */ enum blk_zoned_model zmodel; + unsigned int zcap; unsigned int zsize; unsigned int zsize_shift; unsigned int nr_zones; unsigned int nr_conv_zones; + unsigned int nr_seq_zones; unsigned int nr_imp_open; unsigned int nr_exp_open; unsigned int nr_closed; @@ -827,6 +832,7 @@ static int dif_errors; /* ZBC global data */ static bool sdeb_zbc_in_use; /* true for host-aware and host-managed disks */ +static int sdeb_zbc_zone_cap_mb; static int sdeb_zbc_zone_size_mb; static int sdeb_zbc_max_open = DEF_ZBC_MAX_OPEN_ZONES; static int sdeb_zbc_nr_conv = DEF_ZBC_NR_CONV_ZONES; @@ -1551,6 +1557,12 @@ static int inquiry_vpd_b6(struct sdebug_dev_info *devip, unsigned char *arr) put_unaligned_be32(devip->max_open, &arr[12]); else put_unaligned_be32(0xffffffff, &arr[12]); + if (devip->zcap < devip->zsize) { + arr[19] = ZBC_CONSTANT_ZONE_START_OFFSET; + put_unaligned_be64(devip->zsize, &arr[20]); + } else { + arr[19] = 0; + } return 0x3c; } @@ -2665,7 +2677,23 @@ static inline bool sdebug_dev_is_zoned(struct sdebug_dev_info *devip) static struct sdeb_zone_state *zbc_zone(struct sdebug_dev_info *devip, unsigned long long lba) { - return &devip->zstate[lba >> devip->zsize_shift]; + u32 zno = lba >> devip->zsize_shift; + struct sdeb_zone_state *zsp; + + if (devip->zcap == devip->zsize || zno < devip->nr_conv_zones) + return &devip->zstate[zno]; + + /* + * If the zone capacity is less than the zone size, adjust for gap + * zones. + */ + zno = 2 * zno - devip->nr_conv_zones; + WARN_ONCE(zno >= devip->nr_zones, "%u > %u\n", zno, devip->nr_zones); + zsp = &devip->zstate[zno]; + if (lba >= zsp->z_start + zsp->z_size) + zsp++; + WARN_ON_ONCE(lba >= zsp->z_start + zsp->z_size); + return zsp; } static inline bool zbc_zone_is_conv(struct sdeb_zone_state *zsp) @@ -2673,12 +2701,22 @@ static inline bool zbc_zone_is_conv(struct sdeb_zone_state *zsp) return zsp->z_type == ZBC_ZTYPE_CNV; } +static inline bool zbc_zone_is_gap(struct sdeb_zone_state *zsp) +{ + return zsp->z_type == ZBC_ZTYPE_GAP; +} + +static inline bool zbc_zone_is_seq(struct sdeb_zone_state *zsp) +{ + return !zbc_zone_is_conv(zsp) && !zbc_zone_is_gap(zsp); +} + static void zbc_close_zone(struct sdebug_dev_info *devip, struct sdeb_zone_state *zsp) { enum sdebug_z_cond zc; - if (zbc_zone_is_conv(zsp)) + if (!zbc_zone_is_seq(zsp)) return; zc = zsp->z_cond; @@ -2716,7 +2754,7 @@ static void zbc_open_zone(struct sdebug_dev_info *devip, { enum sdebug_z_cond zc; - if (zbc_zone_is_conv(zsp)) + if (!zbc_zone_is_seq(zsp)) return; zc = zsp->z_cond; @@ -2748,7 +2786,7 @@ static void zbc_inc_wp(struct sdebug_dev_info *devip, struct sdeb_zone_state *zsp = zbc_zone(devip, lba); unsigned long long n, end, zend = zsp->z_start + zsp->z_size; - if (zbc_zone_is_conv(zsp)) + if (!zbc_zone_is_seq(zsp)) return; if (zsp->z_type == ZBC_ZTYPE_SWR) { @@ -2796,9 +2834,7 @@ static int check_zbc_access_params(struct scsi_cmnd *scp, if (devip->zmodel == BLK_ZONED_HA) return 0; /* For host-managed, reads cannot cross zone types boundaries */ - if (zsp_end != zsp && - zbc_zone_is_conv(zsp) && - !zbc_zone_is_conv(zsp_end)) { + if (zsp->z_type != zsp_end->z_type) { mk_sense_buffer(scp, ILLEGAL_REQUEST, LBA_OUT_OF_RANGE, READ_INVDATA_ASCQ); @@ -2807,6 +2843,13 @@ static int check_zbc_access_params(struct scsi_cmnd *scp, return 0; } + /* Writing into a gap zone is not allowed */ + if (zbc_zone_is_gap(zsp)) { + mk_sense_buffer(scp, ILLEGAL_REQUEST, LBA_OUT_OF_RANGE, + ATTEMPT_ACCESS_GAP); + return check_condition_result; + } + /* No restrictions for writes within conventional zones */ if (zbc_zone_is_conv(zsp)) { if (!zbc_zone_is_conv(zsp_end)) { @@ -4304,14 +4347,14 @@ cleanup: static int resp_report_zones(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) { - unsigned int i, max_zones, rep_max_zones, nrz = 0; + unsigned int rep_max_zones, nrz = 0; int ret = 0; u32 alloc_len, rep_opts, rep_len; bool partial; u64 lba, zs_lba; u8 *arr = NULL, *desc; u8 *cmd = scp->cmnd; - struct sdeb_zone_state *zsp; + struct sdeb_zone_state *zsp = NULL; struct sdeb_store_info *sip = devip2sip(devip, false); rwlock_t *macc_lckp = sip ? &sip->macc_lck : &sdeb_fake_rw_lck; @@ -4329,9 +4372,7 @@ static int resp_report_zones(struct scsi_cmnd *scp, return check_condition_result; } - max_zones = devip->nr_zones - (zs_lba >> devip->zsize_shift); - rep_max_zones = min((alloc_len - 64) >> ilog2(RZONES_DESC_HD), - max_zones); + rep_max_zones = (alloc_len - 64) >> ilog2(RZONES_DESC_HD); arr = kcalloc(RZONES_DESC_HD, alloc_len, GFP_ATOMIC); if (!arr) { @@ -4343,9 +4384,9 @@ static int resp_report_zones(struct scsi_cmnd *scp, read_lock(macc_lckp); desc = arr + 64; - for (i = 0; i < max_zones; i++) { - lba = zs_lba + devip->zsize * i; - if (lba > sdebug_capacity) + for (lba = zs_lba; lba < sdebug_capacity; + lba = zsp->z_start + zsp->z_size) { + if (WARN_ONCE(zbc_zone(devip, lba) == zsp, "lba = %llu\n", lba)) break; zsp = zbc_zone(devip, lba); switch (rep_opts) { @@ -4390,9 +4431,14 @@ static int resp_report_zones(struct scsi_cmnd *scp, if (!zsp->z_non_seq_resource) continue; break; + case 0x3e: + /* All zones except gap zones. */ + if (zbc_zone_is_gap(zsp)) + continue; + break; case 0x3f: /* Not write pointer (conventional) zones */ - if (!zbc_zone_is_conv(zsp)) + if (zbc_zone_is_seq(zsp)) continue; break; default: @@ -4421,8 +4467,13 @@ static int resp_report_zones(struct scsi_cmnd *scp, } /* Report header */ + /* Zone list length. */ put_unaligned_be32(nrz * RZONES_DESC_HD, arr + 0); + /* Maximum LBA */ put_unaligned_be64(sdebug_capacity - 1, arr + 8); + /* Zone starting LBA granularity. */ + if (devip->zcap < devip->zsize) + put_unaligned_be64(devip->zsize, arr + 16); rep_len = (unsigned long)desc - (unsigned long)arr; ret = fill_from_dev_buffer(scp, arr, min_t(int, alloc_len, rep_len)); @@ -4649,7 +4700,7 @@ static void zbc_rwp_zone(struct sdebug_dev_info *devip, { enum sdebug_z_cond zc; - if (zbc_zone_is_conv(zsp)) + if (!zbc_zone_is_seq(zsp)) return; zc = zsp->z_cond; @@ -4836,6 +4887,7 @@ static int sdebug_device_create_zones(struct sdebug_dev_info *devip) { struct sdeb_zone_state *zsp; sector_t capacity = get_sdebug_capacity(); + sector_t conv_capacity; sector_t zstart = 0; unsigned int i; @@ -4870,11 +4922,30 @@ static int sdebug_device_create_zones(struct sdebug_dev_info *devip) devip->zsize_shift = ilog2(devip->zsize); devip->nr_zones = (capacity + devip->zsize - 1) >> devip->zsize_shift; - if (sdeb_zbc_nr_conv >= devip->nr_zones) { + if (sdeb_zbc_zone_cap_mb == 0) { + devip->zcap = devip->zsize; + } else { + devip->zcap = (sdeb_zbc_zone_cap_mb * SZ_1M) >> + ilog2(sdebug_sector_size); + if (devip->zcap > devip->zsize) { + pr_err("Zone capacity too large\n"); + return -EINVAL; + } + } + + conv_capacity = (sector_t)sdeb_zbc_nr_conv << devip->zsize_shift; + if (conv_capacity >= capacity) { pr_err("Number of conventional zones too large\n"); return -EINVAL; } devip->nr_conv_zones = sdeb_zbc_nr_conv; + devip->nr_seq_zones = ALIGN(capacity - conv_capacity, devip->zsize) >> + devip->zsize_shift; + devip->nr_zones = devip->nr_conv_zones + devip->nr_seq_zones; + + /* Add gap zones if zone capacity is smaller than the zone size */ + if (devip->zcap < devip->zsize) + devip->nr_zones += devip->nr_seq_zones; if (devip->zmodel == BLK_ZONED_HM) { /* zbc_max_open_zones can be 0, meaning "not reported" */ @@ -4898,20 +4969,26 @@ static int sdebug_device_create_zones(struct sdebug_dev_info *devip) zsp->z_type = ZBC_ZTYPE_CNV; zsp->z_cond = ZBC_NOT_WRITE_POINTER; zsp->z_wp = (sector_t)-1; - } else { + zsp->z_size = + min_t(u64, devip->zsize, capacity - zstart); + } else if ((zstart & (devip->zsize - 1)) == 0) { if (devip->zmodel == BLK_ZONED_HM) zsp->z_type = ZBC_ZTYPE_SWR; else zsp->z_type = ZBC_ZTYPE_SWP; zsp->z_cond = ZC1_EMPTY; zsp->z_wp = zsp->z_start; + zsp->z_size = + min_t(u64, devip->zcap, capacity - zstart); + } else { + zsp->z_type = ZBC_ZTYPE_GAP; + zsp->z_cond = ZBC_NOT_WRITE_POINTER; + zsp->z_wp = (sector_t)-1; + zsp->z_size = min_t(u64, devip->zsize - devip->zcap, + capacity - zstart); } - if (zsp->z_start + devip->zsize < capacity) - zsp->z_size = devip->zsize; - else - zsp->z_size = capacity - zsp->z_start; - + WARN_ON_ONCE((int)zsp->z_size <= 0); zstart += zsp->z_size; } @@ -5679,6 +5756,7 @@ module_param_named(wp, sdebug_wp, bool, S_IRUGO | S_IWUSR); module_param_named(write_same_length, sdebug_write_same_length, int, S_IRUGO | S_IWUSR); module_param_named(zbc, sdeb_zbc_model_s, charp, S_IRUGO); +module_param_named(zone_cap_mb, sdeb_zbc_zone_cap_mb, int, S_IRUGO); module_param_named(zone_max_open, sdeb_zbc_max_open, int, S_IRUGO); module_param_named(zone_nr_conv, sdeb_zbc_nr_conv, int, S_IRUGO); module_param_named(zone_size_mb, sdeb_zbc_zone_size_mb, int, S_IRUGO); @@ -5749,6 +5827,7 @@ MODULE_PARM_DESC(vpd_use_hostno, "0 -> dev ids ignore hostno (def=1 -> unique de MODULE_PARM_DESC(wp, "Write Protect (def=0)"); MODULE_PARM_DESC(write_same_length, "Maximum blocks per WRITE SAME cmd (def=0xffff)"); MODULE_PARM_DESC(zbc, "'none' [0]; 'aware' [1]; 'managed' [2] (def=0). Can have 'host-' prefix"); +MODULE_PARM_DESC(zone_cap_mb, "Zone capacity in MiB (def=zone size)"); MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones; [0] for no limit (def=auto)"); MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones (def=1)"); MODULE_PARM_DESC(zone_size_mb, "Zone size in MiB (def=auto)"); From e643f0f8520dd4c50c141a09c34673b78e3ac725 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 22 Mar 2022 14:39:13 -0700 Subject: [PATCH 10/57] f2fs: replace congestion_wait() calls with io_schedule_timeout() As congestion is no longer tracked, congestion_wait() is effectively equivalent to io_schedule_timeout(). So introduce f2fs_io_schedule_timeout() which sets TASK_UNINTERRUPTIBLE and call that instead. Link: https://lkml.kernel.org/r/164549983744.9187.6425865370954230902.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Darrick J. Wong Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jan Kara Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/compress.c | 4 +--- fs/f2fs/data.c | 3 +-- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/segment.c | 8 +++----- fs/f2fs/super.c | 6 ++---- 5 files changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 750573d1feaa..e98a0ef71941 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1503,9 +1503,7 @@ continue_unlock: if (IS_NOQUOTA(cc->inode)) return 0; ret = 0; - cond_resched(); - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry_write; } return ret; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f9b28c59df86..35aea4be3906 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3066,8 +3066,7 @@ result: } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, + f2fs_io_schedule_timeout( DEFAULT_IO_TIMEOUT); goto retry_write; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5d761e1b0a2a..097608a45913 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4536,6 +4536,12 @@ static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; } +static inline void f2fs_io_schedule_timeout(long timeout) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(timeout); +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 14abe83616a8..34a4e0d70d55 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -314,8 +314,7 @@ next: skip: iput(inode); } - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); - cond_resched(); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); if (gc_failure) { if (++looped >= count) return; @@ -806,8 +805,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) do { ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (ret && --count); if (ret) { @@ -3140,7 +3138,7 @@ next: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto next; } skip: diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c6d7e5bf4a02..f3bf121ecad0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2152,8 +2152,7 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* we should flush all the data to keep data consistency */ do { sync_inodes_sb(sbi->sb); - cond_resched(); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); if (unlikely(retry < 0)) @@ -2522,8 +2521,7 @@ retry: &page, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); From e8e3f1a12d05d0aef2c819664890b540dfb055af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Feb 2022 14:41:22 +0200 Subject: [PATCH 11/57] f2fs: don't pass a bio to f2fs_target_device Set the bdev at bio allocation time by changing the f2fs_target_device calling conventions, so that no bio needs to be passed in. Signed-off-by: Christoph Hellwig Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20220228124123.856027-2-hch@lst.de Signed-off-by: Jens Axboe --- fs/f2fs/data.c | 23 +++++++++++++---------- fs/f2fs/f2fs.h | 2 +- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 35aea4be3906..43ce83c2f2fe 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -355,7 +355,7 @@ static void f2fs_write_end_io(struct bio *bio) } struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio) + block_t blk_addr, sector_t *sector) { struct block_device *bdev = sbi->sb->s_bdev; int i; @@ -370,10 +370,9 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, } } } - if (bio) { - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); - } + + if (sector) + *sector = SECTOR_FROM_BLOCK(blk_addr); return bdev; } @@ -393,11 +392,14 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; + struct block_device *bdev; + sector_t sector; struct bio *bio; + bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); - - f2fs_target_device(sbi, fio->new_blkaddr, bio); + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; bio->bi_private = NULL; @@ -985,15 +987,16 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, struct bio *bio; struct bio_post_read_ctx *ctx = NULL; unsigned int post_read_steps = 0; + sector_t sector; + struct block_device *bdev = f2fs_target_device(sbi, blkaddr, §or); bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, bio_max_segs(nr_pages), &f2fs_bioset); + bio_set_dev(bio, bdev); if (!bio) return ERR_PTR(-ENOMEM); - + bio->bi_iter.bi_sector = sector; f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); - - f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; bio_set_op_attrs(bio, REQ_OP_READ, op_flag); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 097608a45913..3568762b4d32 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3737,7 +3737,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio); int f2fs_merge_page_bio(struct f2fs_io_info *fio); void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio); + block_t blk_addr, sector_t *sector); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); void f2fs_set_data_blkaddr(struct dnode_of_data *dn); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); From 9fb1cfcd65adb25602acfd2a25c4cf047e532f84 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Feb 2022 14:41:23 +0200 Subject: [PATCH 12/57] f2fs: pass the bio operation to bio_alloc_bioset Refactor block I/O code so that the bio operation and known flags are set at bio allocation time. Only the later updated flags are updated on the fly. Signed-off-by: Christoph Hellwig Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20220228124123.856027-3-hch@lst.de Signed-off-by: Jens Axboe --- fs/f2fs/data.c | 67 ++++++++++++++++++++------------------------------ 1 file changed, 27 insertions(+), 40 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 43ce83c2f2fe..e563fc28a902 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -389,6 +389,24 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } +static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag) +{ + unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; + unsigned int fua_flag = io_flag & temp_mask; + unsigned int meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; + + /* + * data/node io flag bits per temp: + * REQ_META | REQ_FUA | + * 5 | 4 | 3 | 2 | 1 | 0 | + * Cold | Warm | Hot | Cold | Warm | Hot | + */ + if ((1 << fio->temp) & meta_flag) + fio->op_flags |= REQ_META; + if ((1 << fio->temp) & fua_flag) + fio->op_flags |= REQ_FUA; +} + static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; @@ -396,9 +414,15 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) sector_t sector; struct bio *bio; + if (fio->type == DATA) + __attach_io_flag(fio, sbi->data_io_flag); + else if (fio->type == NODE) + __attach_io_flag(fio, sbi->node_io_flag); + bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); bio_set_dev(bio, bdev); + bio_set_op_attrs(bio, fio->op, fio->op_flags); bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; @@ -503,34 +527,6 @@ void f2fs_submit_bio(struct f2fs_sb_info *sbi, __submit_bio(sbi, bio, type); } -static void __attach_io_flag(struct f2fs_io_info *fio) -{ - struct f2fs_sb_info *sbi = fio->sbi; - unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; - unsigned int io_flag, fua_flag, meta_flag; - - if (fio->type == DATA) - io_flag = sbi->data_io_flag; - else if (fio->type == NODE) - io_flag = sbi->node_io_flag; - else - return; - - fua_flag = io_flag & temp_mask; - meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; - - /* - * data/node io flag bits per temp: - * REQ_META | REQ_FUA | - * 5 | 4 | 3 | 2 | 1 | 0 | - * Cold | Warm | Hot | Cold | Warm | Hot | - */ - if ((1 << fio->temp) & meta_flag) - fio->op_flags |= REQ_META; - if ((1 << fio->temp) & fua_flag) - fio->op_flags |= REQ_FUA; -} - static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -538,9 +534,6 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - __attach_io_flag(fio); - bio_set_op_attrs(io->bio, fio->op, fio->op_flags); - if (is_read_io(fio->op)) trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); else @@ -598,10 +591,9 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; - io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC; + io->bio->bi_opf |= REQ_META | REQ_PRIO | REQ_SYNC; if (!test_opt(sbi, NOBARRIER)) - io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); f2fs_up_write(&io->io_rwsem); @@ -682,9 +674,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (fio->io_wbc && !is_read_io(fio->op)) wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); - __attach_io_flag(fio); - bio_set_op_attrs(bio, fio->op, fio->op_flags); - inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page): WB_DATA_TYPE(fio->page)); @@ -878,10 +867,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_VECS); - __attach_io_flag(fio); f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, fio->page->index, fio, GFP_NOIO); - bio_set_op_attrs(bio, fio->op, fio->op_flags); add_bio_entry(fio->sbi, bio, page, fio->temp); } else { @@ -993,12 +980,12 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, bio_max_segs(nr_pages), &f2fs_bioset); bio_set_dev(bio, bdev); + bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (!bio) return ERR_PTR(-ENOMEM); bio->bi_iter.bi_sector = sector; f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); bio->bi_end_io = f2fs_read_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= STEP_DECRYPT; From fdf493a2d35cb01f0472ef5cd5a9dc7a4f745c76 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 12 Apr 2022 14:45:50 -0700 Subject: [PATCH 13/57] f2fs: remove obsolete whint_mode This patch removes obsolete whint_mode. Fixes: 41d36a9f3e53 ("fs: remove kiocb.ki_hint") Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 70 ---------------------- fs/f2fs/data.c | 2 - fs/f2fs/f2fs.h | 9 --- fs/f2fs/file.c | 7 --- fs/f2fs/segment.c | 95 ------------------------------ fs/f2fs/super.c | 32 +--------- 6 files changed, 1 insertion(+), 214 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 5fda320354a6..d294d04c06d0 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -235,12 +235,6 @@ offgrpjquota Turn off group journalled quota. offprjjquota Turn off project journalled quota. quota Enable plain user disk quota accounting. noquota Disable all plain disk quota option. -whint_mode=%s Control which write hints are passed down to block - layer. This supports "off", "user-based", and - "fs-based". In "off" mode (default), f2fs does not pass - down hints. In "user-based" mode, f2fs tries to pass - down hints given by users. And in "fs-based" mode, f2fs - passes down hints with its policy. alloc_mode=%s Adjust block allocation policy, which supports "reuse" and "default". fsync_mode=%s Control the policy of fsync. Currently supports "posix", @@ -751,70 +745,6 @@ In order to identify whether the data in the victim segment are valid or not, F2FS manages a bitmap. Each bit represents the validity of a block, and the bitmap is composed of a bit stream covering whole blocks in main area. -Write-hint Policy ------------------ - -1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. - -2) whint_mode=user-based. F2FS tries to pass down hints given by -users. - -===================== ======================== =================== -User F2FS Block -===================== ======================== =================== -N/A META WRITE_LIFE_NOT_SET -N/A HOT_NODE " -N/A WARM_NODE " -N/A COLD_NODE " -ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME -extension list " " - --- buffered io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET -WRITE_LIFE_NONE " " -WRITE_LIFE_MEDIUM " " -WRITE_LIFE_LONG " " - --- direct io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET -WRITE_LIFE_NONE " WRITE_LIFE_NONE -WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM -WRITE_LIFE_LONG " WRITE_LIFE_LONG -===================== ======================== =================== - -3) whint_mode=fs-based. F2FS passes down hints with its policy. - -===================== ======================== =================== -User F2FS Block -===================== ======================== =================== -N/A META WRITE_LIFE_MEDIUM; -N/A HOT_NODE WRITE_LIFE_NOT_SET -N/A WARM_NODE " -N/A COLD_NODE WRITE_LIFE_NONE -ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME -extension list " " - --- buffered io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG -WRITE_LIFE_NONE " " -WRITE_LIFE_MEDIUM " " -WRITE_LIFE_LONG " " - --- direct io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET -WRITE_LIFE_NONE " WRITE_LIFE_NONE -WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM -WRITE_LIFE_LONG " WRITE_LIFE_LONG -===================== ======================== =================== - Fallocate(2) Policy ------------------- diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e563fc28a902..98db194cc85c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -430,8 +430,6 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; - bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, - fio->type, fio->temp); } iostat_alloc_and_bind_ctx(sbi, bio, NULL); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3568762b4d32..eecbd56bbca8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -152,7 +152,6 @@ struct f2fs_mount_info { int s_jquota_fmt; /* Format of quota to use */ #endif /* For which write hints are passed down to block layer */ - int whint_mode; int alloc_mode; /* segment allocation policy */ int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ @@ -1331,12 +1330,6 @@ enum { FS_MODE_FRAGMENT_BLK, /* block fragmentation mode */ }; -enum { - WHINT_MODE_OFF, /* not pass down write hints */ - WHINT_MODE_USER, /* try to pass down hints given by users */ - WHINT_MODE_FS, /* pass down hints with F2FS policy */ -}; - enum { ALLOC_MODE_DEFAULT, /* stay default */ ALLOC_MODE_REUSE, /* reuse segments as much as possible */ @@ -3655,8 +3648,6 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); void f2fs_destroy_segment_manager_caches(void); int f2fs_rw_hint_to_seg_type(enum rw_hint hint); -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp); unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, unsigned int segno); unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2fa33b7b46c3..fbd68d687dc8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4482,10 +4482,8 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); const bool do_opu = f2fs_lfs_mode(sbi); - const int whint_mode = F2FS_OPTION(sbi).whint_mode; const loff_t pos = iocb->ki_pos; const ssize_t count = iov_iter_count(from); - const enum rw_hint hint = iocb->ki_hint; unsigned int dio_flags; struct iomap_dio *dio; ssize_t ret; @@ -4518,9 +4516,6 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, if (do_opu) f2fs_down_read(&fi->i_gc_rwsem[READ]); } - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = WRITE_LIFE_NOT_SET; - /* * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of * the higher-level function iomap_dio_rw() in order to ensure that the @@ -4542,8 +4537,6 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, ret = iomap_dio_complete(dio); } - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = hint; if (do_opu) f2fs_up_read(&fi->i_gc_rwsem[READ]); f2fs_up_read(&fi->i_gc_rwsem[WRITE]); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 34a4e0d70d55..e2212b380fcd 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3246,101 +3246,6 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint) } } -/* This returns write hints for each segment type. This hints will be - * passed down to block layer. There are mapping tables which depend on - * the mount option 'whint_mode'. - * - * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. - * - * 2) whint_mode=user-based. F2FS tries to pass down hints given by users. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_NOT_SET - * HOT_NODE " - * WARM_NODE " - * COLD_NODE " - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - * - * 3) whint_mode=fs-based. F2FS passes down hints with its policy. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_MEDIUM; - * HOT_NODE WRITE_LIFE_NOT_SET - * WARM_NODE " - * COLD_NODE WRITE_LIFE_NONE - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - */ - -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp) -{ - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_NOT_SET; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else { - return WRITE_LIFE_NOT_SET; - } - } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_LONG; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else if (type == NODE) { - if (temp == WARM || temp == HOT) - return WRITE_LIFE_NOT_SET; - else if (temp == COLD) - return WRITE_LIFE_NONE; - } else if (type == META) { - return WRITE_LIFE_MEDIUM; - } - } - return WRITE_LIFE_NOT_SET; -} - static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f3bf121ecad0..9c8f26c00dee 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -138,7 +138,6 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, - Opt_whint, Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, @@ -214,7 +213,6 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, @@ -982,22 +980,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) f2fs_info(sbi, "quota operations not supported"); break; #endif - case Opt_whint: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "user-based")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER; - } else if (!strcmp(name, "off")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - } else if (!strcmp(name, "fs-based")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; case Opt_alloc: name = match_strdup(&args[0]); if (!name) @@ -1335,12 +1317,6 @@ default_check: return -EINVAL; } - /* Not pass down write hints if the number of active logs is lesser - * than NR_CURSEG_PERSIST_TYPE. - */ - if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_PERSIST_TYPE) - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; @@ -1981,10 +1957,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) - seq_printf(seq, ",whint_mode=%s", "user-based"); - else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) - seq_printf(seq, ",whint_mode=%s", "fs-based"); fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); @@ -2036,7 +2008,6 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); @@ -2317,8 +2288,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & SB_RDONLY || - F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { + if (*flags & SB_RDONLY) { sync_inodes_sb(sb); set_sbi_flag(sbi, SBI_IS_DIRTY); From f983ae971a917699dcde0c8036d62739b0b9264d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 12 Apr 2022 15:01:58 -0700 Subject: [PATCH 14/57] f2fs: keep io_flags to avoid IO split due to different op_flags in two fio holders Let's attach io_flags to bio only, so that we can merge IOs given original io_flags only. Fixes: 64bf0eef0171 ("f2fs: pass the bio operation to bio_alloc_bioset") Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 98db194cc85c..7b39665da532 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -389,11 +389,23 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } -static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag) +static unsigned int f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; - unsigned int fua_flag = io_flag & temp_mask; - unsigned int meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; + unsigned int fua_flag, meta_flag, io_flag; + unsigned int op_flags = 0; + + if (fio->op != REQ_OP_WRITE) + return 0; + if (fio->type == DATA) + io_flag = fio->sbi->data_io_flag; + else if (fio->type == NODE) + io_flag = fio->sbi->node_io_flag; + else + return 0; + + fua_flag = io_flag & temp_mask; + meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; /* * data/node io flag bits per temp: @@ -402,9 +414,10 @@ static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag) * Cold | Warm | Hot | Cold | Warm | Hot | */ if ((1 << fio->temp) & meta_flag) - fio->op_flags |= REQ_META; + op_flags |= REQ_META; if ((1 << fio->temp) & fua_flag) - fio->op_flags |= REQ_FUA; + op_flags |= REQ_FUA; + return op_flags; } static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) @@ -414,15 +427,10 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) sector_t sector; struct bio *bio; - if (fio->type == DATA) - __attach_io_flag(fio, sbi->data_io_flag); - else if (fio->type == NODE) - __attach_io_flag(fio, sbi->node_io_flag); - bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, fio->op, fio->op_flags); + bio_set_op_attrs(bio, fio->op, fio->op_flags | f2fs_io_flags(fio)); bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; From 91e1b07af1bf34a999a7e08e4966f1b50d4887a2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 29 Mar 2022 11:28:07 -0700 Subject: [PATCH 15/57] f2fs: fix wrong condition check when failing metapage read This patch fixes wrong initialization. Fixes: 50c63009f6ab ("f2fs: avoid an infinite loop in f2fs_sync_dirty_inodes") Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index aba1b8a1ce66..ed61ac99a3cf 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -98,9 +98,9 @@ repeat: } if (unlikely(!PageUptodate(page))) { - if (page->index == sbi->metapage_eio_ofs && - sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) { - set_ckpt_flags(sbi, CP_ERROR_FLAG); + if (page->index == sbi->metapage_eio_ofs) { + if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); } else { sbi->metapage_eio_ofs = page->index; sbi->metapage_eio_cnt = 0; From 0084ddbc302370f50cd7afc49b1fe92e0c0bcccd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 21 Apr 2022 16:47:02 -0700 Subject: [PATCH 16/57] f2fs: should not truncate blocks during roll-forward recovery If the file preallocated blocks and fsync'ed, we should not truncate them during roll-forward recovery which will recover i_size correctly back. Fixes: d4dd19ec1ea0 ("f2fs: do not expose unwritten blocks to user by DIO") Cc: # 5.17+ Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ad5fde39d3bd..5fffc2268964 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -550,7 +550,8 @@ make_now: } f2fs_set_inode_flags(inode); - if (file_should_truncate(inode)) { + if (file_should_truncate(inode) && + !is_sbi_flag_set(sbi, SBI_POR_DOING)) { ret = f2fs_truncate(inode); if (ret) goto bad_inode; From 1e3f2b54b00d0d9a82d44728bd8959b8a7958804 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 20 Mar 2022 23:11:17 +0800 Subject: [PATCH 17/57] f2fs: check pinfile in gc_data_segment() in advance In order to skip migrating section which contains data of pinned file in advance. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 79dc38eacb19..e2c9555f8bd6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1481,6 +1481,13 @@ next_step: special_file(inode->i_mode)) continue; + if (is_inode_flag_set(inode, FI_PIN_FILE) && + gc_type == FG_GC) { + f2fs_pin_file_control(inode, true); + iput(inode); + return submitted; + } + if (!f2fs_down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); From e1fc90b5f5770d82d1a4f9341f9450a3fce01952 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 20 Mar 2022 23:11:18 +0800 Subject: [PATCH 18/57] f2fs: don't set GC_FAILURE_PIN for background GC So that it can reduce the possibility that file be unpinned forcely by foreground GC due to .i_gc_failures[GC_FAILURE_PIN] exceeds threshold. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e2c9555f8bd6..79ea591f176c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1202,7 +1202,8 @@ static int move_data_block(struct inode *inode, block_t bidx, } if (f2fs_is_pinned_file(inode)) { - f2fs_pin_file_control(inode, true); + if (gc_type == FG_GC) + f2fs_pin_file_control(inode, true); err = -EAGAIN; goto out; } From 9424a9fdc163182ff119a82a6caa322628006e5d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Mar 2022 13:30:04 -0700 Subject: [PATCH 19/57] f2fs: remove unnecessary f2fs_lock_op in f2fs_new_inode This can be removed, since f2fs_alloc_nid() actually doesn't require to block checkpoint and __f2fs_build_free_nids() is covered by nm_i->nat_tree_lock. Suggested-by: Linus Torvalds Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 906e9e301ac8..f5420eb3cbb8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -37,13 +37,10 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, if (!inode) return ERR_PTR(-ENOMEM); - f2fs_lock_op(sbi); if (!f2fs_alloc_nid(sbi, &ino)) { - f2fs_unlock_op(sbi); err = -ENOSPC; goto fail; } - f2fs_unlock_op(sbi); nid_free = true; From 4722ee2c4c199d46d485ba6ea72a655d506d87cc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 21 Mar 2022 15:13:06 -0700 Subject: [PATCH 20/57] f2fs: introduce data read/write showing path info This was used in Android for a long time. Let's upstream it. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 58 ++++++++++++++++++++--- include/trace/events/f2fs.h | 94 +++++++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fbd68d687dc8..09a1350a1259 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4329,17 +4329,39 @@ out: static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); + const loff_t pos = iocb->ki_pos; ssize_t ret; if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - if (f2fs_should_use_dio(inode, iocb, to)) - return f2fs_dio_read_iter(iocb, to); + if (trace_f2fs_dataread_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); + char *path; - ret = filemap_read(iocb, to, 0); - if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + if (!p) + goto skip_read_trace; + + path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_read_trace; + } + + trace_f2fs_dataread_start(inode, pos, iov_iter_count(to), + current->pid, path, current->comm); + kfree(p); + } +skip_read_trace: + if (f2fs_should_use_dio(inode, iocb, to)) { + ret = f2fs_dio_read_iter(iocb, to); + } else { + ret = filemap_read(iocb, to, 0); + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + } + if (trace_f2fs_dataread_end_enabled()) + trace_f2fs_dataread_end(inode, pos, ret); return ret; } @@ -4630,14 +4652,36 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* Possibly preallocate the blocks for the write. */ target_size = iocb->ki_pos + iov_iter_count(from); preallocated = f2fs_preallocate_blocks(iocb, from, dio); - if (preallocated < 0) + if (preallocated < 0) { ret = preallocated; - else + } else { + if (trace_f2fs_datawrite_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), + PATH_MAX, GFP_KERNEL); + char *path; + + if (!p) + goto skip_write_trace; + path = dentry_path_raw(file_dentry(iocb->ki_filp), + p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_write_trace; + } + trace_f2fs_datawrite_start(inode, orig_pos, orig_count, + current->pid, path, current->comm); + kfree(p); + } +skip_write_trace: /* Do the actual write. */ ret = dio ? f2fs_dio_write_iter(iocb, from, &may_need_sync): f2fs_buffered_write_iter(iocb, from); + if (trace_f2fs_datawrite_end_enabled()) + trace_f2fs_datawrite_end(inode, orig_pos, ret); + } + /* Don't leave any preallocated blocks around past i_size. */ if (preallocated && i_size_read(inode) < target_size) { f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index f701bb23f83c..11f6b7147be2 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -2068,6 +2068,100 @@ TRACE_EVENT(f2fs_fiemap, __entry->ret) ); +DECLARE_EVENT_CLASS(f2fs__rw_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command), + + TP_STRUCT__entry( + __string(pathbuf, pathname) + __field(loff_t, offset) + __field(int, bytes) + __field(loff_t, i_size) + __string(cmdline, command) + __field(pid_t, pid) + __field(ino_t, ino) + ), + + TP_fast_assign( + /* + * Replace the spaces in filenames and cmdlines + * because this screws up the tooling that parses + * the traces. + */ + __assign_str(pathbuf, pathname); + (void)strreplace(__get_str(pathbuf), ' ', '_'); + __entry->offset = offset; + __entry->bytes = bytes; + __entry->i_size = i_size_read(inode); + __assign_str(cmdline, command); + (void)strreplace(__get_str(cmdline), ' ', '_'); + __entry->pid = pid; + __entry->ino = inode->i_ino; + ), + + TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s," + " pid %d, i_size %llu, ino %lu", + __get_str(pathbuf), __entry->offset, __entry->bytes, + __get_str(cmdline), __entry->pid, __entry->i_size, + (unsigned long) __entry->ino) +); + +DECLARE_EVENT_CLASS(f2fs__rw_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes), + + TP_STRUCT__entry( + __field(ino_t, ino) + __field(loff_t, offset) + __field(int, bytes) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->offset = offset; + __entry->bytes = bytes; + ), + + TP_printk("ino %lu, offset %llu, bytes %d", + (unsigned long) __entry->ino, + __entry->offset, __entry->bytes) +); + +DEFINE_EVENT(f2fs__rw_start, f2fs_dataread_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command) +); + +DEFINE_EVENT(f2fs__rw_end, f2fs_dataread_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes) +); + +DEFINE_EVENT(f2fs__rw_start, f2fs_datawrite_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command) +); + +DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ From 0b8c2cb3f899246bde95b53405dfe8a260abab48 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 29 Mar 2022 00:02:53 +0800 Subject: [PATCH 21/57] f2fs: fix to do sanity check on inline_dots inode As Wenqing reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215765 It will cause a kernel panic with steps: - mkdir mnt - mount tmp40.img mnt - ls mnt folio_mark_dirty+0x33/0x50 f2fs_add_regular_entry+0x541/0xad0 [f2fs] f2fs_add_dentry+0x6c/0xb0 [f2fs] f2fs_do_add_link+0x182/0x230 [f2fs] __recover_dot_dentries+0x2d6/0x470 [f2fs] f2fs_lookup+0x5af/0x6a0 [f2fs] __lookup_slow+0xac/0x200 lookup_slow+0x45/0x70 walk_component+0x16c/0x250 path_lookupat+0x8b/0x1f0 filename_lookup+0xef/0x250 user_path_at_empty+0x46/0x70 vfs_statx+0x98/0x190 __do_sys_newlstat+0x41/0x90 __x64_sys_newlstat+0x1a/0x30 do_syscall_64+0x37/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is for special file: e.g. character, block, fifo or socket file, f2fs doesn't assign address space operations pointer array for mapping->a_ops field, so, in a fuzzed image, if inline_dots flag was tagged in special file, during lookup(), when f2fs runs into __recover_dot_dentries(), it will cause NULL pointer access once f2fs_add_regular_entry() calls a_ops->set_dirty_page(). Fixes: 510022a85839 ("f2fs: add F2FS_INLINE_DOTS to recover missing dot dentries") Reported-by: Wenqing Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index f5420eb3cbb8..7c554e921dd5 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -458,6 +458,13 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + if (!S_ISDIR(dir->i_mode)) { + f2fs_err(sbi, "inconsistent inode status, skip recovering inline_dots inode (ino:%lu, i_mode:%u, pino:%u)", + dir->i_ino, dir->i_mode, pino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -ENOTDIR; + } + err = f2fs_dquot_initialize(dir); if (err) return err; From 5d724a393a437ec34c1368c9a38f4a3244e9601b Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Fri, 1 Apr 2022 00:34:14 +0200 Subject: [PATCH 22/57] f2fs: fix dereference of stale list iterator after loop body The list iterator variable will be a bogus pointer if no break was hit. Dereferencing it (cur->page in this case) could load an out-of-bounds/undefined value making it unsafe to use that in the comparision to determine if the specific element was found. Since 'cur->page' *can* be out-ouf-bounds it cannot be guaranteed that by chance (or intention of an attacker) it matches the value of 'page' even though the correct element was not found. This is fixed by using a separate list iterator variable for the loop and only setting the original variable if a suitable element was found. Then determing if the element was found is simply checking if the variable is set. Fixes: 8c242db9b8c0 ("f2fs: fix stale ATOMIC_WRITTEN_PAGE private pointer") Signed-off-by: Jakob Koschel Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e2212b380fcd..a1460d3841b6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -356,16 +356,19 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct list_head *head = &fi->inmem_pages; struct inmem_pages *cur = NULL; + struct inmem_pages *tmp; f2fs_bug_on(sbi, !page_private_atomic(page)); mutex_lock(&fi->inmem_lock); - list_for_each_entry(cur, head, list) { - if (cur->page == page) + list_for_each_entry(tmp, head, list) { + if (tmp->page == page) { + cur = tmp; break; + } } - f2fs_bug_on(sbi, list_empty(head) || cur->page != page); + f2fs_bug_on(sbi, !cur); list_del(&cur->list); mutex_unlock(&fi->inmem_lock); From 436b60432a59dc7b4cb8b0f579a954e5b7808162 Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Tue, 12 Apr 2022 14:20:39 +0200 Subject: [PATCH 23/57] f2fs: Remove usage of list iterator pas the loop for list_move_tail() In preparation to limit the scope of a list iterator to the list traversal loop, the usage of the list iterator variable 'next' should be avoided past the loop body [1]. Instead of calling list_move_tail() on 'next' after the loop, it is called within the loop if the correct location was found. After the loop it covers the case if no location was found and it should be inserted based on the 'head' of the list. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Signed-off-by: Jakob Koschel Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a1460d3841b6..3552fce0ddcb 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4092,10 +4092,12 @@ static void adjust_sit_entry_set(struct sit_entry_set *ses, return; list_for_each_entry_continue(next, head, set_list) - if (ses->entry_cnt <= next->entry_cnt) - break; + if (ses->entry_cnt <= next->entry_cnt) { + list_move_tail(&ses->set_list, &next->set_list); + return; + } - list_move_tail(&ses->set_list, &next->set_list); + list_move_tail(&ses->set_list, head); } static void add_sit_entry(unsigned int segno, struct list_head *head) From 4c22a6785d79ee57fd8b38f4210aaaece163fc80 Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Tue, 12 Apr 2022 14:20:40 +0200 Subject: [PATCH 24/57] f2fs: replace usage of found with dedicated list iterator variable To move the list iterator variable into the list_for_each_entry_*() macro in the future it should be avoided to use the list iterator variable after the loop body. To *never* use the list iterator variable after the loop it was concluded to use a separate iterator variable instead of a found boolean [1]. This removes the need to use a found variable and simply checking if the variable was set, can determine if the break/goto was hit. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Signed-off-by: Jakob Koschel Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3552fce0ddcb..e9059572f6bb 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1672,33 +1672,32 @@ static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - struct discard_cmd *dc, *tmp; - bool need_wait; + struct discard_cmd *dc = NULL, *iter, *tmp; unsigned int trimmed = 0; next: - need_wait = false; + dc = NULL; mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->lstart + dc->len <= start || end <= dc->lstart) + list_for_each_entry_safe(iter, tmp, wait_list, list) { + if (iter->lstart + iter->len <= start || end <= iter->lstart) continue; - if (dc->len < dpolicy->granularity) + if (iter->len < dpolicy->granularity) continue; - if (dc->state == D_DONE && !dc->ref) { - wait_for_completion_io(&dc->wait); - if (!dc->error) - trimmed += dc->len; - __remove_discard_cmd(sbi, dc); + if (iter->state == D_DONE && !iter->ref) { + wait_for_completion_io(&iter->wait); + if (!iter->error) + trimmed += iter->len; + __remove_discard_cmd(sbi, iter); } else { - dc->ref++; - need_wait = true; + iter->ref++; + dc = iter; break; } } mutex_unlock(&dcc->cmd_lock); - if (need_wait) { + if (dc) { trimmed += __wait_one_discard_bio(sbi, dc); goto next; } From f763cdd4f63d4b82ce276ee9974ef1b287ece764 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Fri, 15 Apr 2022 21:19:02 +0800 Subject: [PATCH 25/57] f2fs: remove WARN_ON in f2fs_is_valid_blkaddr Syzbot triggers two WARNs in f2fs_is_valid_blkaddr and __is_bitmap_valid. For example, in f2fs_is_valid_blkaddr, if type is DATA_GENERIC_ENHANCE or DATA_GENERIC_ENHANCE_READ, it invokes WARN_ON if blkaddr is not in the right range. The call trace is as follows: f2fs_get_node_info+0x45f/0x1070 read_node_page+0x577/0x1190 __get_node_page.part.0+0x9e/0x10e0 __get_node_page f2fs_get_node_page+0x109/0x180 do_read_inode f2fs_iget+0x2a5/0x58b0 f2fs_fill_super+0x3b39/0x7ca0 Fix these two WARNs by replacing WARN_ON with dump_stack. Reported-by: syzbot+763ae12a2ede1d99d4dc@syzkaller.appspotmail.com Signed-off-by: Dongliang Mu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ed61ac99a3cf..1ba5deff5e98 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -158,7 +158,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); } return exist; } @@ -196,7 +196,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, f2fs_warn(sbi, "access invalid blkaddr:%u", blkaddr); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); return false; } else { return __is_bitmap_valid(sbi, blkaddr, type); From 50cf7ef92fe740e8b49d0402c8a2e7b6feac3323 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 18 Apr 2022 16:57:44 -0700 Subject: [PATCH 26/57] f2fs: use flush command instead of FUA for zoned device The block layer for zoned disk can reorder the FUA'ed IOs. Let's use flush command to keep the write order. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 ++- fs/f2fs/node.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 09a1350a1259..6f4721eebdc1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -372,7 +372,8 @@ sync_nodes: f2fs_remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) + if ((!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) || + (atomic && !test_opt(sbi, NOBARRIER) && f2fs_sb_has_blkzoned(sbi))) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ba0766a56dfa..dbe871861d3d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1631,7 +1631,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, goto redirty_out; } - if (atomic && !test_opt(sbi, NOBARRIER)) + if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; /* should add to global list before clearing PAGECACHE status */ From 23710ade20183246da80445dacb7f9eb59a4ea88 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 29 Mar 2022 16:25:54 -0700 Subject: [PATCH 27/57] f2fs: avoid infinite loop to flush node pages xfstests/generic/475 can give EIO all the time which give an infinite loop to flush node page like below. Let's avoid it. [16418.518551] Call Trace: [16418.518553] ? dm_submit_bio+0x48/0x400 [16418.518574] ? submit_bio_checks+0x1ac/0x5a0 [16418.525207] __submit_bio+0x1a9/0x230 [16418.525210] ? kmem_cache_alloc+0x29e/0x3c0 [16418.525223] submit_bio_noacct+0xa8/0x2b0 [16418.525226] submit_bio+0x4d/0x130 [16418.525238] __submit_bio+0x49/0x310 [f2fs] [16418.525339] ? bio_add_page+0x6a/0x90 [16418.525344] f2fs_submit_page_bio+0x134/0x1f0 [f2fs] [16418.525365] read_node_page+0x125/0x1b0 [f2fs] [16418.525388] __get_node_page.part.0+0x58/0x3f0 [f2fs] [16418.525409] __get_node_page+0x2f/0x60 [f2fs] [16418.525431] f2fs_get_dnode_of_data+0x423/0x860 [f2fs] [16418.525452] ? asm_sysvec_apic_timer_interrupt+0x12/0x20 [16418.525458] ? __mod_memcg_state.part.0+0x2a/0x30 [16418.525465] ? __mod_memcg_lruvec_state+0x27/0x40 [16418.525467] ? __xa_set_mark+0x57/0x70 [16418.525472] f2fs_do_write_data_page+0x10e/0x7b0 [f2fs] [16418.525493] f2fs_write_single_data_page+0x555/0x830 [f2fs] [16418.525514] ? sysvec_apic_timer_interrupt+0x4e/0x90 [16418.525518] ? asm_sysvec_apic_timer_interrupt+0x12/0x20 [16418.525523] f2fs_write_cache_pages+0x303/0x880 [f2fs] [16418.525545] ? blk_flush_plug_list+0x47/0x100 [16418.525548] f2fs_write_data_pages+0xfd/0x320 [f2fs] [16418.525569] do_writepages+0xd5/0x210 [16418.525648] filemap_fdatawrite_wbc+0x7d/0xc0 [16418.525655] filemap_fdatawrite+0x50/0x70 [16418.525658] f2fs_sync_dirty_inodes+0xa4/0x230 [f2fs] [16418.525679] f2fs_write_checkpoint+0x16d/0x1720 [f2fs] [16418.525699] ? ttwu_do_wakeup+0x1c/0x160 [16418.525709] ? ttwu_do_activate+0x6d/0xd0 [16418.525711] ? __wait_for_common+0x11d/0x150 [16418.525715] kill_f2fs_super+0xca/0x100 [f2fs] [16418.525733] deactivate_locked_super+0x3b/0xb0 [16418.525739] deactivate_super+0x40/0x50 [16418.525741] cleanup_mnt+0x139/0x190 [16418.525747] __cleanup_mnt+0x12/0x20 [16418.525749] task_work_run+0x6d/0xa0 [16418.525765] exit_to_user_mode_prepare+0x1ad/0x1b0 [16418.525771] syscall_exit_to_user_mode+0x27/0x50 [16418.525774] do_syscall_64+0x48/0xc0 [16418.525776] entry_SYSCALL_64_after_hwframe+0x44/0xae Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 +------- fs/f2fs/f2fs.h | 23 +++++++++++++++++++---- fs/f2fs/node.c | 23 ++++++++++++----------- 3 files changed, 32 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1ba5deff5e98..ca2b2a69fddc 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -98,13 +98,7 @@ repeat: } if (unlikely(!PageUptodate(page))) { - if (page->index == sbi->metapage_eio_ofs) { - if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) - set_ckpt_flags(sbi, CP_ERROR_FLAG); - } else { - sbi->metapage_eio_ofs = page->index; - sbi->metapage_eio_cnt = 0; - } + f2fs_handle_page_eio(sbi, page->index, META); f2fs_put_page(page, 1); return ERR_PTR(-EIO); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index eecbd56bbca8..b23614e7028a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -576,8 +576,8 @@ enum { /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 -/* maximum retry of EIO'ed meta page */ -#define MAX_RETRY_META_PAGE_EIO 100 +/* maximum retry of EIO'ed page */ +#define MAX_RETRY_PAGE_EIO 100 #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ @@ -1612,8 +1612,8 @@ struct f2fs_sb_info { /* keep migration IO order for LFS mode */ struct f2fs_rwsem io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ - pgoff_t metapage_eio_ofs; /* EIO page offset */ - int metapage_eio_cnt; /* EIO count */ + pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */ + int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -4533,6 +4533,21 @@ static inline void f2fs_io_schedule_timeout(long timeout) io_schedule_timeout(timeout); } +static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs, + enum page_type type) +{ + if (unlikely(f2fs_cp_error(sbi))) + return; + + if (ofs == sbi->page_eio_ofs[type]) { + if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + } else { + sbi->page_eio_ofs[type] = ofs; + sbi->page_eio_cnt[type] = 0; + } +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dbe871861d3d..b32e13a0a6ac 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1416,8 +1416,7 @@ repeat: err = read_node_page(page, 0); if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); + goto out_put_err; } else if (err == LOCKED_PAGE) { err = 0; goto page_hit; @@ -1443,19 +1442,21 @@ repeat: goto out_err; } page_hit: - if (unlikely(nid != nid_of_node(page))) { - f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + if (likely(nid == nid_of_node(page))) + return page; + + f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - err = -EINVAL; + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; out_err: - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(err); - } - return page; + ClearPageUptodate(page); +out_put_err: + f2fs_handle_page_eio(sbi, page->index, NODE); + f2fs_put_page(page, 1); + return ERR_PTR(err); } struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) From 17aed49ed16aef9b9c4e769113361a39ff421818 Mon Sep 17 00:00:00 2001 From: Niels Dossche Date: Fri, 22 Apr 2022 20:05:04 +0200 Subject: [PATCH 28/57] f2fs: extend stat_lock to avoid potential race in statfs There are multiple calculations and reads of fields of sbi that should be protected by stat_lock. As stat_lock is not used to read these values in statfs, this can lead to inconsistent results. Extend the locking to prevent this issue. Commit c9c8ed50d94c ("f2fs: fix to avoid potential race on sbi->unusable_block_count access/update") already added the use of sbi->stat_lock in statfs in order to make the calculation of multiple, different fields atomic so that results are consistent. This is similar to that patch regarding the change in statfs. Signed-off-by: Niels Dossche Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9c8f26c00dee..f91fb3f12d11 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1710,18 +1710,23 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count; u64 avail_node_count; + unsigned int total_valid_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); - user_block_count = sbi->user_block_count; start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; + + spin_lock(&sbi->stat_lock); + + user_block_count = sbi->user_block_count; + total_valid_node_count = valid_node_count(sbi); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - spin_lock(&sbi->stat_lock); if (unlikely(buf->f_bfree <= sbi->unusable_block_count)) buf->f_bfree = 0; else @@ -1734,14 +1739,12 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) else buf->f_bavail = 0; - avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - if (avail_node_count > user_block_count) { buf->f_files = user_block_count; buf->f_ffree = buf->f_bavail; } else { buf->f_files = avail_node_count; - buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_ffree = min(avail_node_count - total_valid_node_count, buf->f_bavail); } From 18e09a823ff33c414757045cd89d7c6a2b9855fd Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 27 Apr 2022 18:02:53 +0200 Subject: [PATCH 29/57] f2fs: call bdev_zone_sectors() only once on init_blkz_info() Instead of calling bdev_zone_sectors() multiple times, call it once and cache the value locally. This will make the subsequent change easier to read. Signed-off-by: Luis Chamberlain Signed-off-by: Pankaj Raghav Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f91fb3f12d11..81c4e2593cf8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3655,22 +3655,25 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev_nr_sectors(bdev); struct f2fs_report_zones_args rep_zone_arg; + u64 zone_sectors; int ret; if (!f2fs_sb_has_blkzoned(sbi)) return 0; + zone_sectors = bdev_zone_sectors(bdev); + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) + SECTOR_TO_BLOCK(zone_sectors)) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) + if (nr_sectors & (zone_sectors - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi, From f634322eac286b806b0613809d67eb470f208ca8 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 27 Apr 2022 18:02:54 +0200 Subject: [PATCH 30/57] f2fs: ensure only power of 2 zone sizes are allowed F2FS zoned support has power of 2 zone size assumption in many places such as in __f2fs_issue_discard_zone, init_blkz_info. As the power of 2 requirement has been removed from the block layer, explicitly add a condition in f2fs to allow only power of 2 zone size devices. This condition will be relaxed once those calculation based on power of 2 is made generic. Signed-off-by: Luis Chamberlain Signed-off-by: Pankaj Raghav Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 81c4e2593cf8..668c896b4825 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3662,6 +3662,10 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) return 0; zone_sectors = bdev_zone_sectors(bdev); + if (!is_power_of_2(zone_sectors)) { + f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n"); + return -EINVAL; + } if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != SECTOR_TO_BLOCK(zone_sectors)) From 6f974a68fc86e9f287ded649a5479b6136954e01 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 30 Apr 2022 21:19:24 +0800 Subject: [PATCH 31/57] f2fs: fix to clear dirty inode in f2fs_evict_inode() As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215904 The kernel message is shown below: kernel BUG at fs/f2fs/inode.c:825! Call Trace: evict+0x282/0x4e0 __dentry_kill+0x2b2/0x4d0 shrink_dentry_list+0x17c/0x4f0 shrink_dcache_parent+0x143/0x1e0 do_one_tree+0x9/0x30 shrink_dcache_for_umount+0x51/0x120 generic_shutdown_super+0x5c/0x3a0 kill_block_super+0x90/0xd0 kill_f2fs_super+0x225/0x310 deactivate_locked_super+0x78/0xc0 cleanup_mnt+0x2b7/0x480 task_work_run+0xc8/0x150 exit_to_user_mode_prepare+0x14a/0x150 syscall_exit_to_user_mode+0x1d/0x40 do_syscall_64+0x48/0x90 The root cause is: inode node and dnode node share the same nid, so during f2fs_evict_inode(), dnode node truncation will invalidate its NAT entry, so when truncating inode node, it fails due to invalid NAT entry, result in inode is still marked as dirty, fix this issue by clearing dirty for inode and setting SBI_NEED_FSCK flag in filesystem. output from dump.f2fs: [print_node_info: 354] Node ID [0xf:15] is inode i_nid[0] [0x f : 15] Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 5fffc2268964..6effb70298a2 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -796,8 +796,22 @@ retry: f2fs_lock_op(sbi); err = f2fs_remove_inode_page(inode); f2fs_unlock_op(sbi); - if (err == -ENOENT) + if (err == -ENOENT) { err = 0; + + /* + * in fuzzed image, another node may has the same + * block address as inode's, if it was truncated + * previously, truncation of inode node will fail. + */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + f2fs_warn(F2FS_I_SB(inode), + "f2fs_evict_inode: inconsistent node id, ino:%lu", + inode->i_ino); + f2fs_inode_synced(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } } /* give more chances, if ENOMEM case */ From b2593de2475c8346d53ec05ab344585756dfb040 Mon Sep 17 00:00:00 2001 From: Byungki Lee Date: Fri, 29 Apr 2022 13:29:53 -0700 Subject: [PATCH 32/57] f2fs: write checkpoint during FG_GC If there's not enough free sections each of which consistis of large segments, we can hit no free section for upcoming section allocation. Let's reclaim some prefree segments by writing checkpoints. Signed-off-by: Byungki Lee Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 79ea591f176c..97efa11beb27 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1791,23 +1791,31 @@ gc_more: if (sync) goto stop; - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round <= MAX_SKIP_GC_COUNT || - skipped_round * 2 < round) { - segno = NULL_SEGNO; - goto gc_more; - } + if (!has_not_enough_free_secs(sbi, sec_freed, 0)) + goto stop; - if (first_skipped < last_skipped && - (last_skipped - first_skipped) > - sbi->skipped_gc_rwsem) { - f2fs_drop_inmem_pages_all(sbi, true); - segno = NULL_SEGNO; - goto gc_more; - } - if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + if (skipped_round <= MAX_SKIP_GC_COUNT || skipped_round * 2 < round) { + + /* Write checkpoint to reclaim prefree segments */ + if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && + prefree_segments(sbi) && + !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } + segno = NULL_SEGNO; + goto gc_more; } + if (first_skipped < last_skipped && + (last_skipped - first_skipped) > + sbi->skipped_gc_rwsem) { + f2fs_drop_inmem_pages_all(sbi, true); + segno = NULL_SEGNO; + goto gc_more; + } + if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + ret = f2fs_write_checkpoint(sbi, &cpc); stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; From 3511949cefb93ae240445a6a952797d3a84e747d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 27 Apr 2022 01:06:02 +0800 Subject: [PATCH 33/57] f2fs: fix to avoid f2fs_bug_on() in dec_valid_node_count() As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215897 I have encountered a bug in F2FS file system in kernel v5.17. The kernel should enable CONFIG_KASAN=y and CONFIG_KASAN_INLINE=y. You can reproduce the bug by running the following commands: The kernel message is shown below: kernel BUG at fs/f2fs/f2fs.h:2511! Call Trace: f2fs_remove_inode_page+0x2a2/0x830 f2fs_evict_inode+0x9b7/0x1510 evict+0x282/0x4e0 do_unlinkat+0x33a/0x540 __x64_sys_unlinkat+0x8e/0xd0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is: .total_valid_block_count or .total_valid_node_count could fuzzed to zero, then once dec_valid_node_count() was called, it will cause BUG_ON(), this patch fixes to print warning info and set SBI_NEED_FSCK into CP instead of panic. Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b23614e7028a..29feb1cf513e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2603,11 +2603,17 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, { spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_block_count); - f2fs_bug_on(sbi, !sbi->total_valid_node_count); + if (unlikely(!sbi->total_valid_block_count || + !sbi->total_valid_node_count)) { + f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u", + sbi->total_valid_block_count, + sbi->total_valid_node_count); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } else { + sbi->total_valid_block_count--; + sbi->total_valid_node_count--; + } - sbi->total_valid_node_count--; - sbi->total_valid_block_count--; if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks++; From 354c050902c634c278f405b0c4e421aefdec5731 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 27 Apr 2022 17:51:40 +0800 Subject: [PATCH 34/57] f2fs: fix to do sanity check on block address in f2fs_do_zero_range() As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215894 I have encountered a bug in F2FS file system in kernel v5.17. I have uploaded the system call sequence as case.c, and a fuzzed image can be found in google net disk The kernel should enable CONFIG_KASAN=y and CONFIG_KASAN_INLINE=y. You can reproduce the bug by running the following commands: kernel BUG at fs/f2fs/segment.c:2291! Call Trace: f2fs_invalidate_blocks+0x193/0x2d0 f2fs_fallocate+0x2593/0x4a70 vfs_fallocate+0x2a5/0xac0 ksys_fallocate+0x35/0x70 __x64_sys_fallocate+0x8e/0xf0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is, after image was fuzzed, block mapping info in inode will be inconsistent with SIT table, so in f2fs_fallocate(), it will cause panic when updating SIT with invalid blkaddr. Let's fix the issue by adding sanity check on block address before updating SIT table with it. Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6f4721eebdc1..32e22f84a4ce 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1438,11 +1438,19 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, ret = -ENOSPC; break; } - if (dn->data_blkaddr != NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn->data_blkaddr); - dn->data_blkaddr = NEW_ADDR; - f2fs_set_data_blkaddr(dn); + + if (dn->data_blkaddr == NEW_ADDR) + continue; + + if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, + DATA_GENERIC_ENHANCE)) { + ret = -EFSCORRUPTED; + break; } + + f2fs_invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + f2fs_set_data_blkaddr(dn); } f2fs_update_extent_cache_range(dn, start, 0, index - start); From 329e9031091a0e880a0deddb68de7bd31af37086 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 May 2022 14:09:22 +0800 Subject: [PATCH 35/57] f2fs: fix deadloop in foreground GC As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215914 The root cause is: in a very small sized image, it's very easy to exceed threshold of foreground GC, if we calculate free space and dirty data based on section granularity, in corner case, has_not_enough_free_secs() will always return true, result in deadloop in f2fs_gc(). So this patch refactors has_not_enough_free_secs() as below to fix this issue: 1. calculate needed space based on block granularity, and separate all blocks to two parts, section part, and block part, comparing section part to free section, and comparing block part to free space in openned log. 2. account F2FS_DIRTY_NODES, F2FS_DIRTY_IMETA and F2FS_DIRTY_DENTS as node block consumer; 3. account F2FS_DIRTY_DENTS as data block consumer; Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5c94caf0c0a1..66153c632a5d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -572,11 +572,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } -static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, + unsigned int node_blocks, unsigned int dent_blocks) { - unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + - get_pages(sbi, F2FS_DIRTY_DENTS); - unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int segno, left_blocks; int i; @@ -602,19 +601,28 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS) + + get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi); + unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi); + unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi); + unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi); + unsigned int free, need_lower, need_upper; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - if (free_sections(sbi) + freed == reserved_sections(sbi) + needed && - has_curseg_enough_space(sbi)) + free = free_sections(sbi) + freed; + need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed; + need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + + if (free > need_upper) return false; - return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + needed); + else if (free <= need_lower) + return true; + return !has_curseg_enough_space(sbi, node_blocks, dent_blocks); } static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) From 8e285631dff46cdf3f672ae904b518915130488a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 May 2022 09:33:06 +0800 Subject: [PATCH 36/57] f2fs: fix to do sanity check on total_data_blocks As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215916 The kernel message is shown below: kernel BUG at fs/f2fs/segment.c:2560! Call Trace: allocate_segment_by_default+0x228/0x440 f2fs_allocate_data_block+0x13d1/0x31f0 do_write_page+0x18d/0x710 f2fs_outplace_write_data+0x151/0x250 f2fs_do_write_data_page+0xef9/0x1980 move_data_page+0x6af/0xbc0 do_garbage_collect+0x312f/0x46f0 f2fs_gc+0x6b0/0x3bc0 f2fs_balance_fs+0x921/0x2260 f2fs_write_single_data_page+0x16be/0x2370 f2fs_write_cache_pages+0x428/0xd00 f2fs_write_data_pages+0x96e/0xd50 do_writepages+0x168/0x550 __writeback_single_inode+0x9f/0x870 writeback_sb_inodes+0x47d/0xb20 __writeback_inodes_wb+0xb2/0x200 wb_writeback+0x4bd/0x660 wb_workfn+0x5f3/0xab0 process_one_work+0x79f/0x13e0 worker_thread+0x89/0xf60 kthread+0x26a/0x300 ret_from_fork+0x22/0x30 RIP: 0010:new_curseg+0xe8d/0x15f0 The root cause is: ckpt.valid_block_count is inconsistent with SIT table, stat info indicates filesystem has free blocks, but SIT table indicates filesystem has no free segment. So that during garbage colloection, it triggers panic when LFS allocator fails to find free segment. This patch tries to fix this issue by checking consistency in between ckpt.valid_block_count and block accounted from SIT. Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/segment.c | 33 ++++++++++++++++++++++----------- fs/f2fs/segment.h | 1 + 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 29feb1cf513e..550a909e20e0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1115,8 +1115,8 @@ enum count_type { */ #define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) enum page_type { - DATA, - NODE, + DATA = 0, + NODE = 1, /* should not change this */ META, NR_PAGE_TYPE, META_FLUSH, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e9059572f6bb..77cba030dfb6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4464,7 +4464,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; - block_t total_node_blocks = 0; + block_t sit_valid_blocks[2] = {0, 0}; do { readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, @@ -4489,8 +4489,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (err) return err; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { /* build discard map only one time */ @@ -4530,15 +4530,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; - if (IS_NODESEG(se->type)) - total_node_blocks -= old_valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { @@ -4560,13 +4560,24 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) } up_read(&curseg->journal_rwsem); - if (!err && total_node_blocks != valid_node_count(sbi)) { + if (err) + return err; + + if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", - total_node_blocks, valid_node_count(sbi)); - err = -EFSCORRUPTED; + sit_valid_blocks[NODE], valid_node_count(sbi)); + return -EFSCORRUPTED; } - return err; + if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] > + valid_user_blocks(sbi)) { + f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", + sit_valid_blocks[DATA], sit_valid_blocks[NODE], + valid_user_blocks(sbi)); + return -EFSCORRUPTED; + } + + return 0; } static void init_free_segmap(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 66153c632a5d..1fa26a9603cb 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -24,6 +24,7 @@ #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) #define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) +#define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA)) static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, unsigned short seg_type) From bf123c9ddf34b3b6d0bb9b46f64478daf01b4f1f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 May 2022 18:30:31 +0800 Subject: [PATCH 37/57] f2fs: give priority to select unpinned section for foreground GC Previously, during foreground GC, if victims contain data of pinned file, it will fail migration of the data, and meanwhile i_gc_failures of that pinned file may increase, and when it exceeds threshold, GC will unpin the file, result in breaking pinfile's semantics. In order to mitigate such condition, let's record and skip section which has pinned file's data and give priority to select unpinned one. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 85 +++++++++++++++++++++++++++++++++++++++-------- fs/f2fs/segment.c | 8 +++++ fs/f2fs/segment.h | 3 ++ 3 files changed, 82 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 97efa11beb27..4d1a4df09634 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -646,6 +646,54 @@ static void release_victim_entry(struct f2fs_sb_info *sbi) f2fs_bug_on(sbi, !list_empty(&am->victim_list)); } +static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!dirty_i->enable_pin_section) + return false; + if (!test_and_set_bit(secno, dirty_i->pinned_secmap)) + dirty_i->pinned_secmap_cnt++; + return true; +} + +static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i) +{ + return dirty_i->pinned_secmap_cnt; +} + +static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i, + unsigned int secno) +{ + return dirty_i->enable_pin_section && + f2fs_pinned_section_exists(dirty_i) && + test_bit(secno, dirty_i->pinned_secmap); +} + +static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable) +{ + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + + if (f2fs_pinned_section_exists(DIRTY_I(sbi))) { + memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size); + DIRTY_I(sbi)->pinned_secmap_cnt = 0; + } + DIRTY_I(sbi)->enable_pin_section = enable; +} + +static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, + unsigned int segno) +{ + if (!f2fs_is_pinned_file(inode)) + return 0; + if (gc_type != FG_GC) + return -EBUSY; + if (!f2fs_pin_section(F2FS_I_SB(inode), segno)) + f2fs_pin_file_control(inode, true); + return -EAGAIN; +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -787,6 +835,9 @@ retry: if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno)) + goto next; + if (is_atgc) { add_victim_entry(sbi, &p, segno); goto next; @@ -1201,12 +1252,9 @@ static int move_data_block(struct inode *inode, block_t bidx, goto out; } - if (f2fs_is_pinned_file(inode)) { - if (gc_type == FG_GC) - f2fs_pin_file_control(inode, true); - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); @@ -1351,12 +1399,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, err = -EAGAIN; goto out; } - if (f2fs_is_pinned_file(inode)) { - if (gc_type == FG_GC) - f2fs_pin_file_control(inode, true); - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } if (gc_type == BG_GC) { if (PageWriteback(page)) { @@ -1477,14 +1522,15 @@ next_step: ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 3) { + int err; + inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode) || is_bad_inode(inode) || special_file(inode->i_mode)) continue; - if (is_inode_flag_set(inode, FI_PIN_FILE) && - gc_type == FG_GC) { - f2fs_pin_file_control(inode, true); + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err == -EAGAIN) { iput(inode); return submitted; } @@ -1767,9 +1813,17 @@ gc_more: ret = -EINVAL; goto stop; } +retry: ret = __get_victim(sbi, &segno, gc_type); - if (ret) + if (ret) { + /* allow to search victim from sections has pinned data */ + if (ret == -ENODATA && gc_type == FG_GC && + f2fs_pinned_section_exists(DIRTY_I(sbi))) { + f2fs_unpin_all_sections(sbi, false); + goto retry; + } goto stop; + } seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); if (gc_type == FG_GC && @@ -1820,6 +1874,9 @@ stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + if (gc_type == FG_GC) + f2fs_unpin_all_sections(sbi, true); + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 77cba030dfb6..4ce8f63f6767 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4657,6 +4657,13 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; + + dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!dirty_i->pinned_secmap) + return -ENOMEM; + + dirty_i->pinned_secmap_cnt = 0; + dirty_i->enable_pin_section = true; return 0; } @@ -5245,6 +5252,7 @@ static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + kvfree(dirty_i->pinned_secmap); kvfree(dirty_i->victim_secmap); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 1fa26a9603cb..8fbc9f6afa55 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -295,6 +295,9 @@ struct dirty_seglist_info { struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ unsigned long *victim_secmap; /* background GC victims */ + unsigned long *pinned_secmap; /* pinned victims from foreground GC */ + unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */ + bool enable_pin_section; /* enable pinning section */ }; /* victim selection function for cleaning and SSR */ From f37569f8d67002308a1b9a661367cc4ec92c7639 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Sat, 7 May 2022 00:28:14 +0800 Subject: [PATCH 38/57] f2fs: skip GC if possible when checkpoint disabling If the number of unusable blocks is not larger than unusable capacity, we can skip GC when checkpoint disabling. Signed-off-by: Weichao Guo Signed-off-by: Chao Yu [Jaegeuk Kim: Fix missing gc_mode assignment] Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 668c896b4825..234372fc329e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2061,7 +2061,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; - unsigned int gc_mode; + unsigned int gc_mode = sbi->gc_mode; int err = 0; int ret; block_t unusable; @@ -2072,9 +2072,13 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) } sbi->sb->s_flags |= SB_ACTIVE; + /* check if we need more GC first */ + unusable = f2fs_get_unusable_blocks(sbi); + if (!f2fs_disable_cp_again(sbi, unusable)) + goto skip_gc; + f2fs_update_time(sbi, DISABLE_TIME); - gc_mode = sbi->gc_mode; sbi->gc_mode = GC_URGENT_HIGH; while (!f2fs_time_over(sbi, DISABLE_TIME)) { @@ -2100,6 +2104,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) goto restore_flag; } +skip_gc: f2fs_down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); From d8a5f0ec1110c98a55ad449fe0f25f6ed39b98c2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2022 14:17:32 -0700 Subject: [PATCH 39/57] f2fs: stop allocating pinned sections if EAGAIN happens EAGAIN doesn't guarantee to have a free section. Let's report it. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 32e22f84a4ce..5e402ac93e7f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1685,7 +1685,7 @@ next_alloc: GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) + if (err && err != -ENODATA) goto out_err; } From 2d86f7572df449196671794db0587c44235691d1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 5 May 2022 17:40:25 -0700 Subject: [PATCH 40/57] f2fs: don't need inode lock for system hidden quota Let's avoid false-alarmed lockdep warning. [ 58.914674] [T1501146] -> #2 (&sb->s_type->i_mutex_key#20){+.+.}-{3:3}: [ 58.915975] [T1501146] system_server: down_write+0x7c/0xe0 [ 58.916738] [T1501146] system_server: f2fs_quota_sync+0x60/0x1a8 [ 58.917563] [T1501146] system_server: block_operations+0x16c/0x43c [ 58.918410] [T1501146] system_server: f2fs_write_checkpoint+0x114/0x318 [ 58.919312] [T1501146] system_server: f2fs_issue_checkpoint+0x178/0x21c [ 58.920214] [T1501146] system_server: f2fs_sync_fs+0x48/0x6c [ 58.920999] [T1501146] system_server: f2fs_do_sync_file+0x334/0x738 [ 58.921862] [T1501146] system_server: f2fs_sync_file+0x30/0x48 [ 58.922667] [T1501146] system_server: __arm64_sys_fsync+0x84/0xf8 [ 58.923506] [T1501146] system_server: el0_svc_common.llvm.12821150825140585682+0xd8/0x20c [ 58.924604] [T1501146] system_server: do_el0_svc+0x28/0xa0 [ 58.925366] [T1501146] system_server: el0_svc+0x24/0x38 [ 58.926094] [T1501146] system_server: el0_sync_handler+0x88/0xec [ 58.926920] [T1501146] system_server: el0_sync+0x1b4/0x1c0 [ 58.927681] [T1501146] -> #1 (&sbi->cp_global_sem){+.+.}-{3:3}: [ 58.928889] [T1501146] system_server: down_write+0x7c/0xe0 [ 58.929650] [T1501146] system_server: f2fs_write_checkpoint+0xbc/0x318 [ 58.930541] [T1501146] system_server: f2fs_issue_checkpoint+0x178/0x21c [ 58.931443] [T1501146] system_server: f2fs_sync_fs+0x48/0x6c [ 58.932226] [T1501146] system_server: sync_filesystem+0xac/0x130 [ 58.933053] [T1501146] system_server: generic_shutdown_super+0x38/0x150 [ 58.933958] [T1501146] system_server: kill_block_super+0x24/0x58 [ 58.934791] [T1501146] system_server: kill_f2fs_super+0xcc/0x124 [ 58.935618] [T1501146] system_server: deactivate_locked_super+0x90/0x120 [ 58.936529] [T1501146] system_server: deactivate_super+0x74/0xac [ 58.937356] [T1501146] system_server: cleanup_mnt+0x128/0x168 [ 58.938150] [T1501146] system_server: __cleanup_mnt+0x18/0x28 [ 58.938944] [T1501146] system_server: task_work_run+0xb8/0x14c [ 58.939749] [T1501146] system_server: do_notify_resume+0x114/0x1e8 [ 58.940595] [T1501146] system_server: work_pending+0xc/0x5f0 [ 58.941375] [T1501146] -> #0 (&sbi->gc_lock){+.+.}-{3:3}: [ 58.942519] [T1501146] system_server: __lock_acquire+0x1270/0x2868 [ 58.943366] [T1501146] system_server: lock_acquire+0x114/0x294 [ 58.944169] [T1501146] system_server: down_write+0x7c/0xe0 [ 58.944930] [T1501146] system_server: f2fs_issue_checkpoint+0x13c/0x21c [ 58.945831] [T1501146] system_server: f2fs_sync_fs+0x48/0x6c [ 58.946614] [T1501146] system_server: f2fs_do_sync_file+0x334/0x738 [ 58.947472] [T1501146] system_server: f2fs_ioc_commit_atomic_write+0xc8/0x14c [ 58.948439] [T1501146] system_server: __f2fs_ioctl+0x674/0x154c [ 58.949253] [T1501146] system_server: f2fs_ioctl+0x54/0x88 [ 58.950018] [T1501146] system_server: __arm64_sys_ioctl+0xa8/0x110 [ 58.950865] [T1501146] system_server: el0_svc_common.llvm.12821150825140585682+0xd8/0x20c [ 58.951965] [T1501146] system_server: do_el0_svc+0x28/0xa0 [ 58.952727] [T1501146] system_server: el0_svc+0x24/0x38 [ 58.953454] [T1501146] system_server: el0_sync_handler+0x88/0xec [ 58.954279] [T1501146] system_server: el0_sync+0x1b4/0x1c0 Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 234372fc329e..7b0a31dceb4c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2696,7 +2696,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) if (!sb_has_quota_active(sb, cnt)) continue; - inode_lock(dqopt->files[cnt]); + if (!f2fs_sb_has_quota_ino(sbi)) + inode_lock(dqopt->files[cnt]); /* * do_quotactl @@ -2715,7 +2716,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) f2fs_up_read(&sbi->quota_sem); f2fs_unlock_op(sbi); - inode_unlock(dqopt->files[cnt]); + if (!f2fs_sb_has_quota_ino(sbi)) + inode_unlock(dqopt->files[cnt]); if (ret) break; From 77c0800191374e45230484de20a7a3b6953e7134 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 28 Apr 2022 11:18:09 -0700 Subject: [PATCH 41/57] f2fs: change the current atomic write way Current atomic write has three major issues like below. - keeps the updates in non-reclaimable memory space and they are even hard to be migrated, which is not good for contiguous memory allocation. - disk spaces used for atomic files cannot be garbage collected, so this makes it difficult for the filesystem to be defragmented. - If atomic write operations hit the threshold of either memory usage or garbage collection failure count, All the atomic write operations will fail immediately. To resolve the issues, I will keep a COW inode internally for all the updates to be flushed from memory, when we need to flush them out in a situation like high memory pressure. These COW inodes will be tagged as orphan inodes to be reclaimed in case of sudden power-cut or system failure during atomic writes. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 180 +++++++++------ fs/f2fs/debug.c | 12 +- fs/f2fs/f2fs.h | 33 +-- fs/f2fs/file.c | 49 +++-- fs/f2fs/gc.c | 27 +-- fs/f2fs/inode.c | 3 +- fs/f2fs/namei.c | 28 ++- fs/f2fs/node.c | 4 - fs/f2fs/node.h | 1 - fs/f2fs/segment.c | 424 +++++++++++++----------------------- fs/f2fs/segment.h | 4 +- fs/f2fs/super.c | 6 +- include/trace/events/f2fs.h | 22 -- 13 files changed, 323 insertions(+), 470 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7b39665da532..3ec97088d00f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -70,8 +70,7 @@ static bool __is_cp_guaranteed(struct page *page) if (f2fs_is_compressed_page(page)) return false; - if ((S_ISREG(inode->i_mode) && - (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || + if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || page_private_gcing(page)) return true; return false; @@ -2577,7 +2576,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) bool ipu_force = false; int err = 0; - set_new_dnode(&dn, inode, NULL, NULL, 0); + /* Use COW inode to make dnode_of_data for atomic write */ + if (f2fs_is_atomic_file(inode)) + set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); + else + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; @@ -2614,6 +2618,7 @@ got_it: err = -EFSCORRUPTED; goto out_writepage; } + /* * If current allocation needs SSR, * it had better in-place writes for updated data. @@ -3327,6 +3332,100 @@ unlock_out: return err; } +static int __find_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr) +{ + struct dnode_of_data dn; + struct page *ipage; + struct extent_info ei = {0, }; + int err = 0; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + dn.data_blkaddr = NULL_ADDR; + err = 0; + } + } + *blk_addr = dn.data_blkaddr; + f2fs_put_dnode(&dn); + return err; +} + +static int __reserve_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr, bool *node_changed) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct page *ipage; + int err = 0; + + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + set_new_dnode(&dn, inode, ipage, ipage, 0); + + err = f2fs_get_block(&dn, index); + + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; + f2fs_put_dnode(&dn); + +unlock_out: + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + return err; +} + +static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned int len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + struct inode *cow_inode = F2FS_I(inode)->cow_inode; + pgoff_t index = page->index; + int err = 0; + block_t ori_blk_addr; + + /* If pos is beyond the end of file, reserve a new block in COW inode */ + if ((pos & PAGE_MASK) >= i_size_read(inode)) + return __reserve_data_block(cow_inode, index, blk_addr, + node_changed); + + /* Look for the block in COW inode first */ + err = __find_data_block(cow_inode, index, blk_addr); + if (err) + return err; + else if (*blk_addr != NULL_ADDR) + return 0; + + /* Look for the block in the original inode */ + err = __find_data_block(inode, index, &ori_blk_addr); + if (err) + return err; + + /* Finally, we should reserve a new block in COW inode for the update */ + err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); + if (err) + return err; + + if (ori_blk_addr != NULL_ADDR) + *blk_addr = ori_blk_addr; + return 0; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -3335,7 +3434,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; - bool need_balance = false, drop_atomic = false; + bool need_balance = false; block_t blkaddr = NULL_ADDR; int err = 0; @@ -3346,14 +3445,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } - if ((f2fs_is_atomic_file(inode) && - !f2fs_available_free_memory(sbi, INMEM_PAGES)) || - is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - err = -ENOMEM; - drop_atomic = true; - goto fail; - } - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -3401,7 +3492,11 @@ repeat: *pagep = page; - err = prepare_write_begin(sbi, page, pos, len, + if (f2fs_is_atomic_file(inode)) + err = prepare_atomic_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + else + err = prepare_write_begin(sbi, page, pos, len, &blkaddr, &need_balance); if (err) goto fail; @@ -3457,8 +3552,6 @@ repeat: fail: f2fs_put_page(page, 1); f2fs_write_failed(inode, pos + len); - if (drop_atomic) - f2fs_drop_inmem_pages_all(sbi, false); return err; } @@ -3502,8 +3595,12 @@ static int f2fs_write_end(struct file *file, set_page_dirty(page); if (pos + copied > i_size_read(inode) && - !f2fs_verity_in_progress(inode)) + !f2fs_verity_in_progress(inode)) { f2fs_i_size_write(inode, pos + copied); + if (f2fs_is_atomic_file(inode)) + f2fs_i_size_write(F2FS_I(inode)->cow_inode, + pos + copied); + } unlock_out: f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -3537,9 +3634,6 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, inode->i_ino == F2FS_COMPRESS_INO(sbi)) clear_page_private_data(page); - if (page_private_atomic(page)) - return f2fs_drop_inmem_page(inode, page); - detach_page_private(page); set_page_private(page, 0); } @@ -3550,10 +3644,6 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (PageDirty(page)) return 0; - /* This is atomic written page, keep Private */ - if (page_private_atomic(page)) - return 0; - if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) { struct inode *inode = page->mapping->host; @@ -3579,18 +3669,6 @@ static int f2fs_set_data_page_dirty(struct page *page) if (PageSwapCache(page)) return __set_page_dirty_nobuffers(page); - if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { - if (!page_private_atomic(page)) { - f2fs_register_inmem_page(inode, page); - return 1; - } - /* - * Previously, this page has been registered, we just - * return here. - */ - return 0; - } - if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); f2fs_update_dirty_page(inode, page); @@ -3670,42 +3748,14 @@ out: int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { - int rc, extra_count; - struct f2fs_inode_info *fi = F2FS_I(mapping->host); - bool atomic_written = page_private_atomic(page); + int rc, extra_count = 0; BUG_ON(PageWriteback(page)); - /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written) { - if (mode != MIGRATE_SYNC) - return -EBUSY; - if (!mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; - } - - /* one extra reference was held for atomic_write page */ - extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, page, extra_count); - if (rc != MIGRATEPAGE_SUCCESS) { - if (atomic_written) - mutex_unlock(&fi->inmem_lock); + if (rc != MIGRATEPAGE_SUCCESS) return rc; - } - - if (atomic_written) { - struct inmem_pages *cur; - - list_for_each_entry(cur, &fi->inmem_pages, list) - if (cur->page == page) { - cur->page = newpage; - break; - } - mutex_unlock(&fi->inmem_lock); - put_page(page); - get_page(newpage); - } /* guarantee to start from no stale private field */ set_page_private(newpage, 0); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fcdf253cd211..65f0bcf498bb 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -91,7 +91,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; - si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = sbi->atomic_files; si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); @@ -167,8 +166,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->io_skip_bggc = sbi->io_skip_bggc; si->other_skip_bggc = sbi->other_skip_bggc; - si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; - si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; @@ -296,7 +293,6 @@ get_cache: sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * sizeof(struct nat_entry_set); - si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * @@ -491,10 +487,6 @@ static int stat_show(struct seq_file *s, void *v) si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); - seq_printf(s, "Skipped : atomic write %llu (%llu)\n", - si->skipped_atomic_files[BG_GC] + - si->skipped_atomic_files[FG_GC], - si->skipped_atomic_files[BG_GC]); seq_printf(s, "BG skip : IO: %u, Other: %u\n", si->io_skip_bggc, si->other_skip_bggc); seq_puts(s, "\nExtent Cache:\n"); @@ -519,9 +511,9 @@ static int stat_show(struct seq_file *s, void *v) si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " + seq_printf(s, " - atomic IO: %4d (Max. %4d), " "volatile IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt, + si->aw_cnt, si->max_aw_cnt, si->vw_cnt, si->max_vw_cnt); seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 550a909e20e0..68a0d1e06d77 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -714,7 +714,6 @@ enum { enum { GC_FAILURE_PIN, - GC_FAILURE_ATOMIC, MAX_GC_FAILURE }; @@ -736,7 +735,6 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ - FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ @@ -750,7 +748,6 @@ enum { FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ - FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ @@ -792,11 +789,9 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ - struct list_head inmem_ilist; /* list for inmem inodes */ - struct list_head inmem_pages; /* inmemory pages managed by f2fs */ - struct task_struct *inmem_task; /* store inmemory task */ - struct mutex inmem_lock; /* lock for inmemory pages */ + struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct inode *cow_inode; /* copy-on-write inode for atomic write */ /* avoid racing between foreground op and gc */ struct f2fs_rwsem i_gc_rwsem[2]; @@ -1090,7 +1085,6 @@ enum count_type { F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, - F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, F2FS_WB_CP_DATA, F2FS_WB_DATA, @@ -1120,11 +1114,7 @@ enum page_type { META, NR_PAGE_TYPE, META_FLUSH, - INMEM, /* the below types are used by tracepoints only. */ - INMEM_DROP, - INMEM_INVALIDATE, - INMEM_REVOKE, - IPU, + IPU, /* the below types are used by tracepoints only. */ OPU, }; @@ -1716,7 +1706,6 @@ struct f2fs_sb_info { /* for skip statistic */ unsigned int atomic_files; /* # of opened atomic file */ - unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ @@ -3200,11 +3189,6 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } -static inline bool f2fs_is_commit_atomic_write(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); -} - static inline bool f2fs_is_volatile_file(struct inode *inode) { return is_inode_flag_set(inode, FI_VOLATILE_FILE); @@ -3442,6 +3426,8 @@ void f2fs_handle_failed_inode(struct inode *inode); int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode); /* * dir.c @@ -3577,11 +3563,8 @@ void f2fs_destroy_node_manager_caches(void); * segment.c */ bool f2fs_need_SSR(struct f2fs_sb_info *sbi); -void f2fs_register_inmem_page(struct inode *inode, struct page *page); -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); -void f2fs_drop_inmem_pages(struct inode *inode); -void f2fs_drop_inmem_page(struct inode *inode, struct page *page); -int f2fs_commit_inmem_pages(struct inode *inode); +int f2fs_commit_atomic_write(struct inode *inode); +void f2fs_abort_atomic_write(struct inode *inode, bool clean); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); @@ -3814,7 +3797,6 @@ struct f2fs_stat_info { int ext_tree, zombie_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; - int inmem_pages; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; @@ -3844,7 +3826,6 @@ struct f2fs_stat_info { int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; - unsigned long long skipped_atomic_files[2]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5e402ac93e7f..cc9ade535fc9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1813,9 +1813,8 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) atomic_read(&inode->i_writecount) != 1) return 0; - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); if (f2fs_is_volatile_file(inode)) { set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); @@ -1837,8 +1836,8 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) * before dropping file lock, it needs to do in ->flush. */ if (f2fs_is_atomic_file(inode) && - F2FS_I(inode)->inmem_task == current) - f2fs_drop_inmem_pages(inode); + F2FS_I(inode)->atomic_write_task == current) + f2fs_abort_atomic_write(inode, true); return 0; } @@ -2001,6 +2000,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) struct user_namespace *mnt_userns = file_mnt_user_ns(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inode *pinode; int ret; if (!inode_owner_or_capable(mnt_userns, inode)) @@ -2023,11 +2023,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; } - if (f2fs_is_atomic_file(inode)) { - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) - ret = -EINVAL; + if (f2fs_is_atomic_file(inode)) goto out; - } ret = f2fs_convert_inline_inode(inode); if (ret) @@ -2048,19 +2045,33 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; } + /* Create a COW inode for atomic write */ + pinode = f2fs_iget(inode->i_sb, fi->i_pino); + if (IS_ERR(pinode)) { + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + ret = PTR_ERR(pinode); + goto out; + } + + ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode); + iput(pinode); + if (ret) { + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + goto out; + } + f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(&fi->inmem_ilist)) - list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); sbi->atomic_files++; spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - /* add inode in inmem_list first and set atomic_file */ set_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + set_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - F2FS_I(inode)->inmem_task = current; + F2FS_I(inode)->atomic_write_task = current; stat_update_max_atomic_write(inode); out: inode_unlock(inode); @@ -2091,21 +2102,17 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) } if (f2fs_is_atomic_file(inode)) { - ret = f2fs_commit_inmem_pages(inode); + ret = f2fs_commit_atomic_write(inode); if (ret) goto err_out; ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, false); } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - ret = -EINVAL; - } inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -2193,15 +2200,13 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) inode_lock(inode); if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); stat_dec_volatile_write(inode); ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - inode_unlock(inode); mnt_drop_write_file(filp); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 4d1a4df09634..1ea3fb916bec 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1245,13 +1245,6 @@ static int move_data_block(struct inode *inode, block_t bidx, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; - goto out; - } - err = f2fs_gc_pinned_control(inode, gc_type, segno); if (err) goto out; @@ -1393,12 +1386,6 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; - goto out; - } err = f2fs_gc_pinned_control(inode, gc_type, segno); if (err) goto out; @@ -1766,8 +1753,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; - unsigned long long first_skipped; unsigned int skipped_round = 0, round = 0; trace_f2fs_gc_begin(sbi->sb, sync, background, @@ -1781,7 +1766,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, cpc.reason = __get_cp_reason(sbi); sbi->skipped_gc_rwsem = 0; - first_skipped = last_skipped; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1832,10 +1816,8 @@ retry: total_freed += seg_freed; if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped || - sbi->skipped_gc_rwsem) + if (sbi->skipped_gc_rwsem) skipped_round++; - last_skipped = sbi->skipped_atomic_files[FG_GC]; round++; } @@ -1861,13 +1843,6 @@ retry: segno = NULL_SEGNO; goto gc_more; } - if (first_skipped < last_skipped && - (last_skipped - first_skipped) > - sbi->skipped_gc_rwsem) { - f2fs_drop_inmem_pages_all(sbi, true); - segno = NULL_SEGNO; - goto gc_more; - } if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) ret = f2fs_write_checkpoint(sbi, &cpc); stop: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 6effb70298a2..4816b739c84f 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -745,9 +745,8 @@ void f2fs_evict_inode(struct inode *inode) nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7c554e921dd5..343a259194d9 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -840,8 +840,8 @@ out: } static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, - struct inode **whiteout) + struct dentry *dentry, umode_t mode, bool is_whiteout, + struct inode **new_inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -855,7 +855,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) return PTR_ERR(inode); - if (whiteout) { + if (is_whiteout) { init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); inode->i_op = &f2fs_special_inode_operations; } else { @@ -880,21 +880,25 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, f2fs_add_orphan_inode(inode); f2fs_alloc_nid_done(sbi, inode->i_ino); - if (whiteout) { + if (is_whiteout) { f2fs_i_links_write(inode, false); spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); - - *whiteout = inode; } else { - d_tmpfile(dentry, inode); + if (dentry) + d_tmpfile(dentry, inode); + else + f2fs_i_links_write(inode, false); } /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + if (new_inode) + *new_inode = inode; + f2fs_balance_fs(sbi, true); return 0; @@ -915,7 +919,7 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, NULL); + return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL); } static int f2fs_create_whiteout(struct user_namespace *mnt_userns, @@ -925,7 +929,13 @@ static int f2fs_create_whiteout(struct user_namespace *mnt_userns, return -EIO; return __f2fs_tmpfile(mnt_userns, dir, NULL, - S_IFCHR | WHITEOUT_MODE, whiteout); + S_IFCHR | WHITEOUT_MODE, true, whiteout); +} + +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode) +{ + return __f2fs_tmpfile(mnt_userns, dir, NULL, S_IFREG, false, new_inode); } static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b32e13a0a6ac..5cd48cac0a03 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -90,10 +90,6 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == INMEM_PAGES) { - /* it allows 20% / total_ram for inmemory pages */ - mem_size = get_pages(sbi, F2FS_INMEM_PAGES); - res = mem_size < (val.totalram / 5); } else if (type == DISCARD_CACHE) { mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 4c1d34bfea78..3c09cae058b0 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -147,7 +147,6 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ - INMEM_PAGES, /* indicates inmemory pages */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4ce8f63f6767..74e4bf67ad64 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -29,7 +29,7 @@ static struct kmem_cache *discard_entry_slab; static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; -static struct kmem_cache *inmem_entry_slab; +static struct kmem_cache *revoke_entry_slab; static unsigned long __reverse_ulong(unsigned char *str) { @@ -184,308 +184,180 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } -void f2fs_register_inmem_page(struct inode *inode, struct page *page) -{ - struct inmem_pages *new; - - set_page_private_atomic(page); - - new = f2fs_kmem_cache_alloc(inmem_entry_slab, - GFP_NOFS, true, NULL); - - /* add atomic page indices to the list */ - new->page = page; - INIT_LIST_HEAD(&new->list); - - /* increase reference count with clean state */ - get_page(page); - mutex_lock(&F2FS_I(inode)->inmem_lock); - list_add_tail(&new->list, &F2FS_I(inode)->inmem_pages); - inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); - mutex_unlock(&F2FS_I(inode)->inmem_lock); - - trace_f2fs_register_inmem_page(page, INMEM); -} - -static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover, - bool trylock) +void f2fs_abort_atomic_write(struct inode *inode, bool clean) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inmem_pages *cur, *tmp; - int err = 0; + struct f2fs_inode_info *fi = F2FS_I(inode); + + if (f2fs_is_atomic_file(inode)) { + if (clean) + truncate_inode_pages_final(inode->i_mapping); + clear_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + iput(fi->cow_inode); + fi->cow_inode = NULL; + clear_inode_flag(inode, FI_ATOMIC_FILE); + + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + sbi->atomic_files--; + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } +} + +static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, + block_t new_addr, block_t *old_addr, bool recover) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct node_info ni; + int err; + +retry: + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE_RA); + if (err) { + if (err == -ENOMEM) { + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + goto retry; + } + return err; + } + + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + + if (recover) { + /* dn.data_blkaddr is always valid */ + if (!__is_valid_data_blkaddr(new_addr)) { + if (new_addr == NULL_ADDR) + dec_valid_block_count(sbi, inode, 1); + f2fs_invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_update_data_blkaddr(&dn, new_addr); + } else { + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + new_addr, ni.version, true, true); + } + } else { + blkcnt_t count = 1; + + *old_addr = dn.data_blkaddr; + f2fs_truncate_data_blocks_range(&dn, 1); + dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); + inc_valid_block_count(sbi, inode, &count); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true, false); + } + + f2fs_put_dnode(&dn); + return 0; +} + +static void __complete_revoke_list(struct inode *inode, struct list_head *head, + bool revoke) +{ + struct revoke_entry *cur, *tmp; list_for_each_entry_safe(cur, tmp, head, list) { - struct page *page = cur->page; - - if (drop) - trace_f2fs_commit_inmem_page(page, INMEM_DROP); - - if (trylock) { - /* - * to avoid deadlock in between page lock and - * inmem_lock. - */ - if (!trylock_page(page)) - continue; - } else { - lock_page(page); - } - - f2fs_wait_on_page_writeback(page, DATA, true, true); - - if (recover) { - struct dnode_of_data dn; - struct node_info ni; - - trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); -retry: - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_dnode_of_data(&dn, page->index, - LOOKUP_NODE); - if (err) { - if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); - goto retry; - } - err = -EAGAIN; - goto next; - } - - err = f2fs_get_node_info(sbi, dn.nid, &ni, false); - if (err) { - f2fs_put_dnode(&dn); - return err; - } - - if (cur->old_addr == NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn.data_blkaddr); - f2fs_update_data_blkaddr(&dn, NEW_ADDR); - } else - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, - cur->old_addr, ni.version, true, true); - f2fs_put_dnode(&dn); - } -next: - /* we don't need to invalidate this in the sccessful status */ - if (drop || recover) { - ClearPageUptodate(page); - clear_page_private_gcing(page); - } - detach_page_private(page); - set_page_private(page, 0); - f2fs_put_page(page, 1); - + if (revoke) + __replace_atomic_write_block(inode, cur->index, + cur->old_addr, NULL, true); list_del(&cur->list); - kmem_cache_free(inmem_entry_slab, cur); - dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + kmem_cache_free(revoke_entry_slab, cur); } - return err; } -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) -{ - struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; - struct inode *inode; - struct f2fs_inode_info *fi; - unsigned int count = sbi->atomic_files; - unsigned int looped = 0; -next: - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(head)) { - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - return; - } - fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); - inode = igrab(&fi->vfs_inode); - if (inode) - list_move_tail(&fi->inmem_ilist, head); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - - if (inode) { - if (gc_failure) { - if (!fi->i_gc_failures[GC_FAILURE_ATOMIC]) - goto skip; - } - set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_drop_inmem_pages(inode); -skip: - iput(inode); - } - f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); - if (gc_failure) { - if (++looped >= count) - return; - } - goto next; -} - -void f2fs_drop_inmem_pages(struct inode *inode) +static int __f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - - do { - mutex_lock(&fi->inmem_lock); - if (list_empty(&fi->inmem_pages)) { - fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - sbi->atomic_files--; - } - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - - mutex_unlock(&fi->inmem_lock); - break; - } - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, true); - mutex_unlock(&fi->inmem_lock); - } while (1); -} - -void f2fs_drop_inmem_page(struct inode *inode, struct page *page) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct list_head *head = &fi->inmem_pages; - struct inmem_pages *cur = NULL; - struct inmem_pages *tmp; - - f2fs_bug_on(sbi, !page_private_atomic(page)); - - mutex_lock(&fi->inmem_lock); - list_for_each_entry(tmp, head, list) { - if (tmp->page == page) { - cur = tmp; - break; - } - } - - f2fs_bug_on(sbi, !cur); - list_del(&cur->list); - mutex_unlock(&fi->inmem_lock); - - dec_page_count(sbi, F2FS_INMEM_PAGES); - kmem_cache_free(inmem_entry_slab, cur); - - ClearPageUptodate(page); - clear_page_private_atomic(page); - f2fs_put_page(page, 0); - - detach_page_private(page); - set_page_private(page, 0); - - trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); -} - -static int __f2fs_commit_inmem_pages(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - struct inmem_pages *cur, *tmp; - struct f2fs_io_info fio = { - .sbi = sbi, - .ino = inode->i_ino, - .type = DATA, - .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_PRIO, - .io_type = FS_DATA_IO, - }; + struct inode *cow_inode = fi->cow_inode; + struct revoke_entry *new; struct list_head revoke_list; - bool submit_bio = false; - int err = 0; + block_t blkaddr; + struct dnode_of_data dn; + pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + pgoff_t off = 0, blen, index; + int ret = 0, i; INIT_LIST_HEAD(&revoke_list); - list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { - struct page *page = cur->page; + while (len) { + blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); - lock_page(page); - if (page->mapping == inode->i_mapping) { - trace_f2fs_commit_inmem_page(page, INMEM); - - f2fs_wait_on_page_writeback(page, DATA, true, true); - - set_page_dirty(page); - if (clear_page_dirty_for_io(page)) { - inode_dec_dirty_pages(inode); - f2fs_remove_dirty_inode(inode); - } -retry: - fio.page = page; - fio.old_blkaddr = NULL_ADDR; - fio.encrypted_page = NULL; - fio.need_lock = LOCK_DONE; - err = f2fs_do_write_data_page(&fio); - if (err) { - if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); - goto retry; - } - unlock_page(page); - break; - } - /* record old blkaddr for revoking */ - cur->old_addr = fio.old_blkaddr; - submit_bio = true; + set_new_dnode(&dn, cow_inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 0; + if (dn.max_level == 0) + goto out; + goto next; } - unlock_page(page); - list_move_tail(&cur->list, &revoke_list); + + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode), + len); + index = off; + for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { + blkaddr = f2fs_data_blkaddr(&dn); + + if (!__is_valid_data_blkaddr(blkaddr)) { + continue; + } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + ret = -EFSCORRUPTED; + goto out; + } + + new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, + true, NULL); + if (!new) { + f2fs_put_dnode(&dn); + ret = -ENOMEM; + goto out; + } + + ret = __replace_atomic_write_block(inode, index, blkaddr, + &new->old_addr, false); + if (ret) { + f2fs_put_dnode(&dn); + kmem_cache_free(revoke_entry_slab, new); + goto out; + } + + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + new->index = index; + list_add_tail(&new->list, &revoke_list); + } + f2fs_put_dnode(&dn); +next: + off += blen; + len -= blen; } - if (submit_bio) - f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA); +out: + __complete_revoke_list(inode, &revoke_list, ret ? true : false); - if (err) { - /* - * try to revoke all committed pages, but still we could fail - * due to no memory or other reason, if that happened, EAGAIN - * will be returned, which means in such case, transaction is - * already not integrity, caller should use journal to do the - * recovery or rewrite & commit last transaction. For other - * error number, revoking was done by filesystem itself. - */ - err = __revoke_inmem_pages(inode, &revoke_list, - false, true, false); - - /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, false); - } else { - __revoke_inmem_pages(inode, &revoke_list, - false, false, false); - } - - return err; + return ret; } -int f2fs_commit_inmem_pages(struct inode *inode) +int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); int err; - f2fs_balance_fs(sbi, true); + err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (err) + return err; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); - f2fs_lock_op(sbi); - set_inode_flag(inode, FI_ATOMIC_COMMIT); - mutex_lock(&fi->inmem_lock); - err = __f2fs_commit_inmem_pages(inode); - mutex_unlock(&fi->inmem_lock); - - clear_inode_flag(inode, FI_ATOMIC_COMMIT); + err = __f2fs_commit_atomic_write(inode); f2fs_unlock_op(sbi); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); @@ -5363,9 +5235,9 @@ int __init f2fs_create_segment_manager_caches(void) if (!sit_entry_set_slab) goto destroy_discard_cmd; - inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry", - sizeof(struct inmem_pages)); - if (!inmem_entry_slab) + revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry", + sizeof(struct revoke_entry)); + if (!revoke_entry_slab) goto destroy_sit_entry_set; return 0; @@ -5384,5 +5256,5 @@ void f2fs_destroy_segment_manager_caches(void) kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); - kmem_cache_destroy(inmem_entry_slab); + kmem_cache_destroy(revoke_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 8fbc9f6afa55..3f277dfcb131 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -225,10 +225,10 @@ struct segment_allocation { #define MAX_SKIP_GC_COUNT 16 -struct inmem_pages { +struct revoke_entry { struct list_head list; - struct page *page; block_t old_addr; /* for revoking when fail to commit */ + pgoff_t index; }; struct sit_info { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7b0a31dceb4c..e12405ee4533 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1342,9 +1342,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); - INIT_LIST_HEAD(&fi->inmem_ilist); - INIT_LIST_HEAD(&fi->inmem_pages); - mutex_init(&fi->inmem_lock); init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); init_f2fs_rwsem(&fi->i_xattr_sem); @@ -1385,9 +1382,8 @@ static int f2fs_drop_inode(struct inode *inode) atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 11f6b7147be2..eb33eb18ae69 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -15,10 +15,6 @@ TRACE_DEFINE_ENUM(NODE); TRACE_DEFINE_ENUM(DATA); TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); -TRACE_DEFINE_ENUM(INMEM); -TRACE_DEFINE_ENUM(INMEM_DROP); -TRACE_DEFINE_ENUM(INMEM_INVALIDATE); -TRACE_DEFINE_ENUM(INMEM_REVOKE); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(HOT); @@ -59,10 +55,6 @@ TRACE_DEFINE_ENUM(CP_RESIZE); { DATA, "DATA" }, \ { META, "META" }, \ { META_FLUSH, "META_FLUSH" }, \ - { INMEM, "INMEM" }, \ - { INMEM_DROP, "INMEM_DROP" }, \ - { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \ - { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) @@ -1290,20 +1282,6 @@ DEFINE_EVENT(f2fs__page, f2fs_vm_page_mkwrite, TP_ARGS(page, type) ); -DEFINE_EVENT(f2fs__page, f2fs_register_inmem_page, - - TP_PROTO(struct page *page, int type), - - TP_ARGS(page, type) -); - -DEFINE_EVENT(f2fs__page, f2fs_commit_inmem_page, - - TP_PROTO(struct page *page, int type), - - TP_ARGS(page, type) -); - TRACE_EVENT(f2fs_filemap_fault, TP_PROTO(struct inode *inode, pgoff_t index, unsigned long ret), From 87a38cac9eb1e8ea28aa015713ae5d179e023057 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 5 May 2022 17:49:18 -0700 Subject: [PATCH 42/57] f2fs: kill volatile write support There's no user, since all can use atomic writes simply. Let's kill it. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +- fs/f2fs/data.c | 5 -- fs/f2fs/debug.c | 10 +--- fs/f2fs/f2fs.h | 27 +--------- fs/f2fs/file.c | 116 ++----------------------------------------- fs/f2fs/segment.c | 3 +- fs/f2fs/verity.c | 2 +- 7 files changed, 10 insertions(+), 157 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ca2b2a69fddc..af9fd6cafa2b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1003,9 +1003,7 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - if (!f2fs_is_volatile_file(inode)) - list_add_tail(&F2FS_I(inode)->dirty_list, - &sbi->inode_list[type]); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3ec97088d00f..54715a18e9e8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2755,11 +2755,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, write: if (f2fs_is_drop_cache(inode)) goto out; - /* we should not write 0'th page having journal header */ - if (f2fs_is_volatile_file(inode) && (!page->index || - (!wbc->for_reclaim && - f2fs_available_free_memory(sbi, BASE_CHECK)))) - goto redirty_out; /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) { diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 65f0bcf498bb..c92625ef16d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -92,9 +92,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->aw_cnt = sbi->atomic_files; - si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); - si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); @@ -511,10 +509,8 @@ static int stat_show(struct seq_file *s, void *v) si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); - seq_printf(s, " - atomic IO: %4d (Max. %4d), " - "volatile IO: %4d (Max. %4d)\n", - si->aw_cnt, si->max_aw_cnt, - si->vw_cnt, si->max_vw_cnt); + seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); @@ -615,9 +611,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); - atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); - atomic_set(&sbi->max_vw_cnt, 0); raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 68a0d1e06d77..4fa697ccec6c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -735,7 +735,6 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ - FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ @@ -1736,9 +1735,7 @@ struct f2fs_sb_info { atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ - atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ - atomic_t max_vw_cnt; /* max # of volatile writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ @@ -3189,11 +3186,6 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } -static inline bool f2fs_is_volatile_file(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_VOLATILE_FILE); -} - static inline bool f2fs_is_first_block_written(struct inode *inode) { return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); @@ -3814,7 +3806,7 @@ struct f2fs_stat_info { int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode; unsigned long long compr_blocks; - int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; + int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -3925,17 +3917,6 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) -#define stat_inc_volatile_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_dec_volatile_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_update_max_volatile_write(inode) \ - do { \ - int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ - int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ - if (cur > max) \ - atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ - } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -3997,9 +3978,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) -#define stat_inc_volatile_write(inode) do { } while (0) -#define stat_dec_volatile_write(inode) do { } while (0) -#define stat_update_max_volatile_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) @@ -4402,8 +4380,7 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode)) return false; return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index cc9ade535fc9..945aa1ac574c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1815,13 +1815,6 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) if (f2fs_is_atomic_file(inode)) f2fs_abort_atomic_write(inode, true); - if (f2fs_is_volatile_file(inode)) { - set_inode_flag(inode, FI_DROP_CACHE); - filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(inode, FI_DROP_CACHE); - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - } return 0; } @@ -2096,15 +2089,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); - if (f2fs_is_volatile_file(inode)) { - ret = -EINVAL; - goto err_out; - } - if (f2fs_is_atomic_file(inode)) { ret = f2fs_commit_atomic_write(inode); if (ret) - goto err_out; + goto unlock_out; ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) @@ -2112,108 +2100,12 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } -err_out: +unlock_out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; } -static int f2fs_ioc_start_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_volatile_file(inode)) - goto out; - - ret = f2fs_convert_inline_inode(inode); - if (ret) - goto out; - - stat_inc_volatile_write(inode); - stat_update_max_volatile_write(inode); - - set_inode_flag(inode, FI_VOLATILE_FILE); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_release_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (!f2fs_is_volatile_file(inode)) - goto out; - - if (!f2fs_is_first_block_written(inode)) { - ret = truncate_partial_data_page(inode, 0, true); - goto out; - } - - ret = punch_hole(inode, 0, F2FS_BLKSIZE); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_abort_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_atomic_file(inode)) - f2fs_abort_atomic_write(inode, true); - if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); - } - - inode_unlock(inode); - - mnt_drop_write_file(filp); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return ret; -} - static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -4151,11 +4043,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); case F2FS_IOC_START_VOLATILE_WRITE: - return f2fs_ioc_start_volatile_write(filp); case F2FS_IOC_RELEASE_VOLATILE_WRITE: - return f2fs_ioc_release_volatile_write(filp); case F2FS_IOC_ABORT_VOLATILE_WRITE: - return f2fs_ioc_abort_volatile_write(filp); + return -EOPNOTSUPP; case F2FS_IOC_SHUTDOWN: return f2fs_ioc_shutdown(filp, arg); case FITRIM: diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 74e4bf67ad64..5c66f98073c9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3165,8 +3165,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(inode->i_write_hint); } else { diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 3d793202cc9f..5ac7e756a1bb 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -128,7 +128,7 @@ static int f2fs_begin_enable_verity(struct file *filp) if (f2fs_verity_in_progress(inode)) return -EBUSY; - if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + if (f2fs_is_atomic_file(inode)) return -EOPNOTSUPP; /* From 8f08a1a4cedc85fb6c5d7da5b69eba68e1893c1c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 30 Apr 2022 22:08:52 -0700 Subject: [PATCH 43/57] f2fs: reject test_dummy_encryption when !CONFIG_FS_ENCRYPTION There is no good reason to allow this mount option when the kernel isn't configured with encryption support. Since this option is only for testing, we can just fix this; we don't really need to worry about breaking anyone who might be counting on this option being ignored. Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e12405ee4533..16361edc6327 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -532,10 +532,11 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, return -EINVAL; } f2fs_warn(sbi, "Test dummy encryption mode enabled"); -#else - f2fs_warn(sbi, "Test dummy encryption mount option ignored"); -#endif return 0; +#else + f2fs_warn(sbi, "test_dummy_encryption option not supported"); + return -EINVAL; +#endif } #ifdef CONFIG_F2FS_FS_COMPRESSION From 42368edd88b9943f501a2c3ebffa2274131a5bca Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2022 11:40:33 -0700 Subject: [PATCH 44/57] f2fs: introduce f2fs_gc_control to consolidate f2fs_gc parameters No functional change. - remove checkpoint=disable check for f2fs_write_checkpoint - get sec_freed all the time Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 +++++- fs/f2fs/file.c | 30 ++++++++++++--- fs/f2fs/gc.c | 74 ++++++++++++++++++++----------------- fs/f2fs/segment.c | 8 +++- fs/f2fs/super.c | 8 +++- include/trace/events/f2fs.h | 18 ++++----- 6 files changed, 98 insertions(+), 51 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4fa697ccec6c..989ec7f36f98 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1263,6 +1263,14 @@ struct atgc_management { unsigned long long age_threshold; /* age threshold */ }; +struct f2fs_gc_control { + unsigned int victim_segno; /* target victim segment number */ + int init_gc_type; /* FG_GC or BG_GC */ + bool no_bg_gc; /* check the space and stop bg_gc */ + bool should_migrate_blocks; /* should migrate blocks */ + bool err_gc_skipped; /* return EAGAIN if GC skipped */ +}; + /* For s_flag in struct f2fs_sb_info */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ @@ -3760,8 +3768,7 @@ extern const struct iomap_ops f2fs_iomap_ops; int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force, - unsigned int segno); +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 945aa1ac574c..0c5ea5a45933 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1647,6 +1647,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_map_blocks map = { .m_next_pgofs = NULL, .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = true }; + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true }; pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -1684,7 +1688,7 @@ next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); + err = f2fs_gc(sbi, &gc_control); if (err && err != -ENODATA) goto out_err; } @@ -2344,6 +2348,9 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .no_bg_gc = false, + .should_migrate_blocks = false }; __u32 sync; int ret; @@ -2369,7 +2376,9 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO); + gc_control.init_gc_type = sync ? FG_GC : BG_GC; + gc_control.err_gc_skipped = sync; + ret = f2fs_gc(sbi, &gc_control); out: mnt_drop_write_file(filp); return ret; @@ -2378,6 +2387,11 @@ out: static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) { struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + struct f2fs_gc_control gc_control = { + .init_gc_type = range->sync ? FG_GC : BG_GC, + .no_bg_gc = false, + .should_migrate_blocks = false, + .err_gc_skipped = range->sync }; u64 end; int ret; @@ -2405,8 +2419,8 @@ do_more: f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range->sync, true, false, - GET_SEGNO(sbi, range->start)); + gc_control.victim_segno = GET_SEGNO(sbi, range->start); + ret = f2fs_gc(sbi, &gc_control); if (ret) { if (ret == -EBUSY) ret = -EAGAIN; @@ -2820,6 +2834,10 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) unsigned int start_segno = 0, end_segno = 0; unsigned int dev_start_segno = 0, dev_end_segno = 0; struct f2fs_flush_device range; + struct f2fs_gc_control gc_control = { + .init_gc_type = FG_GC, + .should_migrate_blocks = true, + .err_gc_skipped = true }; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -2863,7 +2881,9 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sm->last_victim[GC_CB] = end_segno + 1; sm->last_victim[GC_GREEDY] = end_segno + 1; sm->last_victim[ALLOC_NEXT] = end_segno + 1; - ret = f2fs_gc(sbi, true, true, true, start_segno); + + gc_control.victim_segno = start_segno; + ret = f2fs_gc(sbi, &gc_control); if (ret == -EAGAIN) ret = 0; else if (ret < 0) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 1ea3fb916bec..9af3190cfdf7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -35,6 +35,9 @@ static int gc_thread_func(void *data) wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .should_migrate_blocks = false }; wait_ms = gc_th->min_sleep_time; @@ -141,8 +144,12 @@ do_gc: if (foreground) sync_mode = false; + gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; + gc_control.no_bg_gc = foreground; + gc_control.err_gc_skipped = sync_mode; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO)) + if (f2fs_gc(sbi, &gc_control)) wait_ms = gc_th->no_gc_sleep_time; if (foreground) @@ -1741,21 +1748,20 @@ skip: return seg_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, - bool background, bool force, unsigned int segno) +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) { - int gc_type = sync ? FG_GC : BG_GC; + int gc_type = gc_control->init_gc_type; + unsigned int segno = gc_control->victim_segno; int sec_freed = 0, seg_freed = 0, total_freed = 0; int ret = 0; struct cp_control cpc; - unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; unsigned int skipped_round = 0, round = 0; - trace_f2fs_gc_begin(sbi->sb, sync, background, + trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), @@ -1782,8 +1788,7 @@ gc_more: * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - if (prefree_segments(sbi) && - !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + if (prefree_segments(sbi)) { ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -1793,7 +1798,7 @@ gc_more: } /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - if (gc_type == BG_GC && !background) { + if (gc_type == BG_GC && gc_control->no_bg_gc) { ret = -EINVAL; goto stop; } @@ -1809,45 +1814,48 @@ retry: goto stop; } - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); - if (gc_type == FG_GC && - seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) - sec_freed++; + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, + gc_control->should_migrate_blocks); total_freed += seg_freed; - if (gc_type == FG_GC) { - if (sbi->skipped_gc_rwsem) - skipped_round++; - round++; - } + if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) + sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (sync) + if (gc_control->init_gc_type == FG_GC) goto stop; - if (!has_not_enough_free_secs(sbi, sec_freed, 0)) + if (!has_not_enough_free_secs(sbi, + (gc_type == FG_GC) ? sec_freed : 0, 0)) goto stop; - if (skipped_round <= MAX_SKIP_GC_COUNT || skipped_round * 2 < round) { - - /* Write checkpoint to reclaim prefree segments */ - if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && - prefree_segments(sbi) && - !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + /* FG_GC stops GC by skip_count */ + if (gc_type == FG_GC) { + if (sbi->skipped_gc_rwsem) + skipped_round++; + round++; + if (skipped_round > MAX_SKIP_GC_COUNT && + skipped_round * 2 >= round) { ret = f2fs_write_checkpoint(sbi, &cpc); - if (ret) - goto stop; + goto stop; } - segno = NULL_SEGNO; - goto gc_more; } - if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + + /* Write checkpoint to reclaim prefree segments */ + if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && + prefree_segments(sbi)) { ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } + segno = NULL_SEGNO; + goto gc_more; + stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; - SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno; if (gc_type == FG_GC) f2fs_unpin_all_sections(sbi, true); @@ -1865,7 +1873,7 @@ stop: put_gc_inode(&gc_list); - if (sync && !ret) + if (gc_control->err_gc_skipped && !ret) ret = sec_freed ? 0 : -EAGAIN; return ret; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5c66f98073c9..a6f473dff42e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -398,8 +398,14 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) io_schedule(); finish_wait(&sbi->gc_thread->fggc_wq, &wait); } else { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = BG_GC, + .no_bg_gc = true, + .should_migrate_blocks = false, + .err_gc_skipped = false }; f2fs_down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, false, NULL_SEGNO); + f2fs_gc(sbi, &gc_control); } } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 16361edc6327..15c0847a0ae1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2079,8 +2079,14 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) sbi->gc_mode = GC_URGENT_HIGH; while (!f2fs_time_over(sbi, DISABLE_TIME)) { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true }; + f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); + err = f2fs_gc(sbi, &gc_control); if (err == -ENODATA) { err = 0; break; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index eb33eb18ae69..1de200beea50 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -644,19 +644,19 @@ TRACE_EVENT(f2fs_background_gc, TRACE_EVENT(f2fs_gc_begin, - TP_PROTO(struct super_block *sb, bool sync, bool background, + TP_PROTO(struct super_block *sb, int gc_type, bool no_bg_gc, long long dirty_nodes, long long dirty_dents, long long dirty_imeta, unsigned int free_sec, unsigned int free_seg, int reserved_seg, unsigned int prefree_seg), - TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta, + TP_ARGS(sb, gc_type, no_bg_gc, dirty_nodes, dirty_dents, dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), TP_STRUCT__entry( __field(dev_t, dev) - __field(bool, sync) - __field(bool, background) + __field(int, gc_type) + __field(bool, no_bg_gc) __field(long long, dirty_nodes) __field(long long, dirty_dents) __field(long long, dirty_imeta) @@ -668,8 +668,8 @@ TRACE_EVENT(f2fs_gc_begin, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->sync = sync; - __entry->background = background; + __entry->gc_type = gc_type; + __entry->no_bg_gc = no_bg_gc; __entry->dirty_nodes = dirty_nodes; __entry->dirty_dents = dirty_dents; __entry->dirty_imeta = dirty_imeta; @@ -679,12 +679,12 @@ TRACE_EVENT(f2fs_gc_begin, __entry->prefree_seg = prefree_seg; ), - TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, " + TP_printk("dev = (%d,%d), gc_type = %s, no_background_GC = %d, nodes = %lld, " "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " "rsv_seg:%d, prefree_seg:%u", show_dev(__entry->dev), - __entry->sync, - __entry->background, + show_gc_type(__entry->gc_type), + (__entry->gc_type == BG_GC) ? __entry->no_bg_gc : -1, __entry->dirty_nodes, __entry->dirty_dents, __entry->dirty_imeta, From b86a2a1564d2685ec1c70810d9eb85af714d839f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2022 14:23:27 -0700 Subject: [PATCH 45/57] f2fs: keep wait_ms if EAGAIN happens In f2fs_gc thread, let's keep wait_ms when sec_freed was zero. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9af3190cfdf7..8715d4d15b79 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -37,7 +37,8 @@ static int gc_thread_func(void *data) unsigned int wait_ms; struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, - .should_migrate_blocks = false }; + .should_migrate_blocks = false, + .err_gc_skipped = false }; wait_ms = gc_th->min_sleep_time; @@ -146,7 +147,6 @@ do_gc: gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; gc_control.no_bg_gc = foreground; - gc_control.err_gc_skipped = sync_mode; /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, &gc_control)) From 1fb82674e063a4d46f56f089a738a495a9ac21a1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2022 13:34:41 -0700 Subject: [PATCH 46/57] f2fs: do not stop GC when requiring a free section The f2fs_gc uses a bitmap to indicate pinned sections, but when disabling chckpoint, we call f2fs_gc() with NULL_SEGNO which selects the same dirty segment as a victim all the time, resulting in checkpoint=disable failure, for example. Let's pick another one, if we fail to collect it. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 12 ++++++++---- fs/f2fs/gc.c | 14 +++++++++----- fs/f2fs/segment.c | 3 ++- fs/f2fs/super.c | 3 ++- include/trace/events/f2fs.h | 11 ++++++++--- 6 files changed, 30 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 989ec7f36f98..426006a70849 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1269,6 +1269,7 @@ struct f2fs_gc_control { bool no_bg_gc; /* check the space and stop bg_gc */ bool should_migrate_blocks; /* should migrate blocks */ bool err_gc_skipped; /* return EAGAIN if GC skipped */ + unsigned int nr_free_secs; /* # of free sections to do GC */ }; /* For s_flag in struct f2fs_sb_info */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0c5ea5a45933..7939caf8d69b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1650,7 +1650,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, .init_gc_type = FG_GC, .should_migrate_blocks = false, - .err_gc_skipped = true }; + .err_gc_skipped = true, + .nr_free_secs = 0 }; pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -2350,7 +2351,8 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, .no_bg_gc = false, - .should_migrate_blocks = false }; + .should_migrate_blocks = false, + .nr_free_secs = 0 }; __u32 sync; int ret; @@ -2391,7 +2393,8 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) .init_gc_type = range->sync ? FG_GC : BG_GC, .no_bg_gc = false, .should_migrate_blocks = false, - .err_gc_skipped = range->sync }; + .err_gc_skipped = range->sync, + .nr_free_secs = 0 }; u64 end; int ret; @@ -2837,7 +2840,8 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) struct f2fs_gc_control gc_control = { .init_gc_type = FG_GC, .should_migrate_blocks = true, - .err_gc_skipped = true }; + .err_gc_skipped = true, + .nr_free_secs = 0 }; int ret; if (!capable(CAP_SYS_ADMIN)) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8715d4d15b79..818612e421bb 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -147,6 +147,7 @@ do_gc: gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; gc_control.no_bg_gc = foreground; + gc_control.nr_free_secs = foreground ? 1 : 0; /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, &gc_control)) @@ -1762,6 +1763,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) unsigned int skipped_round = 0, round = 0; trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, + gc_control->nr_free_secs, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), @@ -1824,12 +1826,13 @@ retry: if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (gc_control->init_gc_type == FG_GC) - goto stop; - - if (!has_not_enough_free_secs(sbi, - (gc_type == FG_GC) ? sec_freed : 0, 0)) + if (gc_control->init_gc_type == FG_GC || + !has_not_enough_free_secs(sbi, + (gc_type == FG_GC) ? sec_freed : 0, 0)) { + if (gc_type == FG_GC && sec_freed < gc_control->nr_free_secs) + goto go_gc_more; goto stop; + } /* FG_GC stops GC by skip_count */ if (gc_type == FG_GC) { @@ -1850,6 +1853,7 @@ retry: if (ret) goto stop; } +go_gc_more: segno = NULL_SEGNO; goto gc_more; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a6f473dff42e..b2c0d3a6037d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -403,7 +403,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) .init_gc_type = BG_GC, .no_bg_gc = true, .should_migrate_blocks = false, - .err_gc_skipped = false }; + .err_gc_skipped = false, + .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); f2fs_gc(sbi, &gc_control); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 15c0847a0ae1..0a15ed236ee5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2083,7 +2083,8 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) .victim_segno = NULL_SEGNO, .init_gc_type = FG_GC, .should_migrate_blocks = false, - .err_gc_skipped = true }; + .err_gc_skipped = true, + .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); err = f2fs_gc(sbi, &gc_control); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 1de200beea50..0edcd4e11638 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -645,18 +645,21 @@ TRACE_EVENT(f2fs_background_gc, TRACE_EVENT(f2fs_gc_begin, TP_PROTO(struct super_block *sb, int gc_type, bool no_bg_gc, + unsigned int nr_free_secs, long long dirty_nodes, long long dirty_dents, long long dirty_imeta, unsigned int free_sec, unsigned int free_seg, int reserved_seg, unsigned int prefree_seg), - TP_ARGS(sb, gc_type, no_bg_gc, dirty_nodes, dirty_dents, dirty_imeta, + TP_ARGS(sb, gc_type, no_bg_gc, nr_free_secs, dirty_nodes, + dirty_dents, dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), TP_STRUCT__entry( __field(dev_t, dev) __field(int, gc_type) __field(bool, no_bg_gc) + __field(unsigned int, nr_free_secs) __field(long long, dirty_nodes) __field(long long, dirty_dents) __field(long long, dirty_imeta) @@ -670,6 +673,7 @@ TRACE_EVENT(f2fs_gc_begin, __entry->dev = sb->s_dev; __entry->gc_type = gc_type; __entry->no_bg_gc = no_bg_gc; + __entry->nr_free_secs = nr_free_secs; __entry->dirty_nodes = dirty_nodes; __entry->dirty_dents = dirty_dents; __entry->dirty_imeta = dirty_imeta; @@ -679,12 +683,13 @@ TRACE_EVENT(f2fs_gc_begin, __entry->prefree_seg = prefree_seg; ), - TP_printk("dev = (%d,%d), gc_type = %s, no_background_GC = %d, nodes = %lld, " - "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " + TP_printk("dev = (%d,%d), gc_type = %s, no_background_GC = %d, nr_free_secs = %u, " + "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " "rsv_seg:%d, prefree_seg:%u", show_dev(__entry->dev), show_gc_type(__entry->gc_type), (__entry->gc_type == BG_GC) ? __entry->no_bg_gc : -1, + __entry->nr_free_secs, __entry->dirty_nodes, __entry->dirty_dents, __entry->dirty_imeta, From 017e64bde295dd28eb3aafad27b1bc4790fc7d61 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 14 May 2022 10:59:29 -0700 Subject: [PATCH 47/57] f2fs: don't use casefolded comparison for "." and ".." Tryng to rename a directory that has all following properties fails with EINVAL and triggers the 'WARN_ON_ONCE(!fscrypt_has_encryption_key(dir))' in f2fs_match_ci_name(): - The directory is casefolded - The directory is encrypted - The directory's encryption key is not yet set up - The parent directory is *not* encrypted The problem is incorrect handling of the lookup of ".." to get the parent reference to update. fscrypt_setup_filename() treats ".." (and ".") specially, as it's never encrypted. It's passed through as-is, and setting up the directory's key is not attempted. As the name isn't a no-key name, f2fs treats it as a "normal" name and attempts a casefolded comparison. That breaks the assumption of the WARN_ON_ONCE() in f2fs_match_ci_name() which assumes that for encrypted directories, casefolded comparisons only happen when the directory's key is set up. We could just remove this WARN_ON_ONCE(). However, since casefolding is always a no-op on "." and ".." anyway, let's instead just not casefold these names. This results in the standard bytewise comparison. Fixes: 7ad08a58bf67 ("f2fs: Handle casefolding with Encryption") Cc: # v5.11+ Signed-off-by: Eric Biggers Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 3 ++- fs/f2fs/f2fs.h | 10 +++++----- fs/f2fs/hash.c | 11 ++++++----- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 011df7058c42..0a3b6303363d 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -82,7 +82,8 @@ int f2fs_init_casefolded_name(const struct inode *dir, #ifdef CONFIG_UNICODE struct super_block *sb = dir->i_sb; - if (IS_CASEFOLDED(dir)) { + if (IS_CASEFOLDED(dir) && + !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) { fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, GFP_NOFS, false, F2FS_SB(sb)); if (!fname->cf_name.name) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 426006a70849..d2b1b3f33b33 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -506,11 +506,11 @@ struct f2fs_filename { #ifdef CONFIG_UNICODE /* * For casefolded directories: the casefolded name, but it's left NULL - * if the original name is not valid Unicode, if the directory is both - * casefolded and encrypted and its encryption key is unavailable, or if - * the filesystem is doing an internal operation where usr_fname is also - * NULL. In all these cases we fall back to treating the name as an - * opaque byte sequence. + * if the original name is not valid Unicode, if the original name is + * "." or "..", if the directory is both casefolded and encrypted and + * its encryption key is unavailable, or if the filesystem is doing an + * internal operation where usr_fname is also NULL. In all these cases + * we fall back to treating the name as an opaque byte sequence. */ struct fscrypt_str cf_name; #endif diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index e3beac546c63..2788ceeaf5c2 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -91,7 +91,7 @@ static u32 TEA_hash_name(const u8 *p, size_t len) /* * Compute @fname->hash. For all directories, @fname->disk_name must be set. * For casefolded directories, @fname->usr_fname must be set, and also - * @fname->cf_name if the filename is valid Unicode. + * @fname->cf_name if the filename is valid Unicode and is not "." or "..". */ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) { @@ -110,10 +110,11 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) /* * If the casefolded name is provided, hash it instead of the * on-disk name. If the casefolded name is *not* provided, that - * should only be because the name wasn't valid Unicode, so fall - * back to treating the name as an opaque byte sequence. Note - * that to handle encrypted directories, the fallback must use - * usr_fname (plaintext) rather than disk_name (ciphertext). + * should only be because the name wasn't valid Unicode or was + * "." or "..", so fall back to treating the name as an opaque + * byte sequence. Note that to handle encrypted directories, + * the fallback must use usr_fname (plaintext) rather than + * disk_name (ciphertext). */ WARN_ON_ONCE(!fname->usr_fname->name); if (fname->cf_name.name) { From 41df0435a8e522b33e91ec4ab96f096e381ea4cd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 May 2022 11:37:23 +0800 Subject: [PATCH 48/57] f2fs: fix fallocate to use file_modified to update permissions consistently This patch tries to fix permission consistency issue as all other mainline filesystems. Since the initial introduction of (posix) fallocate back at the turn of the century, it has been possible to use this syscall to change the user-visible contents of files. This can happen by extending the file size during a preallocation, or through any of the newer modes (punch, zero, collapse, insert range). Because the call can be used to change file contents, we should treat it like we do any other modification to a file -- update the mtime, and drop set[ug]id privileges/capabilities. The VFS function file_modified() does all this for us if pass it a locked inode, so let's make fallocate drop permissions correctly. Cc: stable@kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7939caf8d69b..63d23e44ff3c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1780,6 +1780,10 @@ static long f2fs_fallocate(struct file *file, int mode, inode_lock(inode); + ret = file_modified(file); + if (ret) + goto out; + if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) goto out; From 3f134b1b1b53474fe09f96bc7ebac879cb8a8c7a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 18 May 2022 20:28:41 +0800 Subject: [PATCH 49/57] f2fs: fix to do sanity check for inline inode Yanming reported a kernel bug in Bugzilla kernel [1], which can be reproduced. The bug message is: The kernel message is shown below: kernel BUG at fs/inode.c:611! Call Trace: evict+0x282/0x4e0 __dentry_kill+0x2b2/0x4d0 dput+0x2dd/0x720 do_renameat2+0x596/0x970 __x64_sys_rename+0x78/0x90 do_syscall_64+0x3b/0x90 [1] https://bugzilla.kernel.org/show_bug.cgi?id=215895 The bug is due to fuzzed inode has both inline_data and encrypted flags. During f2fs_evict_inode(), as the inode was deleted by rename(), it will cause inline data conversion due to conflicting flags. The page cache will be polluted and the panic will be triggered in clear_inode(). Try fixing the bug by doing more sanity checks for inline data inode in sanity_check_inode(). Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/inline.c | 29 ++++++++++++++++++++++++----- fs/f2fs/inode.c | 3 +-- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d2b1b3f33b33..c2ad8aeb0cb5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4018,6 +4018,7 @@ extern struct kmem_cache *f2fs_inode_entry_slab; * inline.c */ bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_sanity_check_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); void f2fs_do_read_inline_data(struct page *page, struct page *ipage); void f2fs_truncate_inline_inode(struct inode *inode, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a578bf83b803..bf46a7dfbea2 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -14,21 +14,40 @@ #include "node.h" #include -bool f2fs_may_inline_data(struct inode *inode) +static bool support_inline_data(struct inode *inode) { if (f2fs_is_atomic_file(inode)) return false; - if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; + return true; +} - if (f2fs_post_read_required(inode)) +bool f2fs_may_inline_data(struct inode *inode) +{ + if (!support_inline_data(inode)) return false; - return true; + return !f2fs_post_read_required(inode); +} + +bool f2fs_sanity_check_inline_data(struct inode *inode) +{ + if (!f2fs_has_inline_data(inode)) + return false; + + if (!support_inline_data(inode)) + return true; + + /* + * used by sanity_check_inode(), when disk layout fields has not + * been synchronized to inmem fields. + */ + return (S_ISREG(inode->i_mode) && + (file_is_encrypt(inode) || file_is_verity(inode) || + (F2FS_I(inode)->i_flags & F2FS_COMPR_FL))); } bool f2fs_may_inline_dentry(struct inode *inode) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 4816b739c84f..c5ea1f7fea26 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -276,8 +276,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } } - if (f2fs_has_inline_data(inode) && - (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) { + if (f2fs_sanity_check_inline_data(inode)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); From 0671eb7794e062d4a9056c2019af04faec4aeac4 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 19 May 2022 18:40:10 +0800 Subject: [PATCH 50/57] f2fs: make f2fs_read_inline_data() more readable In f2fs_read_inline_data(), it is confused with checking of inline_data flag, as we checked it before calling. So this patch add some comments for f2fs_has_inline_data(). Signed-off-by: Chao Liu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c2ad8aeb0cb5..3ea4a641bf3e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3165,6 +3165,10 @@ static inline int inline_xattr_size(struct inode *inode) return 0; } +/* + * Notice: check inline_data flag without inode page lock is unsafe. + * It could change at any time by f2fs_convert_inline_page(). + */ static inline int f2fs_has_inline_data(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_DATA); From e5b8b3371beba1e3207f7de7c250a324ae94a612 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:11:43 +0200 Subject: [PATCH 51/57] f2fs: fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3ea4a641bf3e..2445d55d28bc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1755,7 +1755,7 @@ struct f2fs_sb_info { unsigned int data_io_flag; unsigned int node_io_flag; - /* For sysfs suppport */ + /* For sysfs support */ struct kobject s_kobj; /* /sys/fs/f2fs/ */ struct completion s_kobj_unregister; From 77142c3cf71f09dcf3fb2ef43926a29e2cfd2ae3 Mon Sep 17 00:00:00 2001 From: Sungjong Seo Date: Tue, 24 May 2022 10:29:11 +0900 Subject: [PATCH 52/57] f2fs: allow compression for mmap files in compress_mode=user Since commit e3c548323d32 ("f2fs: let's allow compression for mmap files"), it has been allowed to compress mmap files. However, in compress_mode=user, it is not allowed yet. To keep the same concept in both compress_modes, f2fs_ioc_(de)compress_file() should also allow it. Let's remove checking mmap files in f2fs_ioc_(de)compress_file() so that the compression for mmap files is also allowed in compress_mode=user. Signed-off-by: Sungjong Seo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 63d23e44ff3c..f4061f3bd5e4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3945,11 +3945,6 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) goto out; } - if (f2fs_is_mmap_file(inode)) { - ret = -EBUSY; - goto out; - } - ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; @@ -4017,11 +4012,6 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) goto out; } - if (f2fs_is_mmap_file(inode)) { - ret = -EBUSY; - goto out; - } - ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; From f9a80689503d13961c9ca5ccad010b01586da430 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 May 2022 09:56:34 +0800 Subject: [PATCH 53/57] f2fs: avoid unneeded error handling for revoke_entry_slab allocation In __f2fs_commit_atomic_write(), we will guarantee success of revoke_entry_slab allocation, so let's avoid unneeded error handling. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b2c0d3a6037d..22619b8fde75 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -314,11 +314,6 @@ static int __f2fs_commit_atomic_write(struct inode *inode) new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, true, NULL); - if (!new) { - f2fs_put_dnode(&dn); - ret = -ENOMEM; - goto out; - } ret = __replace_atomic_write_block(inode, index, blkaddr, &new->old_addr, false); From d4affd1238e1794cc72ccef5096e976f5f428198 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Wed, 25 May 2022 17:43:36 +0800 Subject: [PATCH 54/57] f2fs: add f2fs_init_write_merge_io function Almost all other initialization of variables in f2fs_fill_super are extraced to a single function. Also do it for write_io[], which can make code more clean. This patch just refactors the code, theres no functional change. Signed-off-by: Yufen Yu Reviewed-by: Chao Yu [Jaegeuk Kim: clean up] Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 28 ++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 27 +++------------------------ 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 54715a18e9e8..d09d2bc9a9b6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -585,6 +585,34 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, return false; } +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_PAGE_TYPE; i++) { + int n = (i == META) ? 1 : NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = f2fs_kmalloc(sbi, + array_size(n, sizeof(struct f2fs_bio_info)), + GFP_KERNEL); + if (!sbi->write_io[i]) + return -ENOMEM; + + for (j = HOT; j < n; j++) { + init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); + init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); + } + } + + return 0; +} + static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2445d55d28bc..4650bb67bfc9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3711,6 +3711,7 @@ int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type); +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0a15ed236ee5..b19f4dcf2b9b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4088,30 +4088,9 @@ try_onemore: set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - for (i = 0; i < NR_PAGE_TYPE; i++) { - int n = (i == META) ? 1 : NR_TEMP_TYPE; - int j; - - sbi->write_io[i] = - f2fs_kmalloc(sbi, - array_size(n, - sizeof(struct f2fs_bio_info)), - GFP_KERNEL); - if (!sbi->write_io[i]) { - err = -ENOMEM; - goto free_bio_info; - } - - for (j = HOT; j < n; j++) { - init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); - sbi->write_io[i][j].sbi = sbi; - sbi->write_io[i][j].bio = NULL; - spin_lock_init(&sbi->write_io[i][j].io_lock); - INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); - INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); - init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); - } - } + err = f2fs_init_write_merge_io(sbi); + if (err) + goto free_bio_info; init_f2fs_rwsem(&sbi->cp_rwsem); init_f2fs_rwsem(&sbi->quota_sem); From 9fd22b27c8ee58594c05ebd8188c0edaac50b894 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Thu, 26 May 2022 10:21:06 +0800 Subject: [PATCH 55/57] f2fs: replace F2FS_I(inode) and sbi by the local variable We have define 'fi' at the begin of the functions, just use it, rather than use F2FS_I(inode) again. Signed-off-by: Yufen Yu Reviewed-by: Chao Yu [Jaegeuk Kim: replace sbi] Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 22 +++++++++++----------- fs/f2fs/inode.c | 12 ++++++------ 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4650bb67bfc9..d3d0b08ae1bc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4402,8 +4402,8 @@ static inline bool f2fs_may_compress(struct inode *inode) static inline void f2fs_i_compr_blocks_update(struct inode *inode, u64 blocks, bool add) { - int diff = F2FS_I(inode)->i_cluster_size - blocks; struct f2fs_inode_info *fi = F2FS_I(inode); + int diff = fi->i_cluster_size - blocks; /* don't update i_compr_blocks if saved blocks were released */ if (!add && !atomic_read(&fi->i_compr_blocks)) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f4061f3bd5e4..0f6e1b07cec8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2032,25 +2032,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; - f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); /* * Should wait end_io to count F2FS_WB_CP_DATA correctly by * f2fs_is_atomic_file. */ if (get_dirty_pages(inode)) - f2fs_warn(F2FS_I_SB(inode), "Unexpected flush for atomic writes: ino=%lu, npages=%u", + f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } /* Create a COW inode for atomic write */ pinode = f2fs_iget(inode->i_sb, fi->i_pino); if (IS_ERR(pinode)) { - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); ret = PTR_ERR(pinode); goto out; } @@ -2058,7 +2058,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode); iput(pinode); if (ret) { - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); @@ -2070,10 +2070,10 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) set_inode_flag(inode, FI_ATOMIC_FILE); set_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - F2FS_I(inode)->atomic_write_task = current; + f2fs_update_time(sbi, REQ_TIME); + fi->atomic_write_task = current; stat_update_max_atomic_write(inode); out: inode_unlock(inode); @@ -2952,7 +2952,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) kprojid = make_kprojid(&init_user_ns, (projid_t)projid); - if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) + if (projid_eq(kprojid, fi->i_projid)) return 0; err = -EPERM; @@ -2972,7 +2972,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) if (err) goto out_unlock; - F2FS_I(inode)->i_projid = kprojid; + fi->i_projid = kprojid; inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); out_unlock: @@ -3922,7 +3922,7 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) struct f2fs_inode_info *fi = F2FS_I(inode); pgoff_t page_idx = 0, last_idx; unsigned int blk_per_seg = sbi->blocks_per_seg; - int cluster_size = F2FS_I(inode)->i_cluster_size; + int cluster_size = fi->i_cluster_size; int count, ret; if (!f2fs_sb_has_compression(sbi) || diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index c5ea1f7fea26..02a0005aac77 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -260,8 +260,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (F2FS_I(inode)->extent_tree) { - struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; + if (fi->extent_tree) { + struct extent_info *ei = &fi->extent_tree->largest; if (ei->len && (!f2fs_is_valid_blkaddr(sbi, ei->blk, @@ -465,10 +465,10 @@ static int do_read_inode(struct inode *inode) } } - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); From 9d03ec1e540d7638e75492d01016428728b06d4f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 27 May 2022 12:13:30 +0800 Subject: [PATCH 56/57] f2fs: fix to tag gcing flag on page during file defragment In order to garantee migrated data be persisted during checkpoint, otherwise out-of-order persistency between data and node may cause data corruption after SPOR. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0f6e1b07cec8..eaa45875bd1f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2602,6 +2602,7 @@ do_map: } set_page_dirty(page); + set_page_private_gcing(page); f2fs_put_page(page, 1); idx++; From 2c7e69438d7f6a786193aa49908fb71ff7bc86c9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 31 May 2022 18:27:09 -0700 Subject: [PATCH 57/57] f2fs: attach inline_data after setting compression This fixes the below corruption. [345393.335389] F2FS-fs (vdb): sanity_check_inode: inode (ino=6d0, mode=33206) should not have inline_data, run fsck to fix Fixes: 677a82b44ebf ("f2fs: fix to do sanity check for inline inode") Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 343a259194d9..aa7f68d66c61 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -89,8 +89,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); - if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) - set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(inode, FI_INLINE_DENTRY); @@ -107,10 +105,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, f2fs_init_extent_tree(inode, NULL); - stat_inc_inline_xattr(inode); - stat_inc_inline_inode(inode); - stat_inc_inline_dir(inode); - F2FS_I(inode)->i_flags = f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); @@ -127,6 +121,14 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, set_compress_context(inode); } + /* Should enable inline_data after compression set */ + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + f2fs_set_inode_flags(inode); trace_f2fs_new_inode(inode, 0); @@ -325,6 +327,8 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, if (!is_extension_exist(name, ext[i], false)) continue; + /* Do not use inline_data with compression */ + clear_inode_flag(inode, FI_INLINE_DATA); set_compress_context(inode); return; }