From 6dffd7067d84c5c646e11e21638aaebf6fde2ecd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2023 15:57:40 -0700 Subject: [PATCH 1/6] iomap: convert iomap_unshare_iter to use large folios [ Upstream commit a5f31a5028d1e88e97c3b6cdc3e3bf2da085e232 ] Convert iomap_unshare_iter to create large folios if possible, since the write and zeroing paths already do that. I think this got missed in the conversion of the write paths that landed in 6.6-rc1. Cc: ritesh.list@gmail.com, willy@infradead.org Signed-off-by: Darrick J. Wong Reviewed-by: Ritesh Harjani (IBM) Stable-dep-of: 50793801fc7f ("fsdax: dax_unshare_iter needs to copy entire blocks") Signed-off-by: Sasha Levin --- fs/iomap/buffered-io.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 1833608f3931..674ac79bdb45 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1090,7 +1090,6 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); - long status = 0; loff_t written = 0; /* don't bother with blocks that are not shared to start with */ @@ -1101,28 +1100,33 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) return length; do { - unsigned long offset = offset_in_page(pos); - unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); struct folio *folio; + int status; + size_t offset; + size_t bytes = min_t(u64, SIZE_MAX, length); status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; - if (iter->iomap.flags & IOMAP_F_STALE) + if (iomap->flags & IOMAP_F_STALE) break; - status = iomap_write_end(iter, pos, bytes, bytes, folio); - if (WARN_ON_ONCE(status == 0)) + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) + bytes = folio_size(folio) - offset; + + bytes = iomap_write_end(iter, pos, bytes, bytes, folio); + if (WARN_ON_ONCE(bytes == 0)) return -EIO; cond_resched(); - pos += status; - written += status; - length -= status; + pos += bytes; + written += bytes; + length -= bytes; balance_dirty_pages_ratelimited(iter->inode->i_mapping); - } while (length); + } while (length > 0); return written; } From 50c2cffdcf21944a38241495629164e6e715b92b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Sep 2024 07:39:04 +0300 Subject: [PATCH 2/6] iomap: improve shared block detection in iomap_unshare_iter [ Upstream commit b53fdb215d13f8e9c29541434bf2d14dac8bcbdc ] Currently iomap_unshare_iter relies on the IOMAP_F_SHARED flag to detect blocks to unshare. This is reasonable, but IOMAP_F_SHARED is also useful for the file system to do internal book keeping for out of place writes. XFS used to that, until it got removed in commit 72a048c1056a ("xfs: only set IOMAP_F_SHARED when providing a srcmap to a write") because unshare for incorrectly unshare such blocks. Add an extra safeguard by checking the explicitly provided srcmap instead of the fallback to the iomap for valid data, as that catches the case where we'd just copy from the same place we'd write to easily, allowing to reinstate setting IOMAP_F_SHARED for all XFS writes that go to the COW fork. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240910043949.3481298-3-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner Stable-dep-of: 50793801fc7f ("fsdax: dax_unshare_iter needs to copy entire blocks") Signed-off-by: Sasha Levin --- fs/iomap/buffered-io.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 674ac79bdb45..527d3bcfc69a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1087,16 +1087,25 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); static loff_t iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; - const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); loff_t written = 0; - /* don't bother with blocks that are not shared to start with */ + /* Don't bother with blocks that are not shared to start with. */ if (!(iomap->flags & IOMAP_F_SHARED)) return length; - /* don't bother with holes or unwritten extents */ - if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) + + /* + * Don't bother with holes or unwritten extents. + * + * Note that we use srcmap directly instead of iomap_iter_srcmap as + * unsharing requires providing a separate source map, and the presence + * of one is a good indicator that unsharing is needed, unlike + * IOMAP_F_SHARED which can be set for any data that goes into the COW + * fork for XFS. + */ + if (iter->srcmap.type == IOMAP_HOLE || + iter->srcmap.type == IOMAP_UNWRITTEN) return length; do { From eb4c6fe20a3034a2c2e6157d5b98a9e67f388af1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 2 Oct 2024 08:00:40 -0700 Subject: [PATCH 3/6] iomap: don't bother unsharing delalloc extents [ Upstream commit f7a4874d977bf4202ad575031222e78809a36292 ] If unshare encounters a delalloc reservation in the srcmap, that means that the file range isn't shared because delalloc reservations cannot be reflinked. Therefore, don't try to unshare them. Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/r/20241002150040.GB21853@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Christian Brauner Stable-dep-of: 50793801fc7f ("fsdax: dax_unshare_iter needs to copy entire blocks") Signed-off-by: Sasha Levin --- fs/iomap/buffered-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 527d3bcfc69a..b1af9001e6db 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1096,7 +1096,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) return length; /* - * Don't bother with holes or unwritten extents. + * Don't bother with delalloc reservations, holes or unwritten extents. * * Note that we use srcmap directly instead of iomap_iter_srcmap as * unsharing requires providing a separate source map, and the presence @@ -1105,6 +1105,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) * fork for XFS. */ if (iter->srcmap.type == IOMAP_HOLE || + iter->srcmap.type == IOMAP_DELALLOC || iter->srcmap.type == IOMAP_UNWRITTEN) return length; From aa56ea3b7defffbcebd01f8c913fe03c12539dc1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 3 Oct 2024 08:09:16 -0700 Subject: [PATCH 4/6] iomap: share iomap_unshare_iter predicate code with fsdax [ Upstream commit 6ef6a0e821d3dad6bf8a5d5508762dba9042c84b ] The predicate code that iomap_unshare_iter uses to decide if it's really needs to unshare a file range mapping should be shared with the fsdax version, because right now they're opencoded and inconsistent. Note that we simplify the predicate logic a bit -- we no longer allow unsharing of inline data mappings, but there aren't any filesystems that allow shared inline data currently. This is a fix in the sense that it should have been ported to fsdax. Fixes: b53fdb215d13 ("iomap: improve shared block detection in iomap_unshare_iter") Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/r/172796813294.1131942.15762084021076932620.stgit@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner Stable-dep-of: 50793801fc7f ("fsdax: dax_unshare_iter needs to copy entire blocks") Signed-off-by: Sasha Levin --- fs/dax.c | 3 +-- fs/iomap/buffered-io.c | 30 ++++++++++++++++-------------- include/linux/iomap.h | 1 + 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 72a437892b4a..74f9a14565f5 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1231,8 +1231,7 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) s64 ret = 0; void *daddr = NULL, *saddr = NULL; - /* don't bother with blocks that are not shared to start with */ - if (!(iomap->flags & IOMAP_F_SHARED)) + if (!iomap_want_unshare_iter(iter)) return length; id = dax_read_lock(); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index b1af9001e6db..876273db711d 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1084,19 +1084,12 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); -static loff_t iomap_unshare_iter(struct iomap_iter *iter) +bool iomap_want_unshare_iter(const struct iomap_iter *iter) { - struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); - loff_t written = 0; - - /* Don't bother with blocks that are not shared to start with. */ - if (!(iomap->flags & IOMAP_F_SHARED)) - return length; - /* - * Don't bother with delalloc reservations, holes or unwritten extents. + * Don't bother with blocks that are not shared to start with; or + * mappings that cannot be shared, such as inline data, delalloc + * reservations, holes or unwritten extents. * * Note that we use srcmap directly instead of iomap_iter_srcmap as * unsharing requires providing a separate source map, and the presence @@ -1104,9 +1097,18 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) * IOMAP_F_SHARED which can be set for any data that goes into the COW * fork for XFS. */ - if (iter->srcmap.type == IOMAP_HOLE || - iter->srcmap.type == IOMAP_DELALLOC || - iter->srcmap.type == IOMAP_UNWRITTEN) + return (iter->iomap.flags & IOMAP_F_SHARED) && + iter->srcmap.type == IOMAP_MAPPED; +} + +static loff_t iomap_unshare_iter(struct iomap_iter *iter) +{ + struct iomap *iomap = &iter->iomap; + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); + loff_t written = 0; + + if (!iomap_want_unshare_iter(iter)) return length; do { diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 0983dfc9a203..0a37b54e2492 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -264,6 +264,7 @@ bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); +bool iomap_want_unshare_iter(const struct iomap_iter *iter); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, From a01987cd90fa20b970fc729850c1366420eac0b4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 3 Oct 2024 08:09:32 -0700 Subject: [PATCH 5/6] fsdax: remove zeroing code from dax_unshare_iter [ Upstream commit 95472274b6fed8f2d30fbdda304e12174b3d4099 ] Remove the code in dax_unshare_iter that zeroes the destination memory because it's not necessary. If srcmap is unwritten, we don't have to do anything because that unwritten extent came from the regular file mapping, and unwritten extents cannot be shared. The same applies to holes. Furthermore, zeroing to unshare a mapping is just plain wrong because unsharing means copy on write, and we should be copying data. This is effectively a revert of commit 13dd4e04625f ("fsdax: unshare: zero destination if srcmap is HOLE or UNWRITTEN") Cc: ruansy.fnst@fujitsu.com Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/r/172796813311.1131942.16033376284752798632.stgit@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner Stable-dep-of: 50793801fc7f ("fsdax: dax_unshare_iter needs to copy entire blocks") Signed-off-by: Sasha Levin --- fs/dax.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 74f9a14565f5..fa5a82b27c2f 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1239,14 +1239,6 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) if (ret < 0) goto out_unlock; - /* zero the distance if srcmap is HOLE or UNWRITTEN */ - if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) { - memset(daddr, 0, length); - dax_flush(iomap->dax_dev, daddr, length); - ret = length; - goto out_unlock; - } - ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); if (ret < 0) goto out_unlock; From bdbc96c23197d773a7d1bf03e4f11de593b0ff28 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 3 Oct 2024 08:09:48 -0700 Subject: [PATCH 6/6] fsdax: dax_unshare_iter needs to copy entire blocks [ Upstream commit 50793801fc7f6d08def48754fb0f0706b0cfc394 ] The code that copies data from srcmap to iomap in dax_unshare_iter is very very broken, which bfoster's recent fsx changes have exposed. If the pos and len passed to dax_file_unshare are not aligned to an fsblock boundary, the iter pos and length in the _iter function will reflect this unalignment. dax_iomap_direct_access always returns a pointer to the start of the kmapped fsdax page, even if its pos argument is in the middle of that page. This is catastrophic for data integrity when iter->pos is not aligned to a page, because daddr/saddr do not point to the same byte in the file as iter->pos. Hence we corrupt user data by copying it to the wrong place. If iter->pos + iomap_length() in the _iter function not aligned to a page, then we fail to copy a full block, and only partially populate the destination block. This is catastrophic for data confidentiality because we expose stale pmem contents. Fix both of these issues by aligning copy_pos/copy_len to a page boundary (remember, this is fsdax so 1 fsblock == 1 base page) so that we always copy full blocks. We're not done yet -- there's no call to invalidate_inode_pages2_range, so programs that have the file range mmap'd will continue accessing the old memory mapping after the file metadata updates have completed. Be careful with the return value -- if the unshare succeeds, we still need to return the number of bytes that the iomap iter thinks we're operating on. Cc: ruansy.fnst@fujitsu.com Fixes: d984648e428b ("fsdax,xfs: port unshare to fsdax") Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/r/172796813328.1131942.16777025316348797355.stgit@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner Signed-off-by: Sasha Levin --- fs/dax.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index fa5a82b27c2f..ca7138bb1d54 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1225,26 +1225,46 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); + loff_t copy_pos = iter->pos; + u64 copy_len = iomap_length(iter); + u32 mod; int id = 0; s64 ret = 0; void *daddr = NULL, *saddr = NULL; if (!iomap_want_unshare_iter(iter)) - return length; + return iomap_length(iter); + + /* + * Extend the file range to be aligned to fsblock/pagesize, because + * we need to copy entire blocks, not just the byte range specified. + * Invalidate the mapping because we're about to CoW. + */ + mod = offset_in_page(copy_pos); + if (mod) { + copy_len += mod; + copy_pos -= mod; + } + + mod = offset_in_page(copy_pos + copy_len); + if (mod) + copy_len += PAGE_SIZE - mod; + + invalidate_inode_pages2_range(iter->inode->i_mapping, + copy_pos >> PAGE_SHIFT, + (copy_pos + copy_len - 1) >> PAGE_SHIFT); id = dax_read_lock(); - ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL); + ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL); if (ret < 0) goto out_unlock; - ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); + ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL); if (ret < 0) goto out_unlock; - if (copy_mc_to_kernel(daddr, saddr, length) == 0) - ret = length; + if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0) + ret = iomap_length(iter); else ret = -EIO;