From 312e98342f842933c3ea7fe38903677d537dfad6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Oct 2022 14:25:21 +0200 Subject: [PATCH 1/4] fuse: add "expire only" mode to FUSE_NOTIFY_INVAL_ENTRY [ Upstream commit 4f8d37020e1fd0bf6ee9381ba918135ef3712efd ] Add a flag to entry expiration that lets the filesystem expire a dentry without kicking it out from the cache immediately. This makes a difference for overmounted dentries, where plain invalidation would detach all submounts before dropping the dentry from the cache. If only expiry is set on the dentry, then any overmounts are left alone and until ->d_revalidate() is called. Note: ->d_revalidate() is not called for the case of following a submount, so invalidation will only be triggered for the non-overmounted case. The dentry could also be mounted in a different mount instance, in which case any submounts will still be detached. Suggested-by: Jakob Blomer Signed-off-by: Miklos Szeredi Stable-dep-of: 3002240d1649 ("fuse: fix memory leak in fuse_create_open") Signed-off-by: Sasha Levin --- fs/fuse/dev.c | 4 ++-- fs/fuse/dir.c | 6 ++++-- fs/fuse/fuse_i.h | 2 +- include/uapi/linux/fuse.h | 13 +++++++++++-- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 96a717f73ce3..61bef919c042 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1498,7 +1498,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags); up_read(&fc->killsb); kfree(buf); return err; @@ -1546,7 +1546,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0); up_read(&fc->killsb); kfree(buf); return err; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 936a24b646ce..8474003aa54d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1174,7 +1174,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) } int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, - u64 child_nodeid, struct qstr *name) + u64 child_nodeid, struct qstr *name, u32 flags) { int err = -ENOTDIR; struct inode *parent; @@ -1201,7 +1201,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, goto unlock; fuse_dir_changed(parent); - fuse_invalidate_entry(entry); + if (!(flags & FUSE_EXPIRE_ONLY)) + d_invalidate(entry); + fuse_invalidate_entry_cache(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { inode_lock(d_inode(entry)); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 66c2a9999468..cb464e5b171a 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1235,7 +1235,7 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, * then the dentry is unhashed (d_delete()). */ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, - u64 child_nodeid, struct qstr *name); + u64 child_nodeid, struct qstr *name, u32 flags); int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 76ee8f9e024a..39cfb343faa8 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -197,6 +197,9 @@ * * 7.37 * - add FUSE_TMPFILE + * + * 7.38 + * - add FUSE_EXPIRE_ONLY flag to fuse_notify_inval_entry */ #ifndef _LINUX_FUSE_H @@ -232,7 +235,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 37 +#define FUSE_KERNEL_MINOR_VERSION 38 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -491,6 +494,12 @@ struct fuse_file_lock { */ #define FUSE_SETXATTR_ACL_KILL_SGID (1 << 0) +/** + * notify_inval_entry flags + * FUSE_EXPIRE_ONLY + */ +#define FUSE_EXPIRE_ONLY (1 << 0) + enum fuse_opcode { FUSE_LOOKUP = 1, FUSE_FORGET = 2, /* no reply */ @@ -919,7 +928,7 @@ struct fuse_notify_inval_inode_out { struct fuse_notify_inval_entry_out { uint64_t parent; uint32_t namelen; - uint32_t padding; + uint32_t flags; }; struct fuse_notify_delete_out { From aa97ab65932367f67188f5f954ef6fd8bc30b75d Mon Sep 17 00:00:00 2001 From: Dharmendra Singh Date: Fri, 17 Jun 2022 12:40:27 +0530 Subject: [PATCH 2/4] fuse: allow non-extending parallel direct writes on the same file [ Upstream commit 153524053bbb0d27bb2e0be36d1b46862e9ce74c ] In general, as of now, in FUSE, direct writes on the same file are serialized over inode lock i.e we hold inode lock for the full duration of the write request. I could not find in fuse code and git history a comment which clearly explains why this exclusive lock is taken for direct writes. Following might be the reasons for acquiring an exclusive lock but not be limited to 1) Our guess is some USER space fuse implementations might be relying on this lock for serialization. 2) The lock protects against file read/write size races. 3) Ruling out any issues arising from partial write failures. This patch relaxes the exclusive lock for direct non-extending writes only. File size extending writes might not need the lock either, but we are not entirely sure if there is a risk to introduce any kind of regression. Furthermore, benchmarking with fio does not show a difference between patch versions that take on file size extension a) an exclusive lock and b) a shared lock. A possible example of an issue with i_size extending writes are write error cases. Some writes might succeed and others might fail for file system internal reasons - for example ENOSPACE. With parallel file size extending writes it _might_ be difficult to revert the action of the failing write, especially to restore the right i_size. With these changes, we allow non-extending parallel direct writes on the same file with the help of a flag called FOPEN_PARALLEL_DIRECT_WRITES. If this flag is set on the file (flag is passed from libfuse to fuse kernel as part of file open/create), we do not take exclusive lock anymore, but instead use a shared lock that allows non-extending writes to run in parallel. FUSE implementations which rely on this inode lock for serialization can continue to do so and serialized direct writes are still the default. Implementations that do not do write serialization need to be updated and need to set the FOPEN_PARALLEL_DIRECT_WRITES flag in their file open/create reply. On patch review there were concerns that network file systems (or vfs multiple mounts of the same file system) might have issues with parallel writes. We believe this is not the case, as this is just a local lock, which network file systems could not rely on anyway. I.e. this lock is just for local consistency. Signed-off-by: Dharmendra Singh Signed-off-by: Bernd Schubert Signed-off-by: Miklos Szeredi Stable-dep-of: 3002240d1649 ("fuse: fix memory leak in fuse_create_open") Signed-off-by: Sasha Levin --- fs/fuse/file.c | 43 ++++++++++++++++++++++++++++++++++++--- include/uapi/linux/fuse.h | 3 +++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e6ec4338a9c5..0df1311afb87 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1563,14 +1563,47 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) return res; } +static bool fuse_direct_write_extending_i_size(struct kiocb *iocb, + struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); +} + static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; + bool exclusive_lock = + !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || + iocb->ki_flags & IOCB_APPEND || + fuse_direct_write_extending_i_size(iocb, from); + + /* + * Take exclusive lock if + * - Parallel direct writes are disabled - a user space decision + * - Parallel direct writes are enabled and i_size is being extended. + * This might not be needed at all, but needs further investigation. + */ + if (exclusive_lock) + inode_lock(inode); + else { + inode_lock_shared(inode); + + /* A race with truncate might have come up as the decision for + * the lock type was done without holding the lock, check again. + */ + if (fuse_direct_write_extending_i_size(iocb, from)) { + inode_unlock_shared(inode); + inode_lock(inode); + exclusive_lock = true; + } + } - /* Don't allow parallel writes to the same file */ - inode_lock(inode); res = generic_write_checks(iocb, from); if (res > 0) { if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { @@ -1581,7 +1614,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_write_update_attr(inode, iocb->ki_pos, res); } } - inode_unlock(inode); + if (exclusive_lock) + inode_unlock(inode); + else + inode_unlock_shared(inode); return res; } @@ -2937,6 +2973,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (iov_iter_rw(iter) == WRITE) { fuse_write_update_attr(inode, pos, ret); + /* For extending writes we already hold exclusive lock */ if (ret < 0 && offset + count > i_size) fuse_do_truncate(file); } diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 39cfb343faa8..e3c54109bae9 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -200,6 +200,7 @@ * * 7.38 * - add FUSE_EXPIRE_ONLY flag to fuse_notify_inval_entry + * - add FOPEN_PARALLEL_DIRECT_WRITES */ #ifndef _LINUX_FUSE_H @@ -307,6 +308,7 @@ struct fuse_file_lock { * FOPEN_CACHE_DIR: allow caching this directory * FOPEN_STREAM: the file is stream-like (no file position at all) * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) + * FOPEN_PARALLEL_DIRECT_WRITES: Allow concurrent direct writes on the same inode */ #define FOPEN_DIRECT_IO (1 << 0) #define FOPEN_KEEP_CACHE (1 << 1) @@ -314,6 +316,7 @@ struct fuse_file_lock { #define FOPEN_CACHE_DIR (1 << 3) #define FOPEN_STREAM (1 << 4) #define FOPEN_NOFLUSH (1 << 5) +#define FOPEN_PARALLEL_DIRECT_WRITES (1 << 6) /** * INIT request/reply flags From 4250dddafd4b2daaf2b8ac38c62237bf93299cc7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 10 Nov 2022 15:46:33 +0100 Subject: [PATCH 3/4] fuse: add request extension [ Upstream commit 15d937d7ca8c55d2b0ce9116e20c780fdd0b67cc ] Will need to add supplementary groups to create messages, so add the general concept of a request extension. A request extension is appended to the end of the main request. It has a header indicating the size and type of the extension. The create security context (fuse_secctx_*) is similar to the generic request extension, so include that as well in a backward compatible manner. Add the total extension length to the request header. The offset of the extension block within the request can be calculated by: inh->len - inh->total_extlen * 8 Signed-off-by: Miklos Szeredi Stable-dep-of: 3002240d1649 ("fuse: fix memory leak in fuse_create_open") Signed-off-by: Sasha Levin --- fs/fuse/dev.c | 2 ++ fs/fuse/dir.c | 66 ++++++++++++++++++++++----------------- fs/fuse/fuse_i.h | 6 ++-- include/uapi/linux/fuse.h | 28 ++++++++++++++++- 4 files changed, 71 insertions(+), 31 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 61bef919c042..7e0d4f08a0cf 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -476,6 +476,8 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) req->in.h.opcode = args->opcode; req->in.h.nodeid = args->nodeid; req->args = args; + if (args->is_ext) + req->in.h.total_extlen = args->in_args[args->ext_idx].size / 8; if (args->end) __set_bit(FR_ASYNC, &req->flags); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 8474003aa54d..3b7887312ac0 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -470,7 +470,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, } static int get_security_context(struct dentry *entry, umode_t mode, - void **security_ctx, u32 *security_ctxlen) + struct fuse_in_arg *ext) { struct fuse_secctx *fctx; struct fuse_secctx_header *header; @@ -517,14 +517,42 @@ static int get_security_context(struct dentry *entry, umode_t mode, memcpy(ptr, ctx, ctxlen); } - *security_ctxlen = total_len; - *security_ctx = header; + ext->size = total_len; + ext->value = header; err = 0; out_err: kfree(ctx); return err; } +static int get_create_ext(struct fuse_args *args, struct dentry *dentry, + umode_t mode) +{ + struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); + struct fuse_in_arg ext = { .size = 0, .value = NULL }; + int err = 0; + + if (fc->init_security) + err = get_security_context(dentry, mode, &ext); + + if (!err && ext.size) { + WARN_ON(args->in_numargs >= ARRAY_SIZE(args->in_args)); + args->is_ext = true; + args->ext_idx = args->in_numargs++; + args->in_args[args->ext_idx] = ext; + } else { + kfree(ext.value); + } + + return err; +} + +static void free_ext_value(struct fuse_args *args) +{ + if (args->is_ext) + kfree(args->in_args[args->ext_idx].value); +} + /* * Atomic create+open operation * @@ -545,8 +573,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; - void *security_ctx = NULL; - u32 security_ctxlen; bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ @@ -590,19 +616,12 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[1].size = sizeof(outopen); args.out_args[1].value = &outopen; - if (fm->fc->init_security) { - err = get_security_context(entry, mode, &security_ctx, - &security_ctxlen); - if (err) - goto out_put_forget_req; - - args.in_numargs = 3; - args.in_args[2].size = security_ctxlen; - args.in_args[2].value = security_ctx; - } + err = get_create_ext(&args, entry, mode); + if (err) + goto out_put_forget_req; err = fuse_simple_request(fm, &args); - kfree(security_ctx); + free_ext_value(&args); if (err) goto out_free_ff; @@ -709,8 +728,6 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, struct dentry *d; int err; struct fuse_forget_link *forget; - void *security_ctx = NULL; - u32 security_ctxlen; if (fuse_is_bad(dir)) return -EIO; @@ -725,21 +742,14 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, args->out_args[0].size = sizeof(outarg); args->out_args[0].value = &outarg; - if (fm->fc->init_security && args->opcode != FUSE_LINK) { - err = get_security_context(entry, mode, &security_ctx, - &security_ctxlen); + if (args->opcode != FUSE_LINK) { + err = get_create_ext(args, entry, mode); if (err) goto out_put_forget_req; - - BUG_ON(args->in_numargs != 2); - - args->in_numargs = 3; - args->in_args[2].size = security_ctxlen; - args->in_args[2].value = security_ctx; } err = fuse_simple_request(fm, args); - kfree(security_ctx); + free_ext_value(args); if (err) goto out_put_forget_req; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index cb464e5b171a..6c3ec70c1b70 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -264,8 +264,9 @@ struct fuse_page_desc { struct fuse_args { uint64_t nodeid; uint32_t opcode; - unsigned short in_numargs; - unsigned short out_numargs; + uint8_t in_numargs; + uint8_t out_numargs; + uint8_t ext_idx; bool force:1; bool noreply:1; bool nocreds:1; @@ -276,6 +277,7 @@ struct fuse_args { bool page_zeroing:1; bool page_replace:1; bool may_block:1; + bool is_ext:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index e3c54109bae9..c71f12429e3d 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -201,6 +201,9 @@ * 7.38 * - add FUSE_EXPIRE_ONLY flag to fuse_notify_inval_entry * - add FOPEN_PARALLEL_DIRECT_WRITES + * - add total_extlen to fuse_in_header + * - add FUSE_MAX_NR_SECCTX + * - add extension header */ #ifndef _LINUX_FUSE_H @@ -503,6 +506,15 @@ struct fuse_file_lock { */ #define FUSE_EXPIRE_ONLY (1 << 0) +/** + * extension type + * FUSE_MAX_NR_SECCTX: maximum value of &fuse_secctx_header.nr_secctx + */ +enum fuse_ext_type { + /* Types 0..31 are reserved for fuse_secctx_header */ + FUSE_MAX_NR_SECCTX = 31, +}; + enum fuse_opcode { FUSE_LOOKUP = 1, FUSE_FORGET = 2, /* no reply */ @@ -886,7 +898,8 @@ struct fuse_in_header { uint32_t uid; uint32_t gid; uint32_t pid; - uint32_t padding; + uint16_t total_extlen; /* length of extensions in 8byte units */ + uint16_t padding; }; struct fuse_out_header { @@ -1047,4 +1060,17 @@ struct fuse_secctx_header { uint32_t nr_secctx; }; +/** + * struct fuse_ext_header - extension header + * @size: total size of this extension including this header + * @type: type of extension + * + * This is made compatible with fuse_secctx_header by using type values > + * FUSE_MAX_NR_SECCTX + */ +struct fuse_ext_header { + uint32_t size; + uint32_t type; +}; + #endif /* _LINUX_FUSE_H */ From 5e20208dfe4e21b9e02a0a2037da2e29e19796bc Mon Sep 17 00:00:00 2001 From: yangyun Date: Fri, 23 Aug 2024 16:51:46 +0800 Subject: [PATCH 4/4] fuse: fix memory leak in fuse_create_open [ Upstream commit 3002240d16494d798add0575e8ba1f284258ab34 ] The memory of struct fuse_file is allocated but not freed when get_create_ext return error. Fixes: 3e2b6fdbdc9a ("fuse: send security context of inode on file") Cc: stable@vger.kernel.org # v5.17 Signed-off-by: yangyun Signed-off-by: Miklos Szeredi Signed-off-by: Sasha Levin --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3b7887312ac0..aa2be4c1ea8f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -618,7 +618,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, err = get_create_ext(&args, entry, mode); if (err) - goto out_put_forget_req; + goto out_free_ff; err = fuse_simple_request(fm, &args); free_ext_value(&args);