From 613d8368e39f4cd67f2ad629ebb870d3e3b4bc6e Mon Sep 17 00:00:00 2001 From: Paul Lawrence Date: Wed, 2 Aug 2023 12:23:44 -0700 Subject: [PATCH 01/78] ANDROID: fuse-bpf: Follow mounts in lookups Bug: 292925770 Test: fuse_test run. The following steps on Android also now pass: Create /data/123 and /data/media/0/Android/data/45 directories Mount /data/123 directory to /data/media/0/Android/data/45 directory Create 1.txt under the /data/123 directory File 1.txt should appear in /storage/emulated/0/Android/data/45 Change-Id: I1fe27d743ca2981e624a9aa87d9ab6deb313aadc Signed-off-by: Paul Lawrence --- fs/fuse/backing.c | 15 ++++--- .../selftests/filesystems/fuse/bpf_loader.c | 28 +++++++++++- .../selftests/filesystems/fuse/fuse_test.c | 45 +++++++++++++++++++ .../selftests/filesystems/fuse/test_fuse.h | 3 ++ 4 files changed, 85 insertions(+), 6 deletions(-) diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c index e16457c75944..6ca74987f7da 100644 --- a/fs/fuse/backing.c +++ b/fs/fuse/backing.c @@ -1117,7 +1117,6 @@ int fuse_lookup_backing(struct fuse_bpf_args *fa, struct inode *dir, struct kstat stat; int err; - /* TODO this will not handle lookups over mount points */ inode_lock_nested(dir_backing_inode, I_MUTEX_PARENT); backing_entry = lookup_one_len(entry->d_name.name, dir_backing_entry, strlen(entry->d_name.name)); @@ -1136,16 +1135,22 @@ int fuse_lookup_backing(struct fuse_bpf_args *fa, struct inode *dir, return 0; } + err = follow_down(&fuse_entry->backing_path); + if (err) + goto err_out; + err = vfs_getattr(&fuse_entry->backing_path, &stat, STATX_BASIC_STATS, 0); - if (err) { - path_put_init(&fuse_entry->backing_path); - return err; - } + if (err) + goto err_out; fuse_stat_to_attr(get_fuse_conn(dir), backing_entry->d_inode, &stat, &feo->attr); return 0; + +err_out: + path_put_init(&fuse_entry->backing_path); + return err; } int fuse_handle_backing(struct fuse_entry_bpf *feb, struct inode **backing_inode, diff --git a/tools/testing/selftests/filesystems/fuse/bpf_loader.c b/tools/testing/selftests/filesystems/fuse/bpf_loader.c index 5bf26eadd421..94f884c64d29 100644 --- a/tools/testing/selftests/filesystems/fuse/bpf_loader.c +++ b/tools/testing/selftests/filesystems/fuse/bpf_loader.c @@ -394,6 +394,29 @@ int s_rename(struct s oldpathname, struct s newpathname) return res; } +int s_mount(struct s source, struct s target, struct s filesystem, + unsigned long mountflags, struct s data) +{ + int res; + + res = mount(source.s, target.s, filesystem.s, mountflags, data.s); + free(source.s); + free(target.s); + free(filesystem.s); + free(data.s); + + return res; +} + +int s_umount(struct s target) +{ + int res; + + res = umount(target.s); + free(target.s); + return res; +} + int s_fuse_attr(struct s pathname, struct fuse_attr *fuse_attr_out) { @@ -574,7 +597,10 @@ static int mount_fuse_maybe_init(const char *mount_dir, int bpf_fd, int dir_fd, })); } - *fuse_dev_ptr = fuse_dev; + if (fuse_dev_ptr) + *fuse_dev_ptr = fuse_dev; + else + TESTSYSCALL(close(fuse_dev)); fuse_dev = -1; result = TEST_SUCCESS; out: diff --git a/tools/testing/selftests/filesystems/fuse/fuse_test.c b/tools/testing/selftests/filesystems/fuse/fuse_test.c index 528595a8e82f..ad24ed48853e 100644 --- a/tools/testing/selftests/filesystems/fuse/fuse_test.c +++ b/tools/testing/selftests/filesystems/fuse/fuse_test.c @@ -2114,6 +2114,50 @@ out: return result; } +/** + * Test that fuse passthrough correctly traverses a mount point on the lower fs + */ +static int bpf_test_follow_mounts(const char *mount_dir) +{ + const char *bind_src = "bind_src"; + const char *bind_dst = "bind_dst"; + const char *file = "file"; + int fd = -1; + int src_fd = -1; + int result = TEST_FAILURE; + + TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(bind_src)), 0777)); + TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(bind_dst)), 0777)); + TEST(fd = s_creat(s_pathn(3, s(ft_src), s(bind_src), s(file)), 0777), + fd != -1); + TESTSYSCALL(close(fd)); + fd = -1; + TESTSYSCALL(s_mount(s_path(s(ft_src), s(bind_src)), + s_path(s(ft_src), s(bind_dst)), + s(NULL), MS_BIND, s(NULL))); + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(mount_fuse_no_init(mount_dir, -1, src_fd, NULL), 0); + TEST(fd = s_open(s_pathn(3, s(mount_dir), s(bind_src), s(file)), + O_RDONLY), + fd != -1); + TESTSYSCALL(close(fd)); + fd = -1; + TEST(fd = s_open(s_pathn(3, s(mount_dir), s(bind_dst), s(file)), + O_RDONLY), + fd != -1); + TESTSYSCALL(close(fd)); + fd = -1; + + result = TEST_SUCCESS; +out: + umount(mount_dir); + close(src_fd); + s_umount(s_path(s(ft_src), s(bind_dst))); + close(fd); + return result; +} + static void parse_range(const char *ranges, bool *run_test, size_t tests) { size_t i; @@ -2244,6 +2288,7 @@ int main(int argc, char *argv[]) MAKE_TEST(bpf_test_create_and_remove_bpf), MAKE_TEST(bpf_test_mkdir_and_remove_bpf), MAKE_TEST(bpf_test_readahead), + MAKE_TEST(bpf_test_follow_mounts), }; #undef MAKE_TEST diff --git a/tools/testing/selftests/filesystems/fuse/test_fuse.h b/tools/testing/selftests/filesystems/fuse/test_fuse.h index 69dadc9c7e45..e62e2ee07713 100644 --- a/tools/testing/selftests/filesystems/fuse/test_fuse.h +++ b/tools/testing/selftests/filesystems/fuse/test_fuse.h @@ -64,6 +64,9 @@ int s_setxattr(struct s pathname, const char name[], const void *value, size_t size, int flags); int s_removexattr(struct s pathname, const char name[]); int s_rename(struct s oldpathname, struct s newpathname); +int s_mount(struct s source, struct s target, struct s filesystem, + unsigned long mountflags, struct s data); +int s_umount(struct s target); struct s tracing_folder(void); int tracing_on(void); From 44702d8fa1b046b52595a03ac7fedc7abc414c42 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Thu, 14 Dec 2023 04:58:41 +0000 Subject: [PATCH 02/78] FROMLIST: mm: migrate high-order folios in swap cache correctly Large folios occupy N consecutive entries in the swap cache instead of using multi-index entries like the page cache. However, if a large folio is re-added to the LRU list, it can be migrated. The migration code was not aware of the difference between the swap cache and the page cache and assumed that a single xas_store() would be sufficient. This leaves potentially many stale pointers to the now-migrated folio in the swap cache, which can lead to almost arbitrary data corruption in the future. This can also manifest as infinite loops with the RCU read lock held. Bug: 315281107 Change-Id: I455f964a9f21c13089890073777388236b6669d7 [willy@infradead.org: modifications to the changelog & tweaked the fix] Fixes: 3417013e0d18 ("mm/migrate: Add folio_migrate_mapping()") Link: https://lkml.kernel.org/r/20231214045841.961776-1-willy@infradead.org Link: https://lore.kernel.org/linux-mm/20231214045841.961776-1-willy@infradead.org/ Signed-off-by: Charan Teja Kalla Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Charan Teja Kalla Closes: https://lkml.kernel.org/r/1700569840-17327-1-git-send-email-quic_charante@quicinc.com Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Naoya Horiguchi Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton Signed-off-by: Charan Teja Kalla --- mm/migrate.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index ef490976c98e..9f5f52deed0c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -393,6 +393,7 @@ int folio_migrate_mapping(struct address_space *mapping, int dirty; int expected_count = folio_expected_refs(mapping, folio) + extra_count; long nr = folio_nr_pages(folio); + long entries, i; if (!mapping) { /* Anonymous page without mapping */ @@ -430,8 +431,10 @@ int folio_migrate_mapping(struct address_space *mapping, folio_set_swapcache(newfolio); newfolio->private = folio_get_private(folio); } + entries = nr; } else { VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); + entries = 1; } /* Move dirty while page refs frozen and newpage not yet exposed */ @@ -441,7 +444,11 @@ int folio_migrate_mapping(struct address_space *mapping, folio_set_dirty(newfolio); } - xas_store(&xas, newfolio); + /* Swap cache still stores N entries instead of a high-order entry */ + for (i = 0; i < entries; i++) { + xas_store(&xas, newfolio); + xas_next(&xas); + } /* * Drop cache reference from old page by unfreezing From 30bca9e2785b3c7cce113308b16b40132293ca34 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 1 Dec 2023 15:47:13 +0100 Subject: [PATCH 03/78] UPSTREAM: netfilter: nft_set_pipapo: skip inactive elements during set walk commit 317eb9685095678f2c9f5a8189de698c5354316a upstream. Otherwise set elements can be deactivated twice which will cause a crash. Bug: 316310313 Reported-by: Xingyuan Mo Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 189c2a82933c67ad360c421258d5449f6647544a) Signed-off-by: Lee Jones Change-Id: I27fb6ee806642e23ca02700763a387341dd463e6 --- net/netfilter/nft_set_pipapo.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index deea6196d992..4e1cc31729b8 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2042,6 +2042,9 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, e = f->mt[r].e; + if (!nft_set_elem_active(&e->ext, iter->genmask)) + goto cont; + elem.priv = e; iter->err = iter->fn(ctx, set, iter, &elem); From 401a2769d99066952f3bfb73a8ce6b0269bc04d7 Mon Sep 17 00:00:00 2001 From: Wu Bo Date: Tue, 21 Nov 2023 20:51:50 -0700 Subject: [PATCH 04/78] UPSTREAM: dm verity: don't perform FEC for failed readahead IO We found an issue under Android OTA scenario that many BIOs have to do FEC where the data under dm-verity is 100% complete and no corruption. Android OTA has many dm-block layers, from upper to lower: dm-verity dm-snapshot dm-origin & dm-cow dm-linear ufs DM tables have to change 2 times during Android OTA merging process. When doing table change, the dm-snapshot will be suspended for a while. During this interval, many readahead IOs are submitted to dm_verity from filesystem. Then the kverity works are busy doing FEC process which cost too much time to finish dm-verity IO. This causes needless delay which feels like system is hung. After adding debugging it was found that each readahead IO needed around 10s to finish when this situation occurred. This is due to IO amplification: dm-snapshot suspend erofs_readahead // 300+ io is submitted dm_submit_bio (dm_verity) dm_submit_bio (dm_snapshot) bio return EIO bio got nothing, it's empty verity_end_io verity_verify_io forloop range(0, io->n_blocks) // each io->nblocks ~= 20 verity_fec_decode fec_decode_rsb fec_read_bufs forloop range(0, v->fec->rsn) // v->fec->rsn = 253 new_read submit_bio (dm_snapshot) end loop end loop dm-snapshot resume Readahead BIOs get nothing while dm-snapshot is suspended, so all of them will cause verity's FEC. Each readahead BIO needs to verify ~20 (io->nblocks) blocks. Each block needs to do FEC, and every block needs to do 253 (v->fec->rsn) reads. So during the suspend interval(~200ms), 300 readahead BIOs trigger ~1518000 (300*20*253) IOs to dm-snapshot. As readahead IO is not required by userspace, and to fix this issue, it is best to pass readahead errors to upper layer to handle it. Cc: stable@vger.kernel.org Fixes: a739ff3f543a ("dm verity: add support for forward error correction") Bug: 316972624 Link: https://lore.kernel.org/dm-devel/b84fb49-bf63-3442-8c99-d565e134f2@redhat.com Signed-off-by: Wu Bo Reviewed-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Akilesh Kailash (cherry picked from commit 0193e3966ceeeef69e235975918b287ab093082b) Change-Id: I73560e5660cebdc1997e1f9926cbb8888789eb46 --- drivers/md/dm-verity-target.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index fd68500ae2ad..1112a0a694e0 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -656,7 +656,9 @@ static void verity_end_io(struct bio *bio) struct dm_verity_io *io = bio->bi_private; if (bio->bi_status && - (!verity_fec_is_enabled(io->v) || verity_is_system_shutting_down())) { + (!verity_fec_is_enabled(io->v) || + verity_is_system_shutting_down() || + (bio->bi_opf & REQ_RAHEAD))) { verity_finish_io(io, bio->bi_status); return; } From bc97d5019afdfa2a26cd39d6556ace0e55269919 Mon Sep 17 00:00:00 2001 From: xieliujie Date: Mon, 25 Dec 2023 14:28:51 +0800 Subject: [PATCH 05/78] ANDROID: vendor_hooks: Add hooks for rt_mutex steal Add hooks at rt_mutex_steal function so that oems can decide whether tasks with the same priority steal the rt_mutex or not. We did experiments and found that rt_mutex throughput can benefit a lot when threads with the same priority can steal the rt_mutex lock. Bug: 317670024 Change-Id: Id60a7a41c6c77a67808982d3667946cabe4acc8f Signed-off-by: xeiliujie --- drivers/android/vendor_hooks.c | 1 + include/trace/hooks/dtask.h | 3 +++ kernel/locking/rtmutex.c | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c index 43eb748c1103..ee68032b1918 100644 --- a/drivers/android/vendor_hooks.c +++ b/drivers/android/vendor_hooks.c @@ -95,6 +95,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_task_blocks_on_rtmutex); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_waiter_prio); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_start); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_finish); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rt_mutex_steal); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_start); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_finish); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_can_spin_on_owner); diff --git a/include/trace/hooks/dtask.h b/include/trace/hooks/dtask.h index a63a2868e626..b51147089b2d 100644 --- a/include/trace/hooks/dtask.h +++ b/include/trace/hooks/dtask.h @@ -42,6 +42,9 @@ DECLARE_HOOK(android_vh_rtmutex_wait_start, DECLARE_HOOK(android_vh_rtmutex_wait_finish, TP_PROTO(struct rt_mutex_base *lock), TP_ARGS(lock)); +DECLARE_HOOK(android_vh_rt_mutex_steal, + TP_PROTO(int waiter_prio, int top_waiter_prio, bool *ret), + TP_ARGS(waiter_prio, top_waiter_prio, ret)); DECLARE_HOOK(android_vh_rwsem_read_wait_start, TP_PROTO(struct rw_semaphore *sem), diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 351716fe9138..8207fdade4a8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -391,9 +391,15 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *top_waiter) { + bool ret = false; + if (rt_mutex_waiter_less(waiter, top_waiter)) return true; + trace_android_vh_rt_mutex_steal(waiter->prio, top_waiter->prio, &ret); + if (ret) + return true; + #ifdef RT_MUTEX_BUILD_SPINLOCKS /* * Note that RT tasks are excluded from same priority (lateral) From d3006fb9449d46b3c7733d5baa30c7a0e944cc5e Mon Sep 17 00:00:00 2001 From: xieliujie Date: Mon, 25 Dec 2023 15:06:38 +0800 Subject: [PATCH 06/78] ANDROID: ABI: Update oplus symbol list 1 function symbol(s) added 'int __traceiter_android_vh_rt_mutex_steal(void*, int, int, bool*)' 1 variable symbol(s) added 'struct tracepoint __tracepoint_android_vh_rt_mutex_steal' Bug: 317670024 Change-Id: I28f0379adaec041400e49cbd1e497b2f8c5c893d Signed-off-by: xeiliujie --- android/abi_gki_aarch64.stg | 28 ++++++++++++++++++++++++++++ android/abi_gki_aarch64_oplus | 2 ++ 2 files changed, 30 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index e570ab134873..f2b7c07a7716 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -315510,6 +315510,14 @@ function { parameter_id: 0x6720d32f parameter_id: 0x3c2755a3 } +function { + id: 0x9a2ab624 + return_type_id: 0x6720d32f + parameter_id: 0x18bd6530 + parameter_id: 0x6720d32f + parameter_id: 0x6720d32f + parameter_id: 0x11cfee5a +} function { id: 0x9a2abc7b return_type_id: 0x6720d32f @@ -337636,6 +337644,15 @@ elf_symbol { type_id: 0x9b08a261 full_name: "__traceiter_android_vh_rproc_recovery_set" } +elf_symbol { + id: 0xd56fbf76 + name: "__traceiter_android_vh_rt_mutex_steal" + is_defined: true + symbol_type: FUNCTION + crc: 0xf0a6d2df + type_id: 0x9a2ab624 + full_name: "__traceiter_android_vh_rt_mutex_steal" +} elf_symbol { id: 0x3ef508a2 name: "__traceiter_android_vh_rtmutex_wait_finish" @@ -341632,6 +341649,15 @@ elf_symbol { type_id: 0x18ccbd2c full_name: "__tracepoint_android_vh_rproc_recovery_set" } +elf_symbol { + id: 0xed43b088 + name: "__tracepoint_android_vh_rt_mutex_steal" + is_defined: true + symbol_type: OBJECT + crc: 0xdc6b8d43 + type_id: 0x18ccbd2c + full_name: "__tracepoint_android_vh_rt_mutex_steal" +} elf_symbol { id: 0xa3915d70 name: "__tracepoint_android_vh_rtmutex_wait_finish" @@ -399133,6 +399159,7 @@ interface { symbol_id: 0x8d62858f symbol_id: 0xcef5d79f symbol_id: 0x91384eff + symbol_id: 0xd56fbf76 symbol_id: 0x3ef508a2 symbol_id: 0xfb1b8d64 symbol_id: 0xc56d7179 @@ -399577,6 +399604,7 @@ interface { symbol_id: 0x04365139 symbol_id: 0xd94bc301 symbol_id: 0x3fc5ffc9 + symbol_id: 0xed43b088 symbol_id: 0xa3915d70 symbol_id: 0xf01f02ea symbol_id: 0xeaebbadf diff --git a/android/abi_gki_aarch64_oplus b/android/abi_gki_aarch64_oplus index a374dcf65636..8c9846b3a27d 100644 --- a/android/abi_gki_aarch64_oplus +++ b/android/abi_gki_aarch64_oplus @@ -158,6 +158,7 @@ __traceiter_android_vh_dm_bufio_shrink_scan_bypass __traceiter_android_vh_mutex_unlock_slowpath __traceiter_android_vh_rtmutex_waiter_prio + __traceiter_android_vh_rt_mutex_steal __traceiter_android_vh_rwsem_can_spin_on_owner __traceiter_android_vh_rwsem_opt_spin_finish __traceiter_android_vh_rwsem_opt_spin_start @@ -258,6 +259,7 @@ __tracepoint_android_vh_record_rtmutex_lock_starttime __tracepoint_android_vh_record_rwsem_lock_starttime __tracepoint_android_vh_rtmutex_waiter_prio + __tracepoint_android_vh_rt_mutex_steal __tracepoint_android_vh_rwsem_can_spin_on_owner __tracepoint_android_vh_rwsem_opt_spin_finish __tracepoint_android_vh_rwsem_opt_spin_start From d16a15fde575b740e248e5bbd6ffd11a90c40baa Mon Sep 17 00:00:00 2001 From: Roy Luo Date: Tue, 28 Nov 2023 22:17:56 +0000 Subject: [PATCH 07/78] UPSTREAM: USB: gadget: core: adjust uevent timing on gadget unbind The KOBJ_CHANGE uevent is sent before gadget unbind is actually executed, resulting in inaccurate uevent emitted at incorrect timing (the uevent would have USB_UDC_DRIVER variable set while it would soon be removed). Move the KOBJ_CHANGE uevent to the end of the unbind function so that uevent is sent only after the change has been made. Fixes: 2ccea03a8f7e ("usb: gadget: introduce UDC Class") Cc: stable@vger.kernel.org Signed-off-by: Roy Luo Link: https://lore.kernel.org/r/20231128221756.2591158-1-royluo@google.com Signed-off-by: Greg Kroah-Hartman Bug: 312543856 Change-Id: Ida7fa7e1cfae3d1b3f3348512a67fe91065f25af (cherry picked from commit 73ea73affe8622bdf292de898da869d441da6a9d) Signed-off-by: Roy Luo --- drivers/usb/gadget/udc/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index d7e992ee7743..1f56b770465e 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -1619,8 +1619,6 @@ static void gadget_unbind_driver(struct device *dev) dev_dbg(&udc->dev, "unbinding gadget driver [%s]\n", driver->function); - kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); - udc->allow_connect = false; cancel_work_sync(&udc->vbus_work); mutex_lock(&udc->connect_lock); @@ -1640,6 +1638,8 @@ static void gadget_unbind_driver(struct device *dev) driver->is_bound = false; udc->driver = NULL; mutex_unlock(&udc_lock); + + kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); } /* ------------------------------------------------------------------------- */ From 2c085909e7a7d2bb39881bc33ab6a45713c7f379 Mon Sep 17 00:00:00 2001 From: leonardian Date: Wed, 13 Dec 2023 07:00:25 +0000 Subject: [PATCH 08/78] ANDROID: Update the ABI symbol list Adding the following symbols: - _dev_alert Bug: 311337219 Change-Id: Iaf6710842c45921ccfbacd1361e0b57401cf65d9 Signed-off-by: leonardian --- android/abi_gki_aarch64_pixel | 1 + 1 file changed, 1 insertion(+) diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index b7d8f1baa215..af22721e20b8 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -340,6 +340,7 @@ desc_to_gpio destroy_workqueue dev_addr_mod + _dev_alert dev_alloc_name __dev_change_net_namespace dev_close From 88a19395047b78359c6c352491c2028dc730878c Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Wed, 30 Nov 2022 14:04:55 +0800 Subject: [PATCH 09/78] UPSTREAM: erofs: enable large folios for iomap mode Enable large folios for iomap mode. Then the readahead routine will pass down large folios containing multiple pages. Let's enable this for non-compressed format for now, until the compression part supports large folios later. When large folios supported, the iomap routine will allocate iomap_page for each large folio and thus we need iomap_release_folio() and iomap_invalidate_folio() to free iomap_page when these folios get reclaimed or invalidated. Signed-off-by: Jingbo Xu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20221130060455.44532-1-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang Bug: 318378021 Change-Id: Iedbb9a2daf132399b7a1b5ea6905977ba123ba3c (cherry picked from commit ce529cc25b184e93397b94a8a322128fc0095cbb) Signed-off-by: Sandeep Dhavale --- fs/erofs/data.c | 2 ++ fs/erofs/inode.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 83532525282e..bbfbaf25ee59 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -404,6 +404,8 @@ const struct address_space_operations erofs_raw_access_aops = { .readahead = erofs_readahead, .bmap = erofs_bmap, .direct_IO = noop_direct_IO, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, }; #ifdef CONFIG_FS_DAX diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 8fc41fd1620c..187ca02bbc2d 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -299,6 +299,8 @@ static int erofs_fill_inode(struct inode *inode) goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; + if (!erofs_is_fscache_mode(inode->i_sb)) + mapping_set_large_folios(inode->i_mapping); #ifdef CONFIG_EROFS_FS_ONDEMAND if (erofs_is_fscache_mode(inode->i_sb)) inode->i_mapping->a_ops = &erofs_fscache_access_aops; From 66595bb17c147d49721669fe822817dea377b9ce Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Wed, 26 Apr 2023 16:44:49 +0800 Subject: [PATCH 10/78] UPSTREAM: erofs: fold in z_erofs_decompress() No need this helper since it's just a simple wrapper for decompress method and only one caller. So, let's fold in directly instead. Signed-off-by: Yue Hu Reviewed-by: Gao Xiang Link: https://lore.kernel.org/r/20230426084449.12781-1-zbestahu@gmail.com Signed-off-by: Gao Xiang Bug: 318378021 (cherry picked from commit 597e2953ae9b4a391e883c1f1a4cda5878e2dbed) Change-Id: I849360f088016cf97542858e8a5a9cee671a2f61 Signed-off-by: Sandeep Dhavale --- fs/erofs/compress.h | 3 +-- fs/erofs/decompressor.c | 8 +------- fs/erofs/zdata.c | 4 +++- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 26fa170090b8..b1b846504027 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -89,8 +89,7 @@ static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, unsigned int padbufsize); -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool); +extern const struct z_erofs_decompressor erofs_decompressors[]; /* prototypes for specific algorithms */ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 7021e2cf6146..2a29943fa5cc 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -363,7 +363,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, return 0; } -static struct z_erofs_decompressor decompressors[] = { +const struct z_erofs_decompressor erofs_decompressors[] = { [Z_EROFS_COMPRESSION_SHIFTED] = { .decompress = z_erofs_transform_plain, .name = "shifted" @@ -383,9 +383,3 @@ static struct z_erofs_decompressor decompressors[] = { }, #endif }; - -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) -{ - return decompressors[rq->alg].decompress(rq, pagepool); -} diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 451b9a6cba68..f63c2422c9f9 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1256,6 +1256,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + const struct z_erofs_decompressor *decompressor = + &erofs_decompressors[pcl->algorithmformat]; unsigned int i, inputsize; int err2; struct page *page; @@ -1299,7 +1301,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, else inputsize = pclusterpages * PAGE_SIZE; - err = z_erofs_decompress(&(struct z_erofs_decompress_req) { + err = decompressor->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, From 5e861fa97e24f31aeda455870177da40c9eccd76 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Wed, 24 May 2023 14:39:44 +0800 Subject: [PATCH 11/78] UPSTREAM: erofs: remove the member readahead from struct z_erofs_decompress_frontend The struct member is only used to add REQ_RAHEAD during I/O submission. So it is cleaner to pass it as a parameter than keep it in the struct. Also, rename function z_erofs_get_sync_decompress_policy() to z_erofs_is_sync_decompress() for better clarity and conciseness. Signed-off-by: Yue Hu Reviewed-by: Gao Xiang Link: https://lore.kernel.org/r/20230524063944.1655-1-zbestahu@gmail.com Signed-off-by: Gao Xiang Bug: 318378021 (cherry picked from commit ef4b4b46c6aaf8edeea9a79320627fe10993f153) Change-Id: I59cc13e7499968a1e93e13df1cb43a5123d510d9 Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index f63c2422c9f9..50f803a48f96 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -534,7 +534,6 @@ struct z_erofs_decompress_frontend { z_erofs_next_pcluster_t owned_head; enum z_erofs_pclustermode mode; - bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; @@ -1076,7 +1075,7 @@ out: return err; } -static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, +static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, unsigned int readahead_pages) { /* auto: enable for read_folio, disable for readahead */ @@ -1637,7 +1636,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct page **pagepool, struct z_erofs_decompressqueue *fgq, - bool *force_fg) + bool *force_fg, bool readahead) { struct super_block *sb = f->inode->i_sb; struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); @@ -1723,7 +1722,7 @@ submit_bio_retry: bio->bi_iter.bi_sector = (sector_t)cur << (sb->s_blocksize_bits - 9); bio->bi_private = q[JQ_SUBMIT]; - if (f->readahead) + if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } @@ -1759,13 +1758,13 @@ submit_bio_retry: } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, bool force_fg) + struct page **pagepool, bool force_fg, bool ra) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(f, pagepool, io, &force_fg); + z_erofs_submit_queue(f, pagepool, io, &force_fg, ra); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1863,8 +1862,8 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, 0)); + z_erofs_runqueue(&f, &pagepool, z_erofs_is_sync_decompress(sbi, 0), + false); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); @@ -1882,7 +1881,6 @@ static void z_erofs_readahead(struct readahead_control *rac) struct page *pagepool = NULL, *head = NULL, *page; unsigned int nr_pages; - f.readahead = true; f.headoffset = readahead_pos(rac); z_erofs_pcluster_readmore(&f, rac, f.headoffset + @@ -1913,7 +1911,7 @@ static void z_erofs_readahead(struct readahead_control *rac) (void)z_erofs_collector_end(&f); z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, nr_pages)); + z_erofs_is_sync_decompress(sbi, nr_pages), true); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&pagepool); } From bed20ed1d348355f99a40e7e61729b25e21356f9 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Thu, 25 May 2023 15:26:05 +0800 Subject: [PATCH 12/78] UPSTREAM: erofs: clean up z_erofs_pcluster_readmore() `end` parameter is no needed since it's pointless for !backmost, we can handle it with backmost internally. And we only expand the trailing edge, so the newstart can be replaced with ->headoffset. Also, remove linux/prefetch.h inclusion since that is not used anymore after commit 386292919c25 ("erofs: introduce readmore decompression strategy"). Signed-off-by: Yue Hu Reviewed-by: Gao Xiang Link: https://lore.kernel.org/r/20230525072605.17857-1-zbestahu@gmail.com [ Gao Xiang: update commit description. ] Signed-off-by: Gao Xiang Bug: 318378021 (cherry picked from commit 796e9149a2fcdba5543e247abd8d911a399bb9a6) Change-Id: I9412c4111800077c876a43c4256ce9760a7d902e Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 50f803a48f96..b7bf435eea82 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -5,7 +5,6 @@ * Copyright (C) 2022 Alibaba Cloud */ #include "compress.h" -#include #include #include #include @@ -1785,28 +1784,28 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, */ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, struct readahead_control *rac, - erofs_off_t end, - struct page **pagepool, - bool backmost) + struct page **pagepool, bool backmost) { struct inode *inode = f->inode; struct erofs_map_blocks *map = &f->map; - erofs_off_t cur; + erofs_off_t cur, end, headoffset = f->headoffset; int err; if (backmost) { + if (rac) + end = headoffset + readahead_length(rac) - 1; + else + end = headoffset + PAGE_SIZE - 1; map->m_la = end; err = z_erofs_map_blocks_iter(inode, map, EROFS_GET_BLOCKS_READMORE); if (err) return; - /* expend ra for the trailing edge if readahead */ + /* expand ra for the trailing edge if readahead */ if (rac) { - loff_t newstart = readahead_pos(rac); - cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); - readahead_expand(rac, newstart, cur - newstart); + readahead_expand(rac, headoffset, cur - headoffset); return; } end = round_up(end, PAGE_SIZE); @@ -1854,10 +1853,9 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) trace_erofs_readpage(page, false); f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, - &pagepool, true); + z_erofs_pcluster_readmore(&f, NULL, &pagepool, true); err = z_erofs_do_read_page(&f, page, &pagepool); - z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); + z_erofs_pcluster_readmore(&f, NULL, &pagepool, false); (void)z_erofs_collector_end(&f); @@ -1883,8 +1881,7 @@ static void z_erofs_readahead(struct readahead_control *rac) f.headoffset = readahead_pos(rac); - z_erofs_pcluster_readmore(&f, rac, f.headoffset + - readahead_length(rac) - 1, &pagepool, true); + z_erofs_pcluster_readmore(&f, rac, &pagepool, true); nr_pages = readahead_count(rac); trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); @@ -1907,7 +1904,7 @@ static void z_erofs_readahead(struct readahead_control *rac) page->index, EROFS_I(inode)->nid); put_page(page); } - z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); + z_erofs_pcluster_readmore(&f, rac, &pagepool, false); (void)z_erofs_collector_end(&f); z_erofs_runqueue(&f, &pagepool, From 5c1827383ac1df6e796ae44533a913d39425a78e Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 27 May 2023 04:14:54 +0800 Subject: [PATCH 13/78] UPSTREAM: erofs: allocate extra bvec pages directly instead of retrying If non-bootstrap bvecs cannot be kept in place (very rarely), an extra short-lived page is allocated. Let's just allocate it immediately rather than do unnecessary -EAGAIN return first and retry as a cleanup. Also it's unnecessary to use __GFP_NOFAIL here since we could gracefully fail out this case instead. Signed-off-by: Gao Xiang Reviewed-by: Yue Hu Link: https://lore.kernel.org/r/20230526201459.128169-2-hsiangkao@linux.alibaba.com Bug: 318378021 (cherry picked from commit 05b63d2beb8b0f752d1f5cdd051c8bdbf532cedd) Change-Id: I2ac45a943060406bcbb741c5f7aa1094f783f906 Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index b7bf435eea82..2e945803c457 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -240,12 +240,17 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct z_erofs_bvec *bvec, struct page **candidate_bvpage) { - if (iter->cur == iter->nr) { - if (!*candidate_bvpage) - return -EAGAIN; + if (iter->cur >= iter->nr) { + struct page *nextpage = *candidate_bvpage; + if (!nextpage) { + nextpage = alloc_page(GFP_NOFS); + if (!nextpage) + return -ENOMEM; + set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); + } DBG_BUGON(iter->bvset->nextpage); - iter->bvset->nextpage = *candidate_bvpage; + iter->bvset->nextpage = nextpage; z_erofs_bvset_flip(iter); iter->bvset->nextpage = NULL; @@ -872,10 +877,8 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); - if (fe->candidate_bvpage) { - DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + if (fe->candidate_bvpage) fe->candidate_bvpage = NULL; - } /* * if all pending pages are added, don't hold its reference @@ -1023,24 +1026,13 @@ hitted: if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); -retry: err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { .page = page, .offset = offset - map->m_la, .end = end, }), exclusive); - /* should allocate an additional short-lived page for bvset */ - if (err == -EAGAIN && !fe->candidate_bvpage) { - fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); - set_page_private(fe->candidate_bvpage, - Z_EROFS_SHORTLIVED_PAGE); - goto retry; - } - - if (err) { - DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); + if (err) goto out; - } z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ From 3d9318266165302aae965416e8ebfed980088b8c Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 27 May 2023 04:14:55 +0800 Subject: [PATCH 14/78] UPSTREAM: erofs: avoid on-stack pagepool directly passed by arguments On-stack pagepool is used so that short-lived temporary pages could be shared within a single I/O request (e.g. among multiple pclusters). Moving the remaining frontend-related uses into z_erofs_decompress_frontend to avoid too many arguments. Signed-off-by: Gao Xiang Reviewed-by: Yue Hu Link: https://lore.kernel.org/r/20230526201459.128169-3-hsiangkao@linux.alibaba.com Bug: 318378021 (cherry picked from commit 6ab5eed6002edc5a29b683285e90459a7df6ce2b) Change-Id: I57d3ba6087904bb40c55b780aca50c16bfba2c0f Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 64 +++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 2e945803c457..2abc2398708a 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -238,13 +238,14 @@ static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct z_erofs_bvec *bvec, - struct page **candidate_bvpage) + struct page **candidate_bvpage, + struct page **pagepool) { if (iter->cur >= iter->nr) { struct page *nextpage = *candidate_bvpage; if (!nextpage) { - nextpage = alloc_page(GFP_NOFS); + nextpage = erofs_allocpage(pagepool, GFP_NOFS); if (!nextpage) return -ENOMEM; set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); @@ -533,6 +534,7 @@ struct z_erofs_decompress_frontend { struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; + struct page *pagepool; struct page *candidate_bvpage; struct z_erofs_pcluster *pcl; z_erofs_next_pcluster_t owned_head; @@ -567,8 +569,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) return false; } -static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, - struct page **pagepool) +static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; @@ -609,7 +610,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, * succeeds or fallback to in-place I/O instead * to avoid any direct reclaim. */ - newpage = erofs_allocpage(pagepool, gfp); + newpage = erofs_allocpage(&fe->pagepool, gfp); if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); @@ -622,7 +623,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, if (page) put_page(page); else if (newpage) - erofs_pagepool_add(pagepool, newpage); + erofs_pagepool_add(&fe->pagepool, newpage); } /* @@ -720,7 +721,8 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, !fe->candidate_bvpage) fe->candidate_bvpage = bvec->page; } - ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage, + &fe->pagepool); fe->pcl->vcnt += (ret >= 0); return ret; } @@ -925,7 +927,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, struct page **pagepool) + struct page *page) { struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; @@ -985,7 +987,7 @@ repeat: fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ - z_erofs_bind_cache(fe, pagepool); + z_erofs_bind_cache(fe); } hitted: /* @@ -1625,7 +1627,6 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) } static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg, bool readahead) { @@ -1683,8 +1684,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, do { struct page *page; - page = pickup_page_for_submission(pcl, i++, pagepool, - mc); + page = pickup_page_for_submission(pcl, i++, + &f->pagepool, mc); if (!page) continue; @@ -1749,16 +1750,16 @@ submit_bio_retry: } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, bool force_fg, bool ra) + bool force_fg, bool ra) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(f, pagepool, io, &force_fg, ra); + z_erofs_submit_queue(f, io, &force_fg, ra); /* handle bypass queue (no i/o pclusters) immediately */ - z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); + z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); if (!force_fg) return; @@ -1767,7 +1768,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ - z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); + z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); } /* @@ -1775,8 +1776,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, * approximate readmore strategies as a start. */ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, - struct readahead_control *rac, - struct page **pagepool, bool backmost) + struct readahead_control *rac, bool backmost) { struct inode *inode = f->inode; struct erofs_map_blocks *map = &f->map; @@ -1818,7 +1818,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, if (PageUptodate(page)) { unlock_page(page); } else { - err = z_erofs_do_read_page(f, page, pagepool); + err = z_erofs_do_read_page(f, page); if (err) erofs_err(inode->i_sb, "readmore error at page %lu @ nid %llu", @@ -1839,27 +1839,24 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) struct inode *const inode = page->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL; int err; trace_erofs_readpage(page, false); f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - z_erofs_pcluster_readmore(&f, NULL, &pagepool, true); - err = z_erofs_do_read_page(&f, page, &pagepool); - z_erofs_pcluster_readmore(&f, NULL, &pagepool, false); - + z_erofs_pcluster_readmore(&f, NULL, true); + err = z_erofs_do_read_page(&f, page); + z_erofs_pcluster_readmore(&f, NULL, false); (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(&f, &pagepool, z_erofs_is_sync_decompress(sbi, 0), - false); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); return err; } @@ -1868,12 +1865,12 @@ static void z_erofs_readahead(struct readahead_control *rac) struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL, *head = NULL, *page; + struct page *head = NULL, *page; unsigned int nr_pages; f.headoffset = readahead_pos(rac); - z_erofs_pcluster_readmore(&f, rac, &pagepool, true); + z_erofs_pcluster_readmore(&f, rac, true); nr_pages = readahead_count(rac); trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); @@ -1889,20 +1886,19 @@ static void z_erofs_readahead(struct readahead_control *rac) /* traversal in reverse order */ head = (void *)page_private(page); - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); if (err) erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", page->index, EROFS_I(inode)->nid); put_page(page); } - z_erofs_pcluster_readmore(&f, rac, &pagepool, false); + z_erofs_pcluster_readmore(&f, rac, false); (void)z_erofs_collector_end(&f); - z_erofs_runqueue(&f, &pagepool, - z_erofs_is_sync_decompress(sbi, nr_pages), true); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); } const struct address_space_operations z_erofs_aops = { From 187d034575bcaf95e145627406a5b26108fba447 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 27 May 2023 04:14:57 +0800 Subject: [PATCH 15/78] BACKPORT: erofs: adapt managed inode operations into folios This patch gets rid of erofs_try_to_free_cached_page() and fold it into .release_folio(). It also moves managed inode operations into zdata.c, which simplifies the code a bit. No logic changes. Signed-off-by: Gao Xiang Reviewed-by: Yue Hu Link: https://lore.kernel.org/r/20230526201459.128169-5-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I5cb1e44769f68edce788cb4f8084bb3d45b594b3 (cherry picked from commit 7b4e372c36fcd33c74ba3cbd65fa534b9c558184) [dhavale: changes to internal.h applied manually] Signed-off-by: Sandeep Dhavale --- fs/erofs/internal.h | 3 ++- fs/erofs/super.c | 62 --------------------------------------------- fs/erofs/zdata.c | 59 ++++++++++++++++++++++++++++++++++++------ 3 files changed, 53 insertions(+), 71 deletions(-) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1c03daf83a68..23151da13a23 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -544,7 +544,7 @@ int __init z_erofs_init_zip_subsystem(void); void z_erofs_exit_zip_subsystem(void); int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct erofs_workgroup *egrp); -int erofs_try_to_free_cached_page(struct page *page); +int erofs_init_managed_cache(struct super_block *sb); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); @@ -565,6 +565,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } return 0; } +static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_ZIP_LZMA diff --git a/fs/erofs/super.c b/fs/erofs/super.c index b073b38c1c77..19af9bbcb8f1 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -597,68 +597,6 @@ static int erofs_fc_parse_param(struct fs_context *fc, return 0; } -#ifdef CONFIG_EROFS_FS_ZIP -static const struct address_space_operations managed_cache_aops; - -static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp) -{ - bool ret = true; - struct address_space *const mapping = folio->mapping; - - DBG_BUGON(!folio_test_locked(folio)); - DBG_BUGON(mapping->a_ops != &managed_cache_aops); - - if (folio_test_private(folio)) - ret = erofs_try_to_free_cached_page(&folio->page); - - return ret; -} - -/* - * It will be called only on inode eviction. In case that there are still some - * decompression requests in progress, wait with rescheduling for a bit here. - * We could introduce an extra locking instead but it seems unnecessary. - */ -static void erofs_managed_cache_invalidate_folio(struct folio *folio, - size_t offset, size_t length) -{ - const size_t stop = length + offset; - - DBG_BUGON(!folio_test_locked(folio)); - - /* Check for potential overflow in debug mode */ - DBG_BUGON(stop > folio_size(folio) || stop < length); - - if (offset == 0 && stop == folio_size(folio)) - while (!erofs_managed_cache_release_folio(folio, GFP_NOFS)) - cond_resched(); -} - -static const struct address_space_operations managed_cache_aops = { - .release_folio = erofs_managed_cache_release_folio, - .invalidate_folio = erofs_managed_cache_invalidate_folio, -}; - -static int erofs_init_managed_cache(struct super_block *sb) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - struct inode *const inode = new_inode(sb); - - if (!inode) - return -ENOMEM; - - set_nlink(inode, 1); - inode->i_size = OFFSET_MAX; - - inode->i_mapping->a_ops = &managed_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - sbi->managed_cache = inode; - return 0; -} -#else -static int erofs_init_managed_cache(struct super_block *sb) { return 0; } -#endif - static struct inode *erofs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 2abc2398708a..8085e712314e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -668,29 +668,72 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, return 0; } -int erofs_try_to_free_cached_page(struct page *page) +static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { - struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret, i; + struct z_erofs_pcluster *pcl = folio_get_private(folio); + bool ret; + int i; + + if (!folio_test_private(folio)) + return true; if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) - return 0; + return false; - ret = 0; + ret = false; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_bvecs[i].page == page) { + if (pcl->compressed_bvecs[i].page == &folio->page) { WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); - ret = 1; + ret = true; break; } } erofs_workgroup_unfreeze(&pcl->obj, 1); + if (ret) - detach_page_private(page); + folio_detach_private(folio); return ret; } +/* + * It will be called only on inode eviction. In case that there are still some + * decompression requests in progress, wait with rescheduling for a bit here. + * An extra lock could be introduced instead but it seems unnecessary. + */ +static void z_erofs_cache_invalidate_folio(struct folio *folio, + size_t offset, size_t length) +{ + const size_t stop = length + offset; + + /* Check for potential overflow in debug mode */ + DBG_BUGON(stop > folio_size(folio) || stop < length); + + if (offset == 0 && stop == folio_size(folio)) + while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) + cond_resched(); +} + +static const struct address_space_operations z_erofs_cache_aops = { + .release_folio = z_erofs_cache_release_folio, + .invalidate_folio = z_erofs_cache_invalidate_folio, +}; + +int erofs_init_managed_cache(struct super_block *sb) +{ + struct inode *const inode = new_inode(sb); + + if (!inode) + return -ENOMEM; + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &z_erofs_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + EROFS_SB(sb)->managed_cache = inode; + return 0; +} + static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec) { From 365ca16da250c80d27a7f7b3499acfa2db9718a7 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 28 Jun 2023 00:12:40 +0800 Subject: [PATCH 16/78] UPSTREAM: erofs: simplify z_erofs_transform_plain() Use memcpy_to_page() instead of open-coding them. In addition, add a missing flush_dcache_page() even though almost all modern architectures clear `PG_dcache_clean` flag for new file cache pages so that it doesn't change anything in practice. Signed-off-by: Gao Xiang Reviewed-by: Yue Hu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230627161240.331-2-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang Bug: 318378021 (cherry picked from commit c5539762f32e97c5e16215fa1336e32095b8b0fd) Change-Id: I4cb665b592936502ca95e2aee20e1c3a56103ff5 Signed-off-by: Sandeep Dhavale --- fs/erofs/decompressor.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2a29943fa5cc..b9dd685a16fc 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -328,7 +328,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, const unsigned int lefthalf = rq->outputsize - righthalf; const unsigned int interlaced_offset = rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; - unsigned char *src, *dst; + u8 *src; if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { DBG_BUGON(1); @@ -341,22 +341,19 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, } src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; - if (rq->out[0]) { - dst = kmap_local_page(rq->out[0]); - memcpy(dst + rq->pageofs_out, src + interlaced_offset, - righthalf); - kunmap_local(dst); - } + if (rq->out[0]) + memcpy_to_page(rq->out[0], rq->pageofs_out, + src + interlaced_offset, righthalf); if (outpages > inpages) { DBG_BUGON(!rq->out[outpages - 1]); if (rq->out[outpages - 1] != rq->in[inpages - 1]) { - dst = kmap_local_page(rq->out[outpages - 1]); - memcpy(dst, interlaced_offset ? src : - (src + righthalf), lefthalf); - kunmap_local(dst); + memcpy_to_page(rq->out[outpages - 1], 0, src + + (interlaced_offset ? 0 : righthalf), + lefthalf); } else if (!interlaced_offset) { memmove(src, src + righthalf, lefthalf); + flush_dcache_page(rq->in[inpages - 1]); } } kunmap_local(src); From 4067dd99694a3722116ed26bfcb361201ea97a9b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 28 Jun 2023 00:12:39 +0800 Subject: [PATCH 17/78] UPSTREAM: erofs: get rid of the remaining kmap_atomic() It's unnecessary to use kmap_atomic() compared with kmap_local_page(). In addition, kmap_atomic() is deprecated now. Signed-off-by: Gao Xiang Reviewed-by: Yue Hu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230627161240.331-1-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang Bug: 318378021 (cherry picked from commit 123ec246ebe323d468c5ca996700ea4739d20ddf) Change-Id: I7efee861bb4f079fe6b79123d554be2e1867d13b Signed-off-by: Sandeep Dhavale --- fs/erofs/decompressor.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index b9dd685a16fc..cfad1eac7fd9 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -148,7 +148,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, *maptype = 0; return inpage; } - kunmap_atomic(inpage); + kunmap_local(inpage); might_sleep(); src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) @@ -162,7 +162,7 @@ docopy: src = erofs_get_pcpubuf(ctx->inpages); if (!src) { DBG_BUGON(1); - kunmap_atomic(inpage); + kunmap_local(inpage); return ERR_PTR(-EFAULT); } @@ -173,9 +173,9 @@ docopy: min_t(unsigned int, total, PAGE_SIZE - *inputmargin); if (!inpage) - inpage = kmap_atomic(*in); + inpage = kmap_local_page(*in); memcpy(tmp, inpage + *inputmargin, page_copycnt); - kunmap_atomic(inpage); + kunmap_local(inpage); inpage = NULL; tmp += page_copycnt; total -= page_copycnt; @@ -214,7 +214,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, int ret, maptype; DBG_BUGON(*rq->in == NULL); - headpage = kmap_atomic(*rq->in); + headpage = kmap_local_page(*rq->in); /* LZ4 decompression inplace is only safe if zero_padding is enabled */ if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) { @@ -223,7 +223,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, min_t(unsigned int, rq->inputsize, rq->sb->s_blocksize - rq->pageofs_in)); if (ret) { - kunmap_atomic(headpage); + kunmap_local(headpage); return ret; } may_inplace = !((rq->pageofs_in + rq->inputsize) & @@ -261,7 +261,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } if (maptype == 0) { - kunmap_atomic(headpage); + kunmap_local(headpage); } else if (maptype == 1) { vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { @@ -289,7 +289,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, /* one optimized fast path only for non bigpcluster cases yet */ if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); + dst = kmap_local_page(*rq->out); dst_maptype = 0; goto dstmap_out; } @@ -311,7 +311,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, dstmap_out: ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out); if (!dst_maptype) - kunmap_atomic(dst); + kunmap_local(dst); else if (dst_maptype == 2) vm_unmap_ram(dst, ctx.outpages); return ret; From d0dbf747924547655f733242ade325bd2426a7e0 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:06 +0800 Subject: [PATCH 18/78] BACKPORT: erofs: simplify z_erofs_read_fragment() A trivial cleanup to make the fragment handling logic more clear. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-1-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I50c09c65b7d3da5022cfc2ede27aa31a1b331d29 (cherry picked from commit 8b00be163f7b57cbf957b3d27b5a7ca1e2495cfa) [dhavale: resolved conflict around erofs_bread() in zdata.c] Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 8085e712314e..f7d0d71359e8 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -936,22 +936,19 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) return true; } -static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, - struct page *page, unsigned int pageofs, - unsigned int len) +static int z_erofs_read_fragment(struct super_block *sb, struct page *page, + unsigned int cur, unsigned int end, erofs_off_t pos) { - struct super_block *sb = inode->i_sb; - struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; + struct inode *packed_inode = EROFS_SB(sb)->packed_inode; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - u8 *src, *dst; - unsigned int i, cnt; + unsigned int cnt; + u8 *src; if (!packed_inode) return -EFSCORRUPTED; - pos += EROFS_I(inode)->z_fragmentoff; - for (i = 0; i < len; i += cnt) { - cnt = min_t(unsigned int, len - i, + for (; cur < end; cur += cnt, pos += cnt) { + cnt = min_t(unsigned int, end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); src = erofs_bread(&buf, packed_inode, erofs_blknr(sb, pos), EROFS_KMAP); @@ -959,11 +956,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, erofs_put_metabuf(&buf); return PTR_ERR(src); } - - dst = kmap_local_page(page); - memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt); - kunmap_local(dst); - pos += cnt; + memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); } erofs_put_metabuf(&buf); return 0; @@ -976,7 +969,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; - unsigned int cur, end, spiltted; + unsigned int cur, end, len, spiltted; int err = 0; /* register locked file pages as online pages in pack */ @@ -1049,17 +1042,11 @@ hitted: goto next_part; } if (map->m_flags & EROFS_MAP_FRAGMENT) { - unsigned int pageofs, skip, len; + erofs_off_t fpos = offset + cur - map->m_la; - if (offset > map->m_la) { - pageofs = 0; - skip = offset - map->m_la; - } else { - pageofs = map->m_la & ~PAGE_MASK; - skip = 0; - } - len = min_t(unsigned int, map->m_llen - skip, end - cur); - err = z_erofs_read_fragment(inode, skip, page, pageofs, len); + len = min_t(unsigned int, map->m_llen - fpos, end - cur); + err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len, + EROFS_I(inode)->z_fragmentoff + fpos); if (err) goto out; ++spiltted; From 7751567a719dc3cd78125ac4b200596bab4c501a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:07 +0800 Subject: [PATCH 19/78] BACKPORT: erofs: avoid obsolete {collector,collection} terms {collector,collection} were once reserved in order to indicate different runtime logical extent instance of multi-reference pclusters. However, de-duplicated decompression has been landed in a more flexable way, thus `struct z_erofs_collection` was formally removed in commit 87ca34a7065d ("erofs: get rid of `struct z_erofs_collection'"). Let's handle the remaining leftovers, for example: `z_erofs_collector_begin` => `z_erofs_pcluster_begin` `z_erofs_collector_end` => `z_erofs_pcluster_end` as well as some comments. No logic changes. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-2-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I61b812b5ae3dd564e52012d082415b1fc198383d (cherry picked from commit dcba1b232e26ebadbd215728199455d38a59253e) [dhavale: fixed minor conflict zdata.c in z_erofs_do_read_page()] Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index f7d0d71359e8..ad921985883f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -512,19 +512,17 @@ enum z_erofs_pclustermode { */ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* - * The current collection has been linked with the owned chain, and - * could also be linked with the remaining collections, which means - * if the processing page is the tail page of the collection, thus - * the current collection can safely use the whole page (since - * the previous collection is under control) for in-place I/O, as - * illustrated below: - * ________________________________________________________________ - * | tail (partial) page | head (partial) page | - * | (of the current cl) | (of the previous collection) | - * | | | - * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________| + * The pcluster was just linked to a decompression chain by us. It can + * also be linked with the remaining pclusters, which means if the + * processing page is the tail page of a pcluster, this pcluster can + * safely use the whole page (since the previous pcluster is within the + * same chain) for in-place I/O, as illustrated below: + * ___________________________________________________ + * | tail (partial) page | head (partial) page | + * | (of the current pcl) | (of the previous pcl) | + * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| * - * [ (*) the above page can be used as inplace I/O. ] + * [ (*) the page above can be used as inplace I/O. ] */ Z_EROFS_PCLUSTER_FOLLOWED, }; @@ -855,7 +853,7 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) +static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct erofs_workgroup *grp = NULL; @@ -912,12 +910,12 @@ void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) call_rcu(&pcl->rcu, z_erofs_rcu_callback); } -static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) +static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) { struct z_erofs_pcluster *pcl = fe->pcl; if (!pcl) - return false; + return; z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); @@ -933,7 +931,7 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; - return true; + fe->backmost = false; } static int z_erofs_read_fragment(struct super_block *sb, struct page *page, @@ -984,8 +982,7 @@ repeat: offset + cur >= map->m_la + map->m_llen) { erofs_dbg("out-of-range map @ pos %llu", offset + cur); - if (z_erofs_collector_end(fe)) - fe->backmost = false; + z_erofs_pcluster_end(fe); map->m_la = offset + cur; map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); @@ -1001,7 +998,7 @@ repeat: map->m_flags & EROFS_MAP_FRAGMENT) goto hitted; - err = z_erofs_collector_begin(fe); + err = z_erofs_pcluster_begin(fe); if (err) goto out; @@ -1877,7 +1874,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) z_erofs_pcluster_readmore(&f, NULL, true); err = z_erofs_do_read_page(&f, page); z_erofs_pcluster_readmore(&f, NULL, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_end(&f); /* if some compressed cluster ready, need submit them anyway */ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); @@ -1924,7 +1921,7 @@ static void z_erofs_readahead(struct readahead_control *rac) put_page(page); } z_erofs_pcluster_readmore(&f, rac, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_end(&f); z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true); erofs_put_metabuf(&f.map.buf); From dc94c3cc6b3fae027bc45b89ecb826bd4b6e9a2f Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:08 +0800 Subject: [PATCH 20/78] UPSTREAM: erofs: move preparation logic into z_erofs_pcluster_begin() Some preparation logic should be part of z_erofs_pcluster_begin() instead of z_erofs_do_read_page(). Let's move now. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-3-hsiangkao@linux.alibaba.com Bug: 318378021 (cherry picked from commit aeebae9d77217709f8ae3edb0cd7858ec8c7a9d6) Change-Id: I4bf438d719742a18a6f3065a78bf027de5dae293 Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 60 ++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index ad921985883f..87e7d9f843bd 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -856,6 +856,8 @@ err_out: static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; + erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); struct erofs_workgroup *grp = NULL; int ret; @@ -865,8 +867,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); if (!(map->m_flags & EROFS_MAP_META)) { - grp = erofs_find_workgroup(fe->inode->i_sb, - map->m_pa >> PAGE_SHIFT); + grp = erofs_find_workgroup(sb, blknr); } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); return -EFSCORRUPTED; @@ -885,9 +886,26 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) } else if (ret) { return ret; } + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); - /* since file-backed online pages are traversed in reverse order */ + if (!z_erofs_is_inline_pcluster(fe->pcl)) { + /* bind cache first when cached decompression is preferred */ + z_erofs_bind_cache(fe); + } else { + void *mptr; + + mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP); + if (IS_ERR(mptr)) { + ret = PTR_ERR(mptr); + erofs_err(sb, "failed to get inline data %d", ret); + return ret; + } + get_page(map->buf.page); + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; + } + /* file-backed inplace I/O pages are traversed in reverse order */ fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -988,39 +1006,15 @@ repeat: err = z_erofs_map_blocks_iter(inode, map, 0); if (err) goto out; - } else { - if (fe->pcl) - goto hitted; - /* didn't get a valid pcluster previously (very rare) */ + } else if (fe->pcl) { + goto hitted; } - if (!(map->m_flags & EROFS_MAP_MAPPED) || - map->m_flags & EROFS_MAP_FRAGMENT) - goto hitted; - - err = z_erofs_pcluster_begin(fe); - if (err) - goto out; - - if (z_erofs_is_inline_pcluster(fe->pcl)) { - void *mp; - - mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, - erofs_blknr(inode->i_sb, map->m_pa), - EROFS_NO_KMAP); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); - erofs_err(inode->i_sb, - "failed to get inline page, err %d", err); + if ((map->m_flags & EROFS_MAP_MAPPED) && + !(map->m_flags & EROFS_MAP_FRAGMENT)) { + err = z_erofs_pcluster_begin(fe); + if (err) goto out; - } - get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, - fe->map.buf.page); - fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; - } else { - /* bind cache first when cached decompression is preferred */ - z_erofs_bind_cache(fe); } hitted: /* From 0d329bbe5cac1bea2bf637e00846eee6982cba53 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:09 +0800 Subject: [PATCH 21/78] BACKPORT: erofs: tidy up z_erofs_do_read_page() - Fix a typo: spiltted => split; - Move !EROFS_MAP_MAPPED and EROFS_MAP_FRAGMENT upwards; - Increase `split` in advance to avoid unnecessary repeats. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-4-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I465fd33c7cbbe91d5da4b4ee2343a7b319534148 (cherry picked from commit e4c1cf523d820730a86cae2c6d55924833b6f7ac) [dhavale: resolved small conflict in zdata.c in z_erofs_do_read_page()] Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 60 +++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 87e7d9f843bd..18fb6556cfd2 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -985,53 +985,35 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; - unsigned int cur, end, len, spiltted; + unsigned int cur, end, len, split; int err = 0; - /* register locked file pages as online pages in pack */ z_erofs_onlinepage_init(page); - spiltted = 0; + split = 0; end = PAGE_SIZE; repeat: - cur = end - 1; - - if (offset + cur < map->m_la || - offset + cur >= map->m_la + map->m_llen) { - erofs_dbg("out-of-range map @ pos %llu", offset + cur); - + if (offset + end - 1 < map->m_la || + offset + end - 1 >= map->m_la + map->m_llen) { + erofs_dbg("out-of-range map @ pos %llu", offset + end - 1); z_erofs_pcluster_end(fe); - map->m_la = offset + cur; + map->m_la = offset + end - 1; map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) goto out; - } else if (fe->pcl) { - goto hitted; } - if ((map->m_flags & EROFS_MAP_MAPPED) && - !(map->m_flags & EROFS_MAP_FRAGMENT)) { - err = z_erofs_pcluster_begin(fe); - if (err) - goto out; - } -hitted: - /* - * Ensure the current partial page belongs to this submit chain rather - * than other concurrent submit chains or the noio(bypass) chain since - * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or bvpage (should be processed in a strict order.) - */ - tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + cur = offset > map->m_la ? 0 : map->m_la - offset; + /* bump split parts first to avoid several separate cases */ + ++split; - cur = end - min_t(erofs_off_t, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); - ++spiltted; tight = false; goto next_part; } + if (map->m_flags & EROFS_MAP_FRAGMENT) { erofs_off_t fpos = offset + cur - map->m_la; @@ -1040,12 +1022,24 @@ hitted: EROFS_I(inode)->z_fragmentoff + fpos); if (err) goto out; - ++spiltted; tight = false; goto next_part; } - exclusive = (!cur && (!spiltted || tight)); + if (!fe->pcl) { + err = z_erofs_pcluster_begin(fe); + if (err) + goto out; + } + + /* + * Ensure the current partial page belongs to this submit chain rather + * than other concurrent submit chains or the noio(bypass) chain since + * those chains are handled asynchronously thus the page cannot be used + * for inplace I/O or bvpage (should be processed in a strict order.) + */ + tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + exclusive = (!cur && ((split <= 1) || tight)); if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); @@ -1058,8 +1052,6 @@ hitted: goto out; z_erofs_onlinepage_split(page); - /* bump up the number of spiltted parts of a page */ - ++spiltted; if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) fe->pcl->multibases = true; if (fe->pcl->length < offset + end - map->m_la) { @@ -1084,8 +1076,8 @@ out: z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); - erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", - __func__, page, spiltted, map->m_llen); + erofs_dbg("%s, finish page: %pK split: %u map->m_llen %llu", + __func__, page, split, map->m_llen); return err; } From bdc5d268ba5ebf809f44e12bb31cce23703bb659 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 29 Nov 2023 02:04:31 +0800 Subject: [PATCH 22/78] FROMGIT: erofs: fix memory leak on short-lived bounced pages Both MicroLZMA and DEFLATE algorithms can use short-lived pages on demand for the overlapped inplace I/O decompression. However, those short-lived pages are actually added to `be->compressed_pages`. Thus, it should be checked instead of `pcl->compressed_bvecs`. The LZ4 algorithm doesn't work like this, so it won't be impacted. Fixes: 67139e36d970 ("erofs: introduce `z_erofs_parse_in_bvecs'") Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231128180431.4116991-1-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: Ia1f602e9944b884022a3e20db12af568304fd80c (cherry picked from commit 93d6fda7f926451a0fa1121b9558d75ca47e861e https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 18fb6556cfd2..51ae6de18cba 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1328,12 +1328,11 @@ out: put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = pcl->compressed_bvecs[i].page; + /* consider shortlived pages added when decompressing */ + page = be->compressed_pages[i]; if (erofs_page_is_managed(sbi, page)) continue; - - /* recycle all individual short-lived pages */ (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } From 8a49ea94416049bab4c6074c6c533181eb419167 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 12:55:34 +0800 Subject: [PATCH 23/78] FROMGIT: erofs: fix lz4 inplace decompression Currently EROFS can map another compressed buffer for inplace decompression, that was used to handle the cases that some pages of compressed data are actually not in-place I/O. However, like most simple LZ77 algorithms, LZ4 expects the compressed data is arranged at the end of the decompressed buffer and it explicitly uses memmove() to handle overlapping: __________________________________________________________ |_ direction of decompression --> ____ |_ compressed data _| Although EROFS arranges compressed data like this, it typically maps two individual virtual buffers so the relative order is uncertain. Previously, it was hardly observed since LZ4 only uses memmove() for short overlapped literals and x86/arm64 memmove implementations seem to completely cover it up and they don't have this issue. Juhyung reported that EROFS data corruption can be found on a new Intel x86 processor. After some analysis, it seems that recent x86 processors with the new FSRM feature expose this issue with "rep movsb". Let's strictly use the decompressed buffer for lz4 inplace decompression for now. Later, as an useful improvement, we could try to tie up these two buffers together in the correct order. Reported-and-tested-by: Juhyung Park Closes: https://lore.kernel.org/r/CAD14+f2AVKf8Fa2OO1aAUdDNTDsVzzR6ctU_oJSmTyd6zSYR2Q@mail.gmail.com Fixes: 0ffd71bcc3a0 ("staging: erofs: introduce LZ4 decompression inplace") Fixes: 598162d05080 ("erofs: support decompress big pcluster for lz4 backend") Cc: stable # 5.4+ Tested-by: Yifan Zhao Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206045534.3920847-1-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: Ifd2981320f9f79b27bc7484d8906501a2fa05359 (cherry picked from commit 3c12466b6b7bf1e56f9b32c366a3d83d87afb4de https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) Signed-off-by: Sandeep Dhavale --- fs/erofs/decompressor.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index cfad1eac7fd9..024e0b4733e8 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -122,11 +122,11 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, } static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, - void *inpage, unsigned int *inputmargin, int *maptype, - bool may_inplace) + void *inpage, void *out, unsigned int *inputmargin, + int *maptype, bool may_inplace) { struct z_erofs_decompress_req *rq = ctx->rq; - unsigned int omargin, total, i, j; + unsigned int omargin, total, i; struct page **in; void *src, *tmp; @@ -136,12 +136,13 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) goto docopy; - for (i = 0; i < ctx->inpages; ++i) { - DBG_BUGON(rq->in[i] == NULL); - for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j) - if (rq->out[j] == rq->in[i]) - goto docopy; - } + for (i = 0; i < ctx->inpages; ++i) + if (rq->out[ctx->outpages - ctx->inpages + i] != + rq->in[i]) + goto docopy; + kunmap_local(inpage); + *maptype = 3; + return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT); } if (ctx->inpages <= 1) { @@ -149,7 +150,6 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, return inpage; } kunmap_local(inpage); - might_sleep(); src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) return ERR_PTR(-ENOMEM); @@ -205,12 +205,12 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, } static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, - u8 *out) + u8 *dst) { struct z_erofs_decompress_req *rq = ctx->rq; bool support_0padding = false, may_inplace = false; unsigned int inputmargin; - u8 *headpage, *src; + u8 *out, *headpage, *src; int ret, maptype; DBG_BUGON(*rq->in == NULL); @@ -231,11 +231,12 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } inputmargin = rq->pageofs_in; - src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin, + src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin, &maptype, may_inplace); if (IS_ERR(src)) return PTR_ERR(src); + out = dst + rq->pageofs_out; /* legacy format could compress extra data in a pcluster. */ if (rq->partial_decoding || !support_0padding) ret = LZ4_decompress_safe_partial(src + inputmargin, out, @@ -266,7 +267,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { erofs_put_pcpubuf(src); - } else { + } else if (maptype != 3) { DBG_BUGON(1); return -EFAULT; } @@ -309,7 +310,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, } dstmap_out: - ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out); + ret = z_erofs_lz4_decompress_mem(&ctx, dst); if (!dst_maptype) kunmap_local(dst); else if (dst_maptype == 2) From 9d259220acdf19d29e0970740d10d2ec973c5d78 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:53 +0800 Subject: [PATCH 24/78] FROMGIT: erofs: support I/O submission for sub-page compressed blocks Add a basic I/O submission path first to support sub-page blocks: - Temporary short-lived pages will be used entirely; - In-place I/O pages can be used partially, but compressed pages need to be able to be mapped in contiguous virtual memory. As a start, currently cache decompression is explicitly disabled for sub-page blocks, which will be supported in the future. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-2-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: Ib2cb6120805ab479a450580fc8774af131271791 (cherry picked from commit 192351616a9dde686492bcb9d1e4895a1411a527 https: //git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 156 ++++++++++++++++++++++------------------------- 1 file changed, 74 insertions(+), 82 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 51ae6de18cba..bc8e6cdf3ae3 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1452,86 +1452,85 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, z_erofs_decompressqueue_work(&io->u.work); } -static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, - unsigned int nr, - struct page **pagepool, - struct address_space *mc) +static void z_erofs_fill_bio_vec(struct bio_vec *bvec, + struct z_erofs_decompress_frontend *f, + struct z_erofs_pcluster *pcl, + unsigned int nr, + struct address_space *mc) { - const pgoff_t index = pcl->obj.index; gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; - + struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr; struct address_space *mapping; - struct page *oldpage, *page; - int justfound; + struct page *page, *oldpage; + int justfound, bs = i_blocksize(f->inode); + /* Except for inplace pages, the entire page can be used for I/Os */ + bvec->bv_offset = 0; + bvec->bv_len = PAGE_SIZE; repeat: - page = READ_ONCE(pcl->compressed_bvecs[nr].page); - oldpage = page; - - if (!page) + oldpage = READ_ONCE(zbv->page); + if (!oldpage) goto out_allocpage; - justfound = (unsigned long)page & 1UL; - page = (struct page *)((unsigned long)page & ~1UL); + justfound = (unsigned long)oldpage & 1UL; + page = (struct page *)((unsigned long)oldpage & ~1UL); + bvec->bv_page = page; + DBG_BUGON(z_erofs_is_shortlived_page(page)); /* - * preallocated cached pages, which is used to avoid direct reclaim - * otherwise, it will go inplace I/O path instead. + * Handle preallocated cached pages. We tried to allocate such pages + * without triggering direct reclaim. If allocation failed, inplace + * file-backed pages will be used instead. */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); set_page_private(page, 0); + WRITE_ONCE(zbv->page, page); tocache = true; goto out_tocache; } + mapping = READ_ONCE(page->mapping); - /* - * file-backed online pages in plcuster are all locked steady, - * therefore it is impossible for `mapping' to be NULL. + * File-backed pages for inplace I/Os are all locked steady, + * therefore it is impossible for `mapping` to be NULL. */ - if (mapping && mapping != mc) - /* ought to be unmanaged pages */ - goto out; - - /* directly return for shortlived page as well */ - if (z_erofs_is_shortlived_page(page)) - goto out; + if (mapping && mapping != mc) { + if (zbv->offset < 0) + bvec->bv_offset = round_up(-zbv->offset, bs); + bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset; + return; + } lock_page(page); - /* only true if page reclaim goes wrong, should never happen */ DBG_BUGON(justfound && PagePrivate(page)); - /* the page is still in manage cache */ + /* the cached page is still in managed cache */ if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - + WRITE_ONCE(zbv->page, page); + /* + * The cached page is still available but without a valid + * `->private` pcluster hint. Let's reconnect them. + */ if (!PagePrivate(page)) { - /* - * impossible to be !PagePrivate(page) for - * the current restriction as well if - * the page is already in compressed_bvecs[]. - */ DBG_BUGON(!justfound); - - justfound = 0; - set_page_private(page, (unsigned long)pcl); - SetPagePrivate(page); + /* compressed_bvecs[] already takes a ref */ + attach_page_private(page, pcl); + put_page(page); } - /* no need to submit io if it is already up-to-date */ + /* no need to submit if it is already up-to-date */ if (PageUptodate(page)) { unlock_page(page); - page = NULL; + bvec->bv_page = NULL; } - goto out; + return; } /* - * the managed page has been truncated, it's unsafe to - * reuse this one, let's allocate a new cache-managed page. + * It has been truncated, so it's unsafe to reuse this one. Let's + * allocate a new page for compressed data. */ DBG_BUGON(page->mapping); DBG_BUGON(!justfound); @@ -1540,25 +1539,23 @@ repeat: unlock_page(page); put_page(page); out_allocpage: - page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, - oldpage, page)) { - erofs_pagepool_add(pagepool, page); + page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); + if (oldpage != cmpxchg(&zbv->page, oldpage, page)) { + erofs_pagepool_add(&f->pagepool, page); cond_resched(); goto repeat; } + bvec->bv_page = page; out_tocache: - if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { - /* turn into temporary page if fails (1 ref) */ + if (!tocache || bs != PAGE_SIZE || + add_to_page_cache_lru(page, mc, pcl->obj.index + nr, gfp)) { + /* turn into a temporary shortlived page (1 ref) */ set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); - goto out; + return; } attach_page_private(page, pcl); - /* drop a refcount added by allocpage (then we have 2 refs here) */ + /* drop a refcount added by allocpage (then 2 refs in total here) */ put_page(page); - -out: /* the only exit (for tracing and debugging) */ - return page; } static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, @@ -1613,7 +1610,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } -static void z_erofs_decompressqueue_endio(struct bio *bio) +static void z_erofs_submissionqueue_endio(struct bio *bio) { struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; @@ -1625,7 +1622,6 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) DBG_BUGON(PageUptodate(page)); DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { if (!err) SetPageUptodate(page); @@ -1648,17 +1644,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; z_erofs_next_pcluster_t owned_head = f->owned_head; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ - pgoff_t last_index; + erofs_off_t last_pa; struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; unsigned long pflags; int memstall = 0; - /* - * if managed cache is enabled, bypass jobqueue is needed, - * no need to read from device for all pclusters in this queue. - */ + /* No need to read from device for pclusters in the bypass queue. */ q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); @@ -1671,7 +1664,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, do { struct erofs_map_dev mdev; struct z_erofs_pcluster *pcl; - pgoff_t cur, end; + erofs_off_t cur, end; + struct bio_vec bvec; unsigned int i = 0; bool bypass = true; @@ -1690,18 +1684,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, }; (void)erofs_map_dev(sb, &mdev); - cur = erofs_blknr(sb, mdev.m_pa); - end = cur + pcl->pclusterpages; - + cur = mdev.m_pa; + end = cur + (pcl->pclusterpages << PAGE_SHIFT); do { - struct page *page; - - page = pickup_page_for_submission(pcl, i++, - &f->pagepool, mc); - if (!page) + z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); + if (!bvec.bv_page) continue; - if (bio && (cur != last_index + 1 || + if (bio && (cur != last_pa || last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); @@ -1712,7 +1702,8 @@ submit_bio_retry: bio = NULL; } - if (unlikely(PageWorkingset(page)) && !memstall) { + if (unlikely(PageWorkingset(bvec.bv_page)) && + !memstall) { psi_memstall_enter(&pflags); memstall = 1; } @@ -1720,23 +1711,24 @@ submit_bio_retry: if (!bio) { bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOIO); - bio->bi_end_io = z_erofs_decompressqueue_endio; - - last_bdev = mdev.m_bdev; - bio->bi_iter.bi_sector = (sector_t)cur << - (sb->s_blocksize_bits - 9); + bio->bi_end_io = z_erofs_submissionqueue_endio; + bio->bi_iter.bi_sector = cur >> 9; bio->bi_private = q[JQ_SUBMIT]; if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; + last_bdev = mdev.m_bdev; } - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + if (cur + bvec.bv_len > end) + bvec.bv_len = end - cur; + if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, + bvec.bv_offset)) goto submit_bio_retry; - last_index = cur; + last_pa = cur + bvec.bv_len; bypass = false; - } while (++cur < end); + } while ((cur += bvec.bv_len) < end); if (!bypass) qtail[JQ_SUBMIT] = &pcl->next; From d7bb85f1cb1950bfa12b4c6c09f8346d32710d49 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:54 +0800 Subject: [PATCH 25/78] FROMGIT: erofs: record `pclustersize` in bytes instead of pages Currently, compressed sizes are recorded in pages using `pclusterpages`, However, for tailpacking pclusters, `tailpacking_size` is used instead. This approach doesn't work when dealing with sub-page blocks. To address this, let's switch them to the unified `pclustersize` in bytes. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-3-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: Ia8c50a7b4adcf6cd161b1d6f8bfc5a7fd3371079 (cherry picked from commit 54ed3fdd66055d073cb1cd2c6c65bbc0683c40cf https: //git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 64 ++++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index bc8e6cdf3ae3..471129a41335 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -57,6 +57,9 @@ struct z_erofs_pcluster { /* L: total number of bvecs */ unsigned int vcnt; + /* I: pcluster size (compressed size) in bytes */ + unsigned int pclustersize; + /* I: page offset of start position of decompression */ unsigned short pageofs_out; @@ -71,14 +74,6 @@ struct z_erofs_pcluster { struct rcu_head rcu; }; - union { - /* I: physical cluster size in pages */ - unsigned short pclusterpages; - - /* I: tailpacking inline compressed size */ - unsigned short tailpacking_size; - }; - /* I: compression algorithm format */ unsigned char algorithmformat; @@ -118,9 +113,7 @@ static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) { - if (z_erofs_is_inline_pcluster(pcl)) - return 1; - return pcl->pclusterpages; + return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; } /* @@ -306,12 +299,12 @@ static int z_erofs_create_pcluster_pool(void) return 0; } -static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) +static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) { - int i; + unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct z_erofs_pcluster_slab *pcs = pcluster_pool; - for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { - struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { struct z_erofs_pcluster *pcl; if (nrpages > pcs->maxpages) @@ -320,7 +313,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); if (!pcl) return ERR_PTR(-ENOMEM); - pcl->pclusterpages = nrpages; + pcl->pclustersize = size; return pcl; } return ERR_PTR(-EINVAL); @@ -571,6 +564,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool shouldalloc = z_erofs_should_alloc_cache(fe); bool standalone = true; /* @@ -584,10 +578,9 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; - for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page; + for (i = 0; i < pclusterpages; ++i) { + struct page *page, *newpage; void *t; /* mark pages just found for debugging */ - struct page *newpage = NULL; /* the compressed page was loaded before */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) @@ -597,6 +590,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) if (page) { t = (void *)((unsigned long)page | 1); + newpage = NULL; } else { /* I/O is needed, no possible to decompress directly */ standalone = false; @@ -604,9 +598,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) continue; /* - * try to use cached I/O if page allocation - * succeeds or fallback to in-place I/O instead - * to avoid any direct reclaim. + * Try cached I/O if allocation succeeds or fallback to + * in-place I/O instead to avoid any direct reclaim. */ newpage = erofs_allocpage(&fe->pagepool, gfp); if (!newpage) @@ -638,6 +631,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); int i; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); @@ -645,7 +639,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, * refcount of workgroup is now freezed as 1, * therefore no need to worry about available decompression users. */ - for (i = 0; i < pcl->pclusterpages; ++i) { + for (i = 0; i < pclusterpages; ++i) { struct page *page = pcl->compressed_bvecs[i].page; if (!page) @@ -669,6 +663,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { struct z_erofs_pcluster *pcl = folio_get_private(folio); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool ret; int i; @@ -680,7 +675,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) ret = false; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { + for (i = 0; i < pclusterpages; ++i) { if (pcl->compressed_bvecs[i].page == &folio->page) { WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); ret = true; @@ -789,20 +784,20 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct erofs_workgroup *grp; int err; if (!(map->m_flags & EROFS_MAP_ENCODED) || - (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { + (!ztailpacking && !erofs_blknr(sb, map->m_pa))) { DBG_BUGON(1); return -EFSCORRUPTED; } /* no available pcluster, let's allocate one */ - pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : - map->m_plen >> PAGE_SHIFT); + pcl = z_erofs_alloc_pcluster(map->m_plen); if (IS_ERR(pcl)) return PTR_ERR(pcl); @@ -826,9 +821,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); - pcl->tailpacking_size = map->m_plen; } else { - pcl->obj.index = map->m_pa >> PAGE_SHIFT; + pcl->obj.index = erofs_blknr(sb, map->m_pa); grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); if (IS_ERR(grp)) { @@ -1263,8 +1257,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, unsigned int pclusterpages = z_erofs_pclusterpages(pcl); const struct z_erofs_decompressor *decompressor = &erofs_decompressors[pcl->algorithmformat]; - unsigned int i, inputsize; - int err2; + int i, err2; struct page *page; bool overlapped; @@ -1301,18 +1294,13 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (err) goto out; - if (z_erofs_is_inline_pcluster(pcl)) - inputsize = pcl->tailpacking_size; - else - inputsize = pclusterpages * PAGE_SIZE; - err = decompressor->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, - .inputsize = inputsize, + .inputsize = pcl->pclustersize, .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, @@ -1685,7 +1673,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, (void)erofs_map_dev(sb, &mdev); cur = mdev.m_pa; - end = cur + (pcl->pclusterpages << PAGE_SHIFT); + end = cur + pcl->pclustersize; do { z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); if (!bvec.bv_page) From 0c6a18c75b54e9045df5284d1cd5d65afcdfe34d Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:55 +0800 Subject: [PATCH 26/78] BACKPORT: FROMGIT: erofs: fix up compacted indexes for block size < 4096 Previously, the block size always equaled to PAGE_SIZE, therefore `lclusterbits` couldn't be less than 12. Since sub-page compressed blocks are now considered, `lobits` for a lcluster in each pack cannot always be `lclusterbits` as before. Otherwise, there is no enough room for the special value `Z_EROFS_VLE_DI_D0_CBLKCNT`. To support smaller block sizes, `lobits` for each compacted lcluster is now calculated as: lobits = max(lclusterbits, ilog2(Z_EROFS_VLE_DI_D0_CBLKCNT) + 1) Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-4-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: Iacd89e2b33ddf39ea40b90e88a2bf99bb5a83b31 (cherry picked from commit 8d2517aaeea3ab8651bb517bca8f3c8664d318ea https: //git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) [dhavale: resolved conflicts in zmap.c due to older naming of constants and updated commit message also to use the older names] Signed-off-by: Sandeep Dhavale --- fs/erofs/zmap.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 8973ccad707d..f5d3ba39dd42 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -101,29 +101,26 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, } static unsigned int decode_compactedbits(unsigned int lobits, - unsigned int lomask, u8 *in, unsigned int pos, u8 *type) { const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7); - const unsigned int lo = v & lomask; + const unsigned int lo = v & ((1 << lobits) - 1); *type = (v >> lobits) & 3; return lo; } -static int get_compacted_la_distance(unsigned int lclusterbits, +static int get_compacted_la_distance(unsigned int lobits, unsigned int encodebits, unsigned int vcnt, u8 *in, int i) { - const unsigned int lomask = (1 << lclusterbits) - 1; unsigned int lo, d1 = 0; u8 type; DBG_BUGON(i >= vcnt); do { - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, encodebits * i, &type); if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) return d1; @@ -142,15 +139,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, { struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; - const unsigned int lomask = (1 << lclusterbits) - 1; - unsigned int vcnt, base, lo, encodebits, nblk, eofs; + unsigned int vcnt, base, lo, lobits, encodebits, nblk, eofs; int i; u8 *in, type; bool big_pcluster; if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; - else if (1 << amortizedshift == 2 && lclusterbits == 12) + else if (1 << amortizedshift == 2 && lclusterbits <= 12) vcnt = 16; else return -EOPNOTSUPP; @@ -159,6 +155,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; + lobits = max(lclusterbits, ilog2(Z_EROFS_VLE_DI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; eofs = erofs_blkoff(m->inode->i_sb, pos); base = round_down(eofs, vcnt << amortizedshift); @@ -166,15 +163,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, i = (eofs - base) >> amortizedshift; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, encodebits * i, &type); m->type = type; if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; /* figure out lookahead_distance: delta[1] if needed */ if (lookahead) - m->delta[1] = get_compacted_la_distance(lclusterbits, + m->delta[1] = get_compacted_la_distance(lobits, encodebits, vcnt, in, i); if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { if (!big_pcluster) { @@ -193,8 +189,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, * of which lo saves delta[1] rather than delta[0]. * Hence, get delta[0] by the previous lcluster indirectly. */ - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * (i - 1), &type); + lo = decode_compactedbits(lobits, in, + encodebits * (i - 1), &type); if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) lo = 0; else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) @@ -209,8 +205,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, nblk = 1; while (i > 0) { --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, + encodebits * i, &type); if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) i -= lo; @@ -221,8 +217,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, nblk = 0; while (i > 0) { --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, + encodebits * i, &type); if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { --i; From a18efa4e4aa95a7348469db998411322ac8193be Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 15 Dec 2023 00:13:37 +0800 Subject: [PATCH 27/78] FROMGIT: erofs: fix ztailpacking for subpage compressed blocks `pageofs_in` should be the compressed data offset of the page rather than of the block. Acked-by: Chao Yu Reviewed-by: Yue Hu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231214161337.753049-1-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I0997a69b22b0f42c327c810359f55f5fa6a76275 (cherry picked from commit e5aba911dee5e20fa82efbe13e0af8f38ea459e7 https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 471129a41335..ffb5ed1b07b0 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -820,7 +820,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ - pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); } else { pcl->obj.index = erofs_blknr(sb, map->m_pa); @@ -897,6 +896,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) } get_page(map->buf.page); WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* file-backed inplace I/O pages are traversed in reverse order */ From f466d5216404d611d6f9559bc97547edb7afa714 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:56 +0800 Subject: [PATCH 28/78] FROMGIT: erofs: refine z_erofs_transform_plain() for sub-page block support Sub-page block support is still unusable even with previous commits if interlaced PLAIN pclusters exist. Such pclusters can be found if the fragment feature is enabled. This commit tries to handle "the head part" of interlaced PLAIN pclusters first: it was once explained in commit fdffc091e6f9 ("erofs: support interlaced uncompressed data for compressed files"). It uses a unique way for both shifted and interlaced PLAIN pclusters. As an added bonus, PLAIN pclusters larger than the block size is also supported now for the upcoming large lclusters. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-5-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I3d50132664f8754f56d62744420060108ed0da4f (cherry picked from commit 192351616a9dde686492bcb9d1e4895a1411a527 https: //git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) Signed-off-by: Sandeep Dhavale --- fs/erofs/decompressor.c | 82 ++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 33 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 024e0b4733e8..38c7f9c96c68 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -321,43 +321,59 @@ dstmap_out: static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, struct page **pagepool) { - const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - const unsigned int outpages = + const unsigned int nrpages_in = + PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT; + const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const unsigned int righthalf = min_t(unsigned int, rq->outputsize, - PAGE_SIZE - rq->pageofs_out); - const unsigned int lefthalf = rq->outputsize - righthalf; - const unsigned int interlaced_offset = - rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; - u8 *src; + const unsigned int bs = rq->sb->s_blocksize; + unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; + u8 *kin; - if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - if (rq->out[0] == *rq->in) { - DBG_BUGON(rq->pageofs_out); - return 0; - } - - src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; - if (rq->out[0]) - memcpy_to_page(rq->out[0], rq->pageofs_out, - src + interlaced_offset, righthalf); - - if (outpages > inpages) { - DBG_BUGON(!rq->out[outpages - 1]); - if (rq->out[outpages - 1] != rq->in[inpages - 1]) { - memcpy_to_page(rq->out[outpages - 1], 0, src + - (interlaced_offset ? 0 : righthalf), - lefthalf); - } else if (!interlaced_offset) { - memmove(src, src + righthalf, lefthalf); - flush_dcache_page(rq->in[inpages - 1]); + DBG_BUGON(rq->outputsize > rq->inputsize); + if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) { + cur = bs - (rq->pageofs_out & (bs - 1)); + pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK; + cur = min(cur, rq->outputsize); + if (cur && rq->out[0]) { + kin = kmap_local_page(rq->in[nrpages_in - 1]); + if (rq->out[0] == rq->in[nrpages_in - 1]) { + memmove(kin + rq->pageofs_out, kin + pi, cur); + flush_dcache_page(rq->out[0]); + } else { + memcpy_to_page(rq->out[0], rq->pageofs_out, + kin + pi, cur); + } + kunmap_local(kin); } + rq->outputsize -= cur; } - kunmap_local(src); + + for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) { + insz = min_t(unsigned int, PAGE_SIZE - rq->pageofs_in, + rq->outputsize); + rq->outputsize -= insz; + if (!rq->in[ni]) + continue; + kin = kmap_local_page(rq->in[ni]); + pi = 0; + do { + no = (rq->pageofs_out + cur + pi) >> PAGE_SHIFT; + po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK; + DBG_BUGON(no >= nrpages_out); + cnt = min_t(unsigned int, insz - pi, PAGE_SIZE - po); + if (rq->out[no] == rq->in[ni]) { + memmove(kin + po, + kin + rq->pageofs_in + pi, cnt); + flush_dcache_page(rq->out[no]); + } else if (rq->out[no]) { + memcpy_to_page(rq->out[no], po, + kin + rq->pageofs_in + pi, cnt); + } + pi += cnt; + } while (pi < insz); + kunmap_local(kin); + } + DBG_BUGON(ni > nrpages_in); return 0; } From 37e0a5b868093b50e698e43ac2412cd9a883f395 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:57 +0800 Subject: [PATCH 29/78] BACKPORT: FROMGIT: erofs: enable sub-page compressed block support Let's just disable cached decompression and inplace I/Os for partial pages as the first step in order to enable sub-page block initial support. In other words, currently it works primarily based on temporary short-lived pages. Don't expect too much in terms of performance. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-6-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I00238aa437f20c46d015bbe5ab7b706b80b8cfd7 (cherry picked from commit 0ee3a0d59e007320167a2e9f4b8bf1304ada7771 https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) [dhavale: resolved conflicts in inode.c in erofs_fill_inode()] Signed-off-by: Sandeep Dhavale --- fs/erofs/inode.c | 7 +++++-- fs/erofs/zdata.c | 6 ++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 187ca02bbc2d..190bc3ad5622 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -291,9 +291,12 @@ static int erofs_fill_inode(struct inode *inode) } if (erofs_inode_is_data_compressed(vi->datalayout)) { - if (!erofs_is_fscache_mode(inode->i_sb) && - inode->i_sb->s_blocksize_bits == PAGE_SHIFT) + if (!erofs_is_fscache_mode(inode->i_sb)) { + DO_ONCE_LITE_IF(inode->i_sb->s_blocksize != PAGE_SIZE, + erofs_info, inode->i_sb, + "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); err = z_erofs_fill_inode(inode); + } else err = -EOPNOTSUPP; goto out_unlock; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index ffb5ed1b07b0..0b1b6ca804b3 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -575,6 +575,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; + if (i_blocksize(fe->inode) != PAGE_SIZE) + return; if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; @@ -978,12 +980,12 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); + const unsigned int bs = i_blocksize(inode); bool tight = true, exclusive; unsigned int cur, end, len, split; int err = 0; z_erofs_onlinepage_init(page); - split = 0; end = PAGE_SIZE; repeat: @@ -1033,7 +1035,7 @@ repeat: * for inplace I/O or bvpage (should be processed in a strict order.) */ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); - exclusive = (!cur && ((split <= 1) || tight)); + exclusive = (!cur && ((split <= 1) || (tight && bs == PAGE_SIZE))); if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); From 7d50253c2790f5c8a2f43c4c737241ab2458a77f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 16 Mar 2020 16:39:31 -0700 Subject: [PATCH 30/78] ANDROID: Export functions to be used with dma_map_ops in modules For modules to reuse default dma_map_ops implementations they need to be exported. Export the following functions: dma_direct_alloc dma_direct_free dma_common_mmap dma_common_get_sgtable dma_direct_get_required_mask Bug: 151050914 Signed-off-by: Suren Baghdasaryan Change-Id: Ia77b797fcd909fce01da7431bfbde282dc70b3b3 (cherry picked from commit fd31496dae939c7bf2ef874e08d4bf8c6ab738b3) Signed-off-by: Qian-Hao Huang (cherry picked from commit cdc9f6ef946f3c17b048b3ee9e36aa0bbc12d712) --- kernel/dma/direct.c | 3 +++ kernel/dma/ops_helpers.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 71bb2e3440e2..09ca202ac480 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -43,6 +43,7 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } +EXPORT_SYMBOL_GPL(dma_direct_get_required_mask); static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, u64 *phys_limit) @@ -320,6 +321,7 @@ out_free_pages: __dma_direct_free_pages(dev, page, size); return NULL; } +EXPORT_SYMBOL_GPL(dma_direct_alloc); void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) @@ -365,6 +367,7 @@ void dma_direct_free(struct device *dev, size_t size, __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size); } +EXPORT_SYMBOL_GPL(dma_direct_free); struct page *dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index af4a6ef48ce0..e28e1e17eaf5 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -27,6 +27,7 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); return ret; } +EXPORT_SYMBOL_GPL(dma_common_get_sgtable); /* * Create userspace mapping for the DMA-coherent memory. @@ -57,6 +58,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, return -ENXIO; #endif /* CONFIG_MMU */ } +EXPORT_SYMBOL_GPL(dma_common_mmap); struct page *dma_common_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) From 9c4bc457ab31d2a121da0b5884a6b7e927b1e1f9 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 12 Aug 2023 16:56:25 +0100 Subject: [PATCH 31/78] UPSTREAM: mm/memory.c: fix mismerge Fix a build issue. Link: https://lkml.kernel.org/r/ZNerqcNS4EBJA/2v@casper.infradead.org Fixes: 4aaa60dad4d1 ("mm: allow per-VMA locks on file-backed VMAs") Signed-off-by: Matthew Wilcox Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202308121909.XNYBtqNI-lkp@intel.com/ Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 08dff2810e8feb3096bf5c8242ab1649d1e8b1a4) Bug: 293665307 Change-Id: I07ce19f29c44831cdcf709fe1ce122d1963f0be2 Signed-off-by: Suren Baghdasaryan --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 56057f97afaf..24463fd7de64 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5514,7 +5514,7 @@ retry: * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA * from its anon_vma. */ - if (unlikely(!vma->anon_vma)) + if (vma_is_anonymous(vma) && !vma->anon_vma) goto inval_end_read; /* Check since vm_start/vm_end might change before we lock the VMA */ From b43b26b4cd1ff080ac1577a558de2962c8f03d07 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:13 +0100 Subject: [PATCH 32/78] UPSTREAM: mm: make lock_folio_maybe_drop_mmap() VMA lock aware Patch series "Handle more faults under the VMA lock", v2. At this point, we're handling the majority of file-backed page faults under the VMA lock, using the ->map_pages entry point. This patch set attempts to expand that for the following siutations: - We have to do a read. This could be because we've hit the point in the readahead window where we need to kick off the next readahead, or because the page is simply not present in cache. - We're handling a write fault. Most applications don't do I/O by writes to shared mmaps for very good reasons, but some do, and it'd be nice to not make that slow unnecessarily. - We're doing a COW of a private mapping (both PTE already present and PTE not-present). These are two different codepaths and I handle both of them in this patch set. There is no support in this patch set for drivers to mark themselves as being VMA lock friendly; they could implement the ->map_pages vm_operation, but if they do, they would be the first. This is probably something we want to change at some point in the future, and I've marked where to make that change in the code. There is very little performance change in the benchmarks we've run; mostly because the vast majority of page faults are handled through the other paths. I still think this patch series is useful for workloads that may take these paths more often, and just for cleaning up the fault path in general (it's now clearer why we have to retry in these cases). This patch (of 6): Drop the VMA lock instead of the mmap_lock if that's the one which is held. Link: https://lkml.kernel.org/r/20231006195318.4087158-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231006195318.4087158-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 5d74b2ab2c15d596c470bae6626f345d5575a9d0) Bug: 293665307 Change-Id: Ife2d11ab12fb428868cd44751784cf731fbffe62 Signed-off-by: Suren Baghdasaryan --- mm/filemap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 63a846a1c1a3..c38dc43bfc8c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2967,7 +2967,7 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, /* * NOTE! This will make us return with VM_FAULT_RETRY, but with - * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT + * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT * is supposed to work. We have way too many special cases.. */ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) @@ -2977,13 +2977,14 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, if (vmf->flags & FAULT_FLAG_KILLABLE) { if (__folio_lock_killable(folio)) { /* - * We didn't have the right flags to drop the mmap_lock, - * but all fault_handlers only check for fatal signals - * if we return VM_FAULT_RETRY, so we need to drop the - * mmap_lock here and return 0 if we don't have a fpin. + * We didn't have the right flags to drop the + * fault lock, but all fault_handlers only check + * for fatal signals if we return VM_FAULT_RETRY, + * so we need to drop the fault lock here and + * return 0 if we don't have a fpin. */ if (*fpin == NULL) - mmap_read_unlock(vmf->vma->vm_mm); + release_fault_lock(vmf); return 0; } } else From 95af8a80bb7ba4ad46de6f743ec91a882f4cd624 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:14 +0100 Subject: [PATCH 33/78] BACKPORT: mm: call wp_page_copy() under the VMA lock It is usually safe to call wp_page_copy() under the VMA lock. The only unsafe situation is when no anon_vma has been allocated for this VMA, and we have to look at adjacent VMAs to determine if their anon_vma can be shared. Since this happens only for the first COW of a page in this VMA, the majority of calls to wp_page_copy() do not need to fall back to the mmap_sem. Add vmf_anon_prepare() as an alternative to anon_vma_prepare() which will return RETRY if we currently hold the VMA lock and need to allocate an anon_vma. This lets us drop the check in do_wp_page(). Link: https://lkml.kernel.org/r/20231006195318.4087158-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 164b06f238b986317131e6b61b2f22aabcbc2cc0) [surenb: resolved merge conflicts due to folio/page differences] Bug: 293665307 Change-Id: I39bdc247b375bd3dae8078b52c60fd4ce12e1850 Signed-off-by: Suren Baghdasaryan --- mm/memory.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 24463fd7de64..3cccb73b0a13 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3099,6 +3099,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf) count_vm_event(PGREUSE); } +static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (likely(vma->anon_vma)) + return 0; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + if (__anon_vma_prepare(vma)) + return VM_FAULT_OOM; + return 0; +} + /* * Handle the case of a page which we actually need to copy to a new page, * either due to COW or unsharing. @@ -3126,12 +3141,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_t entry; int page_copied = 0; struct mmu_notifier_range range; - int ret; + vm_fault_t ret; delayacct_wpcopy_start(); - if (unlikely(anon_vma_prepare(vma))) - goto oom; + ret = vmf_anon_prepare(vmf); + if (unlikely(ret)) + goto out; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { new_page = alloc_zeroed_user_highpage_movable(vma, @@ -3139,13 +3155,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (!new_page) goto oom; } else { + int err; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!new_page) goto oom; - ret = __wp_page_copy_user(new_page, old_page, vmf); - if (ret) { + err = __wp_page_copy_user(new_page, old_page, vmf); + if (err) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on @@ -3158,7 +3175,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) put_page(old_page); delayacct_wpcopy_end(); - return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; + return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(new_page, old_page); } @@ -3271,11 +3288,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) oom_free_new: put_page(new_page); oom: + ret = VM_FAULT_OOM; +out: if (old_page) put_page(old_page); delayacct_wpcopy_end(); - return VM_FAULT_OOM; + return ret; } /** @@ -3510,12 +3529,6 @@ reuse: return wp_page_shared(vmf); } copy: - if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - /* * Ok, we need to copy. Oh, well.. */ From c7fa581a793af13559143914227a42dcc83fa3e7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:15 +0100 Subject: [PATCH 34/78] UPSTREAM: mm: handle shared faults under the VMA lock There are many implementations of ->fault and some of them depend on mmap_lock being held. All vm_ops that implement ->map_pages() end up calling filemap_fault(), which I have audited to be sure it does not rely on mmap_lock. So (for now) key off ->map_pages existing as a flag to indicate that it's safe to call ->fault while only holding the vma lock. Link: https://lkml.kernel.org/r/20231006195318.4087158-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 4ed4379881aa62588aba6442a9f362a8cf7624e6) Bug: 293665307 Change-Id: Ifb5ab3df5d05fb182d0cb52820fa24e28e2d6496 Signed-off-by: Suren Baghdasaryan --- mm/memory.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 3cccb73b0a13..0db26276bed7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3099,6 +3099,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf) count_vm_event(PGREUSE); } +/* + * We could add a bitflag somewhere, but for now, we know that all + * vm_ops that have a ->map_pages have been audited and don't need + * the mmap_lock to be held. + */ +static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK)) + return 0; + vma_end_read(vma); + return VM_FAULT_RETRY; +} + static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -4701,10 +4716,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret, tmp; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) From 6541fffd92e5584e6297b14b0ef9f5e09cf79e5d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:16 +0100 Subject: [PATCH 35/78] UPSTREAM: mm: handle COW faults under the VMA lock If the page is not currently present in the page tables, we need to call the page fault handler to find out which page we're supposed to COW, so we need to both check that there is already an anon_vma and that the fault handler doesn't need the mmap_lock. Link: https://lkml.kernel.org/r/20231006195318.4087158-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 4de8c93a4751e10737b6af65db42c743228c67a6) Bug: 293665307 Change-Id: If749a6f8fcf69d83bbf872c1d45865d1b1b77ea0 Signed-off-by: Suren Baghdasaryan --- mm/memory.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 0db26276bed7..cc0a95fc14b2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4672,13 +4672,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; + ret = vmf_can_call_fault(vmf); + if (!ret) + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!vmf->cow_page) From c1da94fa44e60c93e3896b087680e905116d69b3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:17 +0100 Subject: [PATCH 36/78] UPSTREAM: mm: handle read faults under the VMA lock Most file-backed faults are already handled through ->map_pages(), but if we need to do I/O we'll come this way. Since filemap_fault() is now safe to be called under the VMA lock, we can handle these faults under the VMA lock now. Link: https://lkml.kernel.org/r/20231006195318.4087158-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 12214eba1992642eee5813a9cc9f626e5b2d1815) Bug: 293665307 Change-Id: Iee48af98b866d88d88ec01143eb26389ab373b6b Signed-off-by: Suren Baghdasaryan --- mm/memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index cc0a95fc14b2..ddebfa1457f4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4651,10 +4651,9 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) return ret; } - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) From 4a518d86339bb74f1edb918b0808bc755273258a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:18 +0100 Subject: [PATCH 37/78] UPSTREAM: mm: handle write faults to RO pages under the VMA lock I think this is a pretty rare occurrence, but for consistency handle faults with the VMA lock held the same way that we handle other faults with the VMA lock held. Link: https://lkml.kernel.org/r/20231006195318.4087158-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 4a68fef16df9d88d528094116f8bbd2dbfa62089) Bug: 293665307 Change-Id: I69cec218c8a1fe14df3268722e6b1be6dffe7978 Signed-off-by: Suren Baghdasaryan --- mm/memory.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ddebfa1457f4..7f29d9394bdf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3358,10 +3358,9 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); @@ -3385,10 +3384,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + tmp = vmf_can_call_fault(vmf); + if (tmp) { put_page(vmf->page); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; + return tmp; } tmp = do_page_mkwrite(vmf); From 6c8f7108578a1f4ed013c3bec63784e7f25cd6bf Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 26 Dec 2023 13:46:10 -0800 Subject: [PATCH 38/78] FROMGIT: arch/mm/fault: fix major fault accounting when retrying under per-VMA lock A test [1] in Android test suite started failing after [2] was merged. It turns out that after handling a major fault under per-VMA lock, the process major fault counter does not register that fault as major. Before [2] read faults would be done under mmap_lock, in which case FAULT_FLAG_TRIED flag is set before retrying. That in turn causes mm_account_fault() to account the fault as major once retry completes. With per-VMA locks we often retry because a fault can't be handled without locking the whole mm using mmap_lock. Therefore such retries do not set FAULT_FLAG_TRIED flag. This logic does not work after [2] because we can now handle read major faults under per-VMA lock and upon retry the fact there was a major fault gets lost. Fix this by setting FAULT_FLAG_TRIED after retrying under per-VMA lock if VM_FAULT_MAJOR was returned. Ideally we would use an additional VM_FAULT bit to indicate the reason for the retry (could not handle under per-VMA lock vs other reason) but this simpler solution seems to work, so keeping it simple. [1] https://cs.android.com/android/platform/superproject/+/master:test/vts-testcase/kernel/api/drop_caches_prop/drop_caches_test.cpp [2] https://lore.kernel.org/all/20231006195318.4087158-6-willy@infradead.org/ Link: https://lkml.kernel.org/r/20231226214610.109282-1-surenb@google.com Fixes: 12214eba1992 ("mm: handle read faults under the VMA lock") Signed-off-by: Suren Baghdasaryan Cc: Matthew Wilcox Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Gerald Schaefer Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Andrew Morton (cherry picked from commit 46e714c729c8d1d8110bc0545d7ffe8a759c9dc0 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-hotfixes-stable) Bug: 317385399 Change-Id: Ic7e97bf610dcabb7d3ac2306b2f1213be0ddd269 Signed-off-by: Suren Baghdasaryan --- arch/arm64/mm/fault.c | 2 ++ arch/powerpc/mm/fault.c | 2 ++ arch/riscv/mm/fault.c | 2 ++ arch/s390/mm/fault.c | 3 +++ arch/x86/mm/fault.c | 2 ++ 5 files changed, 11 insertions(+) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 9970a4785819..7d522b037d9a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -619,6 +619,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + mm_flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index b1723094d464..ec23164ad768 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -496,6 +496,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; if (fault_signal_pending(fault, regs)) return user_mode(regs) ? 0 : SIGBUS; diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 34a44febae86..d710bb834a2a 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -310,6 +310,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs) goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 0843adb266d1..60fed3c88332 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -420,6 +420,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto out; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { fault = VM_FAULT_SIGNAL; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 97599581ec6b..bcb5678b5b91 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1369,6 +1369,8 @@ void do_user_addr_fault(struct pt_regs *regs, goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { From 4d99e41ce174d65ef1d710922acb7a2c67bbaa0b Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Sat, 25 Nov 2023 02:41:58 +0530 Subject: [PATCH 39/78] FROMGIT: PM / devfreq: Synchronize devfreq_monitor_[start/stop] There is a chance if a frequent switch of the governor done in a loop result in timer list corruption where timer cancel being done from two place one from cancel_delayed_work_sync() and followed by expire_timers() can be seen from the traces[1]. while true do echo "simple_ondemand" > /sys/class/devfreq/1d84000.ufshc/governor echo "performance" > /sys/class/devfreq/1d84000.ufshc/governor done It looks to be issue with devfreq driver where device_monitor_[start/stop] need to synchronized so that delayed work should get corrupted while it is either being queued or running or being cancelled. Let's use polling flag and devfreq lock to synchronize the queueing the timer instance twice and work data being corrupted. [1] ... .. -0 [003] 9436.209662: timer_cancel timer=0xffffff80444f0428 -0 [003] 9436.209664: timer_expire_entry timer=0xffffff80444f0428 now=0x10022da1c function=__typeid__ZTSFvP10timer_listE_global_addr baseclk=0x10022da1c -0 [003] 9436.209718: timer_expire_exit timer=0xffffff80444f0428 kworker/u16:6-14217 [003] 9436.209863: timer_start timer=0xffffff80444f0428 function=__typeid__ZTSFvP10timer_listE_global_addr expires=0x10022da2b now=0x10022da1c flags=182452227 vendor.xxxyyy.ha-1593 [004] 9436.209888: timer_cancel timer=0xffffff80444f0428 vendor.xxxyyy.ha-1593 [004] 9436.216390: timer_init timer=0xffffff80444f0428 vendor.xxxyyy.ha-1593 [004] 9436.216392: timer_start timer=0xffffff80444f0428 function=__typeid__ZTSFvP10timer_listE_global_addr expires=0x10022da2c now=0x10022da1d flags=186646532 vendor.xxxyyy.ha-1593 [005] 9436.220992: timer_cancel timer=0xffffff80444f0428 xxxyyyTraceManag-7795 [004] 9436.261641: timer_cancel timer=0xffffff80444f0428 [2] 9436.261653][ C4] Unable to handle kernel paging request at virtual address dead00000000012a [ 9436.261664][ C4] Mem abort info: [ 9436.261666][ C4] ESR = 0x96000044 [ 9436.261669][ C4] EC = 0x25: DABT (current EL), IL = 32 bits [ 9436.261671][ C4] SET = 0, FnV = 0 [ 9436.261673][ C4] EA = 0, S1PTW = 0 [ 9436.261675][ C4] Data abort info: [ 9436.261677][ C4] ISV = 0, ISS = 0x00000044 [ 9436.261680][ C4] CM = 0, WnR = 1 [ 9436.261682][ C4] [dead00000000012a] address between user and kernel address ranges [ 9436.261685][ C4] Internal error: Oops: 96000044 [#1] PREEMPT SMP [ 9436.261701][ C4] Skip md ftrace buffer dump for: 0x3a982d0 ... [ 9436.262138][ C4] CPU: 4 PID: 7795 Comm: TraceManag Tainted: G S W O 5.10.149-android12-9-o-g17f915d29d0c #1 [ 9436.262141][ C4] Hardware name: Qualcomm Technologies, Inc. (DT) [ 9436.262144][ C4] pstate: 22400085 (nzCv daIf +PAN -UAO +TCO BTYPE=--) [ 9436.262161][ C4] pc : expire_timers+0x9c/0x438 [ 9436.262164][ C4] lr : expire_timers+0x2a4/0x438 [ 9436.262168][ C4] sp : ffffffc010023dd0 [ 9436.262171][ C4] x29: ffffffc010023df0 x28: ffffffd0636fdc18 [ 9436.262178][ C4] x27: ffffffd063569dd0 x26: ffffffd063536008 [ 9436.262182][ C4] x25: 0000000000000001 x24: ffffff88f7c69280 [ 9436.262185][ C4] x23: 00000000000000e0 x22: dead000000000122 [ 9436.262188][ C4] x21: 000000010022da29 x20: ffffff8af72b4e80 [ 9436.262191][ C4] x19: ffffffc010023e50 x18: ffffffc010025038 [ 9436.262195][ C4] x17: 0000000000000240 x16: 0000000000000201 [ 9436.262199][ C4] x15: ffffffffffffffff x14: ffffff889f3c3100 [ 9436.262203][ C4] x13: ffffff889f3c3100 x12: 00000000049f56b8 [ 9436.262207][ C4] x11: 00000000049f56b8 x10: 00000000ffffffff [ 9436.262212][ C4] x9 : ffffffc010023e50 x8 : dead000000000122 [ 9436.262216][ C4] x7 : ffffffffffffffff x6 : ffffffc0100239d8 [ 9436.262220][ C4] x5 : 0000000000000000 x4 : 0000000000000101 [ 9436.262223][ C4] x3 : 0000000000000080 x2 : ffffff889edc155c [ 9436.262227][ C4] x1 : ffffff8001005200 x0 : ffffff80444f0428 [ 9436.262232][ C4] Call trace: [ 9436.262236][ C4] expire_timers+0x9c/0x438 [ 9436.262240][ C4] __run_timers+0x1f0/0x330 [ 9436.262245][ C4] run_timer_softirq+0x28/0x58 [ 9436.262255][ C4] efi_header_end+0x168/0x5ec [ 9436.262265][ C4] __irq_exit_rcu+0x108/0x124 [ 9436.262274][ C4] __handle_domain_irq+0x118/0x1e4 [ 9436.262282][ C4] gic_handle_irq.30369+0x6c/0x2bc [ 9436.262286][ C4] el0_irq_naked+0x60/0x6c Bug: 317188938 Change-Id: I9a22325f6abbf28217c8f37b093cf77509b0139a Link: https://lore.kernel.org/all/1700860318-4025-1-git-send-email-quic_mojha@quicinc.com/ Reported-by: Joyyoung Huang Acked-by: MyungJoo Ham Signed-off-by: Mukesh Ojha Signed-off-by: Chanwoo Choi (cherry picked from commit aed5ed595960c6d301dcd4ed31aeaa7a8054c0c6 https://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/linux.git devfreq-next) Signed-off-by: Srinivasarao Pathipati --- drivers/devfreq/devfreq.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index fe6644f99887..8e9ba701a643 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -461,10 +461,14 @@ static void devfreq_monitor(struct work_struct *work) if (err) dev_err(&devfreq->dev, "dvfs failed with (%d) error\n", err); + if (devfreq->stop_polling) + goto out; + queue_delayed_work(devfreq_wq, &devfreq->work, msecs_to_jiffies(devfreq->profile->polling_ms)); - mutex_unlock(&devfreq->lock); +out: + mutex_unlock(&devfreq->lock); trace_devfreq_monitor(devfreq); } @@ -482,6 +486,10 @@ void devfreq_monitor_start(struct devfreq *devfreq) if (IS_SUPPORTED_FLAG(devfreq->governor->flags, IRQ_DRIVEN)) return; + mutex_lock(&devfreq->lock); + if (delayed_work_pending(&devfreq->work)) + goto out; + switch (devfreq->profile->timer) { case DEVFREQ_TIMER_DEFERRABLE: INIT_DEFERRABLE_WORK(&devfreq->work, devfreq_monitor); @@ -490,12 +498,16 @@ void devfreq_monitor_start(struct devfreq *devfreq) INIT_DELAYED_WORK(&devfreq->work, devfreq_monitor); break; default: - return; + goto out; } if (devfreq->profile->polling_ms) queue_delayed_work(devfreq_wq, &devfreq->work, msecs_to_jiffies(devfreq->profile->polling_ms)); + +out: + devfreq->stop_polling = false; + mutex_unlock(&devfreq->lock); } EXPORT_SYMBOL(devfreq_monitor_start); @@ -512,6 +524,14 @@ void devfreq_monitor_stop(struct devfreq *devfreq) if (IS_SUPPORTED_FLAG(devfreq->governor->flags, IRQ_DRIVEN)) return; + mutex_lock(&devfreq->lock); + if (devfreq->stop_polling) { + mutex_unlock(&devfreq->lock); + return; + } + + devfreq->stop_polling = true; + mutex_unlock(&devfreq->lock); cancel_delayed_work_sync(&devfreq->work); } EXPORT_SYMBOL(devfreq_monitor_stop); From 99288e911ad553ffee55b3da942f6e722fc0573b Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 20 Dec 2023 13:35:00 +0000 Subject: [PATCH 40/78] ANDROID: KVM: arm64: Document module_change_host_prot_range When this pKVM module ops has been introduced, the documentation has been omitted. Bug: 308373293 Change-Id: I9e471414e72a1ee04c132de4ed95d77e815ae8c9 Signed-off-by: Vincent Donnefort --- arch/arm64/include/asm/kvm_pkvm_module.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h index bf68d862b7d8..b8e5f8064358 100644 --- a/arch/arm64/include/asm/kvm_pkvm_module.h +++ b/arch/arm64/include/asm/kvm_pkvm_module.h @@ -72,6 +72,11 @@ enum pkvm_psci_notification { * @register_host_perm_fault_handler), otherwise * pKVM will be unable to handle this fault and the * CPU will be stuck in an infinite loop. + * @host_stage2_mod_prot_range: Similar to @host_stage2_mod_prot, but takes a + * range as an argument (@nr_pages). This + * considerably speeds up the process for a + * contiguous memory region, compared to the + * per-page @host_stage2_mod_prot. * @host_stage2_get_leaf: Query the host's stage2 page-table entry for * the page @phys. * @register_host_smc_handler: @cb is called whenever the host issues an SMC From 8fc25d7862c9c669025dd534d3eefd20d071ea0d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 19 Oct 2023 15:51:08 -0700 Subject: [PATCH 41/78] FROMGIT: f2fs: do not return EFSCORRUPTED, but try to run online repair If we return the error, there's no way to recover the status as of now, since fsck does not fix the xattr boundary issue. Bug: 305658663 Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim (cherry picked from commit 50a472bbc79ff9d5a88be8019a60e936cadf9f13 https://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git dev) Change-Id: I55060a4eede3f5f85066aba22a6ab7155517e5c4 (cherry picked from commit 70113b9d489050d3e7a6f28e0cd6e43f104fc132) (cherry picked from commit 2c1f3789d609bd549f14c019b6c7b311bfd2fa64) --- fs/f2fs/node.c | 4 +++- fs/f2fs/xattr.c | 20 +++++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5010a33acb8a..60708d47aaa8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2734,7 +2734,9 @@ recover_xnid: f2fs_update_inode_page(inode); /* 3: update and set xattr node page dirty */ - memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); + if (page) + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), + VALID_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index db3b641f2158..adaad16468d8 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -363,10 +363,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); if (!*xe) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "lookup inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); - err = -EFSCORRUPTED; + err = -ENODATA; f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_XATTR); goto out; @@ -583,13 +583,12 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if ((void *)(entry) + sizeof(__u32) > last_base_addr || (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "list inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); - error = -EFSCORRUPTED; f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_XATTR); - goto cleanup; + break; } if (!handler || (handler->list && !handler->list(dentry))) @@ -650,7 +649,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (size > MAX_VALUE_LEN(inode)) return -E2BIG; - +retry: error = read_all_xattrs(inode, ipage, &base_addr); if (error) return error; @@ -660,7 +659,14 @@ static int __f2fs_setxattr(struct inode *inode, int index, /* find entry with wanted name. */ here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); if (!here) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + if (!F2FS_I(inode)->i_xattr_nid) { + f2fs_notice(F2FS_I_SB(inode), + "recover xattr in inode (%lu)", inode->i_ino); + f2fs_recover_xattr_data(inode, NULL); + kfree(base_addr); + goto retry; + } + f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; From 717d1f8f91abc780615bd85ac778f702aff6fde4 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Tue, 19 Dec 2023 16:40:19 +0000 Subject: [PATCH 42/78] ANDROID: KVM: arm64: Fix host_smc print typo From pKVM point of view, unknown SMCs are simply forwarded, we can't consider them invalid or not. This was probably a typo following a copy of the host_hcall event. Bug: 299430621 Change-Id: Ieb53f985a5187a8b5a9feb4a95982b15cdc1b04a Signed-off-by: Vincent Donnefort --- arch/arm64/include/asm/kvm_hypevents.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_hypevents.h b/arch/arm64/include/asm/kvm_hypevents.h index 8a2dd41b8569..c507d8978444 100644 --- a/arch/arm64/include/asm/kvm_hypevents.h +++ b/arch/arm64/include/asm/kvm_hypevents.h @@ -53,7 +53,7 @@ HYP_EVENT(host_smc, __entry->id = id; __entry->forwarded = forwarded; ), - HE_PRINTK("id=%llu invalid=%u", + HE_PRINTK("id=%llu forwarded=%u", __entry->id, __entry->forwarded) ); From 15a93de4641d882e26cf1de50af69679c1a20aee Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 20 Dec 2023 16:18:05 +0000 Subject: [PATCH 43/78] ANDROID: KVM: arm64: Fix hyp event alignment The structures that define hyp events must be packed so they match their format definitions in the tracefs file hyp/events/hyp//format. Bug: 299430621 Change-Id: Ia7e1a686744d5c9c3f8a21881f03228c8acecade Signed-off-by: Vincent Donnefort --- arch/arm64/include/asm/kvm_hypevents_defs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hypevents_defs.h b/arch/arm64/include/asm/kvm_hypevents_defs.h index e228d894a898..606f3477ecd3 100644 --- a/arch/arm64/include/asm/kvm_hypevents_defs.h +++ b/arch/arm64/include/asm/kvm_hypevents_defs.h @@ -15,10 +15,10 @@ struct hyp_entry_hdr { /* * Hyp events definitions common to the hyp and the host */ -#define HYP_EVENT_FORMAT(__name, __struct) \ - struct trace_hyp_format_##__name { \ - struct hyp_entry_hdr hdr; \ - __struct \ +#define HYP_EVENT_FORMAT(__name, __struct) \ + struct __packed trace_hyp_format_##__name { \ + struct hyp_entry_hdr hdr; \ + __struct \ } #define HE_PROTO(args...) args From e74417834ea0485b4a1e9ba9d8d7f2fed143eddd Mon Sep 17 00:00:00 2001 From: Ken Huang Date: Thu, 4 Jan 2024 23:18:14 +0800 Subject: [PATCH 44/78] ANDROID: Update the ABI symbol list Adding the following symbols: - __drmm_crtc_alloc_with_planes Bug: 275278929 Change-Id: I41b6069612d44214f474ed82ee2a4b07ca739302 Signed-off-by: Ken Huang --- android/abi_gki_aarch64_pixel | 1 + 1 file changed, 1 insertion(+) diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index af22721e20b8..a01a49721a1a 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -739,6 +739,7 @@ drm_kms_helper_poll_fini drm_kms_helper_poll_init drm_match_cea_mode + __drmm_crtc_alloc_with_planes drmm_kmalloc drmm_mode_config_init drm_mode_config_reset From 066d57de875d52d8fd4fbb9248f42e3c8a1066cc Mon Sep 17 00:00:00 2001 From: Kever Yang Date: Wed, 3 Jan 2024 22:02:54 +0800 Subject: [PATCH 45/78] ANDROID: GKI: Enable symbols for v4l2 in async and fwnode INFO: 14 function symbol(s) added 'struct v4l2_async_subdev* __v4l2_async_nf_add_fwnode(struct v4l2_async_notifier*, struct fwnode_handle*, unsigned int)' 'struct v4l2_async_subdev* __v4l2_async_nf_add_fwnode_remote(struct v4l2_async_notifier*, struct fwnode_handle*, unsigned int)' 'void v4l2_async_nf_cleanup(struct v4l2_async_notifier*)' 'void v4l2_async_nf_init(struct v4l2_async_notifier*)' 'int v4l2_async_nf_parse_fwnode_endpoints(struct device*, struct v4l2_async_notifier*, size_t, parse_endpoint_func)' 'int v4l2_async_nf_register(struct v4l2_device*, struct v4l2_async_notifier*)' 'void v4l2_async_nf_unregister(struct v4l2_async_notifier*)' 'int v4l2_async_register_subdev(struct v4l2_subdev*)' 'int v4l2_async_register_subdev_sensor(struct v4l2_subdev*)' 'int v4l2_async_subdev_nf_register(struct v4l2_subdev*, struct v4l2_async_notifier*)' 'void v4l2_async_unregister_subdev(struct v4l2_subdev*)' 'int v4l2_fwnode_endpoint_alloc_parse(struct fwnode_handle*, struct v4l2_fwnode_endpoint*)' 'void v4l2_fwnode_endpoint_free(struct v4l2_fwnode_endpoint*)' 'int v4l2_fwnode_endpoint_parse(struct fwnode_handle*, struct v4l2_fwnode_endpoint*)' Bug: 300024866 Change-Id: I7e4c2faac5c8341a19ea3fed694190d38679dc5b Signed-off-by: Kever Yang --- android/abi_gki_aarch64.stg | 269 +++++++++++++++++++++++++++++++ android/abi_gki_aarch64_rockchip | 14 ++ 2 files changed, 283 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index f2b7c07a7716..4a190c90a757 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -9018,6 +9018,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0x72d62916 } +pointer_reference { + id: 0x1625e208 + kind: POINTER + pointee_type_id: 0x72d76ebd +} pointer_reference { id: 0x162c7a70 kind: POINTER @@ -18183,6 +18188,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0x9d41cc1a } +pointer_reference { + id: 0x2dc069c5 + kind: POINTER + pointee_type_id: 0x9d414188 +} pointer_reference { id: 0x2dc1540f kind: POINTER @@ -30713,6 +30723,11 @@ typedef { name: "p4d_t" referred_type_id: 0x148546d4 } +typedef { + id: 0xbad82a2c + name: "parse_endpoint_func" + referred_type_id: 0x2dc069c5 +} typedef { id: 0x8ef19fe7 name: "pci_bus_flags_t" @@ -52189,6 +52204,11 @@ member { name: "base" type_id: 0x180f82e8 } +member { + id: 0x85d2e2e4 + name: "base" + type_id: 0x080c6fc2 +} member { id: 0x85d6188a name: "base" @@ -56419,6 +56439,12 @@ member { name: "bus" type_id: 0x2309ad3e } +member { + id: 0xdaf846cc + name: "bus" + type_id: 0x286a95aa + offset: 160 +} member { id: 0x1639ef00 name: "bus_cleanup" @@ -56648,6 +56674,12 @@ member { type_id: 0x945e7ef6 offset: 448 } +member { + id: 0x2c928e64 + name: "bus_type" + type_id: 0x3c57148f + offset: 128 +} member { id: 0xb43c45b4 name: "bus_width" @@ -116596,6 +116628,12 @@ member { name: "link_fd" type_id: 0xe62ebf07 } +member { + id: 0x6075ccdc + name: "link_frequencies" + type_id: 0x2e18f543 + offset: 512 +} member { id: 0x178cf8a4 name: "link_gen" @@ -126808,6 +126846,18 @@ member { name: "mipi_csi1" type_id: 0xe49bfc8b } +member { + id: 0xa7e5d7c1 + name: "mipi_csi1" + type_id: 0xe49bfc8b + offset: 64 +} +member { + id: 0xeda56411 + name: "mipi_csi2" + type_id: 0xe72f0de6 + offset: 128 +} member { id: 0xeda56dd3 name: "mipi_csi2" @@ -136120,6 +136170,12 @@ member { type_id: 0xe62ebf07 offset: 672 } +member { + id: 0x4519d21b + name: "nr_of_link_frequencies" + type_id: 0x4585663f + offset: 576 +} member { id: 0x9c6b34f7 name: "nr_off" @@ -211032,6 +211088,16 @@ struct_union { member_id: 0x9683f73d } } +struct_union { + id: 0x286a95aa + kind: STRUCT + definition { + bytesize: 40 + member_id: 0xc0bc4db7 + member_id: 0xa7e5d7c1 + member_id: 0xeda56411 + } +} struct_union { id: 0x2880e524 kind: STRUCT @@ -265808,6 +265874,19 @@ struct_union { member_id: 0x465224ed } } +struct_union { + id: 0x72d76ebd + kind: STRUCT + name: "v4l2_fwnode_endpoint" + definition { + bytesize: 80 + member_id: 0x85d2e2e4 + member_id: 0x2c928e64 + member_id: 0xdaf846cc + member_id: 0x6075ccdc + member_id: 0x4519d21b + } +} struct_union { id: 0xccd4dc1a kind: STRUCT @@ -287051,6 +287130,13 @@ enumeration { } } } +function { + id: 0x003279c7 + return_type_id: 0x3c2dd1ca + parameter_id: 0x3cfe7778 + parameter_id: 0x0490bb4a + parameter_id: 0x4585663f +} function { id: 0x004cf563 return_type_id: 0x48b5725f @@ -291664,6 +291750,11 @@ function { parameter_id: 0x14528516 parameter_id: 0x2712b6f9 } +function { + id: 0x15112911 + return_type_id: 0x48b5725f + parameter_id: 0x1625e208 +} function { id: 0x151457b1 return_type_id: 0xd5cc9c9a @@ -298655,6 +298746,11 @@ function { parameter_id: 0x3c2755a3 parameter_id: 0x0cbf60eb } +function { + id: 0x1fa7cc4d + return_type_id: 0x48b5725f + parameter_id: 0x3cfe7778 +} function { id: 0x1fa8b2bc return_type_id: 0x48b5725f @@ -321574,6 +321670,12 @@ function { parameter_id: 0x04b193cc parameter_id: 0x0335a07f } +function { + id: 0x9ca0dc77 + return_type_id: 0x6720d32f + parameter_id: 0x074f1a14 + parameter_id: 0x3cfe7778 +} function { id: 0x9ca1921c return_type_id: 0x6720d32f @@ -322038,6 +322140,12 @@ function { parameter_id: 0x054f691a parameter_id: 0x0aa1f0ee } +function { + id: 0x9cfc5a75 + return_type_id: 0x6720d32f + parameter_id: 0x0490bb4a + parameter_id: 0x1625e208 +} function { id: 0x9cfd713b return_type_id: 0x6720d32f @@ -322060,6 +322168,12 @@ function { parameter_id: 0x02ed0755 parameter_id: 0x0e68dab6 } +function { + id: 0x9d027320 + return_type_id: 0x6720d32f + parameter_id: 0x01c5a749 + parameter_id: 0x3cfe7778 +} function { id: 0x9d038726 return_type_id: 0x6720d32f @@ -322608,6 +322722,13 @@ function { parameter_id: 0x0258f96e parameter_id: 0x15f20052 } +function { + id: 0x9d414188 + return_type_id: 0x6720d32f + parameter_id: 0x0258f96e + parameter_id: 0x1625e208 + parameter_id: 0x3c2dd1ca +} function { id: 0x9d419277 return_type_id: 0x6720d32f @@ -323728,6 +323849,14 @@ function { parameter_id: 0x33756485 parameter_id: 0x064d6086 } +function { + id: 0x9ddac293 + return_type_id: 0x6720d32f + parameter_id: 0x0258f96e + parameter_id: 0x3cfe7778 + parameter_id: 0xf435685e + parameter_id: 0xbad82a2c +} function { id: 0x9ddaf106 return_type_id: 0x6720d32f @@ -343026,6 +343155,24 @@ elf_symbol { type_id: 0x20cd94dc full_name: "__usecs_to_jiffies" } +elf_symbol { + id: 0xf51d746f + name: "__v4l2_async_nf_add_fwnode" + is_defined: true + symbol_type: FUNCTION + crc: 0x03599cac + type_id: 0x003279c7 + full_name: "__v4l2_async_nf_add_fwnode" +} +elf_symbol { + id: 0xe13e16ca + name: "__v4l2_async_nf_add_fwnode_remote" + is_defined: true + symbol_type: FUNCTION + crc: 0x82966749 + type_id: 0x003279c7 + full_name: "__v4l2_async_nf_add_fwnode_remote" +} elf_symbol { id: 0x4c0a941a name: "__v4l2_ctrl_handler_setup" @@ -394585,6 +394732,87 @@ elf_symbol { type_id: 0x927d452a full_name: "uuid_parse" } +elf_symbol { + id: 0x4e2f55da + name: "v4l2_async_nf_cleanup" + is_defined: true + symbol_type: FUNCTION + crc: 0xdad12cba + type_id: 0x1fa7cc4d + full_name: "v4l2_async_nf_cleanup" +} +elf_symbol { + id: 0x04aadf7f + name: "v4l2_async_nf_init" + is_defined: true + symbol_type: FUNCTION + crc: 0xc88abf32 + type_id: 0x1fa7cc4d + full_name: "v4l2_async_nf_init" +} +elf_symbol { + id: 0x7920fabe + name: "v4l2_async_nf_parse_fwnode_endpoints" + is_defined: true + symbol_type: FUNCTION + crc: 0xde590e4b + type_id: 0x9ddac293 + full_name: "v4l2_async_nf_parse_fwnode_endpoints" +} +elf_symbol { + id: 0x48e55006 + name: "v4l2_async_nf_register" + is_defined: true + symbol_type: FUNCTION + crc: 0x8be566ca + type_id: 0x9ca0dc77 + full_name: "v4l2_async_nf_register" +} +elf_symbol { + id: 0x65ffd1d0 + name: "v4l2_async_nf_unregister" + is_defined: true + symbol_type: FUNCTION + crc: 0xc74894f9 + type_id: 0x1fa7cc4d + full_name: "v4l2_async_nf_unregister" +} +elf_symbol { + id: 0x507a9ef5 + name: "v4l2_async_register_subdev" + is_defined: true + symbol_type: FUNCTION + crc: 0x64ab86bc + type_id: 0x9df18afd + full_name: "v4l2_async_register_subdev" +} +elf_symbol { + id: 0x050dd932 + name: "v4l2_async_register_subdev_sensor" + is_defined: true + symbol_type: FUNCTION + crc: 0x61c8f608 + type_id: 0x9df18afd + full_name: "v4l2_async_register_subdev_sensor" +} +elf_symbol { + id: 0x0664687c + name: "v4l2_async_subdev_nf_register" + is_defined: true + symbol_type: FUNCTION + crc: 0x4d890f4b + type_id: 0x9d027320 + full_name: "v4l2_async_subdev_nf_register" +} +elf_symbol { + id: 0xf440f7f1 + name: "v4l2_async_unregister_subdev" + is_defined: true + symbol_type: FUNCTION + crc: 0x2592ea78 + type_id: 0x10e93841 + full_name: "v4l2_async_unregister_subdev" +} elf_symbol { id: 0xf39bae65 name: "v4l2_compat_ioctl32" @@ -394990,6 +395218,33 @@ elf_symbol { type_id: 0x209ae488 full_name: "v4l2_format_info" } +elf_symbol { + id: 0x7ba36329 + name: "v4l2_fwnode_endpoint_alloc_parse" + is_defined: true + symbol_type: FUNCTION + crc: 0x05930b06 + type_id: 0x9cfc5a75 + full_name: "v4l2_fwnode_endpoint_alloc_parse" +} +elf_symbol { + id: 0x2643c2c9 + name: "v4l2_fwnode_endpoint_free" + is_defined: true + symbol_type: FUNCTION + crc: 0xf01d6f06 + type_id: 0x15112911 + full_name: "v4l2_fwnode_endpoint_free" +} +elf_symbol { + id: 0xcb8b4f14 + name: "v4l2_fwnode_endpoint_parse" + is_defined: true + symbol_type: FUNCTION + crc: 0x9dcd6cfe + type_id: 0x9cfc5a75 + full_name: "v4l2_fwnode_endpoint_parse" +} elf_symbol { id: 0x58330374 name: "v4l2_g_parm_cap" @@ -399757,6 +400012,8 @@ interface { symbol_id: 0x7c261545 symbol_id: 0xf497de36 symbol_id: 0xf44f6a18 + symbol_id: 0xf51d746f + symbol_id: 0xe13e16ca symbol_id: 0x4c0a941a symbol_id: 0xfc85c168 symbol_id: 0xb6af2644 @@ -405485,6 +405742,15 @@ interface { symbol_id: 0xb0c1eaf9 symbol_id: 0xe7b3f166 symbol_id: 0xb21b47da + symbol_id: 0x4e2f55da + symbol_id: 0x04aadf7f + symbol_id: 0x7920fabe + symbol_id: 0x48e55006 + symbol_id: 0x65ffd1d0 + symbol_id: 0x507a9ef5 + symbol_id: 0x050dd932 + symbol_id: 0x0664687c + symbol_id: 0xf440f7f1 symbol_id: 0xf39bae65 symbol_id: 0xfd78bf45 symbol_id: 0x218d39b6 @@ -405530,6 +405796,9 @@ interface { symbol_id: 0xe66642fe symbol_id: 0x538ad5cc symbol_id: 0x2244c8f0 + symbol_id: 0x7ba36329 + symbol_id: 0x2643c2c9 + symbol_id: 0xcb8b4f14 symbol_id: 0x58330374 symbol_id: 0xdb18c924 symbol_id: 0x5e36dba6 diff --git a/android/abi_gki_aarch64_rockchip b/android/abi_gki_aarch64_rockchip index 0010cf2300b6..a051b3843047 100644 --- a/android/abi_gki_aarch64_rockchip +++ b/android/abi_gki_aarch64_rockchip @@ -1268,6 +1268,15 @@ usb_submit_urb __usecs_to_jiffies usleep_range_state + __v4l2_async_nf_add_fwnode_remote + v4l2_async_nf_cleanup + v4l2_async_nf_init + v4l2_async_nf_parse_fwnode_endpoints + v4l2_async_nf_register + v4l2_async_register_subdev + v4l2_async_register_subdev_sensor + v4l2_async_subdev_nf_register + v4l2_async_unregister_subdev v4l2_ctrl_find v4l2_ctrl_g_ctrl v4l2_ctrl_g_ctrl_int64 @@ -1295,6 +1304,9 @@ v4l2_event_subscribe v4l2_event_unsubscribe v4l2_fh_open + v4l2_fwnode_endpoint_alloc_parse + v4l2_fwnode_endpoint_free + v4l2_fwnode_endpoint_parse v4l2_i2c_subdev_init v4l2_match_dv_timings v4l2_pipeline_link_notify @@ -2871,9 +2883,11 @@ # required by video_rkcif.ko media_entity_setup_link + __v4l2_async_nf_add_fwnode # required by video_rkisp.ko param_ops_ullong + v4l2_async_nf_unregister v4l2_ctrl_poll # required by videobuf2-cma-sg.ko From c52d48818b621678f7ffbeddf29ee1a3437a1363 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:47 -0400 Subject: [PATCH 46/78] UPSTREAM: maple_tree: introduce __mas_set_range() mas_set_range() resets the node to MAS_START, which will cause a re-walk of the tree to the range. This is unnecessary when the maple state is already at the correct location of the write. Add a function that only sets the range to avoid unnecessary re-walking of the tree. Link: https://lkml.kernel.org/r/20230724183157.3939892-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit c1297987cc2ada57a7faea7985c2334548d110f9) Bug: 308042511 Change-Id: I9e026d0f103e3aa24b47998be6b83e28e7928540 Signed-off-by: Suren Baghdasaryan --- include/linux/maple_tree.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0ff8ce8cd06a..6053986cbdf6 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -525,6 +525,22 @@ static inline void mas_reset(struct ma_state *mas) */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) +/** + * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the + * current location. + * @mas: Maple Tree operation state. + * @start: New start of range in the Maple Tree. + * @last: New end of range in the Maple Tree. + * + * set the internal maple state values to a sub-range. + * Please use mas_set_range() if you do not know where you are in the tree. + */ +static inline void __mas_set_range(struct ma_state *mas, unsigned long start, + unsigned long last) +{ + mas->index = start; + mas->last = last; +} /** * mas_set_range() - Set up Maple Tree operation state for a different index. @@ -539,9 +555,8 @@ static inline void mas_reset(struct ma_state *mas) static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { - mas->index = start; - mas->last = last; - mas->node = MAS_START; + __mas_set_range(mas, start, last); + mas->node = MAS_START; } /** From 4ddcdc519b4ae94f56dfa81769d5ce59f1cc41a8 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:36 +0800 Subject: [PATCH 47/78] FROMGIT: maple_tree: add mt_free_one() and mt_attr() helpers Patch series "Introduce __mt_dup() to improve the performance of fork()", v7. This series introduces __mt_dup() to improve the performance of fork(). During the duplication process of mmap, all VMAs are traversed and inserted one by one into the new maple tree, causing the maple tree to be rebalanced multiple times. Balancing the maple tree is a costly operation. To duplicate VMAs more efficiently, mtree_dup() and __mt_dup() are introduced for the maple tree. They can efficiently duplicate a maple tree. Here are some algorithmic details about {mtree,__mt}_dup(). We perform a DFS pre-order traversal of all nodes in the source maple tree. During this process, we fully copy the nodes from the source tree to the new tree. This involves memory allocation, and when encountering a new node, if it is a non-leaf node, all its child nodes are allocated at once. This idea was originally from Liam R. Howlett's Maple Tree Work email, and I added some of my own ideas to implement it. Some previous discussions can be found in [1]. For a more detailed analysis of the algorithm, please refer to the logs for patch [3/10] and patch [10/10]. There is a "spawn" in byte-unixbench[2], which can be used to test the performance of fork(). I modified it slightly to make it work with different number of VMAs. Below are the test results. The first row shows the number of VMAs. The second and third rows show the number of fork() calls per ten seconds, corresponding to next-20231006 and the this patchset, respectively. The test results were obtained with CPU binding to avoid scheduler load balancing that could cause unstable results. There are still some fluctuations in the test results, but at least they are better than the original performance. 21 121 221 421 821 1621 3221 6421 12821 25621 51221 112100 76261 54227 34035 20195 11112 6017 3161 1606 802 393 114558 83067 65008 45824 28751 16072 8922 4747 2436 1233 599 2.19% 8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42% Thanks to Liam and Matthew for the review. This patch (of 10): Add two helpers: 1. mt_free_one(), used to free a maple node. 2. mt_attr(), used to obtain the attributes of maple tree. Link: https://lkml.kernel.org/r/20231027033845.90608-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20231027033845.90608-2-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 4f2267b58a22d972be98edef8e6b3c7a67c9fb91 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: Ib9b13dee357ac4c85668901c20a3c370fbdd08da Signed-off-by: Suren Baghdasaryan --- lib/maple_tree.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 826f7b8d5e05..d932bf27fa0b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -158,6 +158,11 @@ static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } +static inline void mt_free_one(struct maple_node *node) +{ + kmem_cache_free(maple_node_cache, node); +} + static inline void mt_free_bulk(size_t size, void __rcu **nodes) { kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); @@ -199,6 +204,11 @@ static unsigned int mas_mt_height(struct ma_state *mas) return mt_height(mas->tree); } +static inline unsigned int mt_attr(struct maple_tree *mt) +{ + return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK; +} + static inline enum maple_type mte_node_type(const struct maple_enode *entry) { return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) & @@ -5702,7 +5712,7 @@ void mas_destroy(struct ma_state *mas) mt_free_bulk(count, (void __rcu **)&node->slot[1]); total -= count; } - kmem_cache_free(maple_node_cache, node); + mt_free_one(ma_mnode_ptr(node)); total--; } From dc9323545b03f3d1ebeeac46c5ceb366eced9314 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:37 +0800 Subject: [PATCH 48/78] FROMGIT: maple_tree: introduce {mtree,mas}_lock_nested() In some cases, nested locks may be needed, so {mtree,mas}_lock_nested is introduced. For example, when duplicating maple tree, we need to hold the locks of two trees, in which case nested locks are needed. At the same time, add the definition of spin_lock_nested() in tools for testing. Link: https://lkml.kernel.org/r/20231027033845.90608-3-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit b2472efe4316b2687c153919c1513a098bd82c17 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: I06f0eb0a32a2f39b7842de08a0e5ce59895345c5 Signed-off-by: Suren Baghdasaryan --- include/linux/maple_tree.h | 4 ++++ tools/include/linux/spinlock.h | 1 + 2 files changed, 5 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 6053986cbdf6..73d9f342eac4 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -249,6 +249,8 @@ struct maple_tree { struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) +#define mtree_lock_nested(mas, subclass) \ + spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* @@ -399,6 +401,8 @@ struct ma_wr_state { }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) +#define mas_lock_nested(mas, subclass) \ + spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 622266b197d0..a6cdf25b6b9d 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -11,6 +11,7 @@ #define spin_lock_init(x) pthread_mutex_init(x, NULL) #define spin_lock(x) pthread_mutex_lock(x) +#define spin_lock_nested(x, subclass) pthread_mutex_lock(x) #define spin_unlock(x) pthread_mutex_unlock(x) #define spin_lock_bh(x) pthread_mutex_lock(x) #define spin_unlock_bh(x) pthread_mutex_unlock(x) From eb5048ea90b2f2935edfce9e3182a8aee3f2c8fd Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:38 +0800 Subject: [PATCH 49/78] FROMGIT: maple_tree: introduce interfaces __mt_dup() and mtree_dup() Introduce interfaces __mt_dup() and mtree_dup(), which are used to duplicate a maple tree. They duplicate a maple tree in Depth-First Search (DFS) pre-order traversal. It uses memcopy() to copy nodes in the source tree and allocate new child nodes in non-leaf nodes. The new node is exactly the same as the source node except for all the addresses stored in it. It will be faster than traversing all elements in the source tree and inserting them one by one into the new tree. The time complexity of these two functions is O(n). The difference between __mt_dup() and mtree_dup() is that mtree_dup() handles locks internally. Analysis of the average time complexity of this algorithm: For simplicity, let's assume that the maximum branching factor of all non-leaf nodes is 16 (in allocation mode, it is 10), and the tree is a full tree. Under the given conditions, if there is a maple tree with n elements, the number of its leaves is n/16. From bottom to top, the number of nodes in each level is 1/16 of the number of nodes in the level below. So the total number of nodes in the entire tree is given by the sum of n/16 + n/16^2 + n/16^3 + ... + 1. This is a geometric series, and it has log(n) terms with base 16. According to the formula for the sum of a geometric series, the sum of this series can be calculated as (n-1)/15. Each node has only one parent node pointer, which can be considered as an edge. In total, there are (n-1)/15-1 edges. This algorithm consists of two operations: 1. Traversing all nodes in DFS order. 2. For each node, making a copy and performing necessary modifications to create a new node. For the first part, DFS traversal will visit each edge twice. Let T(ascend) represent the cost of taking one step downwards, and T(descend) represent the cost of taking one step upwards. And both of them are constants (although mas_ascend() may not be, as it contains a loop, but here we ignore it and treat it as a constant). So the time spent on the first part can be represented as ((n-1)/15-1) * (T(ascend) + T(descend)). For the second part, each node will be copied, and the cost of copying a node is denoted as T(copy_node). For each non-leaf node, it is necessary to reallocate all child nodes, and the cost of this operation is denoted as T(dup_alloc). The behavior behind memory allocation is complex and not specific to the maple tree operation. Here, we assume that the time required for a single allocation is constant. Since the size of a node is fixed, both of these symbols are also constants. We can calculate that the time spent on the second part is ((n-1)/15) * T(copy_node) + ((n-1)/15 - n/16) * T(dup_alloc). Adding both parts together, the total time spent by the algorithm can be represented as: ((n-1)/15) * (T(ascend) + T(descend) + T(copy_node) + T(dup_alloc)) - n/16 * T(dup_alloc) - (T(ascend) + T(descend)) Let C1 = T(ascend) + T(descend) + T(copy_node) + T(dup_alloc) Let C2 = T(dup_alloc) Let C3 = T(ascend) + T(descend) Finally, the expression can be simplified as: ((16 * C1 - 15 * C2) / (15 * 16)) * n - (C1 / 15 + C3). This is a linear function, so the average time complexity is O(n). Link: https://lkml.kernel.org/r/20231027033845.90608-4-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Suggested-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit fd32e4e9b7646510ee9010e0d5f8b8857d48a6f7 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: I385759a1184a202498e086458b572c203616b9b4 Signed-off-by: Suren Baghdasaryan --- include/linux/maple_tree.h | 3 + lib/maple_tree.c | 274 +++++++++++++++++++++++++++++++++++++ 2 files changed, 277 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 73d9f342eac4..7ccb05ba08ce 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -322,6 +322,9 @@ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); +int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); +int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); + void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d932bf27fa0b..3bc5414f78ab 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4,6 +4,8 @@ * Copyright (c) 2018-2022 Oracle Corporation * Authors: Liam R. Howlett * Matthew Wilcox + * Copyright (c) 2023 ByteDance + * Author: Peng Zhang */ /* @@ -6537,6 +6539,278 @@ void *mtree_erase(struct maple_tree *mt, unsigned long index) } EXPORT_SYMBOL(mtree_erase); +/* + * mas_dup_free() - Free an incomplete duplication of a tree. + * @mas: The maple state of a incomplete tree. + * + * The parameter @mas->node passed in indicates that the allocation failed on + * this node. This function frees all nodes starting from @mas->node in the + * reverse order of mas_dup_build(). There is no need to hold the source tree + * lock at this time. + */ +static void mas_dup_free(struct ma_state *mas) +{ + struct maple_node *node; + enum maple_type type; + void __rcu **slots; + unsigned char count, i; + + /* Maybe the first node allocation failed. */ + if (mas_is_none(mas)) + return; + + while (!mte_is_root(mas->node)) { + mas_ascend(mas); + if (mas->offset) { + mas->offset--; + do { + mas_descend(mas); + mas->offset = mas_data_end(mas); + } while (!mte_is_leaf(mas->node)); + + mas_ascend(mas); + } + + node = mte_to_node(mas->node); + type = mte_node_type(mas->node); + slots = ma_slots(node, type); + count = mas_data_end(mas) + 1; + for (i = 0; i < count; i++) + ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK; + mt_free_bulk(count, slots); + } + + node = mte_to_node(mas->node); + mt_free_one(node); +} + +/* + * mas_copy_node() - Copy a maple node and replace the parent. + * @mas: The maple state of source tree. + * @new_mas: The maple state of new tree. + * @parent: The parent of the new node. + * + * Copy @mas->node to @new_mas->node, set @parent to be the parent of + * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM. + */ +static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas, + struct maple_pnode *parent) +{ + struct maple_node *node = mte_to_node(mas->node); + struct maple_node *new_node = mte_to_node(new_mas->node); + unsigned long val; + + /* Copy the node completely. */ + memcpy(new_node, node, sizeof(struct maple_node)); + /* Update the parent node pointer. */ + val = (unsigned long)node->parent & MAPLE_NODE_MASK; + new_node->parent = ma_parent_ptr(val | (unsigned long)parent); +} + +/* + * mas_dup_alloc() - Allocate child nodes for a maple node. + * @mas: The maple state of source tree. + * @new_mas: The maple state of new tree. + * @gfp: The GFP_FLAGS to use for allocations. + * + * This function allocates child nodes for @new_mas->node during the duplication + * process. If memory allocation fails, @mas is set to -ENOMEM. + */ +static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, + gfp_t gfp) +{ + struct maple_node *node = mte_to_node(mas->node); + struct maple_node *new_node = mte_to_node(new_mas->node); + enum maple_type type; + unsigned char request, count, i; + void __rcu **slots; + void __rcu **new_slots; + unsigned long val; + + /* Allocate memory for child nodes. */ + type = mte_node_type(mas->node); + new_slots = ma_slots(new_node, type); + request = mas_data_end(mas) + 1; + count = mt_alloc_bulk(gfp, request, (void **)new_slots); + if (unlikely(count < request)) { + memset(new_slots, 0, request * sizeof(void *)); + mas_set_err(mas, -ENOMEM); + return; + } + + /* Restore node type information in slots. */ + slots = ma_slots(node, type); + for (i = 0; i < count; i++) { + val = (unsigned long)mt_slot_locked(mas->tree, slots, i); + val &= MAPLE_NODE_MASK; + ((unsigned long *)new_slots)[i] |= val; + } +} + +/* + * mas_dup_build() - Build a new maple tree from a source tree + * @mas: The maple state of source tree, need to be in MAS_START state. + * @new_mas: The maple state of new tree, need to be in MAS_START state. + * @gfp: The GFP_FLAGS to use for allocations. + * + * This function builds a new tree in DFS preorder. If the memory allocation + * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the + * last node. mas_dup_free() will free the incomplete duplication of a tree. + * + * Note that the attributes of the two trees need to be exactly the same, and the + * new tree needs to be empty, otherwise -EINVAL will be set in @mas. + */ +static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, + gfp_t gfp) +{ + struct maple_node *node; + struct maple_pnode *parent = NULL; + struct maple_enode *root; + enum maple_type type; + + if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) || + unlikely(!mtree_empty(new_mas->tree))) { + mas_set_err(mas, -EINVAL); + return; + } + + root = mas_start(mas); + if (mas_is_ptr(mas) || mas_is_none(mas)) + goto set_new_tree; + + node = mt_alloc_one(gfp); + if (!node) { + new_mas->node = MAS_NONE; + mas_set_err(mas, -ENOMEM); + return; + } + + type = mte_node_type(mas->node); + root = mt_mk_node(node, type); + new_mas->node = root; + new_mas->min = 0; + new_mas->max = ULONG_MAX; + root = mte_mk_root(root); + while (1) { + mas_copy_node(mas, new_mas, parent); + if (!mte_is_leaf(mas->node)) { + /* Only allocate child nodes for non-leaf nodes. */ + mas_dup_alloc(mas, new_mas, gfp); + if (unlikely(mas_is_err(mas))) + return; + } else { + /* + * This is the last leaf node and duplication is + * completed. + */ + if (mas->max == ULONG_MAX) + goto done; + + /* This is not the last leaf node and needs to go up. */ + do { + mas_ascend(mas); + mas_ascend(new_mas); + } while (mas->offset == mas_data_end(mas)); + + /* Move to the next subtree. */ + mas->offset++; + new_mas->offset++; + } + + mas_descend(mas); + parent = ma_parent_ptr(mte_to_node(new_mas->node)); + mas_descend(new_mas); + mas->offset = 0; + new_mas->offset = 0; + } +done: + /* Specially handle the parent of the root node. */ + mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas)); +set_new_tree: + /* Make them the same height */ + new_mas->tree->ma_flags = mas->tree->ma_flags; + rcu_assign_pointer(new_mas->tree->ma_root, root); +} + +/** + * __mt_dup(): Duplicate an entire maple tree + * @mt: The source maple tree + * @new: The new maple tree + * @gfp: The GFP_FLAGS to use for allocations + * + * This function duplicates a maple tree in Depth-First Search (DFS) pre-order + * traversal. It uses memcpy() to copy nodes in the source tree and allocate + * new child nodes in non-leaf nodes. The new node is exactly the same as the + * source node except for all the addresses stored in it. It will be faster than + * traversing all elements in the source tree and inserting them one by one into + * the new tree. + * The user needs to ensure that the attributes of the source tree and the new + * tree are the same, and the new tree needs to be an empty tree, otherwise + * -EINVAL will be returned. + * Note that the user needs to manually lock the source tree and the new tree. + * + * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If + * the attributes of the two trees are different or the new tree is not an empty + * tree. + */ +int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) +{ + int ret = 0; + MA_STATE(mas, mt, 0, 0); + MA_STATE(new_mas, new, 0, 0); + + mas_dup_build(&mas, &new_mas, gfp); + if (unlikely(mas_is_err(&mas))) { + ret = xa_err(mas.node); + if (ret == -ENOMEM) + mas_dup_free(&new_mas); + } + + return ret; +} +EXPORT_SYMBOL(__mt_dup); + +/** + * mtree_dup(): Duplicate an entire maple tree + * @mt: The source maple tree + * @new: The new maple tree + * @gfp: The GFP_FLAGS to use for allocations + * + * This function duplicates a maple tree in Depth-First Search (DFS) pre-order + * traversal. It uses memcpy() to copy nodes in the source tree and allocate + * new child nodes in non-leaf nodes. The new node is exactly the same as the + * source node except for all the addresses stored in it. It will be faster than + * traversing all elements in the source tree and inserting them one by one into + * the new tree. + * The user needs to ensure that the attributes of the source tree and the new + * tree are the same, and the new tree needs to be an empty tree, otherwise + * -EINVAL will be returned. + * + * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If + * the attributes of the two trees are different or the new tree is not an empty + * tree. + */ +int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) +{ + int ret = 0; + MA_STATE(mas, mt, 0, 0); + MA_STATE(new_mas, new, 0, 0); + + mas_lock(&new_mas); + mas_lock_nested(&mas, SINGLE_DEPTH_NESTING); + mas_dup_build(&mas, &new_mas, gfp); + mas_unlock(&mas); + if (unlikely(mas_is_err(&mas))) { + ret = xa_err(mas.node); + if (ret == -ENOMEM) + mas_dup_free(&new_mas); + } + + mas_unlock(&new_mas); + return ret; +} +EXPORT_SYMBOL(mtree_dup); + /** * __mt_destroy() - Walk and free all nodes of a locked maple tree. * @mt: The maple tree From f73f881af49ecfcc16be7e460ed8046a91d259ad Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:39 +0800 Subject: [PATCH 50/78] FROMGIT: radix tree test suite: align kmem_cache_alloc_bulk() with kernel behavior. When kmem_cache_alloc_bulk() fails to allocate, leave the freed pointers in the array. This enables a more accurate simulation of the kernel's behavior and allows for testing potential double-free scenarios. Link: https://lkml.kernel.org/r/20231027033845.90608-5-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 46c99e26f2f86260fed226cab217d0b3ca8dca56 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: If822e9d219066e1573b7c044ef9a7344f652e365 Signed-off-by: Suren Baghdasaryan --- tools/testing/radix-tree/linux.c | 45 +++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c index d587a558997f..64a1645ff94c 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/radix-tree/linux.c @@ -93,13 +93,9 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, return p; } -void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) +void __kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) { assert(objp); - uatomic_dec(&nr_allocated); - uatomic_dec(&cachep->nr_allocated); - if (kmalloc_verbose) - printf("Freeing %p to slab\n", objp); if (cachep->nr_objs > 10 || cachep->align) { memset(objp, POISON_FREE, cachep->size); free(objp); @@ -111,6 +107,15 @@ void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) } } +void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) +{ + uatomic_dec(&nr_allocated); + uatomic_dec(&cachep->nr_allocated); + if (kmalloc_verbose) + printf("Freeing %p to slab\n", objp); + __kmem_cache_free_locked(cachep, objp); +} + void kmem_cache_free(struct kmem_cache *cachep, void *objp) { pthread_mutex_lock(&cachep->lock); @@ -141,18 +146,17 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, if (kmalloc_verbose) pr_debug("Bulk alloc %lu\n", size); - if (!(gfp & __GFP_DIRECT_RECLAIM)) { - if (cachep->non_kernel < size) - return 0; - - cachep->non_kernel -= size; - } - pthread_mutex_lock(&cachep->lock); if (cachep->nr_objs >= size) { struct radix_tree_node *node; for (i = 0; i < size; i++) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + break; + cachep->non_kernel--; + } + node = cachep->objs; cachep->nr_objs--; cachep->objs = node->parent; @@ -163,11 +167,19 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, } else { pthread_mutex_unlock(&cachep->lock); for (i = 0; i < size; i++) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + break; + cachep->non_kernel--; + } + if (cachep->align) { posix_memalign(&p[i], cachep->align, cachep->size * size); } else { p[i] = malloc(cachep->size * size); + if (!p[i]) + break; } if (cachep->ctor) cachep->ctor(p[i]); @@ -176,6 +188,15 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, } } + if (i < size) { + size = i; + pthread_mutex_lock(&cachep->lock); + for (i = 0; i < size; i++) + __kmem_cache_free_locked(cachep, p[i]); + pthread_mutex_unlock(&cachep->lock); + return 0; + } + for (i = 0; i < size; i++) { uatomic_inc(&nr_allocated); uatomic_inc(&cachep->nr_allocated); From 7befa7bbc90ae65849a08dce9bf5ce1845f19494 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:40 +0800 Subject: [PATCH 51/78] FROMGIT: maple_tree: add test for mtree_dup() Add test for mtree_dup(). Test by duplicating different maple trees and then comparing the two trees. Includes tests for duplicating full trees and memory allocation failures on different nodes. Link: https://lkml.kernel.org/r/20231027033845.90608-6-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit a2587a7e8d37885dc063255f5400a66299b42e48 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: I7501db5735b1dfd15240ef2946b26d63ffe1d8e0 Signed-off-by: Suren Baghdasaryan --- tools/testing/radix-tree/maple.c | 361 +++++++++++++++++++++++++++++++ 1 file changed, 361 insertions(+) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index b598b7fe4419..916cf8b45ddf 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35753,6 +35753,363 @@ static noinline void __init check_locky(struct maple_tree *mt) mt_clear_in_rcu(mt); } +/* + * Compares two nodes except for the addresses stored in the nodes. + * Returns zero if they are the same, otherwise returns non-zero. + */ +static int __init compare_node(struct maple_enode *enode_a, + struct maple_enode *enode_b) +{ + struct maple_node *node_a, *node_b; + struct maple_node a, b; + void **slots_a, **slots_b; /* Do not use the rcu tag. */ + enum maple_type type; + int i; + + if (((unsigned long)enode_a & MAPLE_NODE_MASK) != + ((unsigned long)enode_b & MAPLE_NODE_MASK)) { + pr_err("The lower 8 bits of enode are different.\n"); + return -1; + } + + type = mte_node_type(enode_a); + node_a = mte_to_node(enode_a); + node_b = mte_to_node(enode_b); + a = *node_a; + b = *node_b; + + /* Do not compare addresses. */ + if (ma_is_root(node_a) || ma_is_root(node_b)) { + a.parent = (struct maple_pnode *)((unsigned long)a.parent & + MA_ROOT_PARENT); + b.parent = (struct maple_pnode *)((unsigned long)b.parent & + MA_ROOT_PARENT); + } else { + a.parent = (struct maple_pnode *)((unsigned long)a.parent & + MAPLE_NODE_MASK); + b.parent = (struct maple_pnode *)((unsigned long)b.parent & + MAPLE_NODE_MASK); + } + + if (a.parent != b.parent) { + pr_err("The lower 8 bits of parents are different. %p %p\n", + a.parent, b.parent); + return -1; + } + + /* + * If it is a leaf node, the slots do not contain the node address, and + * no special processing of slots is required. + */ + if (ma_is_leaf(type)) + goto cmp; + + slots_a = ma_slots(&a, type); + slots_b = ma_slots(&b, type); + + for (i = 0; i < mt_slots[type]; i++) { + if (!slots_a[i] && !slots_b[i]) + break; + + if (!slots_a[i] || !slots_b[i]) { + pr_err("The number of slots is different.\n"); + return -1; + } + + /* Do not compare addresses in slots. */ + ((unsigned long *)slots_a)[i] &= MAPLE_NODE_MASK; + ((unsigned long *)slots_b)[i] &= MAPLE_NODE_MASK; + } + +cmp: + /* + * Compare all contents of two nodes, including parent (except address), + * slots (except address), pivots, gaps and metadata. + */ + return memcmp(&a, &b, sizeof(struct maple_node)); +} + +/* + * Compare two trees and return 0 if they are the same, non-zero otherwise. + */ +static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b) +{ + MA_STATE(mas_a, mt_a, 0, 0); + MA_STATE(mas_b, mt_b, 0, 0); + + if (mt_a->ma_flags != mt_b->ma_flags) { + pr_err("The flags of the two trees are different.\n"); + return -1; + } + + mas_dfs_preorder(&mas_a); + mas_dfs_preorder(&mas_b); + + if (mas_is_ptr(&mas_a) || mas_is_ptr(&mas_b)) { + if (!(mas_is_ptr(&mas_a) && mas_is_ptr(&mas_b))) { + pr_err("One is MAS_ROOT and the other is not.\n"); + return -1; + } + return 0; + } + + while (!mas_is_none(&mas_a) || !mas_is_none(&mas_b)) { + + if (mas_is_none(&mas_a) || mas_is_none(&mas_b)) { + pr_err("One is MAS_NONE and the other is not.\n"); + return -1; + } + + if (mas_a.min != mas_b.min || + mas_a.max != mas_b.max) { + pr_err("mas->min, mas->max do not match.\n"); + return -1; + } + + if (compare_node(mas_a.node, mas_b.node)) { + pr_err("The contents of nodes %p and %p are different.\n", + mas_a.node, mas_b.node); + mt_dump(mt_a, mt_dump_dec); + mt_dump(mt_b, mt_dump_dec); + return -1; + } + + mas_dfs_preorder(&mas_a); + mas_dfs_preorder(&mas_b); + } + + return 0; +} + +static __init void mas_subtree_max_range(struct ma_state *mas) +{ + unsigned long limit = mas->max; + MA_STATE(newmas, mas->tree, 0, 0); + void *entry; + + mas_for_each(mas, entry, limit) { + if (mas->last - mas->index >= + newmas.last - newmas.index) { + newmas = *mas; + } + } + + *mas = newmas; +} + +/* + * build_full_tree() - Build a full tree. + * @mt: The tree to build. + * @flags: Use @flags to build the tree. + * @height: The height of the tree to build. + * + * Build a tree with full leaf nodes and internal nodes. Note that the height + * should not exceed 3, otherwise it will take a long time to build. + * Return: zero if the build is successful, non-zero if it fails. + */ +static __init int build_full_tree(struct maple_tree *mt, unsigned int flags, + int height) +{ + MA_STATE(mas, mt, 0, 0); + unsigned long step; + int ret = 0, cnt = 1; + enum maple_type type; + + mt_init_flags(mt, flags); + mtree_insert_range(mt, 0, ULONG_MAX, xa_mk_value(5), GFP_KERNEL); + + mtree_lock(mt); + + while (1) { + mas_set(&mas, 0); + if (mt_height(mt) < height) { + mas.max = ULONG_MAX; + goto store; + } + + while (1) { + mas_dfs_preorder(&mas); + if (mas_is_none(&mas)) + goto unlock; + + type = mte_node_type(mas.node); + if (mas_data_end(&mas) + 1 < mt_slots[type]) { + mas_set(&mas, mas.min); + goto store; + } + } +store: + mas_subtree_max_range(&mas); + step = mas.last - mas.index; + if (step < 1) { + ret = -1; + goto unlock; + } + + step /= 2; + mas.last = mas.index + step; + mas_store_gfp(&mas, xa_mk_value(5), + GFP_KERNEL); + ++cnt; + } +unlock: + mtree_unlock(mt); + + MT_BUG_ON(mt, mt_height(mt) != height); + /* pr_info("height:%u number of elements:%d\n", mt_height(mt), cnt); */ + return ret; +} + +static noinline void __init check_mtree_dup(struct maple_tree *mt) +{ + DEFINE_MTREE(new); + int i, j, ret, count = 0; + unsigned int rand_seed = 17, rand; + + /* store a value at [0, 0] */ + mt_init_flags(mt, 0); + mtree_store_range(mt, 0, 0, xa_mk_value(0), GFP_KERNEL); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + + /* The two trees have different attributes. */ + mt_init_flags(mt, 0); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret != -EINVAL); + mtree_destroy(mt); + mtree_destroy(&new); + + /* The new tree is not empty */ + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + mtree_store(&new, 5, xa_mk_value(5), GFP_KERNEL); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret != -EINVAL); + mtree_destroy(mt); + mtree_destroy(&new); + + /* Test for duplicating full trees. */ + for (i = 1; i <= 3; i++) { + ret = build_full_tree(mt, 0, i); + MT_BUG_ON(mt, ret); + mt_init_flags(&new, 0); + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + for (i = 1; i <= 3; i++) { + ret = build_full_tree(mt, MT_FLAGS_ALLOC_RANGE, i); + MT_BUG_ON(mt, ret); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* Test for normal duplicating. */ + for (i = 0; i < 1000; i += 3) { + if (i & 1) { + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + } else { + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + } + + for (j = 0; j < i; j++) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* Test memory allocation failed. */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i < 30; i += 3) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + + /* Failed at the first node. */ + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + mt_set_non_kernel(0); + ret = mtree_dup(mt, &new, GFP_NOWAIT); + mt_set_non_kernel(0); + MT_BUG_ON(&new, ret != -ENOMEM); + mtree_destroy(mt); + mtree_destroy(&new); + + /* Random maple tree fails at a random node. */ + for (i = 0; i < 1000; i += 3) { + if (i & 1) { + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + } else { + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + } + + for (j = 0; j < i; j++) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + /* + * The rand() library function is not used, so we can generate + * the same random numbers on any platform. + */ + rand_seed = rand_seed * 1103515245 + 12345; + rand = rand_seed / 65536 % 128; + mt_set_non_kernel(rand); + + ret = mtree_dup(mt, &new, GFP_NOWAIT); + mt_set_non_kernel(0); + if (ret != 0) { + MT_BUG_ON(&new, ret != -ENOMEM); + count++; + mtree_destroy(mt); + continue; + } + + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* pr_info("mtree_dup() fail %d times\n", count); */ + BUG_ON(!count); +} + extern void test_kmem_cache_bulk(void); void farmer_tests(void) @@ -35800,6 +36157,10 @@ void farmer_tests(void) check_null_expand(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, 0); + check_mtree_dup(&tree); + mtree_destroy(&tree); + /* RCU testing */ mt_init_flags(&tree, 0); check_erase_testset(&tree); From c79ca61edc7b3d27a6fd3acbeb81d4638e1acdec Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:41 +0800 Subject: [PATCH 52/78] FROMGIT: maple_tree: update the documentation of maple tree Introduce the new interface mtree_dup() in the documentation. Link: https://lkml.kernel.org/r/20231027033845.90608-7-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 9bc1d3cdb904170214456bca96c4924f28522ab8 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: I3eb330f0be49f7e8d8b37ecb64de3b7ef349c05b Signed-off-by: Suren Baghdasaryan --- Documentation/core-api/maple_tree.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/core-api/maple_tree.rst b/Documentation/core-api/maple_tree.rst index 45defcf15da7..285e2d2b21ae 100644 --- a/Documentation/core-api/maple_tree.rst +++ b/Documentation/core-api/maple_tree.rst @@ -81,6 +81,9 @@ section. Sometimes it is necessary to ensure the next call to store to a maple tree does not allocate memory, please see :ref:`maple-tree-advanced-api` for this use case. +You can use mtree_dup() to duplicate an entire maple tree. It is a more +efficient way than inserting all elements one by one into a new tree. + Finally, you can remove all entries from a maple tree by calling mtree_destroy(). If the maple tree entries are pointers, you may wish to free the entries first. @@ -112,6 +115,7 @@ Takes ma_lock internally: * mtree_insert() * mtree_insert_range() * mtree_erase() + * mtree_dup() * mtree_destroy() * mt_set_in_rcu() * mt_clear_in_rcu() From e57d333531ad6cfac38e42826d0229c03d6b6b41 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:42 +0800 Subject: [PATCH 53/78] FROMGIT: maple_tree: skip other tests when BENCH is enabled Skip other tests when BENCH is enabled so that performance can be measured in user space. Link: https://lkml.kernel.org/r/20231027033845.90608-8-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit f670fa1caadb4ea532a89012c5451e4c6789bfcc https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: I0a761a4b6211b19ec80c97d5aef80f3979523bcb Signed-off-by: Suren Baghdasaryan --- lib/test_maple_tree.c | 8 ++++---- tools/testing/radix-tree/maple.c | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index ab9d4461abc9..b80e28cdaa96 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -2741,10 +2741,6 @@ static int __init maple_tree_seed(void) pr_info("\nTEST STARTING\n\n"); - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - check_root_expand(&tree); - mtree_destroy(&tree); - #if defined(BENCH_SLOT_STORE) #define BENCH mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); @@ -2788,6 +2784,10 @@ static int __init maple_tree_seed(void) goto skip; #endif + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_root_expand(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_iteration(&tree); mtree_destroy(&tree); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 916cf8b45ddf..a2626f02f385 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -36195,7 +36195,9 @@ void farmer_tests(void) void maple_tree_tests(void) { +#if !defined(BENCH) farmer_tests(); +#endif maple_tree_seed(); maple_tree_harvest(); } From 1bec2dd52eabfd5c2908353ab7f3c2f324131da9 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:43 +0800 Subject: [PATCH 54/78] FROMGIT: maple_tree: update check_forking() and bench_forking() Updated check_forking() and bench_forking() to use __mt_dup() to duplicate maple tree. Link: https://lkml.kernel.org/r/20231027033845.90608-9-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 446e1867e6df3cbdd19af6be8f8f4ed56176adb4 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: I3b64ad1cb5ae40b10dc86ee55501f12522c7e2f5 Signed-off-by: Suren Baghdasaryan --- lib/test_maple_tree.c | 117 ++++++++++++++++++------------------ tools/include/linux/rwsem.h | 4 ++ 2 files changed, 62 insertions(+), 59 deletions(-) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index b80e28cdaa96..68b2c387fddb 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1671,47 +1671,48 @@ static noinline void __init bench_mt_for_each(struct maple_tree *mt) #endif /* check_forking - simulate the kernel forking sequence with the tree. */ -static noinline void __init check_forking(struct maple_tree *mt) +static noinline void __init check_forking(void) { - - struct maple_tree newmt; - int i, nr_entries = 134; + struct maple_tree mt, newmt; + int i, nr_entries = 134, ret; void *val; - MA_STATE(mas, mt, 0, 0); - MA_STATE(newmas, mt, 0, 0); - struct rw_semaphore newmt_lock; + MA_STATE(mas, &mt, 0, 0); + MA_STATE(newmas, &newmt, 0, 0); + struct rw_semaphore mt_lock, newmt_lock; + init_rwsem(&mt_lock); init_rwsem(&newmt_lock); - for (i = 0; i <= nr_entries; i++) - mtree_store_range(mt, i*10, i*10 + 5, - xa_mk_value(i), GFP_KERNEL); + mt_init_flags(&mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt, &mt_lock); - mt_set_non_kernel(99999); mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&newmt, &newmt_lock); - newmas.tree = &newmt; - mas_reset(&newmas); - mas_reset(&mas); - down_write(&newmt_lock); - mas.index = 0; - mas.last = 0; - if (mas_expected_entries(&newmas, nr_entries)) { + + down_write(&mt_lock); + for (i = 0; i <= nr_entries; i++) { + mas_set_range(&mas, i*10, i*10 + 5); + mas_store_gfp(&mas, xa_mk_value(i), GFP_KERNEL); + } + + down_write_nested(&newmt_lock, SINGLE_DEPTH_NESTING); + ret = __mt_dup(&mt, &newmt, GFP_KERNEL); + if (ret) { pr_err("OOM!"); BUG_ON(1); } - rcu_read_lock(); - mas_for_each(&mas, val, ULONG_MAX) { - newmas.index = mas.index; - newmas.last = mas.last; + + mas_set(&newmas, 0); + mas_for_each(&newmas, val, ULONG_MAX) mas_store(&newmas, val); - } - rcu_read_unlock(); + mas_destroy(&newmas); + mas_destroy(&mas); mt_validate(&newmt); - mt_set_non_kernel(0); __mt_destroy(&newmt); + __mt_destroy(&mt); up_write(&newmt_lock); + up_write(&mt_lock); } static noinline void __init check_iteration(struct maple_tree *mt) @@ -1815,49 +1816,51 @@ static noinline void __init check_mas_store_gfp(struct maple_tree *mt) } #if defined(BENCH_FORK) -static noinline void __init bench_forking(struct maple_tree *mt) +static noinline void __init bench_forking(void) { - - struct maple_tree newmt; - int i, nr_entries = 134, nr_fork = 80000; + struct maple_tree mt, newmt; + int i, nr_entries = 134, nr_fork = 80000, ret; void *val; - MA_STATE(mas, mt, 0, 0); - MA_STATE(newmas, mt, 0, 0); - struct rw_semaphore newmt_lock; + MA_STATE(mas, &mt, 0, 0); + MA_STATE(newmas, &newmt, 0, 0); + struct rw_semaphore mt_lock, newmt_lock; + init_rwsem(&mt_lock); init_rwsem(&newmt_lock); - mt_set_external_lock(&newmt, &newmt_lock); - for (i = 0; i <= nr_entries; i++) - mtree_store_range(mt, i*10, i*10 + 5, - xa_mk_value(i), GFP_KERNEL); + mt_init_flags(&mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt, &mt_lock); + + down_write(&mt_lock); + for (i = 0; i <= nr_entries; i++) { + mas_set_range(&mas, i*10, i*10 + 5); + mas_store_gfp(&mas, xa_mk_value(i), GFP_KERNEL); + } for (i = 0; i < nr_fork; i++) { - mt_set_non_kernel(99999); - mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE); - newmas.tree = &newmt; - mas_reset(&newmas); - mas_reset(&mas); - mas.index = 0; - mas.last = 0; - rcu_read_lock(); - down_write(&newmt_lock); - if (mas_expected_entries(&newmas, nr_entries)) { - printk("OOM!"); + mt_init_flags(&newmt, + MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&newmt, &newmt_lock); + + down_write_nested(&newmt_lock, SINGLE_DEPTH_NESTING); + ret = __mt_dup(&mt, &newmt, GFP_KERNEL); + if (ret) { + pr_err("OOM!"); BUG_ON(1); } - mas_for_each(&mas, val, ULONG_MAX) { - newmas.index = mas.index; - newmas.last = mas.last; + + mas_set(&newmas, 0); + mas_for_each(&newmas, val, ULONG_MAX) mas_store(&newmas, val); - } + mas_destroy(&newmas); - rcu_read_unlock(); mt_validate(&newmt); - mt_set_non_kernel(0); __mt_destroy(&newmt); up_write(&newmt_lock); } + mas_destroy(&mas); + __mt_destroy(&mt); + up_write(&mt_lock); } #endif @@ -2771,9 +2774,7 @@ static int __init maple_tree_seed(void) #endif #if defined(BENCH_FORK) #define BENCH - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - bench_forking(&tree); - mtree_destroy(&tree); + bench_forking(); goto skip; #endif #if defined(BENCH_MT_FOR_EACH) @@ -2792,9 +2793,7 @@ static int __init maple_tree_seed(void) check_iteration(&tree); mtree_destroy(&tree); - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - check_forking(&tree); - mtree_destroy(&tree); + check_forking(); mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_mas_store_gfp(&tree); diff --git a/tools/include/linux/rwsem.h b/tools/include/linux/rwsem.h index 83971b3cbfce..f8bffd4a987c 100644 --- a/tools/include/linux/rwsem.h +++ b/tools/include/linux/rwsem.h @@ -37,4 +37,8 @@ static inline int up_write(struct rw_semaphore *sem) { return pthread_rwlock_unlock(&sem->lock); } + +#define down_read_nested(sem, subclass) down_read(sem) +#define down_write_nested(sem, subclass) down_write(sem) + #endif /* _TOOLS_RWSEM_H */ From 3743b40f655249489c00b52a1481f789eba2f4bb Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:44 +0800 Subject: [PATCH 55/78] FROMGIT: maple_tree: preserve the tree attributes when destroying maple tree When destroying maple tree, preserve its attributes and then turn it into an empty tree. This allows it to be reused without needing to be reinitialized. Link: https://lkml.kernel.org/r/20231027033845.90608-10-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 8e50d32c7a89bde896945e4e572ef28ccd87bbf8 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: If1725d5a37dcd26bec23e6bffe95d877903dfab1 Signed-off-by: Suren Baghdasaryan --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3bc5414f78ab..1200ff73c1b0 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6825,7 +6825,7 @@ void __mt_destroy(struct maple_tree *mt) if (xa_is_node(root)) mte_destroy_walk(root, mt); - mt->ma_flags = 0; + mt->ma_flags = mt_attr(mt); } EXPORT_SYMBOL_GPL(__mt_destroy); From ed9b660cd1ad6746e8357e9717981f0c7ceb73ce Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:45 +0800 Subject: [PATCH 56/78] BACKPORT: FROMGIT fork: use __mt_dup() to duplicate maple tree in dup_mmap() In dup_mmap(), using __mt_dup() to duplicate the old maple tree and then directly replacing the entries of VMAs in the new maple tree can result in better performance. __mt_dup() uses DFS pre-order to duplicate the maple tree, so it is efficient. The average time complexity of __mt_dup() is O(n), where n is the number of VMAs. The proof of the time complexity is provided in the commit log that introduces __mt_dup(). After duplicating the maple tree, each element is traversed and replaced (ignoring the cases of deletion, which are rare). Since it is only a replacement operation for each element, this process is also O(n). Analyzing the exact time complexity of the previous algorithm is challenging because each insertion can involve appending to a node, pushing data to adjacent nodes, or even splitting nodes. The frequency of each action is difficult to calculate. The worst-case scenario for a single insertion is when the tree undergoes splitting at every level. If we consider each insertion as the worst-case scenario, we can determine that the upper bound of the time complexity is O(n*log(n)), although this is a loose upper bound. However, based on the test data, it appears that the actual time complexity is likely to be O(n). As the entire maple tree is duplicated using __mt_dup(), if dup_mmap() fails, there will be a portion of VMAs that have not been duplicated in the maple tree. To handle this, we mark the failure point with XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, stop releasing VMAs that have not been duplicated after this point. There is a "spawn" in byte-unixbench[1], which can be used to test the performance of fork(). I modified it slightly to make it work with different number of VMAs. Below are the test results. The first row shows the number of VMAs. The second and third rows show the number of fork() calls per ten seconds, corresponding to next-20231006 and the this patchset, respectively. The test results were obtained with CPU binding to avoid scheduler load balancing that could cause unstable results. There are still some fluctuations in the test results, but at least they are better than the original performance. 21 121 221 421 821 1621 3221 6421 12821 25621 51221 112100 76261 54227 34035 20195 11112 6017 3161 1606 802 393 114558 83067 65008 45824 28751 16072 8922 4747 2436 1233 599 2.19% 8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42% [1] https://github.com/kdlucas/byte-unixbench/tree/master Link: https://lkml.kernel.org/r/20231027033845.90608-11-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Suggested-by: Liam R. Howlett Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit d2406291483775ecddaee929231a39c70c08fda2 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) [surenb: open-coded vma_iter_clear_gfp(), vma_iter_bulk_store(); replaced vma_next() with mas_find()] Bug: 308042511 Change-Id: I42d6620e8ce6a0b16211c231a9b72ba16ba9c0d2 Signed-off-by: Suren Baghdasaryan --- kernel/fork.c | 40 +++++++++++++++++++++++++++++----------- mm/memory.c | 7 ++++++- mm/mmap.c | 9 ++++++--- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 9ef103c05891..1109a10c5ccd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -659,7 +659,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge = 0; LIST_HEAD(uf); - MA_STATE(old_mas, &oldmm->mm_mt, 0, 0); MA_STATE(mas, &mm->mm_mt, 0, 0); uprobe_start_dup_mmap(); @@ -687,16 +686,23 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); - retval = mas_expected_entries(&mas, oldmm->map_count); - if (retval) + /* Use __mt_dup() to efficiently build an identical maple tree. */ + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); + if (unlikely(retval)) goto out; mt_clear_in_rcu(mas.tree); - mas_for_each(&old_mas, mpnt, ULONG_MAX) { + mas_for_each(&mas, mpnt, ULONG_MAX) { struct file *file; vma_start_write(mpnt); if (mpnt->vm_flags & VM_DONTCOPY) { + __mas_set_range(&mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + if (unlikely(mas_is_err(&mas))) { + retval = -ENOMEM; + goto loop_out; + } vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } @@ -758,12 +764,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); - /* Link the vma into the MT */ + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ mas.index = tmp->vm_start; mas.last = tmp->vm_end - 1; mas_store(&mas, tmp); - if (mas_is_err(&mas)) - goto fail_nomem_mas_store; mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -772,15 +779,28 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); - if (retval) + if (retval) { + mpnt = mas_find(&mas, ULONG_MAX); goto loop_out; + } } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: mas_destroy(&mas); - if (!retval) + if (!retval) { mt_set_in_rcu(mas.tree); + } else if (mpnt) { + /* + * The entire maple tree has already been duplicated. If the + * mmap duplication fails, mark the failure point with + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, + * stop releasing VMAs that have not been duplicated after this + * point. + */ + mas_set_range(&mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store(&mas, XA_ZERO_ENTRY); + } out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -790,8 +810,6 @@ fail_uprobe_end: uprobe_end_dup_mmap(); return retval; -fail_nomem_mas_store: - unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: diff --git a/mm/memory.c b/mm/memory.c index 7f29d9394bdf..5e161551a5d1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -411,6 +411,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, * be 0. This will underflow and is okay. */ next = mas_find(&mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing @@ -432,6 +434,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, && !is_vm_hugetlb_page(next)) { vma = next; next = mas_find(&mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -1736,7 +1740,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, do { unmap_single_vma(tlb, vma, start_addr, end_addr, &details, mm_wr_locked); - } while ((vma = mas_find(&mas, end_t - 1)) != NULL); + vma = mas_find(&mas, end_t - 1); + } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/mmap.c b/mm/mmap.c index bd2140cfcf36..d5c48b243869 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3303,10 +3303,11 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); vma = mas_find(&mas, ULONG_MAX); - if (!vma) { + if (!vma || unlikely(xa_is_zero(vma))) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); - return; + mmap_write_lock(mm); + goto destroy; } lru_add_drain(); @@ -3339,11 +3340,13 @@ void exit_mmap(struct mm_struct *mm) remove_vma(vma, true); count++; cond_resched(); - } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + vma = mas_find(&mas, ULONG_MAX); + } while (vma && likely(!xa_is_zero(vma))); BUG_ON(count != mm->map_count); trace_exit_mmap(mm); +destroy: __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); From 8a597e7a2d06d699fb7b6c7787291e3916a502e2 Mon Sep 17 00:00:00 2001 From: Sebastian Ene Date: Tue, 31 Oct 2023 12:15:46 +0000 Subject: [PATCH 57/78] ANDROID: KVM: arm64: Don't prepopulate MMIO regions for host stage-2 As we reserve only 1GB of memory for the MMIO region don't prepopulate the entire remaining address space with MMIO as this is prone to failure. Instead, let the MMIO regions to be created lazily on the fault path and keep only the RAM regions prepopulated. Bug: 307805059 Test: Boot pKVM with CONFIG_ARM64_16K_PAGES Change-Id: I6327f42eb17c6588335a1e04736393c9032114ab Signed-off-by: Sebastian Ene --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 2c1032a59826..b3920a37f334 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -149,22 +149,16 @@ static void prepare_host_vtcr(void) static int prepopulate_host_stage2(void) { struct memblock_region *reg; - u64 addr = 0; - int i, ret; + int i, ret = 0; for (i = 0; i < hyp_memblock_nr; i++) { reg = &hyp_memory[i]; - ret = host_stage2_idmap_locked(addr, reg->base - addr, PKVM_HOST_MMIO_PROT, false); - if (ret) - return ret; ret = host_stage2_idmap_locked(reg->base, reg->size, PKVM_HOST_MEM_PROT, false); if (ret) return ret; - addr = reg->base + reg->size; } - return host_stage2_idmap_locked(addr, BIT(host_mmu.pgt.ia_bits) - addr, PKVM_HOST_MMIO_PROT, - false); + return ret; } int kvm_host_prepare_stage2(void *pgt_pool_base) From 934a40576ee2c3f07dc59e38a339767d51a84238 Mon Sep 17 00:00:00 2001 From: Stanley Chang Date: Wed, 19 Apr 2023 10:00:42 +0800 Subject: [PATCH 58/78] UPSTREAM: usb: dwc3: core: add support for disabling High-speed park mode Setting the PARKMODE_DISABLE_HS bit in the DWC3_USB3_GUCTL1. When this bit is set to '1' all HS bus instances in park mode are disabled For some USB wifi devices, if enable this feature it will reduce the performance. Therefore, add an option for disabling HS park mode by device-tree. In Synopsys's dwc3 data book: In a few high speed devices when an IN request is sent within 900ns of the ACK of the previous packet, these devices send a NAK. When connected to these devices, if required, the software can disable the park mode if you see performance drop in your system. When park mode is disabled, pipelining of multiple packet is disabled and instead one packet at a time is requested by the scheduler. This allows up to 12 NAKs in a micro-frame and improves performance of these slow devices. Bug: 300024866 Acked-by: Thinh Nguyen Signed-off-by: Stanley Chang Link: https://lore.kernel.org/r/20230419020044.15475-1-stanley_chang@realtek.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: William Wu (cherry picked from commit d21a797a3eeb2b001e07ff943e5611eab67a71a3) Change-Id: I43ee416e54779a073a0ba4057edf4be8bd7886de Signed-off-by: Kever Yang --- drivers/usb/dwc3/core.c | 5 +++++ drivers/usb/dwc3/core.h | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 3ee70ffaf003..2b01c3e05ebe 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -1233,6 +1233,9 @@ static int dwc3_core_init(struct dwc3 *dwc) if (dwc->parkmode_disable_ss_quirk) reg |= DWC3_GUCTL1_PARKMODE_DISABLE_SS; + if (dwc->parkmode_disable_hs_quirk) + reg |= DWC3_GUCTL1_PARKMODE_DISABLE_HS; + if (DWC3_VER_IS_WITHIN(DWC3, 290A, ANY) && (dwc->maximum_speed == USB_SPEED_HIGH || dwc->maximum_speed == USB_SPEED_FULL)) @@ -1539,6 +1542,8 @@ static void dwc3_get_properties(struct dwc3 *dwc) "snps,resume-hs-terminations"); dwc->parkmode_disable_ss_quirk = device_property_read_bool(dev, "snps,parkmode-disable-ss-quirk"); + dwc->parkmode_disable_hs_quirk = device_property_read_bool(dev, + "snps,parkmode-disable-hs-quirk"); dwc->gfladj_refclk_lpm_sel = device_property_read_bool(dev, "snps,gfladj-refclk-lpm-sel-quirk"); diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index 89219a14efb0..d21888658806 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -263,6 +263,7 @@ #define DWC3_GUCTL1_DEV_FORCE_20_CLK_FOR_30_CLK BIT(26) #define DWC3_GUCTL1_DEV_L1_EXIT_BY_HW BIT(24) #define DWC3_GUCTL1_PARKMODE_DISABLE_SS BIT(17) +#define DWC3_GUCTL1_PARKMODE_DISABLE_HS BIT(16) #define DWC3_GUCTL1_RESUME_OPMODE_HS_HOST BIT(10) /* Global Status Register */ @@ -1113,6 +1114,8 @@ struct dwc3_scratchpad_array { * generation after resume from suspend. * @parkmode_disable_ss_quirk: set if we need to disable all SuperSpeed * instances in park mode. + * @parkmode_disable_hs_quirk: set if we need to disable all HishSpeed + * instances in park mode. * @tx_de_emphasis_quirk: set if we enable Tx de-emphasis quirk * @tx_de_emphasis: Tx de-emphasis value * 0 - -6dB de-emphasis @@ -1330,6 +1333,7 @@ struct dwc3 { unsigned dis_tx_ipgap_linecheck_quirk:1; unsigned resume_hs_terminations:1; unsigned parkmode_disable_ss_quirk:1; + unsigned parkmode_disable_hs_quirk:1; unsigned gfladj_refclk_lpm_sel:1; unsigned tx_de_emphasis_quirk:1; From ef67750d99e2a666ee18a46aeb0615516845c089 Mon Sep 17 00:00:00 2001 From: Rick Yiu Date: Wed, 3 Jan 2024 06:56:23 +0000 Subject: [PATCH 59/78] ANDROID: sched: Export symbols for vendor modules Export sysctl_sched_min_granularity and sysctl_sched_idle_min_granularity. In the vendor module, it will use several static function in GKI, while we do not want to export these static functions, which will need to make them not static, we copied them to the vendor module, so we need the export the symbols used in those static functions. For example, sysctl_sched_min_granularity and sysctl_sched_idle_min_granularity are referred in sched_slice(), and they are only used as read-only. Bug: 316276520 Change-Id: I976d0a1f3a70e8e60099e55fdd3cc99a90053fbb Signed-off-by: Rick Yiu --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 507d1fc4e163..ac870e416e2a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -96,6 +96,7 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 750000ULL; +EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity); static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* @@ -105,6 +106,7 @@ static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; * (default: 0.75 msec) */ unsigned int sysctl_sched_idle_min_granularity = 750000ULL; +EXPORT_SYMBOL_GPL(sysctl_sched_idle_min_granularity); /* * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity From ac90f0829243d0c6d2f9b219b519c0d0be696ab6 Mon Sep 17 00:00:00 2001 From: Rick Yiu Date: Wed, 3 Jan 2024 07:22:49 +0000 Subject: [PATCH 60/78] ANDROID: Update the ABI symbol list Adding the following symbols: - sysctl_sched_idle_min_granularity - sysctl_sched_min_granularity Bug: 316276520 Change-Id: I8e33c3105a3ca62d168a6289ceafc31404757453 Signed-off-by: Rick Yiu --- android/abi_gki_aarch64.stg | 20 ++++++++++++++++++++ android/abi_gki_aarch64_pixel | 2 ++ 2 files changed, 22 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 4a190c90a757..113fdec98f04 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -388738,6 +388738,15 @@ elf_symbol { type_id: 0x4585663f full_name: "sysctl_sched_features" } +elf_symbol { + id: 0xe6ea21b1 + name: "sysctl_sched_idle_min_granularity" + is_defined: true + symbol_type: OBJECT + crc: 0x69545cfa + type_id: 0x4585663f + full_name: "sysctl_sched_idle_min_granularity" +} elf_symbol { id: 0x87812861 name: "sysctl_sched_latency" @@ -388747,6 +388756,15 @@ elf_symbol { type_id: 0x4585663f full_name: "sysctl_sched_latency" } +elf_symbol { + id: 0x34555a8a + name: "sysctl_sched_min_granularity" + is_defined: true + symbol_type: OBJECT + crc: 0x04390257 + type_id: 0x4585663f + full_name: "sysctl_sched_min_granularity" +} elf_symbol { id: 0x18d0dd21 name: "sysctl_vals" @@ -405076,7 +405094,9 @@ interface { symbol_id: 0x2f857527 symbol_id: 0x3e5f4f82 symbol_id: 0xbf1515af + symbol_id: 0xe6ea21b1 symbol_id: 0x87812861 + symbol_id: 0x34555a8a symbol_id: 0x18d0dd21 symbol_id: 0x92705587 symbol_id: 0xdbe66171 diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index a01a49721a1a..1acd4b663615 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -2091,7 +2091,9 @@ synchronize_rcu syscon_regmap_lookup_by_phandle sysctl_sched_features + sysctl_sched_idle_min_granularity sysctl_sched_latency + sysctl_sched_min_granularity sysfs_add_file_to_group sysfs_add_link_to_group sysfs_create_file_ns From 98b0e4cf0968083bfeaf1b63e5b5873acf534566 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Fri, 15 Sep 2023 17:31:06 +0300 Subject: [PATCH 61/78] BACKPORT: xhci: track port suspend state correctly in unsuccessful resume cases xhci-hub.c tracks suspended ports in a suspended_port bitfield. This is checked when responding to a Get_Status(PORT) request to see if a port in running U0 state was recently resumed, and adds the required USB_PORT_STAT_C_SUSPEND change bit in those cases. The suspended_port bit was left uncleared if a device is disconnected during suspend. The bit remained set even when a new device was connected and enumerated. The set bit resulted in a incorrect Get_Status(PORT) response with a bogus USB_PORT_STAT_C_SUSPEND change bit set once the new device reached U0 link state. USB_PORT_STAT_C_SUSPEND change bit is only used for USB2 ports, but xhci-hub keeps track of both USB2 and USB3 suspended ports. Cc: stable@vger.kernel.org Reported-by: Wesley Cheng Closes: https://lore.kernel.org/linux-usb/d68aa806-b26a-0e43-42fb-b8067325e967@quicinc.com/ Fixes: 1d5810b6923c ("xhci: Rework port suspend structures for limited ports.") Tested-by: Wesley Cheng Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20230915143108.1532163-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman Bug: 200589374 (cherry picked from commit d7cdfc319b2bcf6899ab0a05eec0958bc802a9a1 https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git usb-next) [wcheng: modified change to remove dependency on updated resume timestamp tracking] Change-Id: Icccc1778a1f193b4b4c03532d291db88772bd454 Signed-off-by: Wesley Cheng --- drivers/usb/host/xhci-hub.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 544028862de0..10eb2175e749 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -1053,19 +1053,19 @@ static void xhci_get_usb3_port_status(struct xhci_port *port, u32 *status, *status |= USB_PORT_STAT_C_CONFIG_ERROR << 16; /* USB3 specific wPortStatus bits */ - if (portsc & PORT_POWER) { + if (portsc & PORT_POWER) *status |= USB_SS_PORT_STAT_POWER; - /* link state handling */ - if (link_state == XDEV_U0) - bus_state->suspended_ports &= ~(1 << portnum); - } - /* remote wake resume signaling complete */ - if (bus_state->port_remote_wakeup & (1 << portnum) && + /* no longer suspended or resuming */ + if (link_state != XDEV_U3 && link_state != XDEV_RESUME && link_state != XDEV_RECOVERY) { - bus_state->port_remote_wakeup &= ~(1 << portnum); - usb_hcd_end_port_resume(&hcd->self, portnum); + /* remote wake resume signaling complete */ + if (bus_state->port_remote_wakeup & (1 << portnum)) { + bus_state->port_remote_wakeup &= ~(1 << portnum); + usb_hcd_end_port_resume(&hcd->self, portnum); + } + bus_state->suspended_ports &= ~(1 << portnum); } xhci_hub_report_usb3_link_state(xhci, status, portsc); @@ -1111,6 +1111,21 @@ static void xhci_get_usb2_port_status(struct xhci_port *port, u32 *status, return; } } + + /* + * Clear usb2 resume signalling variables if port is no longer suspended + * or resuming. Port either resumed to U0/U1/U2, disconnected, or in a + * error state. Resume related variables should be cleared in all those cases. + */ + if (link_state != XDEV_U3 && link_state != XDEV_RESUME) { + if (bus_state->resume_done[portnum] || + test_bit(portnum, &bus_state->resuming_ports)) { + bus_state->resume_done[portnum] = 0; + clear_bit(portnum, &bus_state->resuming_ports); + usb_hcd_end_port_resume(&port->rhub->hcd->self, portnum); + } + bus_state->suspended_ports &= ~(1 << portnum); + } } /* From ec46fe0ac7cb11d1f9b6b4709745af317a28c489 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 6 Dec 2023 09:30:40 +0100 Subject: [PATCH 62/78] UPSTREAM: bpf: Fix prog_array_map_poke_run map poke update commit 4b7de801606e504e69689df71475d27e35336fb3 upstream. Lee pointed out issue found by syscaller [0] hitting BUG in prog array map poke update in prog_array_map_poke_run function due to error value returned from bpf_arch_text_poke function. There's race window where bpf_arch_text_poke can fail due to missing bpf program kallsym symbols, which is accounted for with check for -EINVAL in that BUG_ON call. The problem is that in such case we won't update the tail call jump and cause imbalance for the next tail call update check which will fail with -EBUSY in bpf_arch_text_poke. I'm hitting following race during the program load: CPU 0 CPU 1 bpf_prog_load bpf_check do_misc_fixups prog_array_map_poke_track map_update_elem bpf_fd_array_map_update_elem prog_array_map_poke_run bpf_arch_text_poke returns -EINVAL bpf_prog_kallsyms_add After bpf_arch_text_poke (CPU 1) fails to update the tail call jump, the next poke update fails on expected jump instruction check in bpf_arch_text_poke with -EBUSY and triggers the BUG_ON in prog_array_map_poke_run. Similar race exists on the program unload. Fixing this by moving the update to bpf_arch_poke_desc_update function which makes sure we call __bpf_arch_text_poke that skips the bpf address check. Each architecture has slightly different approach wrt looking up bpf address in bpf_arch_text_poke, so instead of splitting the function or adding new 'checkip' argument in previous version, it seems best to move the whole map_poke_run update as arch specific code. [0] https://syzkaller.appspot.com/bug?extid=97a4fe20470e9bc30810 Bug: 309551558 Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Reported-by: syzbot+97a4fe20470e9bc30810@syzkaller.appspotmail.com Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Cc: Lee Jones Cc: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20231206083041.1306660-2-jolsa@kernel.org Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 57a6b0a464eb322bd62a78469d251f1d428c5ebb) Signed-off-by: Lee Jones Change-Id: I251c3da579e5d48cd7de4043913fd42d0671d6b5 --- arch/x86/net/bpf_jit_comp.c | 46 +++++++++++++++++++++++++++++ include/linux/bpf.h | 3 ++ kernel/bpf/arraymap.c | 58 +++++++------------------------------ 3 files changed, 59 insertions(+), 48 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5e680e039d0e..4686c1d9d0cf 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2553,3 +2553,49 @@ void bpf_jit_free(struct bpf_prog *prog) bpf_prog_unlock_free(prog); } + +void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old) +{ + u8 *old_addr, *new_addr, *old_bypass_addr; + int ret; + + old_bypass_addr = old ? NULL : poke->bypass_addr; + old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; + new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; + + /* + * On program loading or teardown, the program's kallsym entry + * might not be in place, so we use __bpf_arch_text_poke to skip + * the kallsyms check. + */ + if (new) { + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, new_addr); + BUG_ON(ret < 0); + if (!old) { + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + poke->bypass_addr, + NULL); + BUG_ON(ret < 0); + } + } else { + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + old_bypass_addr, + poke->bypass_addr); + BUG_ON(ret < 0); + /* let other CPUs finish the execution of program + * so that it will not possible to expose them + * to invalid nop, stack unwind, nop state + */ + if (!ret) + synchronize_rcu(); + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, NULL); + BUG_ON(ret < 0); + } +} diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 320d3b287ed0..6a6ff277501d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2697,6 +2697,9 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old); + void *bpf_arch_text_copy(void *dst, void *src, size_t len); int bpf_arch_text_invalidate(void *dst, size_t len); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 832b2659e96e..00f23febb9a7 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -997,11 +997,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map, mutex_unlock(&aux->poke_mutex); } +void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old) +{ + WARN_ON_ONCE(1); +} + static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { - u8 *old_addr, *new_addr, *old_bypass_addr; struct prog_poke_elem *elem; struct bpf_array_aux *aux; @@ -1010,7 +1015,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, list_for_each_entry(elem, &aux->poke_progs, list) { struct bpf_jit_poke_descriptor *poke; - int i, ret; + int i; for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; @@ -1029,21 +1034,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * activated, so tail call updates can arrive from here * while JIT is still finishing its final fixup for * non-activated poke entries. - * 3) On program teardown, the program's kallsym entry gets - * removed out of RCU callback, but we can only untrack - * from sleepable context, therefore bpf_arch_text_poke() - * might not see that this is in BPF text section and - * bails out with -EINVAL. As these are unreachable since - * RCU grace period already passed, we simply skip them. - * 4) Also programs reaching refcount of zero while patching + * 3) Also programs reaching refcount of zero while patching * is in progress is okay since we're protected under * poke_mutex and untrack the programs before the JIT - * buffer is freed. When we're still in the middle of - * patching and suddenly kallsyms entry of the program - * gets evicted, we just skip the rest which is fine due - * to point 3). - * 5) Any other error happening below from bpf_arch_text_poke() - * is a unexpected bug. + * buffer is freed. */ if (!READ_ONCE(poke->tailcall_target_stable)) continue; @@ -1053,39 +1047,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - old_bypass_addr = old ? NULL : poke->bypass_addr; - old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; - new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; - - if (new) { - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, new_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - if (!old) { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - poke->bypass_addr, - NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } - } else { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - old_bypass_addr, - poke->bypass_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - /* let other CPUs finish the execution of program - * so that it will not possible to expose them - * to invalid nop, stack unwind, nop state - */ - if (!ret) - synchronize_rcu(); - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } + bpf_arch_poke_desc_update(poke, new, old); } } } From 02f444ba07d22d35f4f2ddbf7d7da0643811af15 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 28 Oct 2023 07:30:27 -0600 Subject: [PATCH 63/78] UPSTREAM: io_uring/fdinfo: lock SQ thread while retrieving thread cpu/pid commit 7644b1a1c9a7ae8ab99175989bfc8676055edb46 upstream. We could race with SQ thread exit, and if we do, we'll hit a NULL pointer dereference when the thread is cleared. Grab the SQPOLL data lock before attempting to get the task cpu and pid for fdinfo, this ensures we have a stable view of it. Bug: 309790656 Cc: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=218032 Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin (cherry picked from commit 9236d2ea6465b37c0a73d994c1ad31753d31e5f5) Signed-off-by: Lee Jones Change-Id: I044e0285d4535440606ff593230b873e3145db91 --- io_uring/fdinfo.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 882bd56b01ed..ea2c2ded4e41 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -51,7 +51,6 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { - struct io_sq_data *sq = NULL; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; @@ -62,6 +61,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, unsigned int cq_shift = 0; unsigned int sq_shift = 0; unsigned int sq_entries, cq_entries; + int sq_pid = -1, sq_cpu = -1; bool has_lock; unsigned int i; @@ -139,13 +139,19 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, has_lock = mutex_trylock(&ctx->uring_lock); if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { - sq = ctx->sq_data; - if (!sq->thread) - sq = NULL; + struct io_sq_data *sq = ctx->sq_data; + + if (mutex_trylock(&sq->lock)) { + if (sq->thread) { + sq_pid = task_pid_nr(sq->thread); + sq_cpu = task_cpu(sq->thread); + } + mutex_unlock(&sq->lock); + } } - seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); - seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); + seq_printf(m, "SqThread:\t%d\n", sq_pid); + seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu); seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); for (i = 0; has_lock && i < ctx->nr_user_files; i++) { struct file *f = io_file_from_index(&ctx->file_table, i); From 29544d41573d0085debd0416803ec5287948fccd Mon Sep 17 00:00:00 2001 From: Zhipeng Wang Date: Fri, 5 Jan 2024 19:17:45 +0900 Subject: [PATCH 64/78] ANDROID: ABI: Update symbol list for imx 1 function symbol(s) added 'bool iio_trigger_using_own(struct iio_dev*)' Bug: 318788290 Change-Id: I5b17b2380f7087dabf51f4ed207e9ea4cab1ba38 Signed-off-by: Zhipeng Wang --- android/abi_gki_aarch64.stg | 10 ++++++++++ android/abi_gki_aarch64_imx | 1 + 2 files changed, 11 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 113fdec98f04..dc2644ea5502 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -365298,6 +365298,15 @@ elf_symbol { type_id: 0x16dc304e full_name: "iio_trigger_unregister" } +elf_symbol { + id: 0xfb09b362 + name: "iio_trigger_using_own" + is_defined: true + symbol_type: FUNCTION + crc: 0xe2c1359e + type_id: 0xf886bca4 + full_name: "iio_trigger_using_own" +} elf_symbol { id: 0xdf3e8655 name: "iio_update_buffers" @@ -402489,6 +402498,7 @@ interface { symbol_id: 0x7551a60b symbol_id: 0x08fd4b84 symbol_id: 0xc6d8f246 + symbol_id: 0xfb09b362 symbol_id: 0xdf3e8655 symbol_id: 0x6f2f4bd1 symbol_id: 0xf87ecda4 diff --git a/android/abi_gki_aarch64_imx b/android/abi_gki_aarch64_imx index c3797c477a4c..1b556c8705f6 100644 --- a/android/abi_gki_aarch64_imx +++ b/android/abi_gki_aarch64_imx @@ -1025,6 +1025,7 @@ iio_trigger_poll_chained iio_trigger_register iio_trigger_unregister + iio_trigger_using_own import_iovec in4_pton inet_csk_get_port From d6554d1262c4f39c7b652e02e54bb692e7f20747 Mon Sep 17 00:00:00 2001 From: Wesley Cheng Date: Tue, 21 Nov 2023 14:52:53 -0800 Subject: [PATCH 65/78] FROMGIT: usb: dwc3: gadget: Handle EP0 request dequeuing properly Current EP0 dequeue path will share the same as other EPs. However, there are some special considerations that need to be made for EP0 transfers: - EP0 transfers never transition into the started_list - EP0 only has one active request at a time In case there is a vendor specific control message for a function over USB FFS, then there is no guarantee on the timeline which the DATA/STATUS stage is responded to. While this occurs, any attempt to end transfers on non-control EPs will end up having the DWC3_EP_DELAY_STOP flag set, and defer issuing of the end transfer command. If the USB FFS application decides to timeout the control transfer, or if USB FFS AIO path exits, the USB FFS driver will issue a call to usb_ep_dequeue() for the ep0 request. In case of the AIO exit path, the AIO FS blocks until all pending USB requests utilizing the AIO path is completed. However, since the dequeue of ep0 req does not happen properly, all non-control EPs with the DWC3_EP_DELAY_STOP flag set will not be handled, and the AIO exit path will be stuck waiting for the USB FFS data endpoints to receive a completion callback. Fix is to utilize dwc3_ep0_reset_state() in the dequeue API to ensure EP0 is brought back to the SETUP state, and ensures that any deferred end transfer commands are handled. This also will end any active transfers on EP0, compared to the previous implementation which directly called giveback only. Fixes: fcd2def66392 ("usb: dwc3: gadget: Refactor dwc3_gadget_ep_dequeue") Acked-by: Thinh Nguyen Signed-off-by: Wesley Cheng Bug: 318577849 Change-Id: Ic00684db4b502f1aab128f7e49f22510dda24f60 (cherry picked from commit 730e12fbec53ab59dd807d981a204258a4cfb29a https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git usb-testing) Signed-off-by: Wesley Cheng --- drivers/usb/dwc3/gadget.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 6fdac0fae461..121092e35ec6 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2093,7 +2093,17 @@ static int dwc3_gadget_ep_dequeue(struct usb_ep *ep, list_for_each_entry(r, &dep->pending_list, list) { if (r == req) { - dwc3_gadget_giveback(dep, req, -ECONNRESET); + /* + * Explicitly check for EP0/1 as dequeue for those + * EPs need to be handled differently. Control EP + * only deals with one USB req, and giveback will + * occur during dwc3_ep0_stall_and_restart(). EP0 + * requests are never added to started_list. + */ + if (dep->number > 1) + dwc3_gadget_giveback(dep, req, -ECONNRESET); + else + dwc3_ep0_reset_state(dwc); goto out; } } From 02aa72665c85f3398ada5ba37bacd56732799e11 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 2 Oct 2023 13:54:28 +0300 Subject: [PATCH 66/78] UPSTREAM: nvmet-tcp: Fix a possible UAF in queue intialization setup commit d920abd1e7c4884f9ecd0749d1921b7ab19ddfbd upstream. From Alon: "Due to a logical bug in the NVMe-oF/TCP subsystem in the Linux kernel, a malicious user can cause a UAF and a double free, which may lead to RCE (may also lead to an LPE in case the attacker already has local privileges)." Hence, when a queue initialization fails after the ahash requests are allocated, it is guaranteed that the queue removal async work will be called, hence leave the deallocation to the queue removal. Also, be extra careful not to continue processing the socket, so set queue rcv_state to NVMET_TCP_RECV_ERR upon a socket error. Bug: 310114968 Cc: stable@vger.kernel.org Reported-by: Alon Zahavi Tested-by: Alon Zahavi Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch Signed-off-by: Greg Kroah-Hartman (cherry picked from commit e985d78bdcf37f7ef73666a43b0d2407715f00d3) Signed-off-by: Lee Jones Change-Id: Ifd7ec8294182a6bf6d8c261aeda5d989e909f7ff --- drivers/nvme/target/tcp.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 5e29da94f72d..355d80323b83 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -345,6 +345,7 @@ static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status) { + queue->rcv_state = NVMET_TCP_RECV_ERR; if (status == -EPIPE || status == -ECONNRESET) kernel_sock_shutdown(queue->sock, SHUT_RDWR); else @@ -871,15 +872,11 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) iov.iov_len = sizeof(*icresp); ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (ret < 0) - goto free_crypto; + return ret; /* queue removal will cleanup */ queue->state = NVMET_TCP_Q_LIVE; nvmet_prepare_receive_pdu(queue); return 0; -free_crypto: - if (queue->hdr_digest || queue->data_digest) - nvmet_tcp_free_crypto(queue); - return ret; } static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, From 5070b3b594e321d5abf9613b15f662c6638ea959 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Thu, 23 Nov 2023 15:13:14 +0800 Subject: [PATCH 67/78] UPSTREAM: ipv4: igmp: fix refcnt uaf issue when receiving igmp query packet [ Upstream commit e2b706c691905fe78468c361aaabc719d0a496f1 ] When I perform the following test operations: 1.ip link add br0 type bridge 2.brctl addif br0 eth0 3.ip addr add 239.0.0.1/32 dev eth0 4.ip addr add 239.0.0.1/32 dev br0 5.ip addr add 224.0.0.1/32 dev br0 6.while ((1)) do ifconfig br0 up ifconfig br0 down done 7.send IGMPv2 query packets to port eth0 continuously. For example, ./mausezahn ethX -c 0 "01 00 5e 00 00 01 00 72 19 88 aa 02 08 00 45 00 00 1c 00 01 00 00 01 02 0e 7f c0 a8 0a b7 e0 00 00 01 11 64 ee 9b 00 00 00 00" The preceding tests may trigger the refcnt uaf issue of the mc list. The stack is as follows: refcount_t: addition on 0; use-after-free. WARNING: CPU: 21 PID: 144 at lib/refcount.c:25 refcount_warn_saturate (lib/refcount.c:25) CPU: 21 PID: 144 Comm: ksoftirqd/21 Kdump: loaded Not tainted 6.7.0-rc1-next-20231117-dirty #80 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:refcount_warn_saturate (lib/refcount.c:25) RSP: 0018:ffffb68f00657910 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff8a00c3bf96c0 RCX: ffff8a07b6160908 RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff8a07b6160900 RBP: ffff8a00cba36862 R08: 0000000000000000 R09: 00000000ffff7fff R10: ffffb68f006577c0 R11: ffffffffb0fdcdc8 R12: ffff8a00c3bf9680 R13: ffff8a00c3bf96f0 R14: 0000000000000000 R15: ffff8a00d8766e00 FS: 0000000000000000(0000) GS:ffff8a07b6140000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055f10b520b28 CR3: 000000039741a000 CR4: 00000000000006f0 Call Trace: igmp_heard_query (net/ipv4/igmp.c:1068) igmp_rcv (net/ipv4/igmp.c:1132) ip_protocol_deliver_rcu (net/ipv4/ip_input.c:205) ip_local_deliver_finish (net/ipv4/ip_input.c:234) __netif_receive_skb_one_core (net/core/dev.c:5529) netif_receive_skb_internal (net/core/dev.c:5729) netif_receive_skb (net/core/dev.c:5788) br_handle_frame_finish (net/bridge/br_input.c:216) nf_hook_bridge_pre (net/bridge/br_input.c:294) __netif_receive_skb_core (net/core/dev.c:5423) __netif_receive_skb_list_core (net/core/dev.c:5606) __netif_receive_skb_list (net/core/dev.c:5674) netif_receive_skb_list_internal (net/core/dev.c:5764) napi_gro_receive (net/core/gro.c:609) e1000_clean_rx_irq (drivers/net/ethernet/intel/e1000/e1000_main.c:4467) e1000_clean (drivers/net/ethernet/intel/e1000/e1000_main.c:3805) __napi_poll (net/core/dev.c:6533) net_rx_action (net/core/dev.c:6735) __do_softirq (kernel/softirq.c:554) run_ksoftirqd (kernel/softirq.c:913) smpboot_thread_fn (kernel/smpboot.c:164) kthread (kernel/kthread.c:388) ret_from_fork (arch/x86/kernel/process.c:153) ret_from_fork_asm (arch/x86/entry/entry_64.S:250) The root causes are as follows: Thread A Thread B ... netif_receive_skb br_dev_stop ... br_multicast_leave_snoopers ... __ip_mc_dec_group ... __igmp_group_dropped igmp_rcv igmp_stop_timer igmp_heard_query //ref = 1 ip_ma_put igmp_mod_timer refcount_dec_and_test igmp_start_timer //ref = 0 ... refcount_inc //ref increases from 0 When the device receives an IGMPv2 Query message, it starts the timer immediately, regardless of whether the device is running. If the device is down and has left the multicast group, it will cause the mc list refcount uaf issue. Bug: 316932391 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Zhengchao Shao Reviewed-by: Eric Dumazet Reviewed-by: Hangbin Liu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin (cherry picked from commit 94445d9583079e0ccc5dde1370076ff24800d86e) Signed-off-by: Lee Jones Change-Id: I277be2304e564994e05b981ccd6cd8cbb9dc85be --- net/ipv4/igmp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index cbc4816ed7d8..ac53ef7eec91 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -216,8 +216,10 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) int tv = prandom_u32_max(max_delay); im->tm_running = 1; - if (!mod_timer(&im->timer, jiffies+tv+2)) - refcount_inc(&im->refcnt); + if (refcount_inc_not_zero(&im->refcnt)) { + if (mod_timer(&im->timer, jiffies + tv + 2)) + ip_ma_put(im); + } } static void igmp_gq_start_timer(struct in_device *in_dev) From c5dc4b4b3d11ade04153efa22dbaa77159d73be5 Mon Sep 17 00:00:00 2001 From: Jia-Shiuan Chen Date: Tue, 9 Jan 2024 07:12:12 +0000 Subject: [PATCH 68/78] ANDROID: Update the ABI symbol list Adding the following symbols: - dma_fence_array_ops Bug: 319196045 Change-Id: Id65c62e0aedd65c9c72d71c8e39f7fae1e1de740 Signed-off-by: Jia-Shiuan Chen --- android/abi_gki_aarch64_pixel | 1 + 1 file changed, 1 insertion(+) diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index 1acd4b663615..353946f63fd3 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -543,6 +543,7 @@ dmaengine_unmap_put dma_fence_add_callback dma_fence_array_create + dma_fence_array_ops dma_fence_context_alloc dma_fence_default_wait dma_fence_enable_sw_signaling From 031f804149b99bcec0f52fdfcf89b85a8141a5da Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 27 Nov 2023 09:31:46 +0000 Subject: [PATCH 69/78] ANDROID: KVM: arm64: Avoid BUG-ing from the host abort path Under certain circumstances __get_fault_info() may resolve the faulting address using the AT instruction. Given that this is being done outside of the host lock critical section, it is racy and the resolution via AT may fail. We currently BUG() in this situation, which is obviously less than ideal. Moving the address resolution to the critical section may have a performance impact, so let's keep it where it is, but bail out and return to the host to try a second time. Bug: 311830307 Change-Id: I26d61b04a4ccf040bd31802abb3c6b998ff4a48b Signed-off-by: Quentin Perret --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index b3920a37f334..3a5193ca0fb3 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -875,7 +875,14 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) int ret = -EPERM; esr = read_sysreg_el2(SYS_ESR); - BUG_ON(!__get_fault_info(esr, &fault)); + if (!__get_fault_info(esr, &fault)) { + addr = (u64)-1; + /* + * We've presumably raced with a page-table change which caused + * AT to fail, try again. + */ + goto return_to_host; + } fault.esr_el2 = esr; addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; @@ -902,6 +909,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) else BUG_ON(ret && ret != -EAGAIN); +return_to_host: trace_host_mem_abort(esr, addr); } From 928b3b5dde28ee2c7e85888e13ee2593ed864532 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Dec 2023 19:44:49 +0100 Subject: [PATCH 70/78] UPSTREAM: netfilter: nf_tables: skip set commit for deleted/destroyed sets commit 7315dc1e122c85ffdfc8defffbb8f8b616c2eb1a upstream. NFT_MSG_DELSET deactivates all elements in the set, skip set->ops->commit() to avoid the unnecessary clone (for the pipapo case) as well as the sync GC cycle, which could deactivate again expired elements in such set. Bug: 318548348 Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Reported-by: Kevin Rich Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 0105571f80edb96f81bb4bbdd5233a9130dc345b) Signed-off-by: Lee Jones Change-Id: Ie733688e27d9568d797fc1bc477261883b7dc8c1 --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6b5f22dc1d94..30802f7f2114 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9474,7 +9474,7 @@ static void nft_set_commit_update(struct list_head *set_update_list) list_for_each_entry_safe(set, next, set_update_list, pending_update) { list_del_init(&set->pending_update); - if (!set->ops->commit) + if (!set->ops->commit || set->dead) continue; set->ops->commit(set); From c1b1201d39dcb59bf20e45f40459ea25b37392b7 Mon Sep 17 00:00:00 2001 From: Dezhi Huang Date: Mon, 25 Dec 2023 15:06:46 +0800 Subject: [PATCH 71/78] BACKPORT: FROMLIST: dma-buf: Move sysfs work out of DMA-BUF export path We have identified an animation lag issue on our Android 14-6.1 product which seems to be caused by contention in the rwsem lock during the dmabuf request process. It appears that other processes are holding sysfs read locks, resulting in the blocking of dmabuf sysfs node creation. We encountered an issue in android14-6.1 that is similar to the problem described in [1]. So we cherry-pick this commit to android14-6.1. [1] https://android-review.googlesource.com/c/kernel/common/+/2111974 Bug: 311282169 Bug: 206979019 Link: https://lore.kernel.org/lkml/CABdmKX2dNYhgOYdrrJU6-jt6F=LjCidbKhR6t4F7yaa0SPr+-A@mail.gmail.com/T/ Signed-off-by: Dezhi Huang Conflicts: include/linux/dma-buf.h 1. The android14-6.1 KMI is frozen, and the modification to struct dma_buf_sysfs_entry in the original patch triggers ABI check failures. Instead of an anonymous union, use the existing struct kobject directly as a work_struct with type punning. Signed-off-by: T.J. Mercier Change-Id: Ic0386849b6b248b0a72215633fc1a50782455bac --- drivers/dma-buf/dma-buf-sysfs-stats.c | 66 +++++++++++++++++++++------ drivers/dma-buf/dma-buf.c | 16 +++++-- 2 files changed, 63 insertions(+), 19 deletions(-) diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c index 4b680e10c15a..46520c4d8ec9 100644 --- a/drivers/dma-buf/dma-buf-sysfs-stats.c +++ b/drivers/dma-buf/dma-buf-sysfs-stats.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "dma-buf-sysfs-stats.h" @@ -168,35 +169,72 @@ void dma_buf_uninit_sysfs_statistics(void) kset_unregister(dma_buf_stats_kset); } +static void sysfs_add_workfn(struct work_struct *work) +{ + /* The ABI would have to change for this to be false, but let's be paranoid. */ + _Static_assert(sizeof(struct kobject) >= sizeof(struct work_struct), + "kobject is smaller than work_struct"); + + struct dma_buf_sysfs_entry *sysfs_entry = + container_of((struct kobject *)work, struct dma_buf_sysfs_entry, kobj); + struct dma_buf *dmabuf = sysfs_entry->dmabuf; + + /* + * A dmabuf is ref-counted via its file member. If this handler holds the only + * reference to the dmabuf, there is no need for sysfs kobject creation. This is an + * optimization and a race; when the reference count drops to 1 immediately after + * this check it is not harmful as the sysfs entry will still get cleaned up in + * dma_buf_stats_teardown, which won't get called until the final dmabuf reference + * is released, and that can't happen until the end of this function. + */ + if (file_count(dmabuf->file) > 1) { + /* + * kobject_init_and_add expects kobject to be zero-filled, but we have populated it + * (the sysfs_add_work union member) to trigger this work function. + */ + memset(&dmabuf->sysfs_entry->kobj, 0, sizeof(dmabuf->sysfs_entry->kobj)); + dmabuf->sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset; + if (kobject_init_and_add(&dmabuf->sysfs_entry->kobj, &dma_buf_ktype, NULL, + "%lu", file_inode(dmabuf->file)->i_ino)) { + kobject_put(&dmabuf->sysfs_entry->kobj); + dmabuf->sysfs_entry = NULL; + } + } else { + /* + * Free the sysfs_entry and reset the pointer so dma_buf_stats_teardown doesn't + * attempt to operate on it. + */ + kfree(dmabuf->sysfs_entry); + dmabuf->sysfs_entry = NULL; + } + dma_buf_put(dmabuf); +} + int dma_buf_stats_setup(struct dma_buf *dmabuf, struct file *file) { struct dma_buf_sysfs_entry *sysfs_entry; - int ret; + struct work_struct *work; if (!dmabuf->exp_name) { pr_err("exporter name must not be empty if stats needed\n"); return -EINVAL; } - sysfs_entry = kzalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL); + sysfs_entry = kmalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL); if (!sysfs_entry) return -ENOMEM; - sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset; sysfs_entry->dmabuf = dmabuf; - dmabuf->sysfs_entry = sysfs_entry; - /* create the directory for buffer stats */ - ret = kobject_init_and_add(&sysfs_entry->kobj, &dma_buf_ktype, NULL, - "%lu", file_inode(file)->i_ino); - if (ret) - goto err_sysfs_dmabuf; + /* + * The use of kobj as a work_struct is an ugly hack + * to avoid an ABI break in this frozen kernel. + */ + work = (struct work_struct *)&dmabuf->sysfs_entry->kobj; + INIT_WORK(work, sysfs_add_workfn); + get_dma_buf(dmabuf); /* This reference will be dropped in sysfs_add_workfn. */ + schedule_work(work); return 0; - -err_sysfs_dmabuf: - kobject_put(&sysfs_entry->kobj); - dmabuf->sysfs_entry = NULL; - return ret; } diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index e0d42ee76b43..fbe8a07552ef 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -727,10 +727,6 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) dmabuf->resv = resv; } - ret = dma_buf_stats_setup(dmabuf, file); - if (ret) - goto err_dmabuf; - file->private_data = dmabuf; file->f_path.dentry->d_fsdata = dmabuf; dmabuf->file = file; @@ -739,9 +735,19 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) list_add(&dmabuf->list_node, &db_list.head); mutex_unlock(&db_list.lock); + ret = dma_buf_stats_setup(dmabuf, file); + if (ret) + goto err_sysfs; + return dmabuf; -err_dmabuf: +err_sysfs: + mutex_lock(&db_list.lock); + list_del(&dmabuf->list_node); + mutex_unlock(&db_list.lock); + dmabuf->file = NULL; + file->f_path.dentry->d_fsdata = NULL; + file->private_data = NULL; if (!resv) dma_resv_fini(dmabuf->resv); kfree(dmabuf); From 227b55a7a3ccacd83510b176f4d879de1887b2fa Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Thu, 8 Dec 2022 16:16:37 +0530 Subject: [PATCH 72/78] ANDROID: dma-buf: don't re-purpose kobject as work_struct The commit 5aec776ef8c9 ("BACKPORT: ANDROID: dma-buf: Move sysfs work out of DMA-BUF export path) re-purposed kobject as work_struct temporarily to create the sysfs entries asynchronously. The author knows what he is doing and rightly added a build assert if kobject struct size is smaller than the work_struct size. We are hitting this build assert on a non-GKI platform where CONFIG_ANDROID_KABI_RESERVE is not set. Fix this problem by allocating a new union with dma_buf_sysfs_entry structure and temporary structure as members. We only end up allocating more memory (because of union) only when kobject size is smaller than work_struct which the original patch any way assumed would never be true. Bug: 261818147 Bug: 262666413 Change-Id: Ifb089bf80d8a3a44ece9f05fc0b99ee76cb11645 Signed-off-by: Pavankumar Kondeti (cherry picked from commit ce18af9b5d7d0baad2ac3eea4c732d2bf128d690) Signed-off-by: T.J. Mercier --- drivers/dma-buf/dma-buf-sysfs-stats.c | 44 +++++++++++++++------------ 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c index 46520c4d8ec9..4f3ee92dbe1b 100644 --- a/drivers/dma-buf/dma-buf-sysfs-stats.c +++ b/drivers/dma-buf/dma-buf-sysfs-stats.c @@ -169,15 +169,21 @@ void dma_buf_uninit_sysfs_statistics(void) kset_unregister(dma_buf_stats_kset); } +struct dma_buf_create_sysfs_entry { + struct dma_buf *dmabuf; + struct work_struct work; +}; + +union dma_buf_create_sysfs_work_entry { + struct dma_buf_create_sysfs_entry create_entry; + struct dma_buf_sysfs_entry sysfs_entry; +}; + static void sysfs_add_workfn(struct work_struct *work) { - /* The ABI would have to change for this to be false, but let's be paranoid. */ - _Static_assert(sizeof(struct kobject) >= sizeof(struct work_struct), - "kobject is smaller than work_struct"); - - struct dma_buf_sysfs_entry *sysfs_entry = - container_of((struct kobject *)work, struct dma_buf_sysfs_entry, kobj); - struct dma_buf *dmabuf = sysfs_entry->dmabuf; + struct dma_buf_create_sysfs_entry *create_entry = + container_of(work, struct dma_buf_create_sysfs_entry, work); + struct dma_buf *dmabuf = create_entry->dmabuf; /* * A dmabuf is ref-counted via its file member. If this handler holds the only @@ -188,6 +194,7 @@ static void sysfs_add_workfn(struct work_struct *work) * is released, and that can't happen until the end of this function. */ if (file_count(dmabuf->file) > 1) { + dmabuf->sysfs_entry->dmabuf = dmabuf; /* * kobject_init_and_add expects kobject to be zero-filled, but we have populated it * (the sysfs_add_work union member) to trigger this work function. @@ -212,29 +219,26 @@ static void sysfs_add_workfn(struct work_struct *work) int dma_buf_stats_setup(struct dma_buf *dmabuf, struct file *file) { - struct dma_buf_sysfs_entry *sysfs_entry; - struct work_struct *work; + struct dma_buf_create_sysfs_entry *create_entry; + union dma_buf_create_sysfs_work_entry *work_entry; if (!dmabuf->exp_name) { pr_err("exporter name must not be empty if stats needed\n"); return -EINVAL; } - sysfs_entry = kmalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL); - if (!sysfs_entry) + work_entry = kmalloc(sizeof(union dma_buf_create_sysfs_work_entry), GFP_KERNEL); + if (!work_entry) return -ENOMEM; - sysfs_entry->dmabuf = dmabuf; - dmabuf->sysfs_entry = sysfs_entry; + dmabuf->sysfs_entry = &work_entry->sysfs_entry; - /* - * The use of kobj as a work_struct is an ugly hack - * to avoid an ABI break in this frozen kernel. - */ - work = (struct work_struct *)&dmabuf->sysfs_entry->kobj; - INIT_WORK(work, sysfs_add_workfn); + create_entry = &work_entry->create_entry; + create_entry->dmabuf = dmabuf; + + INIT_WORK(&create_entry->work, sysfs_add_workfn); get_dma_buf(dmabuf); /* This reference will be dropped in sysfs_add_workfn. */ - schedule_work(work); + schedule_work(&create_entry->work); return 0; } From bc4d82ee40515f0c770b28d0dc4fa532a2b1850e Mon Sep 17 00:00:00 2001 From: Norihiko Hama Date: Fri, 15 Dec 2023 12:04:47 +0900 Subject: [PATCH 73/78] ANDROID: KMI workaround for CONFIG_NETFILTER_FAMILY_BRIDGE Enabling CONFIG_NETFILTER_FAMILY_BRIDGE causes the new element, hooks_bridge[] to be added to netns_nf. Since the KMI is frozen this could not be added. The only instantiation of struct netns_nf is as an embedded field of struct net. So instead of adding the field to struct netns_nf, a new "struct ext_net" is added that contains struct net and the new hooks_bridge[] field. An accessor function, get_nf_hooks_bridge() is added to get a pointer to the new field. There is a global init_net of type struct net which must be special cased since it is not a member of a struct ext_net. All other instances of struct net are allocated via net_alloc() which now allocates a struct ext_net. Since CONFIG_NETFILTER_FAMILY_BRIDGE is a hidden config that is needed for vendor modules, it is enabled via init/Kconfig.gki. Bug: 316040984 Fixes: 0145780bfc78 ("fix KASAN-related kernel crash by KMI W/A for NETFILTER_FAMILY_BRIDGE") Change-Id: I2c7384e3df9b88f12464dc0138986fed12ca626a Signed-off-by: Norihiko Hama --- include/linux/netfilter.h | 2 +- include/net/net_namespace.h | 30 ++++++++++++++++++++++++++++++ include/net/netns/netfilter.h | 3 --- init/Kconfig.gki | 1 + net/bridge/br_input.c | 2 +- net/bridge/br_netfilter_hooks.c | 2 +- net/core/net_namespace.c | 10 +++++++--- net/netfilter/core.c | 12 +++++++++--- net/netfilter/nf_queue.c | 2 +- net/netfilter/nfnetlink_hook.c | 4 ++-- 10 files changed, 53 insertions(+), 15 deletions(-) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 445494c502ba..49dc95d21f01 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -243,7 +243,7 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, break; case NFPROTO_BRIDGE: #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); + hook_head = rcu_dereference(get_nf_hooks_bridge(net)[hook]); #endif break; default: diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 8c3587d5c308..6641c4543d18 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -188,6 +188,36 @@ struct net { #endif } __randomize_layout; +/* + * To work around a KMI issue, hooks_bridge[] could not be + * added to struct netns_nf. Since the only use of netns_nf + * is embedded in struct net, struct ext_net is added to + * contain struct net plus the new field. Users of the new + * field must use get_nf_hooks_bridge() to access the field. + */ +struct ext_net { + struct net net; +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS]; +#endif + ANDROID_VENDOR_DATA(1); +}; + +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE +extern struct net init_net; +extern struct nf_hook_entries **init_nf_hooks_bridgep; + +static inline struct nf_hook_entries __rcu **get_nf_hooks_bridge(const struct net *net) +{ + struct ext_net *ext_net; + + if (net == &init_net) + return init_nf_hooks_bridgep; + ext_net = container_of(net, struct ext_net, net); + return ext_net->hooks_bridge; +} +#endif + #include /* Init's network namespace */ diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 3b7eb0cb1201..56c72117b5b3 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -22,9 +22,6 @@ struct netns_nf { #ifdef CONFIG_NETFILTER_FAMILY_ARP struct nf_hook_entries __rcu *hooks_arp[NF_ARP_NUMHOOKS]; #endif -#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS]; -#endif #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) unsigned int defrag_ipv4_users; #endif diff --git a/init/Kconfig.gki b/init/Kconfig.gki index 081b1cdc9c7e..1a17a3d6e27b 100644 --- a/init/Kconfig.gki +++ b/init/Kconfig.gki @@ -202,6 +202,7 @@ config GKI_HIDDEN_NET_CONFIGS select PAGE_POOL select NET_PTP_CLASSIFY select NET_DEVLINK + select NETFILTER_FAMILY_BRIDGE help Dummy config option used to enable the networking hidden config, required by various SoC platforms. diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 6bb272894c96..0da15e1f3f72 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -243,7 +243,7 @@ static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) goto frame_finish; #endif - e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]); + e = rcu_dereference(get_nf_hooks_bridge(net)[NF_BR_PRE_ROUTING]); if (!e) goto frame_finish; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 01d690d9fe5f..c7f0aedf2244 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1016,7 +1016,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, unsigned int i; int ret; - e = rcu_dereference(net->nf.hooks_bridge[hook]); + e = rcu_dereference(get_nf_hooks_bridge(net)[hook]); if (!e) return okfn(net, sk, skb); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 4c1707d0eb9b..1d0110152862 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1093,9 +1093,13 @@ void __init net_ns_init(void) struct net_generic *ng; #ifdef CONFIG_NET_NS - net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), - SMP_CACHE_BYTES, - SLAB_PANIC|SLAB_ACCOUNT, NULL); + /* Allocate size for struct ext_net instead of struct net + * to fix a KMI issue when CONFIG_NETFILTER_FAMILY_BRIDGE + * is enabled + */ + net_cachep = kmem_cache_create("net_namespace", sizeof(struct ext_net), + SMP_CACHE_BYTES, + SLAB_PANIC | SLAB_ACCOUNT, NULL); /* Create workqueue for cleanup */ netns_wq = create_singlethread_workqueue("netns"); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 55a7f72d547c..6c7a44f84b93 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -39,6 +39,12 @@ struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE +struct nf_hook_entries __rcu *init_nf_hooks_bridge[NF_INET_NUMHOOKS]; +struct nf_hook_entries __rcu **init_nf_hooks_bridgep = &init_nf_hooks_bridge[0]; +EXPORT_SYMBOL_GPL(init_nf_hooks_bridgep); +#endif + static DEFINE_MUTEX(nf_hook_mutex); /* max hooks per family/hooknum */ @@ -278,9 +284,9 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, #endif #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE case NFPROTO_BRIDGE: - if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= hooknum)) + if (WARN_ON_ONCE(hooknum >= NF_INET_NUMHOOKS)) return NULL; - return net->nf.hooks_bridge + hooknum; + return get_nf_hooks_bridge(net) + hooknum; #endif #ifdef CONFIG_NETFILTER_INGRESS case NFPROTO_INET: @@ -747,7 +753,7 @@ static int __net_init netfilter_net_init(struct net *net) __netfilter_net_init(net->nf.hooks_arp, ARRAY_SIZE(net->nf.hooks_arp)); #endif #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - __netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge)); + __netfilter_net_init(get_nf_hooks_bridge(net), NF_INET_NUMHOOKS); #endif #ifdef CONFIG_PROC_FS net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 63d1516816b1..566f7794bf58 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -281,7 +281,7 @@ static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf switch (pf) { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE case NFPROTO_BRIDGE: - return rcu_dereference(net->nf.hooks_bridge[hooknum]); + return rcu_dereference(get_nf_hooks_bridge(net)[hooknum]); #endif case NFPROTO_IPV4: return rcu_dereference(net->nf.hooks_ipv4[hooknum]); diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index 8120aadf6a0f..3ca3c3a3ba01 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -210,9 +210,9 @@ nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *de break; case NFPROTO_BRIDGE: #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - if (hook >= ARRAY_SIZE(net->nf.hooks_bridge)) + if (hook >= NF_INET_NUMHOOKS) return ERR_PTR(-EINVAL); - hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); + hook_head = rcu_dereference(get_nf_hooks_bridge(net)[hook]); #endif break; #if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS) From febcf1429fa1f4cc476fa9616fe81d3b29059818 Mon Sep 17 00:00:00 2001 From: Aran Dalton Date: Fri, 5 Jan 2024 19:07:02 +0800 Subject: [PATCH 74/78] ANDROID: gki_defconfig: Set CONFIG_IDLE_INJECT and CONFIG_CPU_IDLE_THERMAL into y Under certain circumstances a SoC can reach a critical temperaturelimit and is unable to stabilize the temperature around a temperaturecontrol. The system may ask for a specific power budget butbecause of the OPP density, we can only choose an OPP with a powerbudget lower than the requested one and under-utilize the CPU, thuslosing performance. In other words, one OPP under-utilizes the CPUwith a power less than the requested power budget and the next OPPexceeds the power budget. The cpu idle cooling can solve this problem. Bug: 299411923 Signed-off-by: Aran Dalton Change-Id: I1c17b340617e88be075097dc47f30ce94be2a4d7 --- arch/arm64/configs/gki_defconfig | 2 ++ arch/x86/configs/gki_defconfig | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig index 38c90d04264e..009675466150 100644 --- a/arch/arm64/configs/gki_defconfig +++ b/arch/arm64/configs/gki_defconfig @@ -431,6 +431,7 @@ CONFIG_THERMAL_WRITABLE_TRIPS=y CONFIG_THERMAL_GOV_USER_SPACE=y CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y CONFIG_CPU_THERMAL=y +CONFIG_CPU_IDLE_THERMAL=y CONFIG_DEVFREQ_THERMAL=y CONFIG_THERMAL_EMULATION=y CONFIG_WATCHDOG=y @@ -580,6 +581,7 @@ CONFIG_IIO_TRIGGER=y CONFIG_PWM=y CONFIG_GENERIC_PHY=y CONFIG_POWERCAP=y +CONFIG_IDLE_INJECT=y CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_BINDERFS=y CONFIG_ANDROID_DEBUG_SYMBOLS=y diff --git a/arch/x86/configs/gki_defconfig b/arch/x86/configs/gki_defconfig index 22797e912979..7e2df44033bc 100644 --- a/arch/x86/configs/gki_defconfig +++ b/arch/x86/configs/gki_defconfig @@ -396,6 +396,7 @@ CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 CONFIG_THERMAL_WRITABLE_TRIPS=y CONFIG_THERMAL_GOV_USER_SPACE=y CONFIG_CPU_THERMAL=y +CONFIG_CPU_IDLE_THERMAL=y CONFIG_DEVFREQ_THERMAL=y CONFIG_THERMAL_EMULATION=y # CONFIG_X86_PKG_TEMP_THERMAL is not set @@ -523,6 +524,7 @@ CONFIG_IIO=y CONFIG_IIO_BUFFER=y CONFIG_IIO_TRIGGER=y CONFIG_POWERCAP=y +CONFIG_IDLE_INJECT=y CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_BINDERFS=y CONFIG_ANDROID_DEBUG_SYMBOLS=y From 28154afe74965b64bfb305f609cbc4cda7cf7004 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 18 Dec 2023 14:52:14 -0800 Subject: [PATCH 75/78] FROMLIST: scsi: ufs: Simplify power management during async scan ufshcd_init() calls pm_runtime_get_sync() before it calls async_schedule(). ufshcd_async_scan() calls pm_runtime_put_sync() directly or indirectly from ufshcd_add_lus(). Simplify ufshcd_async_scan() by always calling pm_runtime_put_sync() from ufshcd_async_scan(). Cc: stable@vger.kernel.org Change-Id: I4b6ede95360c665594963fff0962742728064fb0 Signed-off-by: Bart Van Assche Bug: 310401362 Link: https://lore.kernel.org/linux-scsi/20231218225229.2542156-2-bvanassche@acm.org/ Signed-off-by: Bart Van Assche --- drivers/ufs/core/ufshcd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 94db7033989a..0a86de0feb79 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8683,7 +8683,6 @@ static int ufshcd_add_lus(struct ufs_hba *hba) ufs_bsg_probe(hba); ufshpb_init(hba); scsi_scan_host(hba->host); - pm_runtime_put_sync(hba->dev); out: return ret; @@ -8916,15 +8915,15 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie) /* Probe and add UFS logical units */ ret = ufshcd_add_lus(hba); + out: + pm_runtime_put_sync(hba->dev); /* * If we failed to initialize the device or the device is not * present, turn off the power/clocks etc. */ - if (ret) { - pm_runtime_put_sync(hba->dev); + if (ret) ufshcd_hba_exit(hba); - } } static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd) From 7c91752f5dd9d5c187e9c02282f726a0206e590b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 18 Dec 2023 14:52:15 -0800 Subject: [PATCH 76/78] FROMLIST: scsi: ufs: Remove the ufshcd_hba_exit() call from ufshcd_async_scan() Calling ufshcd_hba_exit() from a function that is called asynchronously from ufshcd_init() is wrong because this triggers multiple race conditions. Instead of calling ufshcd_hba_exit(), log an error message. Reported-by: Daniel Mentz Closes: https://b.corp.google.com/issues/310401362 Fixes: 1d337ec2f35e ("ufs: improve init sequence") Change-Id: I1c056c2e42889301f69107468f2b3eb38bf3d734 Signed-off-by: Bart Van Assche Bug: 310401362 Link: https://lore.kernel.org/linux-scsi/20231218225229.2542156-3-bvanassche@acm.org/ Signed-off-by: Bart Van Assche --- drivers/ufs/core/ufshcd.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 0a86de0feb79..0f0cfea31cbf 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8918,12 +8918,9 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie) out: pm_runtime_put_sync(hba->dev); - /* - * If we failed to initialize the device or the device is not - * present, turn off the power/clocks etc. - */ + if (ret) - ufshcd_hba_exit(hba); + dev_err(hba->dev, "%s failed: %d\n", __func__, ret); } static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd) From 0801d8a89de16b3371ac091b123081565d58f53a Mon Sep 17 00:00:00 2001 From: liangjlee Date: Tue, 9 Jan 2024 03:17:54 +0000 Subject: [PATCH 77/78] ANDROID: mm: export dump_tasks symbol. Export dump_tasks to dump per-task memory status when ramdump. Bug: 316372318 Change-Id: Ie0dd1a4c7ada280dc0c7696781b4b9a5e2a100ab Signed-off-by: liangjlee --- mm/oom_kill.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2c5b854f767b..76a1954071e1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -420,7 +420,7 @@ static int dump_task(struct task_struct *p, void *arg) * State information includes task's pid, uid, tgid, vm size, rss, * pgtables_bytes, swapents, oom_score_adj value, and name. */ -static void dump_tasks(struct oom_control *oc) +void dump_tasks(struct oom_control *oc) { pr_info("Tasks state (memory values in pages):\n"); pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); @@ -436,6 +436,7 @@ static void dump_tasks(struct oom_control *oc) rcu_read_unlock(); } } +EXPORT_SYMBOL_GPL(dump_tasks); static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) { From a41a4ee3704a02c39e67207df9ecc09a0e018e53 Mon Sep 17 00:00:00 2001 From: liangjlee Date: Wed, 10 Jan 2024 09:00:21 +0000 Subject: [PATCH 78/78] ANDROID: Update the ABI symbol list Adding the following symbols: - dump_tasks Bug: 316372318 Change-Id: Iddaed980a227d8beb966cf0fae24947f5bf8b473 Signed-off-by: liangjlee --- android/abi_gki_aarch64.stg | 15 +++++++++++++++ android/abi_gki_aarch64_pixel | 1 + 2 files changed, 16 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index dc2644ea5502..573e477d70d3 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -298635,6 +298635,11 @@ function { parameter_id: 0x3e10b518 parameter_id: 0x0bb0c019 } +function { + id: 0x1f821b4c + return_type_id: 0x48b5725f + parameter_id: 0x3c692b7e +} function { id: 0x1f835b6f return_type_id: 0x48b5725f @@ -359364,6 +359369,15 @@ elf_symbol { type_id: 0x10985193 full_name: "dump_stack" } +elf_symbol { + id: 0x652fbf96 + name: "dump_tasks" + is_defined: true + symbol_type: FUNCTION + crc: 0x6fe3e49b + type_id: 0x1f821b4c + full_name: "dump_tasks" +} elf_symbol { id: 0xda364c85 name: "dw_handle_msi_irq" @@ -401839,6 +401853,7 @@ interface { symbol_id: 0xe09fd784 symbol_id: 0xded28924 symbol_id: 0xe3421d56 + symbol_id: 0x652fbf96 symbol_id: 0xda364c85 symbol_id: 0x68e0756b symbol_id: 0x12cb063e diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index 353946f63fd3..d0f6d7be74ff 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -813,6 +813,7 @@ drm_writeback_signal_completion dump_backtrace dump_stack + dump_tasks dw_handle_msi_irq dw_pcie_find_capability dw_pcie_host_init