From a635beeacc6d56d2b71c39e6c0103f85b53d108e Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Mon, 17 Oct 2022 10:38:06 +0000 Subject: [PATCH 01/38] tracing/histogram: Update document for KEYS_MAX size After commit 4f36c2d85ced ("tracing: Increase tracing map KEYS_MAX size"), 'keys' supports up to three fields. Signed-off-by: Zheng Yejian Cc: stable@vger.kernel.org Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20221017103806.2479139-1-zhengyejian1@huawei.com Signed-off-by: Jonathan Corbet --- Documentation/trace/histogram.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index c1b685a38f6b..87bd772836c0 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -39,7 +39,7 @@ Documentation written by Tom Zanussi will use the event's kernel stacktrace as the key. The keywords 'keys' or 'key' can be used to specify keys, and the keywords 'values', 'vals', or 'val' can be used to specify values. Compound - keys consisting of up to two fields can be specified by the 'keys' + keys consisting of up to three fields can be specified by the 'keys' keyword. Hashing a compound key produces a unique entry in the table for each unique combination of component keys, and can be useful for providing more fine-grained summaries of event data. From 097a4a1612389c31d2c4b95dfa816b91212d7f54 Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Sun, 9 Oct 2022 16:39:44 +0800 Subject: [PATCH 02/38] watchdog: sp805_wdt: fix spelling typo in comment Fix spelling typo in comment. Reported-by: k2ci Signed-off-by: Jiangshan Yi Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20221009083944.2988237-1-13667453960@163.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/sp805_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/sp805_wdt.c b/drivers/watchdog/sp805_wdt.c index 78ba36689eec..2756ed54ca3d 100644 --- a/drivers/watchdog/sp805_wdt.c +++ b/drivers/watchdog/sp805_wdt.c @@ -88,7 +88,7 @@ static bool wdt_is_running(struct watchdog_device *wdd) return (wdtcontrol & ENABLE_MASK) == ENABLE_MASK; } -/* This routine finds load value that will reset system in required timout */ +/* This routine finds load value that will reset system in required timeout */ static int wdt_setload(struct watchdog_device *wdd, unsigned int timeout) { struct sp805_wdt *wdt = watchdog_get_drvdata(wdd); From 82ebbe65d781064cfb0a6a8af221a9cebcaaac9e Mon Sep 17 00:00:00 2001 From: Manank Patel Date: Thu, 13 Oct 2022 15:22:58 +0530 Subject: [PATCH 03/38] drivers: watchdog: exar_wdt.c fix use after free fix use after free by storing the result of PTR_ERR(n->pdev) to a local variable before returning. Signed-off-by: Manank Patel Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20221013095258.1424967-1-pmanank200502@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/exar_wdt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/watchdog/exar_wdt.c b/drivers/watchdog/exar_wdt.c index 35058d8b21bc..7c61ff343271 100644 --- a/drivers/watchdog/exar_wdt.c +++ b/drivers/watchdog/exar_wdt.c @@ -355,8 +355,10 @@ static int __init exar_wdt_register(struct wdt_priv *priv, const int idx) &priv->wdt_res, 1, priv, sizeof(*priv)); if (IS_ERR(n->pdev)) { + int err = PTR_ERR(n->pdev); + kfree(n); - return PTR_ERR(n->pdev); + return err; } list_add_tail(&n->list, &pdev_list); From 0469e56a14bf8cfb80507e51b7aeec0332cdbc13 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 30 Sep 2022 00:51:58 +0200 Subject: [PATCH 04/38] KVM: x86: Mask off reserved bits in CPUID.80000001H KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM actually supports. CPUID.80000001:EBX[27:16] are reserved bits and should be masked off. Fixes: 0771671749b5 ("KVM: Enhance guest cpuid management") Signed-off-by: Jim Mattson Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7065462378e2..834feeb0a828 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1133,6 +1133,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = max(entry->eax, 0x80000021); break; case 0x80000001: + entry->ebx &= ~GENMASK(27, 16); cpuid_entry_override(entry, CPUID_8000_0001_EDX); cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; From eeb69eab57c6604ac90b3fd8e5ac43f24a5535b1 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 29 Sep 2022 15:51:59 -0700 Subject: [PATCH 05/38] KVM: x86: Mask off reserved bits in CPUID.80000006H KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM actually supports. CPUID.80000006H:EDX[17:16] are reserved bits and should be masked off. Fixes: 43d05de2bee7 ("KVM: pass through CPUID(0x80000006)") Signed-off-by: Jim Mattson Message-Id: <20220929225203.2234702-2-jmattson@google.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 834feeb0a828..8325a01cb1f1 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1138,7 +1138,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; case 0x80000006: - /* L2 cache and TLB: pass through host info. */ + /* Drop reserved bits, pass host L2 cache and TLB info. */ + entry->edx &= ~GENMASK(17, 16); break; case 0x80000007: /* Advanced power management */ /* invariant TSC is CPUID.80000007H:EDX[8] */ From 7030d8530e533844e2f4b0e7476498afcd324634 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 29 Sep 2022 15:52:00 -0700 Subject: [PATCH 06/38] KVM: x86: Mask off reserved bits in CPUID.80000008H KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM actually supports. The following ranges of CPUID.80000008H are reserved and should be masked off: ECX[31:18] ECX[11:8] In addition, the PerfTscSize field at ECX[17:16] should also be zero because KVM does not set the PERFTSC bit at CPUID.80000001H.ECX[27]. Fixes: 24c82e576b78 ("KVM: Sanitize cpuid") Signed-off-by: Jim Mattson Message-Id: <20220929225203.2234702-3-jmattson@google.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8325a01cb1f1..489c028859e1 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1169,6 +1169,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); + entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0008_EBX); break; From 079f6889818dd07903fb36c252532ab47ebb6d48 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 29 Sep 2022 15:52:01 -0700 Subject: [PATCH 07/38] KVM: x86: Mask off reserved bits in CPUID.8000001AH KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM actually supports. In the case of CPUID.8000001AH, only three bits are currently defined. The 125 reserved bits should be masked off. Fixes: 24c82e576b78 ("KVM: Sanitize cpuid") Signed-off-by: Jim Mattson Message-Id: <20220929225203.2234702-4-jmattson@google.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 489c028859e1..a0292ba650df 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1189,6 +1189,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ecx = entry->edx = 0; break; case 0x8000001a: + entry->eax &= GENMASK(2, 0); + entry->ebx = entry->ecx = entry->edx = 0; + break; case 0x8000001e: break; case 0x8000001F: From f15fb2cd979a07fbfc666e2f04b8b30ec9233b2a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 10 Oct 2022 18:36:06 +0800 Subject: [PATCH 08/38] btrfs: raid56: properly handle the error when unable to find the missing stripe In raid56_alloc_missing_rbio(), if we can not determine where the missing device is inside the full stripe, we just BUG_ON(). This is not necessary especially the only caller inside scrub.c is already properly checking the return value, and will treat it as a memory allocation failure. Fix the error handling by: - Add an extra warning for the reason Although personally speaking it may be better to be an ASSERT(). - Properly free the allocated rbio Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f6395e8288d6..892005f756cf 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2742,8 +2742,10 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { - BUG(); - kfree(rbio); + btrfs_warn_rl(fs_info, + "can not determine the failed stripe number for full stripe %llu", + bioc->raid_map[0]); + __free_raid_bio(rbio); return NULL; } From ab4c54c643a01067669df8332b64e3f31b69e071 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 10 Oct 2022 18:36:07 +0800 Subject: [PATCH 09/38] btrfs: raid56: avoid double freeing for rbio if full_stripe_write() failed Currently if full_stripe_write() failed to allocate the pages for parity, it will call __free_raid_bio() first, then return -ENOMEM. But some caller of full_stripe_write() will also call __free_raid_bio() again, this would cause double freeing. And it's not a logically sound either, normally we should either free the memory at the same level where we allocated it, or let endio to handle everything. So this patch will solve the double freeing by make raid56_parity_write() to handle the error and free the rbio. Just like what we do in raid56_parity_recover(). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 892005f756cf..82c8e991300e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1632,10 +1632,8 @@ static int full_stripe_write(struct btrfs_raid_bio *rbio) int ret; ret = alloc_rbio_parity_pages(rbio); - if (ret) { - __free_raid_bio(rbio); + if (ret) return ret; - } ret = lock_stripe_add(rbio); if (ret == 0) @@ -1823,8 +1821,10 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) */ if (rbio_is_full(rbio)) { ret = full_stripe_write(rbio); - if (ret) + if (ret) { + __free_raid_bio(rbio); goto fail; + } return; } @@ -1838,8 +1838,10 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) list_add_tail(&rbio->plug_list, &plug->rbio_list); } else { ret = __raid56_parity_write(rbio); - if (ret) + if (ret) { + __free_raid_bio(rbio); goto fail; + } } return; From ae0e5df4d1a4a2694c9c203cc25334aaaf9f2dfa Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 11 Oct 2022 12:02:31 +0200 Subject: [PATCH 10/38] btrfs: reorder btrfs_bio for better packing After changes in commit 917f32a23501 ("btrfs: give struct btrfs_bio a real end_io handler") the layout of btrfs_bio can be improved. There are two holes and the structure size is 264 bytes on release build. By reordering the iterator we can get rid of the holes and the size is 256 bytes which fits to slabs much better. Final layout: struct btrfs_bio { unsigned int mirror_num; /* 0 4 */ struct bvec_iter iter; /* 4 20 */ u64 file_offset; /* 24 8 */ struct btrfs_device * device; /* 32 8 */ u8 * csum; /* 40 8 */ u8 csum_inline[64]; /* 48 64 */ /* --- cacheline 1 boundary (64 bytes) was 48 bytes ago --- */ btrfs_bio_end_io_t end_io; /* 112 8 */ void * private; /* 120 8 */ /* --- cacheline 2 boundary (128 bytes) --- */ struct work_struct end_io_work; /* 128 32 */ struct bio bio; /* 160 96 */ /* size: 256, cachelines: 4, members: 10 */ }; Fixes: 917f32a23501 ("btrfs: give struct btrfs_bio a real end_io handler") Signed-off-by: David Sterba --- fs/btrfs/volumes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 599b9d5af349..f8b668dc8bf8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -395,6 +395,7 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); */ struct btrfs_bio { unsigned int mirror_num; + struct bvec_iter iter; /* for direct I/O */ u64 file_offset; @@ -403,7 +404,6 @@ struct btrfs_bio { struct btrfs_device *device; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; - struct bvec_iter iter; /* End I/O information supplied to btrfs_bio_alloc */ btrfs_bio_end_io_t end_io; From 968b71583130b6104c9f33ba60446d598e327a8b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2022 08:52:46 -0400 Subject: [PATCH 11/38] btrfs: fix tree mod log mishandling of reallocated nodes We have been seeing the following panic in production kernel BUG at fs/btrfs/tree-mod-log.c:677! invalid opcode: 0000 [#1] SMP RIP: 0010:tree_mod_log_rewind+0x1b4/0x200 RSP: 0000:ffffc9002c02f890 EFLAGS: 00010293 RAX: 0000000000000003 RBX: ffff8882b448c700 RCX: 0000000000000000 RDX: 0000000000008000 RSI: 00000000000000a7 RDI: ffff88877d831c00 RBP: 0000000000000002 R08: 000000000000009f R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000100c40 R12: 0000000000000001 R13: ffff8886c26d6a00 R14: ffff88829f5424f8 R15: ffff88877d831a00 FS: 00007fee1d80c780(0000) GS:ffff8890400c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fee1963a020 CR3: 0000000434f33002 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: btrfs_get_old_root+0x12b/0x420 btrfs_search_old_slot+0x64/0x2f0 ? tree_mod_log_oldest_root+0x3d/0xf0 resolve_indirect_ref+0xfd/0x660 ? ulist_alloc+0x31/0x60 ? kmem_cache_alloc_trace+0x114/0x2c0 find_parent_nodes+0x97a/0x17e0 ? ulist_alloc+0x30/0x60 btrfs_find_all_roots_safe+0x97/0x150 iterate_extent_inodes+0x154/0x370 ? btrfs_search_path_in_tree+0x240/0x240 iterate_inodes_from_logical+0x98/0xd0 ? btrfs_search_path_in_tree+0x240/0x240 btrfs_ioctl_logical_to_ino+0xd9/0x180 btrfs_ioctl+0xe2/0x2ec0 ? __mod_memcg_lruvec_state+0x3d/0x280 ? do_sys_openat2+0x6d/0x140 ? kretprobe_dispatcher+0x47/0x70 ? kretprobe_rethook_handler+0x38/0x50 ? rethook_trampoline_handler+0x82/0x140 ? arch_rethook_trampoline_callback+0x3b/0x50 ? kmem_cache_free+0xfb/0x270 ? do_sys_openat2+0xd5/0x140 __x64_sys_ioctl+0x71/0xb0 do_syscall_64+0x2d/0x40 Which is this code in tree_mod_log_rewind() switch (tm->op) { case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: BUG_ON(tm->slot < n); This occurs because we replay the nodes in order that they happened, and when we do a REPLACE we will log a REMOVE_WHILE_FREEING for every slot, starting at 0. 'n' here is the number of items in this block, which in this case was 1, but we had 2 REMOVE_WHILE_FREEING operations. The actual root cause of this was that we were replaying operations for a block that shouldn't have been replayed. Consider the following sequence of events 1. We have an already modified root, and we do a btrfs_get_tree_mod_seq(). 2. We begin removing items from this root, triggering KEY_REPLACE for it's child slots. 3. We remove one of the 2 children this root node points to, thus triggering the root node promotion of the remaining child, and freeing this node. 4. We modify a new root, and re-allocate the above node to the root node of this other root. The tree mod log looks something like this logical 0 op KEY_REPLACE (slot 1) seq 2 logical 0 op KEY_REMOVE (slot 1) seq 3 logical 0 op KEY_REMOVE_WHILE_FREEING (slot 0) seq 4 logical 4096 op LOG_ROOT_REPLACE (old logical 0) seq 5 logical 8192 op KEY_REMOVE_WHILE_FREEING (slot 1) seq 6 logical 8192 op KEY_REMOVE_WHILE_FREEING (slot 0) seq 7 logical 0 op LOG_ROOT_REPLACE (old logical 8192) seq 8 >From here the bug is triggered by the following steps 1. Call btrfs_get_old_root() on the new_root. 2. We call tree_mod_log_oldest_root(btrfs_root_node(new_root)), which is currently logical 0. 3. tree_mod_log_oldest_root() calls tree_mod_log_search_oldest(), which gives us the KEY_REPLACE seq 2, and since that's not a LOG_ROOT_REPLACE we incorrectly believe that we don't have an old root, because we expect that the most recent change should be a LOG_ROOT_REPLACE. 4. Back in tree_mod_log_oldest_root() we don't have a LOG_ROOT_REPLACE, so we don't set old_root, we simply use our existing extent buffer. 5. Since we're using our existing extent buffer (logical 0) we call tree_mod_log_search(0) in order to get the newest change to start the rewind from, which ends up being the LOG_ROOT_REPLACE at seq 8. 6. Again since we didn't find an old_root we simply clone logical 0 at it's current state. 7. We call tree_mod_log_rewind() with the cloned extent buffer. 8. Set n = btrfs_header_nritems(logical 0), which would be whatever the original nritems was when we COWed the original root, say for this example it's 2. 9. We start from the newest operation and work our way forward, so we see LOG_ROOT_REPLACE which we ignore. 10. Next we see KEY_REMOVE_WHILE_FREEING for slot 0, which triggers the BUG_ON(tm->slot < n), because it expects if we've done this we have a completely empty extent buffer to replay completely. The correct thing would be to find the first LOG_ROOT_REPLACE, and then get the old_root set to logical 8192. In fact making that change fixes this particular problem. However consider the much more complicated case. We have a child node in this tree and the above situation. In the above case we freed one of the child blocks at the seq 3 operation. If this block was also re-allocated and got new tree mod log operations we would have a different problem. btrfs_search_old_slot(orig root) would get down to the logical 0 root that still pointed at that node. However in btrfs_search_old_slot() we call tree_mod_log_rewind(buf) directly. This is not context aware enough to know which operations we should be replaying. If the block was re-allocated multiple times we may only want to replay a range of operations, and determining what that range is isn't possible to determine. We could maybe solve this by keeping track of which root the node belonged to at every tree mod log operation, and then passing this around to make sure we're only replaying operations that relate to the root we're trying to rewind. However there's a simpler way to solve this problem, simply disallow reallocations if we have currently running tree mod log users. We already do this for leaf's, so we're simply expanding this to nodes as well. This is a relatively uncommon occurrence, and the problem is complicated enough I'm worried that we will still have corner cases in the reallocation case. So fix this in the most straightforward way possible. Fixes: bd989ba359f2 ("Btrfs: add tree modification log functions") CC: stable@vger.kernel.org # 3.3+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cd2d36580f1a..2801c991814f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3295,21 +3295,22 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } /* - * If this is a leaf and there are tree mod log users, we may - * have recorded mod log operations that point to this leaf. - * So we must make sure no one reuses this leaf's extent before - * mod log operations are applied to a node, otherwise after - * rewinding a node using the mod log operations we get an - * inconsistent btree, as the leaf's extent may now be used as - * a node or leaf for another different btree. + * If there are tree mod log users we may have recorded mod log + * operations for this node. If we re-allocate this node we + * could replay operations on this node that happened when it + * existed in a completely different root. For example if it + * was part of root A, then was reallocated to root B, and we + * are doing a btrfs_old_search_slot(root b), we could replay + * operations that happened when the block was part of root A, + * giving us an inconsistent view of the btree. + * * We are safe from races here because at this point no other * node or root points to this extent buffer, so if after this - * check a new tree mod log user joins, it will not be able to - * find a node pointing to this leaf and record operations that - * point to this leaf. + * check a new tree mod log user joins we will not have an + * existing log of operations on this node that we have to + * contend with. */ - if (btrfs_header_level(buf) == 0 && - test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) must_pin = true; if (must_pin || btrfs_is_zoned(fs_info)) { From 3d17adea74a56a4965f7a603d8ed8c66bb9356d9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 18 Oct 2022 09:56:38 +0800 Subject: [PATCH 12/38] btrfs: make thaw time super block check to also verify checksum Previous commit a05d3c915314 ("btrfs: check superblock to ensure the fs was not modified at thaw time") only checks the content of the super block, but it doesn't really check if the on-disk super block has a matching checksum. This patch will add the checksum verification to thaw time superblock verification. This involves the following extra changes: - Export btrfs_check_super_csum() As we need to call it in super.c. - Change the argument list of btrfs_check_super_csum() Instead of passing a char *, directly pass struct btrfs_super_block * pointer. - Verify that our checksum type didn't change before checking the checksum value, like it's done at mount time Fixes: a05d3c915314 ("btrfs: check superblock to ensure the fs was not modified at thaw time") Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 10 ++++------ fs/btrfs/disk-io.h | 2 ++ fs/btrfs/super.c | 16 ++++++++++++++++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a2da9313c694..4b28263c3d32 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -166,11 +166,9 @@ static bool btrfs_supported_super_csum(u16 csum_type) * Return 0 if the superblock checksum type matches the checksum value of that * algorithm. Pass the raw disk superblock data. */ -static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, - char *raw_disk_sb) +int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *disk_sb) { - struct btrfs_super_block *disk_sb = - (struct btrfs_super_block *)raw_disk_sb; char result[BTRFS_CSUM_SIZE]; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); @@ -181,7 +179,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is * filled with zeros and is included in the checksum. */ - crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE, + crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); if (memcmp(disk_sb->csum, result, fs_info->csum_size)) @@ -3479,7 +3477,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ - if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) { + if (btrfs_check_super_csum(fs_info, disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; btrfs_release_disk_super(disk_super); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c67c15d4d20b..9fa923e005a3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -42,6 +42,8 @@ struct extent_buffer *btrfs_find_create_tree_block( void btrfs_clean_tree_block(struct extent_buffer *buf); void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); +int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *disk_sb); int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9be4fd2db0f4..5942b9384088 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2555,6 +2555,7 @@ static int check_dev_super(struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = dev->fs_info; struct btrfs_super_block *sb; + u16 csum_type; int ret = 0; /* This should be called with fs still frozen. */ @@ -2569,6 +2570,21 @@ static int check_dev_super(struct btrfs_device *dev) if (IS_ERR(sb)) return PTR_ERR(sb); + /* Verify the checksum. */ + csum_type = btrfs_super_csum_type(sb); + if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) { + btrfs_err(fs_info, "csum type changed, has %u expect %u", + csum_type, btrfs_super_csum_type(fs_info->super_copy)); + ret = -EUCLEAN; + goto out; + } + + if (btrfs_check_super_csum(fs_info, sb)) { + btrfs_err(fs_info, "csum for on-disk super block no longer matches"); + ret = -EUCLEAN; + goto out; + } + /* Btrfs_validate_super() includes fsid check against super->fsid. */ ret = btrfs_validate_super(fs_info, sb, 0); if (ret < 0) From 9b8be45f1ef29081c4b614aa559f934526e70d16 Mon Sep 17 00:00:00 2001 From: BingJing Chang Date: Sun, 16 Oct 2022 23:33:46 +0800 Subject: [PATCH 13/38] btrfs: send: fix send failure of a subcase of orphan inodes Commit 9ed0a72e5b35 ("btrfs: send: fix failures when processing inodes with no links") tries to fix all incremental send cases of orphan inodes the send operation will meet. However, there's still a bug causing the corner subcase fails with a ENOENT error. Here's shortened steps of that subcase: $ btrfs subvolume create vol $ touch vol/foo $ btrfs subvolume snapshot -r vol snap1 $ btrfs subvolume snapshot -r vol snap2 # Turn the second snapshot to RW mode and delete the file while # holding an open file descriptor on it $ btrfs property set snap2 ro false $ exec 73 /dev/null At subvol snap1 ERROR: send ioctl failed with -2: No such file or directory It's subcase 3 of BTRFS_COMPARE_TREE_CHANGED in the commit 9ed0a72e5b35 ("btrfs: send: fix failures when processing inodes with no links"). And it's not a common case. We still have not met it in the real world. Theoretically, this case can happen in a batch cascading snapshot backup. In cascading backups, the receive operation in the middle may cause orphan inodes to appear because of the open file descriptors on the snapshot files during receiving. And if we don't do the batch snapshot backups in their creation order, then we can have an inode, which is an orphan in the parent snapshot but refers to a file in the send snapshot. Since an orphan inode has no paths, the send operation will fail with a ENOENT error if it tries to generate a path for it. In that patch, this subcase will be treated as an inode with a new generation. However, when the routine tries to delete the old paths in the parent snapshot, the function process_all_refs() doesn't check whether there are paths recorded or not before it calls the function process_recorded_refs(). And the function process_recorded_refs() try to get the first path in the parent snapshot in the beginning. Since it has no paths in the parent snapshot, the send operation fails. To fix this, we can easily put a link count check to avoid entering the deletion routine like what we do a link count check to avoid creating a new one. Moreover, we can assume that the function process_all_refs() can always collect references to process because we know it has a positive link count. Fixes: 9ed0a72e5b35 ("btrfs: send: fix failures when processing inodes with no links") Reviewed-by: Filipe Manana Signed-off-by: BingJing Chang Signed-off-by: David Sterba --- fs/btrfs/send.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ec6e1752af2c..145c84b44fd0 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6668,17 +6668,19 @@ static int changed_inode(struct send_ctx *sctx, /* * First, process the inode as if it was deleted. */ - sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = false; - sctx->cur_inode_deleted = true; - sctx->cur_inode_size = btrfs_inode_size( - sctx->right_path->nodes[0], right_ii); - sctx->cur_inode_mode = btrfs_inode_mode( - sctx->right_path->nodes[0], right_ii); - ret = process_all_refs(sctx, - BTRFS_COMPARE_TREE_DELETED); - if (ret < 0) - goto out; + if (old_nlinks > 0) { + sctx->cur_inode_gen = right_gen; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; + sctx->cur_inode_size = btrfs_inode_size( + sctx->right_path->nodes[0], right_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->right_path->nodes[0], right_ii); + ret = process_all_refs(sctx, + BTRFS_COMPARE_TREE_DELETED); + if (ret < 0) + goto out; + } /* * Now process the inode as if it was new. From 2398091f9c2c8e0040f4f9928666787a3e8108a7 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 18 Oct 2022 16:05:52 +0200 Subject: [PATCH 14/38] btrfs: fix type of parameter generation in btrfs_get_dentry The type of parameter generation has been u32 since the beginning, however all callers pass a u64 generation, so unify the types to prevent potential loss. CC: stable@vger.kernel.org # 4.9+ Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/export.c | 2 +- fs/btrfs/export.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1d4c2397d0d6..fab7eb76e53b 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -58,7 +58,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, } struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, + u64 root_objectid, u64 generation, int check_generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index f32f4113c976..5afb7ca42828 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -19,7 +19,7 @@ struct btrfs_fid { } __attribute__ ((packed)); struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, + u64 root_objectid, u64 generation, int check_generation); struct dentry *btrfs_get_parent(struct dentry *child); From ea522496afa1dd4ed295466e9c813b88ebda3284 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 14 Oct 2022 10:10:40 -0700 Subject: [PATCH 15/38] Documentation: process: replace outdated LTS table w/ link The existing table was a bit outdated. 3.16 was EOL in 2020. 4.4 was EOL in 2022. 5.10 is new in 2020. 5.15 is new in 2021. We'll see if 6.1 becomes LTS in 2022. Rather than keep this table updated, it does duplicate information from multiple kernel.org pages. Make one less duplication site that needs to be updated and simply refer to the kernel.org page on releases. Suggested-by: Tyler Hicks Suggested-by: Bagas Sanjaya Signed-off-by: Nick Desaulniers Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/stable/20221014171040.849726-1-ndesaulniers%40google.com Reviewed-by: Tyler Hicks (Microsoft) Link: https://lore.kernel.org/r/20221014171040.849726-1-ndesaulniers@google.com Signed-off-by: Jonathan Corbet --- Documentation/process/2.Process.rst | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/Documentation/process/2.Process.rst b/Documentation/process/2.Process.rst index e05fb1b8f8b6..6a919cffcbfd 100644 --- a/Documentation/process/2.Process.rst +++ b/Documentation/process/2.Process.rst @@ -126,17 +126,10 @@ than one development cycle past their initial release. So, for example, the 5.2.21 was the final stable update of the 5.2 release. Some kernels are designated "long term" kernels; they will receive support -for a longer period. As of this writing, the current long term kernels -and their maintainers are: +for a longer period. Please refer to the following link for the list of active +long term kernel versions and their maintainers: - ====== ================================ ======================= - 3.16 Ben Hutchings (very long-term kernel) - 4.4 Greg Kroah-Hartman & Sasha Levin (very long-term kernel) - 4.9 Greg Kroah-Hartman & Sasha Levin - 4.14 Greg Kroah-Hartman & Sasha Levin - 4.19 Greg Kroah-Hartman & Sasha Levin - 5.4 Greg Kroah-Hartman & Sasha Levin - ====== ================================ ======================= + https://www.kernel.org/category/releases.html The selection of a kernel for long-term support is purely a matter of a maintainer having the need and the time to maintain that release. There From e648174b53f1e29ee72ef33756a97ffb8241b6a5 Mon Sep 17 00:00:00 2001 From: Mushahid Hussain Date: Mon, 17 Oct 2022 16:20:26 +0500 Subject: [PATCH 16/38] Documentation: Fix spelling mistake in hacking.rst Fix `botton half locks` to `bottom half locks`. Signed-off-by: Mushahid Hussain Link: https://lore.kernel.org/r/20221017112026.88324-1-mushi.shar@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/kernel-hacking/hacking.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/kernel-hacking/hacking.rst b/Documentation/kernel-hacking/hacking.rst index 9a1f020c8449..1717348a4404 100644 --- a/Documentation/kernel-hacking/hacking.rst +++ b/Documentation/kernel-hacking/hacking.rst @@ -120,7 +120,7 @@ You can tell you are in a softirq (or tasklet) using the .. warning:: Beware that this will return a false positive if a - :ref:`botton half lock ` is held. + :ref:`bottom half lock ` is held. Some Basic Rules ================ From 2f3f53d62307262f0086804ea7cea99b0e085450 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Sat, 15 Oct 2022 18:22:01 +0900 Subject: [PATCH 17/38] docs/process/howto: Replace C89 with C11 Commit e8c07082a810 ("Kbuild: move to -std=gnu11") updated process/programming-language.rst, but failed to update process/howto.rst. Update howto.rst and resolve the inconsistency. Fixes: e8c07082a810 ("Kbuild: move to -std=gnu11") Signed-off-by: Akira Yokosawa Cc: Arnd Bergmann Cc: Federico Vaga Cc: Alex Shi Cc: Hu Haowen Cc: Tsugikazu Shibata Link: https://lore.kernel.org/r/20221015092201.32099-1-akiyks@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/process/howto.rst | 2 +- Documentation/translations/it_IT/process/howto.rst | 2 +- Documentation/translations/ja_JP/howto.rst | 2 +- Documentation/translations/ko_KR/howto.rst | 2 +- Documentation/translations/zh_CN/process/howto.rst | 2 +- Documentation/translations/zh_TW/process/howto.rst | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/process/howto.rst b/Documentation/process/howto.rst index bd15c393ba3c..cb6abcb2b6d0 100644 --- a/Documentation/process/howto.rst +++ b/Documentation/process/howto.rst @@ -36,7 +36,7 @@ experience, the following books are good for, if anything, reference: - "C: A Reference Manual" by Harbison and Steele [Prentice Hall] The kernel is written using GNU C and the GNU toolchain. While it -adheres to the ISO C89 standard, it uses a number of extensions that are +adheres to the ISO C11 standard, it uses a number of extensions that are not featured in the standard. The kernel is a freestanding C environment, with no reliance on the standard C library, so some portions of the C standard are not supported. Arbitrary long long diff --git a/Documentation/translations/it_IT/process/howto.rst b/Documentation/translations/it_IT/process/howto.rst index 15c08aea1dfe..052f1b3610cb 100644 --- a/Documentation/translations/it_IT/process/howto.rst +++ b/Documentation/translations/it_IT/process/howto.rst @@ -44,7 +44,7 @@ altro, utili riferimenti: - "C: A Reference Manual" di Harbison and Steele [Prentice Hall] Il kernel è stato scritto usando GNU C e la toolchain GNU. -Sebbene si attenga allo standard ISO C89, esso utilizza una serie di +Sebbene si attenga allo standard ISO C11, esso utilizza una serie di estensioni che non sono previste in questo standard. Il kernel è un ambiente C indipendente, che non ha alcuna dipendenza dalle librerie C standard, così alcune parti del C standard non sono supportate. diff --git a/Documentation/translations/ja_JP/howto.rst b/Documentation/translations/ja_JP/howto.rst index b47a682d8ded..b8eeb45a02d4 100644 --- a/Documentation/translations/ja_JP/howto.rst +++ b/Documentation/translations/ja_JP/howto.rst @@ -65,7 +65,7 @@ Linux カーネル開発のやり方 - 『新・詳説 C 言語 H&S リファレンス』 (サミュエル P ハービソン/ガイ L スティール共著 斉藤 信男監訳)[ソフトバンク] カーネルは GNU C と GNU ツールチェインを使って書かれています。カーネル -は ISO C89 仕様に準拠して書く一方で、標準には無い言語拡張を多く使って +は ISO C11 仕様に準拠して書く一方で、標準には無い言語拡張を多く使って います。カーネルは標準 C ライブラリに依存しない、C 言語非依存環境です。 そのため、C の標準の中で使えないものもあります。特に任意の long long の除算や浮動小数点は使えません。カーネルがツールチェインや C 言語拡張 diff --git a/Documentation/translations/ko_KR/howto.rst b/Documentation/translations/ko_KR/howto.rst index df53fafd1b10..969e91a95bb0 100644 --- a/Documentation/translations/ko_KR/howto.rst +++ b/Documentation/translations/ko_KR/howto.rst @@ -62,7 +62,7 @@ Documentation/process/howto.rst - "Practical C Programming" by Steve Oualline [O'Reilly] - "C: A Reference Manual" by Harbison and Steele [Prentice Hall] -커널은 GNU C와 GNU 툴체인을 사용하여 작성되었다. 이 툴들은 ISO C89 표준을 +커널은 GNU C와 GNU 툴체인을 사용하여 작성되었다. 이 툴들은 ISO C11 표준을 따르는 반면 표준에 있지 않은 많은 확장기능도 가지고 있다. 커널은 표준 C 라이브러리와는 관계없이 freestanding C 환경이어서 C 표준의 일부는 지원되지 않는다. 임의의 long long 나누기나 floating point는 지원되지 않는다. diff --git a/Documentation/translations/zh_CN/process/howto.rst b/Documentation/translations/zh_CN/process/howto.rst index 5bf953146929..888978a62db3 100644 --- a/Documentation/translations/zh_CN/process/howto.rst +++ b/Documentation/translations/zh_CN/process/howto.rst @@ -45,7 +45,7 @@ Linux内核大部分是由C语言写成的,一些体系结构相关的代码 - "C: A Reference Manual" by Harbison and Steele [Prentice Hall] 《C语言参考手册(原书第5版)》(邱仲潘 等译)[机械工业出版社] -Linux内核使用GNU C和GNU工具链开发。虽然它遵循ISO C89标准,但也用到了一些 +Linux内核使用GNU C和GNU工具链开发。虽然它遵循ISO C11标准,但也用到了一些 标准中没有定义的扩展。内核是自给自足的C环境,不依赖于标准C库的支持,所以 并不支持C标准中的部分定义。比如long long类型的大数除法和浮点运算就不允许 使用。有时候确实很难弄清楚内核对工具链的要求和它所使用的扩展,不幸的是目 diff --git a/Documentation/translations/zh_TW/process/howto.rst b/Documentation/translations/zh_TW/process/howto.rst index 86b0d4c6d6f9..8fb8edcaee66 100644 --- a/Documentation/translations/zh_TW/process/howto.rst +++ b/Documentation/translations/zh_TW/process/howto.rst @@ -48,7 +48,7 @@ Linux內核大部分是由C語言寫成的,一些體系結構相關的代碼 - "C: A Reference Manual" by Harbison and Steele [Prentice Hall] 《C語言參考手冊(原書第5版)》(邱仲潘 等譯)[機械工業出版社] -Linux內核使用GNU C和GNU工具鏈開發。雖然它遵循ISO C89標準,但也用到了一些 +Linux內核使用GNU C和GNU工具鏈開發。雖然它遵循ISO C11標準,但也用到了一些 標準中沒有定義的擴展。內核是自給自足的C環境,不依賴於標準C庫的支持,所以 並不支持C標準中的部分定義。比如long long類型的大數除法和浮點運算就不允許 使用。有時候確實很難弄清楚內核對工具鏈的要求和它所使用的擴展,不幸的是目 From 76a66ba101329316a5d7f4275070be22eb85fdf2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 21 Oct 2022 08:43:45 +0800 Subject: [PATCH 18/38] btrfs: don't use btrfs_chunk::sub_stripes from disk [BUG] There are two reports (the earliest one from LKP, a more recent one from kernel bugzilla) that we can have some chunks with 0 as sub_stripes. This will cause divide-by-zero errors at btrfs_rmap_block, which is introduced by a recent kernel patch ac0677348f3c ("btrfs: merge calculations for simple striped profiles in btrfs_rmap_block"): if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { stripe_nr = stripe_nr * map->num_stripes + i; stripe_nr = div_u64(stripe_nr, map->sub_stripes); <<< } [CAUSE] From the more recent report, it has been proven that we have some chunks with 0 as sub_stripes, mostly caused by older mkfs. It turns out that the mkfs.btrfs fix is only introduced in 6718ab4d33aa ("btrfs-progs: Initialize sub_stripes to 1 in btrfs_alloc_data_chunk") which is included in v5.4 btrfs-progs release. So there would be quite some old filesystems with such 0 sub_stripes. [FIX] Just don't trust the sub_stripes values from disk. We have a trusted btrfs_raid_array[] to fetch the correct sub_stripes numbers for each profile and that are fixed. By this, we can keep the compatibility with older filesystems while still avoid divide-by-zero bugs. Reported-by: kernel test robot Reported-by: Viktor Kuzmin Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216559 Fixes: ac0677348f3c ("btrfs: merge calculations for simple striped profiles in btrfs_rmap_block") CC: stable@vger.kernel.org # 6.0 Reviewed-by: Su Yue Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 94ba46d57920..a8d4bc6a1937 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7142,6 +7142,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, u64 devid; u64 type; u8 uuid[BTRFS_UUID_SIZE]; + int index; int num_stripes; int ret; int i; @@ -7149,6 +7150,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); type = btrfs_chunk_type(leaf, chunk); + index = btrfs_bg_flags_to_raid_index(type); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); #if BITS_PER_LONG == 32 @@ -7202,7 +7204,15 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->io_align = btrfs_chunk_io_align(leaf, chunk); map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); map->type = type; - map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + /* + * We can't use the sub_stripes value, as for profiles other than + * RAID10, they may have 0 as sub_stripes for filesystems created by + * older mkfs (sub_stripes = btrfs_raid_array[index].sub_stripes; map->verified_stripes = 0; em->orig_block_len = btrfs_calc_stripe_length(em); for (i = 0; i < num_stripes; i++) { From 86c4f0d547f6460d0426ebb3ba0614f1134b8cda Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 29 Sep 2022 15:52:03 -0700 Subject: [PATCH 19/38] KVM: x86: Mask off reserved bits in CPUID.8000001FH KVM_GET_SUPPORTED_CPUID should only enumerate features that KVM actually supports. CPUID.8000001FH:EBX[31:16] are reserved bits and should be masked off. Fixes: 8765d75329a3 ("KVM: X86: Extend CPUID range to include new leaf") Signed-off-by: Jim Mattson Message-Id: <20220929225203.2234702-6-jmattson@google.com> Cc: stable@vger.kernel.org [Clear NumVMPL too. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a0292ba650df..0810e93cbedc 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1199,7 +1199,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; } else { cpuid_entry_override(entry, CPUID_8000_001F_EAX); - + /* Clear NumVMPL since KVM does not support VMPL. */ + entry->ebx &= ~GENMASK(31, 12); /* * Enumerate '0' for "PA bits reduction", the adjusted * MAXPHYADDR is enumerated directly (see 0x80000008). From 5aa02366773376a1fd3a5c6a815e5f6e026ab391 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Fri, 14 Oct 2022 15:55:11 +0800 Subject: [PATCH 20/38] KVM: x86: Reduce refcount if single_open() fails in kvm_mmu_rmaps_stat_open() Refcount is increased before calling single_open() in kvm_mmu_rmaps_stat_open(), If single_open() fails, refcount should be restored, otherwise the vm couldn't be destroyed. Fixes: 3bcd0662d66fd ("KVM: X86: Introduce mmu_rmaps_stat per-vm debugfs file") Signed-off-by: Hou Wenlong Message-Id: [Preserved return value of single_open. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/debugfs.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index cfed36aba2f7..c1390357126a 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -158,11 +158,16 @@ out: static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file) { struct kvm *kvm = inode->i_private; + int r; if (!kvm_get_kvm_safe(kvm)) return -ENOENT; - return single_open(file, kvm_mmu_rmaps_stat_show, kvm); + r = single_open(file, kvm_mmu_rmaps_stat_show, kvm); + if (r < 0) + kvm_put_kvm(kvm); + + return r; } static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file) From 180418e2eb33be5c8d0b703c843e0ebc045aef80 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Mon, 17 Oct 2022 11:06:10 +0800 Subject: [PATCH 21/38] KVM: debugfs: Return retval of simple_attr_open() if it fails Although simple_attr_open() fails only with -ENOMEM with current code base, it would be nicer to return retval of simple_attr_open() directly in kvm_debugfs_open(). No functional change intended. Signed-off-by: Hou Wenlong Message-Id: <69d64d93accd1f33691b8a383ae555baee80f943.1665975828.git.houwenlong.hwl@antgroup.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1376a47fedee..f1df24c2bc84 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5409,6 +5409,7 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt) { + int ret; struct kvm_stat_data *stat_data = (struct kvm_stat_data *) inode->i_private; @@ -5420,15 +5421,13 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, if (!kvm_get_kvm_safe(stat_data->kvm)) return -ENOENT; - if (simple_attr_open(inode, file, get, - kvm_stats_debugfs_mode(stat_data->desc) & 0222 - ? set : NULL, - fmt)) { + ret = simple_attr_open(inode, file, get, + kvm_stats_debugfs_mode(stat_data->desc) & 0222 + ? set : NULL, fmt); + if (ret) kvm_put_kvm(stat_data->kvm); - return -ENOMEM; - } - return 0; + return ret; } static int kvm_debugfs_release(struct inode *inode, struct file *file) From 44fc40a015af7511408f7b447e2c0c2da056fd95 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 13 Oct 2022 23:46:38 +0200 Subject: [PATCH 22/38] MAINTAINERS: git://github -> https://github.com for kvm-riscv Github deprecated the git:// links about a year ago, so let's move to the https:// URLs instead. Reported-by: Conor Dooley Link: https://github.blog/2021-09-01-improving-git-protocol-security-github/ Signed-off-by: Palmer Dabbelt Signed-off-by: Paolo Bonzini --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index cf0f18502372..7d62b1640930 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11241,7 +11241,7 @@ L: kvm@vger.kernel.org L: kvm-riscv@lists.infradead.org L: linux-riscv@lists.infradead.org S: Maintained -T: git git://github.com/kvm-riscv/linux.git +T: git https://github.com/kvm-riscv/linux.git F: arch/riscv/include/asm/kvm* F: arch/riscv/include/uapi/asm/kvm* F: arch/riscv/kvm/ From dea0d5a2fde62237ff14c41cb05dd151cebf84c0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Sep 2022 23:00:08 +0000 Subject: [PATCH 23/38] KVM: x86: Exempt pending triple fault from event injection sanity check Exempt pending triple faults, a.k.a. KVM_REQ_TRIPLE_FAULT, when asserting that KVM didn't attempt to queue a new exception during event injection. KVM needs to emulate the injection itself when emulating Real Mode due to lack of unrestricted guest support (VMX) and will queue a triple fault if that emulation fails. Ideally the assertion would more precisely filter out the emulated Real Mode triple fault case, but rmode.vm86_active is buried in vcpu_vmx and can't be queried without a new kvm_x86_ops. And unlike "regular" exceptions, triple fault cannot put the vCPU into an infinite loop; the triple fault will force either an exit to userspace or a nested VM-Exit, and triple fault after nested VM-Exit will force an exit to userspace. I.e. there is no functional issue, so just suppress the warning for triple faults. Opportunistically convert the warning to a one-time thing, when it fires, it fires _a lot_, and is usually user triggerable, i.e. can be used to spam the kernel log. Fixes: 7055fb113116 ("KVM: x86: Treat pending TRIPLE_FAULT requests as pending exceptions") Reported-by: kernel test robot Link: https://lore.kernel.org/r/202209301338.aca913c3-yujie.liu@intel.com Signed-off-by: Sean Christopherson Message-Id: <20220930230008.1636044-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9cf1ba865562..104b72df33d6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10044,7 +10044,20 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, kvm_x86_ops.nested_ops->has_events(vcpu)) *req_immediate_exit = true; - WARN_ON(kvm_is_exception_pending(vcpu)); + /* + * KVM must never queue a new exception while injecting an event; KVM + * is done emulating and should only propagate the to-be-injected event + * to the VMCS/VMCB. Queueing a new exception can put the vCPU into an + * infinite loop as KVM will bail from VM-Enter to inject the pending + * exception and start the cycle all over. + * + * Exempt triple faults as they have special handling and won't put the + * vCPU into an infinite loop. Triple fault can be queued when running + * VMX without unrestricted guest, as that requires KVM to emulate Real + * Mode events (see kvm_inject_realmode_interrupt()). + */ + WARN_ON_ONCE(vcpu->arch.exception.pending || + vcpu->arch.exception_vmexit.pending); return 0; out: From 1c1a41497ab879ac9608f3047f230af833eeef3d Mon Sep 17 00:00:00 2001 From: Emanuele Giuseppe Esposito Date: Tue, 25 Oct 2022 08:37:49 -0400 Subject: [PATCH 24/38] KVM: VMX: fully disable SGX if SECONDARY_EXEC_ENCLS_EXITING unavailable Clear enable_sgx if ENCLS-exiting is not supported, i.e. if SGX cannot be virtualized. When KVM is loaded, adjust_vmx_controls checks that the bit is available before enabling the feature; however, other parts of the code check enable_sgx and not clearing the variable caused two different bugs, mostly affecting nested virtualization scenarios. First, because enable_sgx remained true, SECONDARY_EXEC_ENCLS_EXITING would be marked available in the capability MSR that are accessed by a nested hypervisor. KVM would then propagate the control from vmcs12 to vmcs02 even if it isn't supported by the processor, thus causing an unexpected VM-Fail (exit code 0x7) in L1. Second, vmx_set_cpu_caps() would not clear the SGX bits when hardware support is unavailable. This is a much less problematic bug as it only happens if SGX is soft-disabled (available in the processor but hidden in CPUID) or if SGX is supported for bare metal but not in the VMCS (will never happen when running on bare metal, but can theoertically happen when running in a VM). Last but not least, this ensures that module params in sysfs reflect KVM's actual configuration. RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=2127128 Fixes: 72add915fbd5 ("KVM: VMX: Enable SGX virtualization for SGX1, SGX2 and LC") Cc: stable@vger.kernel.org Suggested-by: Sean Christopherson Suggested-by: Bandan Das Signed-off-by: Emanuele Giuseppe Esposito Message-Id: <20221025123749.2201649-1-eesposit@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9dba04b6b019..65f092e4a81b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8263,6 +8263,11 @@ static __init int hardware_setup(void) if (!cpu_has_virtual_nmis()) enable_vnmi = 0; +#ifdef CONFIG_X86_SGX_KVM + if (!cpu_has_vmx_encls_vmexit()) + enable_sgx = false; +#endif + /* * set_apic_access_page_addr() is used to reload apic access * page upon invalidation. No need to do anything if not From 52491a38b2c2411f3f0229dc6ad610349c704a41 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Oct 2022 21:12:19 +0000 Subject: [PATCH 25/38] KVM: Initialize gfn_to_pfn_cache locks in dedicated helper Move the gfn_to_pfn_cache lock initialization to another helper and call the new helper during VM/vCPU creation. There are race conditions possible due to kvm_gfn_to_pfn_cache_init()'s ability to re-initialize the cache's locks. For example: a race between ioctl(KVM_XEN_HVM_EVTCHN_SEND) and kvm_gfn_to_pfn_cache_init() leads to a corrupted shinfo gpc lock. (thread 1) | (thread 2) | kvm_xen_set_evtchn_fast | read_lock_irqsave(&gpc->lock, ...) | | kvm_gfn_to_pfn_cache_init | rwlock_init(&gpc->lock) read_unlock_irqrestore(&gpc->lock, ...) | Rename "cache_init" and "cache_destroy" to activate+deactivate to avoid implying that the cache really is destroyed/freed. Note, there more races in the newly named kvm_gpc_activate() that will be addressed separately. Fixes: 982ed0de4753 ("KVM: Reinstate gfn_to_pfn_cache with invalidation support") Cc: stable@vger.kernel.org Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj [sean: call out that this is a bug fix] Signed-off-by: Sean Christopherson Message-Id: <20221013211234.1318131-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 12 +++++---- arch/x86/kvm/xen.c | 57 +++++++++++++++++++++------------------- include/linux/kvm_host.h | 24 ++++++++++++----- virt/kvm/pfncache.c | 21 ++++++++------- 4 files changed, 66 insertions(+), 48 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 104b72df33d6..521b433f978c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2315,11 +2315,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, /* we verify if the enable bit is set... */ if (system_time & 1) { - kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu, - KVM_HOST_USES_PFN, system_time & ~1ULL, - sizeof(struct pvclock_vcpu_time_info)); + kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu, + KVM_HOST_USES_PFN, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info)); } else { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); } return; @@ -3388,7 +3388,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); vcpu->arch.time = 0; } @@ -11829,6 +11829,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; + kvm_gpc_init(&vcpu->arch.pv_time); + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 93c628d3e3a9..b2be60c6efa4 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -42,13 +42,13 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) int idx = srcu_read_lock(&kvm->srcu); if (gfn == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(kvm, gpc); + kvm_gpc_deactivate(kvm, gpc); goto out; } do { - ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN, - gpa, PAGE_SIZE); + ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa, + PAGE_SIZE); if (ret) goto out; @@ -554,15 +554,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) offsetof(struct compat_vcpu_info, time)); if (data->u.gpa == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); r = 0; break; } - r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, - &vcpu->arch.xen.vcpu_info_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_info)); + r = kvm_gpc_activate(vcpu->kvm, + &vcpu->arch.xen.vcpu_info_cache, NULL, + KVM_HOST_USES_PFN, data->u.gpa, + sizeof(struct vcpu_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -570,16 +570,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: if (data->u.gpa == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache); r = 0; break; } - r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct pvclock_vcpu_time_info)); + r = kvm_gpc_activate(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache, + NULL, KVM_HOST_USES_PFN, data->u.gpa, + sizeof(struct pvclock_vcpu_time_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); break; @@ -590,16 +590,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } if (data->u.gpa == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(vcpu->kvm, + &vcpu->arch.xen.runstate_cache); r = 0; break; } - r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, - &vcpu->arch.xen.runstate_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_runstate_info)); + r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache, + NULL, KVM_HOST_USES_PFN, data->u.gpa, + sizeof(struct vcpu_runstate_info)); break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: @@ -1816,7 +1815,12 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) { vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx; vcpu->arch.xen.poll_evtchn = 0; + timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); + + kvm_gpc_init(&vcpu->arch.xen.runstate_cache); + kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache); } void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) @@ -1824,18 +1828,17 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) if (kvm_xen_timer_enabled(vcpu)) kvm_xen_stop_timer(vcpu); - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.runstate_cache); - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.vcpu_info_cache); - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache); + del_timer_sync(&vcpu->arch.xen.poll_timer); } void kvm_xen_init_vm(struct kvm *kvm) { idr_init(&kvm->arch.xen.evtchn_ports); + kvm_gpc_init(&kvm->arch.xen.shinfo_cache); } void kvm_xen_destroy_vm(struct kvm *kvm) @@ -1843,7 +1846,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm) struct evtchnfd *evtchnfd; int i; - kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache); + kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache); idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { if (!evtchnfd->deliver.port.port) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 00c3448ba7f8..18592bdf4c1b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1240,8 +1240,18 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); /** - * kvm_gfn_to_pfn_cache_init - prepare a cached kernel mapping and HPA for a - * given guest physical address. + * kvm_gpc_init - initialize gfn_to_pfn_cache. + * + * @gpc: struct gfn_to_pfn_cache object. + * + * This sets up a gfn_to_pfn_cache by initializing locks. Note, the cache must + * be zero-allocated (or zeroed by the caller before init). + */ +void kvm_gpc_init(struct gfn_to_pfn_cache *gpc); + +/** + * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest + * physical address. * * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. @@ -1265,9 +1275,9 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); * kvm_gfn_to_pfn_cache_check() to ensure that the cache is valid before * accessing the target page. */ -int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, - struct kvm_vcpu *vcpu, enum pfn_cache_usage usage, - gpa_t gpa, unsigned long len); +int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, + struct kvm_vcpu *vcpu, enum pfn_cache_usage usage, + gpa_t gpa, unsigned long len); /** * kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache. @@ -1324,7 +1334,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); /** - * kvm_gfn_to_pfn_cache_destroy - destroy and unlink a gfn_to_pfn_cache. + * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache. * * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. @@ -1332,7 +1342,7 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); * This removes a cache from the @kvm's list to be processed on MMU notifier * invocation. */ -void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); +void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 68ff41d39545..08f97cf97264 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -346,17 +346,20 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) } EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_unmap); +void kvm_gpc_init(struct gfn_to_pfn_cache *gpc) +{ + rwlock_init(&gpc->lock); + mutex_init(&gpc->refresh_lock); +} +EXPORT_SYMBOL_GPL(kvm_gpc_init); -int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, - struct kvm_vcpu *vcpu, enum pfn_cache_usage usage, - gpa_t gpa, unsigned long len) +int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, + struct kvm_vcpu *vcpu, enum pfn_cache_usage usage, + gpa_t gpa, unsigned long len) { WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage); if (!gpc->active) { - rwlock_init(&gpc->lock); - mutex_init(&gpc->refresh_lock); - gpc->khva = NULL; gpc->pfn = KVM_PFN_ERR_FAULT; gpc->uhva = KVM_HVA_ERR_BAD; @@ -371,9 +374,9 @@ int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, } return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len); } -EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_init); +EXPORT_SYMBOL_GPL(kvm_gpc_activate); -void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) +void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) { if (gpc->active) { spin_lock(&kvm->gpc_lock); @@ -384,4 +387,4 @@ void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) gpc->active = false; } } -EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_destroy); +EXPORT_SYMBOL_GPL(kvm_gpc_deactivate); From ecbcf030b45666ad11bc98565e71dfbcb7be4393 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Oct 2022 21:12:20 +0000 Subject: [PATCH 26/38] KVM: Reject attempts to consume or refresh inactive gfn_to_pfn_cache Reject kvm_gpc_check() and kvm_gpc_refresh() if the cache is inactive. Not checking the active flag during refresh is particularly egregious, as KVM can end up with a valid, inactive cache, which can lead to a variety of use-after-free bugs, e.g. consuming a NULL kernel pointer or missing an mmu_notifier invalidation due to the cache not being on the list of gfns to invalidate. Note, "active" needs to be set if and only if the cache is on the list of caches, i.e. is reachable via mmu_notifier events. If a relevant mmu_notifier event occurs while the cache is "active" but not on the list, KVM will not acquire the cache's lock and so will not serailize the mmu_notifier event with active users and/or kvm_gpc_refresh(). A race between KVM_XEN_ATTR_TYPE_SHARED_INFO and KVM_XEN_HVM_EVTCHN_SEND can be exploited to trigger the bug. 1. Deactivate shinfo cache: kvm_xen_hvm_set_attr case KVM_XEN_ATTR_TYPE_SHARED_INFO kvm_gpc_deactivate kvm_gpc_unmap gpc->valid = false gpc->khva = NULL gpc->active = false Result: active = false, valid = false 2. Cause cache refresh: kvm_arch_vm_ioctl case KVM_XEN_HVM_EVTCHN_SEND kvm_xen_hvm_evtchn_send kvm_xen_set_evtchn kvm_xen_set_evtchn_fast kvm_gpc_check return -EWOULDBLOCK because !gpc->valid kvm_xen_set_evtchn_fast return -EWOULDBLOCK kvm_gpc_refresh hva_to_pfn_retry gpc->valid = true gpc->khva = not NULL Result: active = false, valid = true 3. Race ioctl KVM_XEN_HVM_EVTCHN_SEND against ioctl KVM_XEN_ATTR_TYPE_SHARED_INFO: kvm_arch_vm_ioctl case KVM_XEN_HVM_EVTCHN_SEND kvm_xen_hvm_evtchn_send kvm_xen_set_evtchn kvm_xen_set_evtchn_fast read_lock gpc->lock kvm_xen_hvm_set_attr case KVM_XEN_ATTR_TYPE_SHARED_INFO mutex_lock kvm->lock kvm_xen_shared_info_init kvm_gpc_activate gpc->khva = NULL kvm_gpc_check [ Check passes because gpc->valid is still true, even though gpc->khva is already NULL. ] shinfo = gpc->khva pending_bits = shinfo->evtchn_pending CRASH: test_and_set_bit(..., pending_bits) Fixes: 982ed0de4753 ("KVM: Reinstate gfn_to_pfn_cache with invalidation support") Cc: stable@vger.kernel.org Reported-by: : Michal Luczaj Signed-off-by: Sean Christopherson Message-Id: <20221013211234.1318131-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/pfncache.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 08f97cf97264..346e47f15572 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -81,6 +81,9 @@ bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, { struct kvm_memslots *slots = kvm_memslots(kvm); + if (!gpc->active) + return false; + if ((gpa & ~PAGE_MASK) + len > PAGE_SIZE) return false; @@ -240,10 +243,11 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, { struct kvm_memslots *slots = kvm_memslots(kvm); unsigned long page_offset = gpa & ~PAGE_MASK; - kvm_pfn_t old_pfn, new_pfn; + bool unmap_old = false; unsigned long old_uhva; + kvm_pfn_t old_pfn; void *old_khva; - int ret = 0; + int ret; /* * If must fit within a single page. The 'len' argument is @@ -261,6 +265,11 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, write_lock_irq(&gpc->lock); + if (!gpc->active) { + ret = -EINVAL; + goto out_unlock; + } + old_pfn = gpc->pfn; old_khva = gpc->khva - offset_in_page(gpc->khva); old_uhva = gpc->uhva; @@ -291,6 +300,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, /* If the HVA→PFN mapping was already valid, don't unmap it. */ old_pfn = KVM_PFN_ERR_FAULT; old_khva = NULL; + ret = 0; } out: @@ -305,14 +315,15 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpc->khva = NULL; } - /* Snapshot the new pfn before dropping the lock! */ - new_pfn = gpc->pfn; + /* Detect a pfn change before dropping the lock! */ + unmap_old = (old_pfn != gpc->pfn); +out_unlock: write_unlock_irq(&gpc->lock); mutex_unlock(&gpc->refresh_lock); - if (old_pfn != new_pfn) + if (unmap_old) gpc_unmap_khva(kvm, old_pfn, old_khva); return ret; @@ -366,11 +377,19 @@ int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpc->vcpu = vcpu; gpc->usage = usage; gpc->valid = false; - gpc->active = true; spin_lock(&kvm->gpc_lock); list_add(&gpc->list, &kvm->gpc_list); spin_unlock(&kvm->gpc_lock); + + /* + * Activate the cache after adding it to the list, a concurrent + * refresh must not establish a mapping until the cache is + * reachable by mmu_notifier events. + */ + write_lock_irq(&gpc->lock); + gpc->active = true; + write_unlock_irq(&gpc->lock); } return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len); } @@ -379,12 +398,20 @@ EXPORT_SYMBOL_GPL(kvm_gpc_activate); void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) { if (gpc->active) { + /* + * Deactivate the cache before removing it from the list, KVM + * must stall mmu_notifier events until all users go away, i.e. + * until gpc->lock is dropped and refresh is guaranteed to fail. + */ + write_lock_irq(&gpc->lock); + gpc->active = false; + write_unlock_irq(&gpc->lock); + spin_lock(&kvm->gpc_lock); list_del(&gpc->list); spin_unlock(&kvm->gpc_lock); kvm_gfn_to_pfn_cache_unmap(kvm, gpc); - gpc->active = false; } } EXPORT_SYMBOL_GPL(kvm_gpc_deactivate); From a51abbbf25317c07cb00b40ae7d04a209d2a3d54 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Oct 2022 21:12:33 +0000 Subject: [PATCH 27/38] KVM: selftests: Add tests in xen_shinfo_test to detect lock races Tests for races between shinfo_cache (de)activation and hypercall+ioctl() processing. KVM has had bugs where activating the shared info cache multiple times and/or with concurrent users results in lock corruption, NULL pointer dereferences, and other fun. For the timer injection testcase (#22), re-arm the timer until the IRQ is successfully injected. If the timer expires while the shared info is deactivated (invalid), KVM will drop the event. Signed-off-by: Michal Luczaj Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20221013211234.1318131-16-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/xen_shinfo_test.c | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 8a5cb800f50e..caa3f5ab9e10 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -15,9 +15,13 @@ #include #include #include +#include #include +/* Defined in include/linux/kvm_types.h */ +#define GPA_INVALID (~(ulong)0) + #define SHINFO_REGION_GVA 0xc0000000ULL #define SHINFO_REGION_GPA 0xc0000000ULL #define SHINFO_REGION_SLOT 10 @@ -44,6 +48,8 @@ #define MIN_STEAL_TIME 50000 +#define SHINFO_RACE_TIMEOUT 2 /* seconds */ + #define __HYPERVISOR_set_timer_op 15 #define __HYPERVISOR_sched_op 29 #define __HYPERVISOR_event_channel_op 32 @@ -148,6 +154,7 @@ static void guest_wait_for_irq(void) static void guest_code(void) { struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR; + int i; __asm__ __volatile__( "sti\n" @@ -325,6 +332,49 @@ static void guest_code(void) guest_wait_for_irq(); GUEST_SYNC(21); + /* Racing host ioctls */ + + guest_wait_for_irq(); + + GUEST_SYNC(22); + /* Racing vmcall against host ioctl */ + + ports[0] = 0; + + p = (struct sched_poll) { + .ports = ports, + .nr_ports = 1, + .timeout = 0 + }; + +wait_for_timer: + /* + * Poll for a timer wake event while the worker thread is mucking with + * the shared info. KVM XEN drops timer IRQs if the shared info is + * invalid when the timer expires. Arbitrarily poll 100 times before + * giving up and asking the VMM to re-arm the timer. 100 polls should + * consume enough time to beat on KVM without taking too long if the + * timer IRQ is dropped due to an invalid event channel. + */ + for (i = 0; i < 100 && !guest_saw_irq; i++) + asm volatile("vmcall" + : "=a" (rax) + : "a" (__HYPERVISOR_sched_op), + "D" (SCHEDOP_poll), + "S" (&p) + : "memory"); + + /* + * Re-send the timer IRQ if it was (likely) dropped due to the timer + * expiring while the event channel was invalid. + */ + if (!guest_saw_irq) { + GUEST_SYNC(23); + goto wait_for_timer; + } + guest_saw_irq = false; + + GUEST_SYNC(24); } static int cmp_timespec(struct timespec *a, struct timespec *b) @@ -352,11 +402,36 @@ static void handle_alrm(int sig) TEST_FAIL("IRQ delivery timed out"); } +static void *juggle_shinfo_state(void *arg) +{ + struct kvm_vm *vm = (struct kvm_vm *)arg; + + struct kvm_xen_hvm_attr cache_init = { + .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, + .u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE + }; + + struct kvm_xen_hvm_attr cache_destroy = { + .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, + .u.shared_info.gfn = GPA_INVALID + }; + + for (;;) { + __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_init); + __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_destroy); + pthread_testcancel(); + }; + + return NULL; +} + int main(int argc, char *argv[]) { struct timespec min_ts, max_ts, vm_ts; struct kvm_vm *vm; + pthread_t thread; bool verbose; + int ret; verbose = argc > 1 && (!strncmp(argv[1], "-v", 3) || !strncmp(argv[1], "--verbose", 10)); @@ -785,6 +860,71 @@ int main(int argc, char *argv[]) case 21: TEST_ASSERT(!evtchn_irq_expected, "Expected event channel IRQ but it didn't happen"); + alarm(0); + + if (verbose) + printf("Testing shinfo lock corruption (KVM_XEN_HVM_EVTCHN_SEND)\n"); + + ret = pthread_create(&thread, NULL, &juggle_shinfo_state, (void *)vm); + TEST_ASSERT(ret == 0, "pthread_create() failed: %s", strerror(ret)); + + struct kvm_irq_routing_xen_evtchn uxe = { + .port = 1, + .vcpu = vcpu->id, + .priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL + }; + + evtchn_irq_expected = true; + for (time_t t = time(NULL) + SHINFO_RACE_TIMEOUT; time(NULL) < t;) + __vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &uxe); + break; + + case 22: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + + if (verbose) + printf("Testing shinfo lock corruption (SCHEDOP_poll)\n"); + + shinfo->evtchn_pending[0] = 1; + + evtchn_irq_expected = true; + tmr.u.timer.expires_ns = rs->state_entry_time + + SHINFO_RACE_TIMEOUT * 1000000000ULL; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); + break; + + case 23: + /* + * Optional and possibly repeated sync point. + * Injecting the timer IRQ may fail if the + * shinfo is invalid when the timer expires. + * If the timer has expired but the IRQ hasn't + * been delivered, rearm the timer and retry. + */ + vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr); + + /* Resume the guest if the timer is still pending. */ + if (tmr.u.timer.expires_ns) + break; + + /* All done if the IRQ was delivered. */ + if (!evtchn_irq_expected) + break; + + tmr.u.timer.expires_ns = rs->state_entry_time + + SHINFO_RACE_TIMEOUT * 1000000000ULL; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); + break; + case 24: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + + ret = pthread_cancel(thread); + TEST_ASSERT(ret == 0, "pthread_cancel() failed: %s", strerror(ret)); + + ret = pthread_join(thread, 0); + TEST_ASSERT(ret == 0, "pthread_join() failed: %s", strerror(ret)); goto done; case 0x20: From 5addaf530995ac203fa46efde0d1ded4c15ff98e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Oct 2022 21:12:34 +0000 Subject: [PATCH 28/38] KVM: selftests: Mark "guest_saw_irq" as volatile in xen_shinfo_test Tag "guest_saw_irq" as "volatile" to ensure that the compiler will never optimize away lookups. Relying on the compiler thinking that the flag is global and thus might change also works, but it's subtle, less robust, and looks like a bug at first glance, e.g. risks being "fixed" and breaking the test. Make the flag "static" as well since convincing the compiler it's global is no longer necessary. Alternatively, the flag could be accessed with {READ,WRITE}_ONCE(), but literally every access would need the wrappers, and eking out performance isn't exactly top priority for selftests. Signed-off-by: Sean Christopherson Message-Id: <20221013211234.1318131-17-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index caa3f5ab9e10..2a5727188c8d 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -132,7 +132,7 @@ struct { struct kvm_irq_routing_entry entries[2]; } irq_routes; -bool guest_saw_irq; +static volatile bool guest_saw_irq; static void evtchn_handler(struct ex_regs *regs) { From 5015bb89b58225f97df6ac44383e7e8c8662c8c9 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:28 +0300 Subject: [PATCH 29/38] KVM: x86: emulator: em_sysexit should update ctxt->mode SYSEXIT is one of the instructions that can change the processor mode, thus ctxt->mode should be updated after it. Note that this is likely a benign bug, because the only problematic mode change is from 32 bit to 64 bit which can lead to truncation of RIP, and it is not possible to do with sysexit, since sysexit running in 32 bit mode will be limited to 32 bit version. Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-11-mlevitsk@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3b27622d4642..261732957431 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2876,6 +2876,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); ctxt->_eip = rdx; + ctxt->mode = usermode; *reg_write(ctxt, VCPU_REGS_RSP) = rcx; return X86EMUL_CONTINUE; From d087e0f79fa0dd336a9a6b2f79ec23120f5eff73 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:29 +0300 Subject: [PATCH 30/38] KVM: x86: emulator: introduce emulator_recalc_and_set_mode Some instructions update the cpu execution mode, which needs to update the emulation mode. Extract this code, and make assign_eip_far use it. assign_eip_far now reads CS, instead of getting it via a parameter, which is ok, because callers always assign CS to the same value before calling this function. No functional change is intended. Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-12-mlevitsk@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 87 ++++++++++++++++++++++++++++-------------- 1 file changed, 58 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 261732957431..e5522a23d985 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -791,8 +791,7 @@ static int linearize(struct x86_emulate_ctxt *ctxt, ctxt->mode, linear); } -static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, - enum x86emul_mode mode) +static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst) { ulong linear; int rc; @@ -802,41 +801,71 @@ static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, if (ctxt->op_bytes != sizeof(unsigned long)) addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1); - rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear); + rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear); if (rc == X86EMUL_CONTINUE) ctxt->_eip = addr.ea; return rc; } -static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) +static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt) { - return assign_eip(ctxt, dst, ctxt->mode); + u64 efer; + struct desc_struct cs; + u16 selector; + u32 base3; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) { + /* Real mode. cpu must not have long mode active */ + if (efer & EFER_LMA) + return X86EMUL_UNHANDLEABLE; + ctxt->mode = X86EMUL_MODE_REAL; + return X86EMUL_CONTINUE; + } + + if (ctxt->eflags & X86_EFLAGS_VM) { + /* Protected/VM86 mode. cpu must not have long mode active */ + if (efer & EFER_LMA) + return X86EMUL_UNHANDLEABLE; + ctxt->mode = X86EMUL_MODE_VM86; + return X86EMUL_CONTINUE; + } + + if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS)) + return X86EMUL_UNHANDLEABLE; + + if (efer & EFER_LMA) { + if (cs.l) { + /* Proper long mode */ + ctxt->mode = X86EMUL_MODE_PROT64; + } else if (cs.d) { + /* 32 bit compatibility mode*/ + ctxt->mode = X86EMUL_MODE_PROT32; + } else { + ctxt->mode = X86EMUL_MODE_PROT16; + } + } else { + /* Legacy 32 bit / 16 bit mode */ + ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + } + + return X86EMUL_CONTINUE; } -static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, - const struct desc_struct *cs_desc) +static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) { - enum x86emul_mode mode = ctxt->mode; - int rc; + return assign_eip(ctxt, dst); +} -#ifdef CONFIG_X86_64 - if (ctxt->mode >= X86EMUL_MODE_PROT16) { - if (cs_desc->l) { - u64 efer = 0; +static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst) +{ + int rc = emulator_recalc_and_set_mode(ctxt); - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - if (efer & EFER_LMA) - mode = X86EMUL_MODE_PROT64; - } else - mode = X86EMUL_MODE_PROT32; /* temporary value */ - } -#endif - if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) - mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - rc = assign_eip(ctxt, dst, mode); - if (rc == X86EMUL_CONTINUE) - ctxt->mode = mode; - return rc; + if (rc != X86EMUL_CONTINUE) + return rc; + + return assign_eip(ctxt, dst); } static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -2172,7 +2201,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); + rc = assign_eip_far(ctxt, ctxt->src.val); /* Error handling is not implemented. */ if (rc != X86EMUL_CONTINUE) return X86EMUL_UNHANDLEABLE; @@ -2250,7 +2279,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) &new_desc); if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, eip, &new_desc); + rc = assign_eip_far(ctxt, eip); /* Error handling is not implemented. */ if (rc != X86EMUL_CONTINUE) return X86EMUL_UNHANDLEABLE; @@ -3470,7 +3499,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); + rc = assign_eip_far(ctxt, ctxt->src.val); if (rc != X86EMUL_CONTINUE) goto fail; From 055f37f84e304e59c046d1accfd8f08462f52c4c Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:30 +0300 Subject: [PATCH 31/38] KVM: x86: emulator: update the emulation mode after rsm Update the emulation mode after RSM so that RIP will be correctly written back, because the RSM instruction can switch the CPU mode from 32 bit (or less) to 64 bit. This fixes a guest crash in case the #SMI is received while the guest runs a code from an address > 32 bit. Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-13-mlevitsk@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e5522a23d985..33385ebae100 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2662,7 +2662,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) * those side effects need to be explicitly handled for both success * and shutdown. */ - return X86EMUL_CONTINUE; + return emulator_recalc_and_set_mode(ctxt); emulate_shutdown: ctxt->ops->triple_fault(ctxt); From ad8f9e69942c7db90758d9d774157e53bce94840 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:31 +0300 Subject: [PATCH 32/38] KVM: x86: emulator: update the emulation mode after CR0 write Update the emulation mode when handling writes to CR0, because toggling CR0.PE switches between Real and Protected Mode, and toggling CR0.PG when EFER.LME=1 switches between Long and Protected Mode. This is likely a benign bug because there is no writeback of state, other than the RIP increment, and when toggling CR0.PE, the CPU has to execute code from a very low memory address. Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-14-mlevitsk@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 33385ebae100..2954c046740b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3641,11 +3641,25 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt) static int em_cr_write(struct x86_emulate_ctxt *ctxt) { - if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) + int cr_num = ctxt->modrm_reg; + int r; + + if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val)) return emulate_gp(ctxt, 0); /* Disable writeback. */ ctxt->dst.type = OP_NONE; + + if (cr_num == 0) { + /* + * CR0 write might have updated CR0.PE and/or CR0.PG + * which can affect the cpu's execution mode. + */ + r = emulator_recalc_and_set_mode(ctxt); + if (r != X86EMUL_CONTINUE) + return r; + } + return X86EMUL_CONTINUE; } From 696db303e54f7352623d9f640e6c51d8fa9d5588 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:32 +0300 Subject: [PATCH 33/38] KVM: x86: smm: number of GPRs in the SMRAM image depends on the image format On 64 bit host, if the guest doesn't have X86_FEATURE_LM, KVM will access 16 gprs to 32-bit smram image, causing out-ouf-bound ram access. On 32 bit host, the rsm_load_state_64/enter_smm_save_state_64 is compiled out, thus access overflow can't happen. Fixes: b443183a25ab61 ("KVM: x86: Reduce the number of emulator GPRs to '8' for 32-bit KVM") Signed-off-by: Maxim Levitsky Reviewed-by: Sean Christopherson Message-Id: <20221025124741.228045-15-mlevitsk@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2954c046740b..4a43261d25a2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2461,7 +2461,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); - for (i = 0; i < NR_EMULATOR_GPRS; i++) + for (i = 0; i < 8; i++) *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); val = GET_SMSTATE(u32, smstate, 0x7fcc); @@ -2518,7 +2518,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u16 selector; int i, r; - for (i = 0; i < NR_EMULATOR_GPRS; i++) + for (i = 0; i < 16; i++) *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); From 8cf0a1bc12870d148ae830a4ba88cfdf0e879cee Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Tue, 25 Oct 2022 21:33:57 +0800 Subject: [PATCH 34/38] capabilities: fix potential memleak on error path from vfs_getxattr_alloc() In cap_inode_getsecurity(), we will use vfs_getxattr_alloc() to complete the memory allocation of tmpbuf, if we have completed the memory allocation of tmpbuf, but failed to call handler->get(...), there will be a memleak in below logic: |-- ret = (int)vfs_getxattr_alloc(mnt_userns, ...) | /* ^^^ alloc for tmpbuf */ |-- value = krealloc(*xattr_value, error + 1, flags) | /* ^^^ alloc memory */ |-- error = handler->get(handler, ...) | /* error! */ |-- *xattr_value = value | /* xattr_value is &tmpbuf (memory leak!) */ So we will try to free(tmpbuf) after vfs_getxattr_alloc() fails to fix it. Cc: stable@vger.kernel.org Fixes: 8db6c34f1dbc ("Introduce v3 namespaced file capabilities") Signed-off-by: Gaosheng Cui Acked-by: Serge Hallyn [PM: subject line and backtrace tweaks] Signed-off-by: Paul Moore --- security/commoncap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index 5fc8986c3c77..bc751fa5adad 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -401,8 +401,10 @@ int cap_inode_getsecurity(struct user_namespace *mnt_userns, &tmpbuf, size, GFP_NOFS); dput(dentry); - if (ret < 0 || !tmpbuf) - return ret; + if (ret < 0 || !tmpbuf) { + size = ret; + goto out_free; + } fs_ns = inode->i_sb->s_user_ns; cap = (struct vfs_cap_data *) tmpbuf; From 7353633814f6e5b4899fb9ee1483709d6bb0e1cd Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Fri, 28 Oct 2022 09:26:31 +0000 Subject: [PATCH 35/38] KVM: x86/xen: Fix eventfd error handling in kvm_xen_eventfd_assign() Should not call eventfd_ctx_put() in case of error. Fixes: 2fd6df2f2b47 ("KVM: x86/xen: intercept EVTCHNOP_send from guests") Reported-by: syzbot+6f0c896c5a9449a10ded@syzkaller.appspotmail.com Signed-off-by: Eiichi Tsukata Message-Id: <20221028092631.117438-1-eiichi.tsukata@nutanix.com> [Introduce new goto target instead. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/xen.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index b2be60c6efa4..2dae413bd62a 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1666,18 +1666,18 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm, case EVTCHNSTAT_ipi: /* IPI must map back to the same port# */ if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port) - goto out; /* -EINVAL */ + goto out_noeventfd; /* -EINVAL */ break; case EVTCHNSTAT_interdomain: if (data->u.evtchn.deliver.port.port) { if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm)) - goto out; /* -EINVAL */ + goto out_noeventfd; /* -EINVAL */ } else { eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd); if (IS_ERR(eventfd)) { ret = PTR_ERR(eventfd); - goto out; + goto out_noeventfd; } } break; @@ -1717,6 +1717,7 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm, out: if (eventfd) eventfd_ctx_put(eventfd); +out_noeventfd: kfree(evtchnfd); return ret; } From bfc3b0f05653a28c8d41067a2aa3875d1f982e3e Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 9 Oct 2022 20:29:36 +0200 Subject: [PATCH 36/38] tools/nolibc: Fix missing strlen() definition and infinite loop with gcc-12 When built at -Os, gcc-12 recognizes an strlen() pattern in nolibc_strlen() and replaces it with a jump to strlen(), which is not defined as a symbol and breaks compilation. Worse, when the function is called strlen(), the function is simply replaced with a jump to itself, hence becomes an infinite loop. One way to avoid this is to always set -ffreestanding, but the calling code doesn't know this and there's no way (either via attributes or pragmas) to globally enable it from include files, effectively leaving a painful situation for the caller. Alexey suggested to place an empty asm() statement inside the loop to stop gcc from recognizing a well-known pattern, which happens to work pretty fine. At least it allows us to make sure our local definition is not replaced with a self jump. The function only needs to be renamed back to strlen() so that the symbol exists, which implies that nolibc_strlen() which is used on variable strings has to be declared as a macro that points back to it before the strlen() macro is redifined. It was verified to produce valid code with gcc 3.4 to 12.1 at different optimization levels, and both with constant and variable strings. In case this problem surfaces again in the future, an alternate approach consisting in adding an optimize("no-tree-loop-distribute-patterns") function attribute for gcc>=12 worked as well but is less pretty. Reported-by: kernel test robot Link: https://lore.kernel.org/r/202210081618.754a77db-yujie.liu@intel.com Fixes: 66b6f755ad45 ("rcutorture: Import a copy of nolibc") Fixes: 96980b833a21 ("tools/nolibc/string: do not use __builtin_strlen() at -O0") Cc: "Paul E. McKenney" Cc: Alexey Dobriyan Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/string.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index bef35bee9c44..718a405ffbc3 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -125,14 +125,18 @@ char *strcpy(char *dst, const char *src) } /* this function is only used with arguments that are not constants or when - * it's not known because optimizations are disabled. + * it's not known because optimizations are disabled. Note that gcc 12 + * recognizes an strlen() pattern and replaces it with a jump to strlen(), + * thus itself, hence the asm() statement below that's meant to disable this + * confusing practice. */ static __attribute__((unused)) -size_t nolibc_strlen(const char *str) +size_t strlen(const char *str) { size_t len; - for (len = 0; str[len]; len++); + for (len = 0; str[len]; len++) + asm(""); return len; } @@ -140,13 +144,12 @@ size_t nolibc_strlen(const char *str) * the two branches, then will rely on an external definition of strlen(). */ #if defined(__OPTIMIZE__) +#define nolibc_strlen(x) strlen(x) #define strlen(str) ({ \ __builtin_constant_p((str)) ? \ __builtin_strlen((str)) : \ nolibc_strlen((str)); \ }) -#else -#define strlen(str) nolibc_strlen((str)) #endif static __attribute__((unused)) From b3f4f51ea68a495f8a5956064c33dce711a2df91 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 21 Oct 2022 08:01:53 +0200 Subject: [PATCH 37/38] tools/nolibc/string: Fix memcmp() implementation The C standard says that memcmp() must treat the buffers as consisting of "unsigned chars". If char happens to be unsigned, the casts are ok, but then obviously the c1 variable can never contain a negative value. And when char is signed, the casts are wrong, and there's still a problem with using an 8-bit quantity to hold the difference, because that can range from -255 to +255. For example, assuming char is signed, comparing two 1-byte buffers, one containing 0x00 and another 0x80, the current implementation would return -128 for both memcmp(a, b, 1) and memcmp(b, a, 1), whereas one of those should of course return something positive. Signed-off-by: Rasmus Villemoes Fixes: 66b6f755ad45 ("rcutorture: Import a copy of nolibc") Cc: stable@vger.kernel.org # v5.0+ Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/string.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index 718a405ffbc3..ad97c0d522b8 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -19,9 +19,9 @@ static __attribute__((unused)) int memcmp(const void *s1, const void *s2, size_t n) { size_t ofs = 0; - char c1 = 0; + int c1 = 0; - while (ofs < n && !(c1 = ((char *)s1)[ofs] - ((char *)s2)[ofs])) { + while (ofs < n && !(c1 = ((unsigned char *)s1)[ofs] - ((unsigned char *)s2)[ofs])) { ofs++; } return c1; From d3aefd2b29ff5ffdeb5c06a7d3191a027a18cdb8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 31 Oct 2022 11:49:21 -0400 Subject: [PATCH 38/38] nfsd: fix net-namespace logic in __nfsd_file_cache_purge If the namespace doesn't match the one in "net", then we'll continue, but that doesn't cause another rhashtable_walk_next call, so it will loop infinitely. Fixes: ce502f81ba88 ("NFSD: Convert the filecache to use rhashtable") Reported-by: Petr Vorel Link: https://lore.kernel.org/ltp/Y1%2FP8gDAcWC%2F+VR3@pevik/ Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 29a62db155fb..adc4e87a71d2 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -893,9 +893,8 @@ __nfsd_file_cache_purge(struct net *net) nf = rhashtable_walk_next(&iter); while (!IS_ERR_OR_NULL(nf)) { - if (net && nf->nf_net != net) - continue; - nfsd_file_unhash_and_dispose(nf, &dispose); + if (!net || nf->nf_net == net) + nfsd_file_unhash_and_dispose(nf, &dispose); nf = rhashtable_walk_next(&iter); }