From d01b5fc061683e8fc28499b283fab39838730c9b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 17 Mar 2021 17:44:12 +0100 Subject: [PATCH 01/26] ext4: handle error of ext4_setup_system_zone() on remount commit d176b1f62f242ab259ff665a26fbac69db1aecba upstream. ext4_setup_system_zone() can fail. Handle the failure in ext4_remount(). Reviewed-by: Lukas Czerner Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200728130437.7804-2-jack@suse.cz Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 84d28d3efc60..f70c5541d718 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5148,7 +5148,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_register_li_request(sb, first_not_zeroed); } - ext4_setup_system_zone(sb); + err = ext4_setup_system_zone(sb); + if (err) + goto restore_opts; + if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY)) ext4_commit_super(sb, 1); From f175d63724fa0738c060a1e07f796f353e4e729f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 17 Mar 2021 17:44:13 +0100 Subject: [PATCH 02/26] ext4: don't allow overlapping system zones commit bf9a379d0980e7413d94cb18dac73db2bfc5f470 upstream. Currently, add_system_zone() just silently merges two added system zones that overlap. However the overlap should not happen and it generally suggests that some unrelated metadata overlap which indicates the fs is corrupted. We should have caught such problems earlier (e.g. in ext4_check_descriptors()) but add this check as another line of defense. In later patch we also use this for stricter checking of journal inode extent tree. Reviewed-by: Lukas Czerner Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200728130437.7804-3-jack@suse.cz Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/block_validity.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 45c7b0f9a8e3..bc15226518ff 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -57,7 +57,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, unsigned int count) { - struct ext4_system_zone *new_entry = NULL, *entry; + struct ext4_system_zone *new_entry, *entry; struct rb_node **n = &sbi->system_blks.rb_node, *node; struct rb_node *parent = NULL, *new_node = NULL; @@ -68,30 +68,20 @@ static int add_system_zone(struct ext4_sb_info *sbi, n = &(*n)->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; - else { - if (start_blk + count > (entry->start_blk + - entry->count)) - entry->count = (start_blk + count - - entry->start_blk); - new_node = *n; - new_entry = rb_entry(new_node, struct ext4_system_zone, - node); - break; - } + else /* Unexpected overlap of system zones. */ + return -EFSCORRUPTED; } - if (!new_entry) { - new_entry = kmem_cache_alloc(ext4_system_zone_cachep, - GFP_KERNEL); - if (!new_entry) - return -ENOMEM; - new_entry->start_blk = start_blk; - new_entry->count = count; - new_node = &new_entry->node; + new_entry = kmem_cache_alloc(ext4_system_zone_cachep, + GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->start_blk = start_blk; + new_entry->count = count; + new_node = &new_entry->node; - rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &sbi->system_blks); - } + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &sbi->system_blks); /* Can we merge to the left? */ node = rb_prev(new_node); From 58ef3832e6864e862e484123bf8501258e968ce0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 17 Mar 2021 17:44:14 +0100 Subject: [PATCH 03/26] ext4: check journal inode extents more carefully commit ce9f24cccdc019229b70a5c15e2b09ad9c0ab5d1 upstream. Currently, system zones just track ranges of block, that are "important" fs metadata (bitmaps, group descriptors, journal blocks, etc.). This however complicates how extent tree (or indirect blocks) can be checked for inodes that actually track such metadata - currently the journal inode but arguably we should be treating quota files or resize inode similarly. We cannot run __ext4_ext_check() on such metadata inodes when loading their extents as that would immediately trigger the validity checks and so we just hack around that and special-case the journal inode. This however leads to a situation that a journal inode which has extent tree of depth at least one can have invalid extent tree that gets unnoticed until ext4_cache_extents() crashes. To overcome this limitation, track inode number each system zone belongs to (0 is used for zones not belonging to any inode). We can then verify inode number matches the expected one when verifying extent tree and thus avoid the false errors. With this there's no need to to special-case journal inode during extent tree checking anymore so remove it. Fixes: 0a944e8a6c66 ("ext4: don't perform block validity checks on the journal inode") Reported-by: Wolfgang Frisch Reviewed-by: Lukas Czerner Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200728130437.7804-4-jack@suse.cz Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/block_validity.c | 37 +++++++++++++++++++++---------------- fs/ext4/ext4.h | 6 +++--- fs/ext4/extents.c | 16 ++++++---------- fs/ext4/indirect.c | 6 ++---- fs/ext4/inode.c | 5 ++--- fs/ext4/mballoc.c | 4 ++-- 6 files changed, 36 insertions(+), 38 deletions(-) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index bc15226518ff..c50dcda7b708 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -23,6 +23,7 @@ struct ext4_system_zone { struct rb_node node; ext4_fsblk_t start_blk; unsigned int count; + u32 ino; }; static struct kmem_cache *ext4_system_zone_cachep; @@ -43,7 +44,8 @@ void ext4_exit_system_zone(void) static inline int can_merge(struct ext4_system_zone *entry1, struct ext4_system_zone *entry2) { - if ((entry1->start_blk + entry1->count) == entry2->start_blk) + if ((entry1->start_blk + entry1->count) == entry2->start_blk && + entry1->ino == entry2->ino) return 1; return 0; } @@ -55,7 +57,7 @@ static inline int can_merge(struct ext4_system_zone *entry1, */ static int add_system_zone(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, - unsigned int count) + unsigned int count, u32 ino) { struct ext4_system_zone *new_entry, *entry; struct rb_node **n = &sbi->system_blks.rb_node, *node; @@ -78,6 +80,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, return -ENOMEM; new_entry->start_blk = start_blk; new_entry->count = count; + new_entry->ino = ino; new_node = &new_entry->node; rb_link_node(new_node, parent, n); @@ -153,16 +156,16 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) if (n == 0) { i++; } else { - if (!ext4_data_block_valid(sbi, map.m_pblk, n)) { - ext4_error(sb, "blocks %llu-%llu from inode %u " + err = add_system_zone(sbi, map.m_pblk, n, ino); + if (err < 0) { + if (err == -EFSCORRUPTED) { + ext4_error(sb, + "blocks %llu-%llu from inode %u " "overlap system zone", map.m_pblk, map.m_pblk + map.m_len - 1, ino); - err = -EFSCORRUPTED; + } break; } - err = add_system_zone(sbi, map.m_pblk, n); - if (err < 0) - break; i += n; } } @@ -191,16 +194,16 @@ int ext4_setup_system_zone(struct super_block *sb) if (ext4_bg_has_super(sb, i) && ((i < 5) || ((i % flex_size) == 0))) add_system_zone(sbi, ext4_group_first_block_no(sb, i), - ext4_bg_num_gdb(sb, i) + 1); + ext4_bg_num_gdb(sb, i) + 1, 0); gdp = ext4_get_group_desc(sb, i, NULL); - ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); + ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1, 0); if (ret) return ret; - ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1); + ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1, 0); if (ret) return ret; ret = add_system_zone(sbi, ext4_inode_table(sb, gdp), - sbi->s_itb_per_group); + sbi->s_itb_per_group, 0); if (ret) return ret; } @@ -233,10 +236,11 @@ void ext4_release_system_zone(struct super_block *sb) * start_blk+count) is valid; 0 if some part of the block region * overlaps with filesystem metadata blocks. */ -int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, - unsigned int count) +int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, + unsigned int count) { struct ext4_system_zone *entry; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct rb_node *n = sbi->system_blks.rb_node; if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || @@ -252,6 +256,8 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, else if (start_blk >= (entry->start_blk + entry->count)) n = n->rb_right; else { + if (entry->ino == inode->i_ino) + return 1; sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); return 0; } @@ -274,8 +280,7 @@ int ext4_check_blockref(const char *function, unsigned int line, while (bref < p+max) { blk = le32_to_cpu(*bref++); if (blk && - unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), - blk, 1))) { + unlikely(!ext4_inode_block_valid(inode, blk, 1))) { es->s_last_error_block = cpu_to_le64(blk); ext4_error_inode(inode, function, line, blk, "invalid block"); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 54a7f3b8b1bf..1b62bd9e7312 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3136,9 +3136,9 @@ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); extern int __init ext4_init_system_zone(void); extern void ext4_exit_system_zone(void); -extern int ext4_data_block_valid(struct ext4_sb_info *sbi, - ext4_fsblk_t start_blk, - unsigned int count); +extern int ext4_inode_block_valid(struct inode *inode, + ext4_fsblk_t start_blk, + unsigned int count); extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ab19f61bd04b..7f291b7be7f3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -389,7 +389,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) */ if (lblock + len <= lblock) return 0; - return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); + return ext4_inode_block_valid(inode, block, len); } static int ext4_valid_extent_idx(struct inode *inode, @@ -397,7 +397,7 @@ static int ext4_valid_extent_idx(struct inode *inode, { ext4_fsblk_t block = ext4_idx_pblock(ext_idx); - return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); + return ext4_inode_block_valid(inode, block, 1); } static int ext4_valid_extent_entries(struct inode *inode, @@ -554,14 +554,10 @@ __read_extent_tree_block(const char *function, unsigned int line, } if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) return bh; - if (!ext4_has_feature_journal(inode->i_sb) || - (inode->i_ino != - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) { - err = __ext4_ext_check(function, line, inode, - ext_block_hdr(bh), depth, pblk); - if (err) - goto errout; - } + err = __ext4_ext_check(function, line, inode, + ext_block_hdr(bh), depth, pblk); + if (err) + goto errout; set_buffer_verified(bh); /* * If this is a leaf block, cache all of its entries diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index d2844fe9040d..ff7e1ac6ee53 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -840,8 +840,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET; - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, - count)) { + if (!ext4_inode_block_valid(inode, block_to_free, count)) { EXT4_ERROR_INODE(inode, "attempt to clear invalid " "blocks %llu len %lu", (unsigned long long) block_to_free, count); @@ -1003,8 +1002,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (!nr) continue; /* A hole */ - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), - nr, 1)) { + if (!ext4_inode_block_valid(inode, nr, 1)) { EXT4_ERROR_INODE(inode, "invalid indirect mapped " "block %lu (level %d)", diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ccce89de6d7e..17b16bbf8361 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -378,8 +378,7 @@ static int __check_block_validity(struct inode *inode, const char *func, (inode->i_ino == le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) return 0; - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, - map->m_len)) { + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, @@ -4704,7 +4703,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = 0; if (ei->i_file_acl && - !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { + !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1d543f8b3814..807331da9dfc 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2963,7 +2963,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (!ext4_data_block_valid(sbi, block, len)) { + if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error @@ -4725,7 +4725,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, sbi = EXT4_SB(sb); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && - !ext4_data_block_valid(sbi, block, count)) { + !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); goto error_return; From c7f1c92a983a3ce12e58c913395ac3c0d3172e8e Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 22 Feb 2021 14:30:10 -0800 Subject: [PATCH 04/26] net: dsa: b53: Support setting learning on port commit f9b3827ee66cfcf297d0acd6ecf33653a5f297ef upstream. Add support for being able to set the learning attribute on port, and make sure that the standalone ports start up with learning disabled. We can remove the code in bcm_sf2 that configured the ports learning attribute because we want the standalone ports to have learning disabled by default and port 7 cannot be bridged, so its learning attribute will not change past its initial configuration. Signed-off-by: Florian Fainelli Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/dsa/b53/b53_common.c | 20 ++++++++++++++++++++ drivers/net/dsa/b53/b53_regs.h | 1 + drivers/net/dsa/bcm_sf2.c | 5 +++++ drivers/net/dsa/bcm_sf2_regs.h | 2 ++ 4 files changed, 28 insertions(+) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 5ec0042bc384..b6867a8915da 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -502,6 +502,19 @@ static void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) } } +static void b53_port_set_learning(struct b53_device *dev, int port, + bool learning) +{ + u16 reg; + + b53_read16(dev, B53_CTRL_PAGE, B53_DIS_LEARNING, ®); + if (learning) + reg &= ~BIT(port); + else + reg |= BIT(port); + b53_write16(dev, B53_CTRL_PAGE, B53_DIS_LEARNING, reg); +} + static int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) { @@ -509,6 +522,8 @@ static int b53_enable_port(struct dsa_switch *ds, int port, unsigned int cpu_port = dev->cpu_port; u16 pvlan; + b53_port_set_learning(dev, port, false); + /* Clear the Rx and Tx disable bits and set to no spanning tree */ b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), 0); @@ -552,6 +567,8 @@ static void b53_enable_cpu_port(struct b53_device *dev) PORT_CTRL_RX_MCST_EN | PORT_CTRL_RX_UCST_EN; b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(cpu_port), port_ctrl); + + b53_port_set_learning(dev, cpu_port, false); } static void b53_enable_mib(struct b53_device *dev) @@ -1375,6 +1392,8 @@ static int b53_br_join(struct dsa_switch *ds, int port, b53_write16(dev, B53_PVLAN_PAGE, B53_PVLAN_PORT_MASK(port), pvlan); dev->ports[port].vlan_ctl_mask = pvlan; + b53_port_set_learning(dev, port, true); + return 0; } @@ -1426,6 +1445,7 @@ static void b53_br_leave(struct dsa_switch *ds, int port) vl->untag |= BIT(port) | BIT(dev->cpu_port); b53_set_vlan_entry(dev, pvid, vl); } + b53_port_set_learning(dev, port, false); } static void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state) diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h index 3cf246c6bdcc..aed70e76006d 100644 --- a/drivers/net/dsa/b53/b53_regs.h +++ b/drivers/net/dsa/b53/b53_regs.h @@ -112,6 +112,7 @@ #define B53_UC_FLOOD_MASK 0x32 #define B53_MC_FLOOD_MASK 0x34 #define B53_IPMC_FLOOD_MASK 0x36 +#define B53_DIS_LEARNING 0x3c /* * Override Ports 0-7 State on devices with xMII interfaces (8 bit) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index a3742a3b413c..0c69d5858558 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -224,6 +224,11 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, reg &= ~P_TXQ_PSM_VDD(port); core_writel(priv, reg, CORE_MEM_PSM_VDD_CTRL); + /* Disable learning */ + reg = core_readl(priv, CORE_DIS_LEARN); + reg |= BIT(port); + core_writel(priv, reg, CORE_DIS_LEARN); + /* Clear the Rx and Tx disable bits and set to no spanning tree */ core_writel(priv, 0, CORE_G_PCTL_PORT(port)); diff --git a/drivers/net/dsa/bcm_sf2_regs.h b/drivers/net/dsa/bcm_sf2_regs.h index 838fe373cd6f..ca1d1f2e1161 100644 --- a/drivers/net/dsa/bcm_sf2_regs.h +++ b/drivers/net/dsa/bcm_sf2_regs.h @@ -138,6 +138,8 @@ #define CORE_SWITCH_CTRL 0x00088 #define MII_DUMB_FWDG_EN (1 << 6) +#define CORE_DIS_LEARN 0x000f0 + #define CORE_SFT_LRN_CTRL 0x000f8 #define SW_LEARN_CNTL(x) (1 << (x)) From c31cd5fd02c26c15fb1814f1e8c389362f4e17cb Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 3 May 2017 10:29:04 -0700 Subject: [PATCH 05/26] ixgbe: check for Tx timestamp timeouts during watchdog commit 622a2ef538fb3ca8eccf49716aba8267d6e95a47 upstream. The ixgbe driver has logic to handle only one Tx timestamp at a time, using a state bit lock to avoid multiple requests at once. It may be possible, if incredibly unlikely, that a Tx timestamp event is requested but never completes. Since we use an interrupt scheme to determine when the Tx timestamp occurred we would never clear the state bit in this case. Add an ixgbe_ptp_tx_hang() function similar to the already existing ixgbe_ptp_rx_hang() function. This function runs in the watchdog routine and makes sure we eventually recover from this case instead of permanently disabling Tx timestamps. Note: there is no currently known way to cause this without hacking the driver code to force it. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c | 27 +++++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index b06e32d0d22a..293ff8cc6994 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -991,6 +991,7 @@ void ixgbe_ptp_suspend(struct ixgbe_adapter *adapter); void ixgbe_ptp_stop(struct ixgbe_adapter *adapter); void ixgbe_ptp_overflow_check(struct ixgbe_adapter *adapter); void ixgbe_ptp_rx_hang(struct ixgbe_adapter *adapter); +void ixgbe_ptp_tx_hang(struct ixgbe_adapter *adapter); void ixgbe_ptp_rx_pktstamp(struct ixgbe_q_vector *, struct sk_buff *); void ixgbe_ptp_rx_rgtstamp(struct ixgbe_q_vector *, struct sk_buff *skb); static inline void ixgbe_ptp_rx_hwtstamp(struct ixgbe_ring *rx_ring, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 4c729faeb713..66b1cc0247b8 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -7258,6 +7258,7 @@ static void ixgbe_service_task(struct work_struct *work) if (test_bit(__IXGBE_PTP_RUNNING, &adapter->state)) { ixgbe_ptp_overflow_check(adapter); ixgbe_ptp_rx_hang(adapter); + ixgbe_ptp_tx_hang(adapter); } ixgbe_service_event_complete(adapter); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c index a92277683a64..a93a1b3bb8e4 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c @@ -662,6 +662,33 @@ static void ixgbe_ptp_clear_tx_timestamp(struct ixgbe_adapter *adapter) clear_bit_unlock(__IXGBE_PTP_TX_IN_PROGRESS, &adapter->state); } +/** + * ixgbe_ptp_tx_hang - detect error case where Tx timestamp never finishes + * @adapter: private network adapter structure + */ +void ixgbe_ptp_tx_hang(struct ixgbe_adapter *adapter) +{ + bool timeout = time_is_before_jiffies(adapter->ptp_tx_start + + IXGBE_PTP_TX_TIMEOUT); + + if (!adapter->ptp_tx_skb) + return; + + if (!test_bit(__IXGBE_PTP_TX_IN_PROGRESS, &adapter->state)) + return; + + /* If we haven't received a timestamp within the timeout, it is + * reasonable to assume that it will never occur, so we can unlock the + * timestamp bit when this occurs. + */ + if (timeout) { + cancel_work_sync(&adapter->ptp_tx_work); + ixgbe_ptp_clear_tx_timestamp(adapter); + adapter->tx_hwtstamp_timeouts++; + e_warn(drv, "clearing Tx timestamp hang\n"); + } +} + /** * ixgbe_ptp_tx_hwtstamp - utility function which checks for TX time stamp * @adapter: the private adapter struct From 13f759dcb05d8b27ac83eca9cae87ee738957ad1 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 29 Jan 2018 15:57:48 -0800 Subject: [PATCH 06/26] ixgbe: prevent ptp_rx_hang from running when in FILTER_ALL mode commit 6704a3abf4cf4181a1ee64f5db4969347b88ca1d upstream. On hardware which supports timestamping all packets, the timestamps are recorded in the packet buffer, and the driver no longer uses or reads the registers. This makes the logic for checking and clearing Rx timestamp hangs meaningless. If we run the ixgbe_ptp_rx_hang() function in this case, then the driver will continuously spam the log output with "Clearing Rx timestamp hang". These messages are spurious, and confusing to end users. The original code in commit a9763f3cb54c ("ixgbe: Update PTP to support X550EM_x devices", 2015-12-03) did have a flag PTP_RX_TIMESTAMP_IN_REGISTER which was intended to be used to avoid the Rx timestamp hang check, however it did not actually check the flag before calling the function. Do so now in order to stop the checks and prevent the spurious log messages. Fixes: a9763f3cb54c ("ixgbe: Update PTP to support X550EM_x devices", 2015-12-03) Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Greg Kroah-Hartman Signed-off-by: Wen Yang --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 66b1cc0247b8..36d73bf32f4f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -7257,7 +7257,8 @@ static void ixgbe_service_task(struct work_struct *work) if (test_bit(__IXGBE_PTP_RUNNING, &adapter->state)) { ixgbe_ptp_overflow_check(adapter); - ixgbe_ptp_rx_hang(adapter); + if (adapter->flags & IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER) + ixgbe_ptp_rx_hang(adapter); ixgbe_ptp_tx_hang(adapter); } From ca403b79f4330bb5a8df3551e39610db6c06c46f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 11 Mar 2021 14:31:05 +0000 Subject: [PATCH 07/26] btrfs: fix race when cloning extent buffer during rewind of an old root commit dbcc7d57bffc0c8cac9dac11bec548597d59a6a5 upstream. While resolving backreferences, as part of a logical ino ioctl call or fiemap, we can end up hitting a BUG_ON() when replaying tree mod log operations of a root, triggering a stack trace like the following: ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.c:1210! invalid opcode: 0000 [#1] SMP KASAN PTI CPU: 1 PID: 19054 Comm: crawl_335 Tainted: G W 5.11.0-2d11c0084b02-misc-next+ #89 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 RIP: 0010:__tree_mod_log_rewind+0x3b1/0x3c0 Code: 05 48 8d 74 10 (...) RSP: 0018:ffffc90001eb70b8 EFLAGS: 00010297 RAX: 0000000000000000 RBX: ffff88812344e400 RCX: ffffffffb28933b6 RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff88812344e42c RBP: ffffc90001eb7108 R08: 1ffff11020b60a20 R09: ffffed1020b60a20 R10: ffff888105b050f9 R11: ffffed1020b60a1f R12: 00000000000000ee R13: ffff8880195520c0 R14: ffff8881bc958500 R15: ffff88812344e42c FS: 00007fd1955e8700(0000) GS:ffff8881f5600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007efdb7928718 CR3: 000000010103a006 CR4: 0000000000170ee0 Call Trace: btrfs_search_old_slot+0x265/0x10d0 ? lock_acquired+0xbb/0x600 ? btrfs_search_slot+0x1090/0x1090 ? free_extent_buffer.part.61+0xd7/0x140 ? free_extent_buffer+0x13/0x20 resolve_indirect_refs+0x3e9/0xfc0 ? lock_downgrade+0x3d0/0x3d0 ? __kasan_check_read+0x11/0x20 ? add_prelim_ref.part.11+0x150/0x150 ? lock_downgrade+0x3d0/0x3d0 ? __kasan_check_read+0x11/0x20 ? lock_acquired+0xbb/0x600 ? __kasan_check_write+0x14/0x20 ? do_raw_spin_unlock+0xa8/0x140 ? rb_insert_color+0x30/0x360 ? prelim_ref_insert+0x12d/0x430 find_parent_nodes+0x5c3/0x1830 ? resolve_indirect_refs+0xfc0/0xfc0 ? lock_release+0xc8/0x620 ? fs_reclaim_acquire+0x67/0xf0 ? lock_acquire+0xc7/0x510 ? lock_downgrade+0x3d0/0x3d0 ? lockdep_hardirqs_on_prepare+0x160/0x210 ? lock_release+0xc8/0x620 ? fs_reclaim_acquire+0x67/0xf0 ? lock_acquire+0xc7/0x510 ? poison_range+0x38/0x40 ? unpoison_range+0x14/0x40 ? trace_hardirqs_on+0x55/0x120 btrfs_find_all_roots_safe+0x142/0x1e0 ? find_parent_nodes+0x1830/0x1830 ? btrfs_inode_flags_to_xflags+0x50/0x50 iterate_extent_inodes+0x20e/0x580 ? tree_backref_for_extent+0x230/0x230 ? lock_downgrade+0x3d0/0x3d0 ? read_extent_buffer+0xdd/0x110 ? lock_downgrade+0x3d0/0x3d0 ? __kasan_check_read+0x11/0x20 ? lock_acquired+0xbb/0x600 ? __kasan_check_write+0x14/0x20 ? _raw_spin_unlock+0x22/0x30 ? __kasan_check_write+0x14/0x20 iterate_inodes_from_logical+0x129/0x170 ? iterate_inodes_from_logical+0x129/0x170 ? btrfs_inode_flags_to_xflags+0x50/0x50 ? iterate_extent_inodes+0x580/0x580 ? __vmalloc_node+0x92/0xb0 ? init_data_container+0x34/0xb0 ? init_data_container+0x34/0xb0 ? kvmalloc_node+0x60/0x80 btrfs_ioctl_logical_to_ino+0x158/0x230 btrfs_ioctl+0x205e/0x4040 ? __might_sleep+0x71/0xe0 ? btrfs_ioctl_get_supported_features+0x30/0x30 ? getrusage+0x4b6/0x9c0 ? __kasan_check_read+0x11/0x20 ? lock_release+0xc8/0x620 ? __might_fault+0x64/0xd0 ? lock_acquire+0xc7/0x510 ? lock_downgrade+0x3d0/0x3d0 ? lockdep_hardirqs_on_prepare+0x210/0x210 ? lockdep_hardirqs_on_prepare+0x210/0x210 ? __kasan_check_read+0x11/0x20 ? do_vfs_ioctl+0xfc/0x9d0 ? ioctl_file_clone+0xe0/0xe0 ? lock_downgrade+0x3d0/0x3d0 ? lockdep_hardirqs_on_prepare+0x210/0x210 ? __kasan_check_read+0x11/0x20 ? lock_release+0xc8/0x620 ? __task_pid_nr_ns+0xd3/0x250 ? lock_acquire+0xc7/0x510 ? __fget_files+0x160/0x230 ? __fget_light+0xf2/0x110 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fd1976e2427 Code: 00 00 90 48 8b 05 (...) RSP: 002b:00007fd1955e5cf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fd1955e5f40 RCX: 00007fd1976e2427 RDX: 00007fd1955e5f48 RSI: 00000000c038943b RDI: 0000000000000004 RBP: 0000000001000000 R08: 0000000000000000 R09: 00007fd1955e6120 R10: 0000557835366b00 R11: 0000000000000246 R12: 0000000000000004 R13: 00007fd1955e5f48 R14: 00007fd1955e5f40 R15: 00007fd1955e5ef8 Modules linked in: ---[ end trace ec8931a1c36e57be ]--- (gdb) l *(__tree_mod_log_rewind+0x3b1) 0xffffffff81893521 is in __tree_mod_log_rewind (fs/btrfs/ctree.c:1210). 1205 * the modification. as we're going backwards, we do the 1206 * opposite of each operation here. 1207 */ 1208 switch (tm->op) { 1209 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1210 BUG_ON(tm->slot < n); 1211 fallthrough; 1212 case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1213 case MOD_LOG_KEY_REMOVE: 1214 btrfs_set_node_key(eb, &tm->key, tm->slot); Here's what happens to hit that BUG_ON(): 1) We have one tree mod log user (through fiemap or the logical ino ioctl), with a sequence number of 1, so we have fs_info->tree_mod_seq == 1; 2) Another task is at ctree.c:balance_level() and we have eb X currently as the root of the tree, and we promote its single child, eb Y, as the new root. Then, at ctree.c:balance_level(), we call: tree_mod_log_insert_root(eb X, eb Y, 1); 3) At tree_mod_log_insert_root() we create tree mod log elements for each slot of eb X, of operation type MOD_LOG_KEY_REMOVE_WHILE_FREEING each with a ->logical pointing to ebX->start. These are placed in an array named tm_list. Lets assume there are N elements (N pointers in eb X); 4) Then, still at tree_mod_log_insert_root(), we create a tree mod log element of operation type MOD_LOG_ROOT_REPLACE, ->logical set to ebY->start, ->old_root.logical set to ebX->start, ->old_root.level set to the level of eb X and ->generation set to the generation of eb X; 5) Then tree_mod_log_insert_root() calls tree_mod_log_free_eb() with tm_list as argument. After that, tree_mod_log_free_eb() calls __tree_mod_log_insert() for each member of tm_list in reverse order, from highest slot in eb X, slot N - 1, to slot 0 of eb X; 6) __tree_mod_log_insert() sets the sequence number of each given tree mod log operation - it increments fs_info->tree_mod_seq and sets fs_info->tree_mod_seq as the sequence number of the given tree mod log operation. This means that for the tm_list created at tree_mod_log_insert_root(), the element corresponding to slot 0 of eb X has the highest sequence number (1 + N), and the element corresponding to the last slot has the lowest sequence number (2); 7) Then, after inserting tm_list's elements into the tree mod log rbtree, the MOD_LOG_ROOT_REPLACE element is inserted, which gets the highest sequence number, which is N + 2; 8) Back to ctree.c:balance_level(), we free eb X by calling btrfs_free_tree_block() on it. Because eb X was created in the current transaction, has no other references and writeback did not happen for it, we add it back to the free space cache/tree; 9) Later some other task T allocates the metadata extent from eb X, since it is marked as free space in the space cache/tree, and uses it as a node for some other btree; 10) The tree mod log user task calls btrfs_search_old_slot(), which calls get_old_root(), and finally that calls __tree_mod_log_oldest_root() with time_seq == 1 and eb_root == eb Y; 11) First iteration of the while loop finds the tree mod log element with sequence number N + 2, for the logical address of eb Y and of type MOD_LOG_ROOT_REPLACE; 12) Because the operation type is MOD_LOG_ROOT_REPLACE, we don't break out of the loop, and set root_logical to point to tm->old_root.logical which corresponds to the logical address of eb X; 13) On the next iteration of the while loop, the call to tree_mod_log_search_oldest() returns the smallest tree mod log element for the logical address of eb X, which has a sequence number of 2, an operation type of MOD_LOG_KEY_REMOVE_WHILE_FREEING and corresponds to the old slot N - 1 of eb X (eb X had N items in it before being freed); 14) We then break out of the while loop and return the tree mod log operation of type MOD_LOG_ROOT_REPLACE (eb Y), and not the one for slot N - 1 of eb X, to get_old_root(); 15) At get_old_root(), we process the MOD_LOG_ROOT_REPLACE operation and set "logical" to the logical address of eb X, which was the old root. We then call tree_mod_log_search() passing it the logical address of eb X and time_seq == 1; 16) Then before calling tree_mod_log_search(), task T adds a key to eb X, which results in adding a tree mod log operation of type MOD_LOG_KEY_ADD to the tree mod log - this is done at ctree.c:insert_ptr() - but after adding the tree mod log operation and before updating the number of items in eb X from 0 to 1... 17) The task at get_old_root() calls tree_mod_log_search() and gets the tree mod log operation of type MOD_LOG_KEY_ADD just added by task T. Then it enters the following if branch: if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { (...) } (...) Calls read_tree_block() for eb X, which gets a reference on eb X but does not lock it - task T has it locked. Then it clones eb X while it has nritems set to 0 in its header, before task T sets nritems to 1 in eb X's header. From hereupon we use the clone of eb X which no other task has access to; 18) Then we call __tree_mod_log_rewind(), passing it the MOD_LOG_KEY_ADD mod log operation we just got from tree_mod_log_search() in the previous step and the cloned version of eb X; 19) At __tree_mod_log_rewind(), we set the local variable "n" to the number of items set in eb X's clone, which is 0. Then we enter the while loop, and in its first iteration we process the MOD_LOG_KEY_ADD operation, which just decrements "n" from 0 to (u32)-1, since "n" is declared with a type of u32. At the end of this iteration we call rb_next() to find the next tree mod log operation for eb X, that gives us the mod log operation of type MOD_LOG_KEY_REMOVE_WHILE_FREEING, for slot 0, with a sequence number of N + 1 (steps 3 to 6); 20) Then we go back to the top of the while loop and trigger the following BUG_ON(): (...) switch (tm->op) { case MOD_LOG_KEY_REMOVE_WHILE_FREEING: BUG_ON(tm->slot < n); fallthrough; (...) Because "n" has a value of (u32)-1 (4294967295) and tm->slot is 0. Fix this by taking a read lock on the extent buffer before cloning it at ctree.c:get_old_root(). This should be done regardless of the extent buffer having been freed and reused, as a concurrent task might be modifying it (while holding a write lock on it). Reported-by: Zygo Blaxell Link: https://lore.kernel.org/linux-btrfs/20210227155037.GN28049@hungrycats.org/ Fixes: 834328a8493079 ("Btrfs: tree mod log's old roots could still be part of the tree") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ctree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0eebd0bd6bc0..cad06c60ad66 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1422,7 +1422,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_warn(root->fs_info, "failed to read tree block %llu from get_old_root", logical); } else { + btrfs_tree_read_lock(old); eb = btrfs_clone_extent_buffer(old); + btrfs_tree_read_unlock(old); free_extent_buffer(old); } } else if (old_root) { From 8daf2ab0631d0ca0331a1c63e45c6cd2a46b76af Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 15 Mar 2021 15:34:51 -0700 Subject: [PATCH 08/26] nvmet: don't check iosqes,iocqes for discovery controllers commit d218a8a3003e84ab136e69a4e30dd4ec7dab2d22 upstream. From the base spec, Figure 78: "Controller Configuration, these fields are defined as parameters to configure an "I/O Controller (IOC)" and not to configure a "Discovery Controller (DC). ... If the controller does not support I/O queues, then this field shall be read-only with a value of 0h Just perform this check for I/O controllers. Fixes: a07b4970f464 ("nvmet: add a generic NVMe target") Reported-by: Belanger, Martin Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/target/core.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 4b58f352c0c9..fb2461cc3c69 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -574,9 +574,20 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) { lockdep_assert_held(&ctrl->lock); - if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES || - nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES || - nvmet_cc_mps(ctrl->cc) != 0 || + /* + * Only I/O controllers should verify iosqes,iocqes. + * Strictly speaking, the spec says a discovery controller + * should verify iosqes,iocqes are zeroed, however that + * would break backwards compatibility, so don't enforce it. + */ + if (ctrl->subsys->type != NVME_NQN_DISC && + (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES || + nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES)) { + ctrl->csts = NVME_CSTS_CFS; + return; + } + + if (nvmet_cc_mps(ctrl->cc) != 0 || nvmet_cc_ams(ctrl->cc) != 0 || nvmet_cc_css(ctrl->cc) != 0) { ctrl->csts = NVME_CSTS_CFS; From 47de847588f68264659712961acedf5aa1c5eecf Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Fri, 26 Feb 2021 09:38:20 -0500 Subject: [PATCH 09/26] NFSD: Repair misuse of sv_lock in 5.10.16-rt30. commit c7de87ff9dac5f396f62d584f3908f80ddc0e07b upstream. [ This problem is in mainline, but only rt has the chops to be able to detect it. ] Lockdep reports a circular lock dependency between serv->sv_lock and softirq_ctl.lock on system shutdown, when using a kernel built with CONFIG_PREEMPT_RT=y, and a nfs mount exists. This is due to the definition of spin_lock_bh on rt: local_bh_disable(); rt_spin_lock(lock); which forces a softirq_ctl.lock -> serv->sv_lock dependency. This is not a problem as long as _every_ lock of serv->sv_lock is a: spin_lock_bh(&serv->sv_lock); but there is one of the form: spin_lock(&serv->sv_lock); This is what is causing the circular dependency splat. The spin_lock() grabs the lock without first grabbing softirq_ctl.lock via local_bh_disable. If later on in the critical region, someone does a local_bh_disable, we get a serv->sv_lock -> softirq_ctrl.lock dependency established. Deadlock. Fix is to make serv->sv_lock be locked with spin_lock_bh everywhere, no exceptions. [ OK ] Stopped target NFS client services. Stopping Logout off all iSCSI sessions on shutdown... Stopping NFS server and services... [ 109.442380] [ 109.442385] ====================================================== [ 109.442386] WARNING: possible circular locking dependency detected [ 109.442387] 5.10.16-rt30 #1 Not tainted [ 109.442389] ------------------------------------------------------ [ 109.442390] nfsd/1032 is trying to acquire lock: [ 109.442392] ffff994237617f60 ((softirq_ctrl.lock).lock){+.+.}-{2:2}, at: __local_bh_disable_ip+0xd9/0x270 [ 109.442405] [ 109.442405] but task is already holding lock: [ 109.442406] ffff994245cb00b0 (&serv->sv_lock){+.+.}-{0:0}, at: svc_close_list+0x1f/0x90 [ 109.442415] [ 109.442415] which lock already depends on the new lock. [ 109.442415] [ 109.442416] [ 109.442416] the existing dependency chain (in reverse order) is: [ 109.442417] [ 109.442417] -> #1 (&serv->sv_lock){+.+.}-{0:0}: [ 109.442421] rt_spin_lock+0x2b/0xc0 [ 109.442428] svc_add_new_perm_xprt+0x42/0xa0 [ 109.442430] svc_addsock+0x135/0x220 [ 109.442434] write_ports+0x4b3/0x620 [ 109.442438] nfsctl_transaction_write+0x45/0x80 [ 109.442440] vfs_write+0xff/0x420 [ 109.442444] ksys_write+0x4f/0xc0 [ 109.442446] do_syscall_64+0x33/0x40 [ 109.442450] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 109.442454] [ 109.442454] -> #0 ((softirq_ctrl.lock).lock){+.+.}-{2:2}: [ 109.442457] __lock_acquire+0x1264/0x20b0 [ 109.442463] lock_acquire+0xc2/0x400 [ 109.442466] rt_spin_lock+0x2b/0xc0 [ 109.442469] __local_bh_disable_ip+0xd9/0x270 [ 109.442471] svc_xprt_do_enqueue+0xc0/0x4d0 [ 109.442474] svc_close_list+0x60/0x90 [ 109.442476] svc_close_net+0x49/0x1a0 [ 109.442478] svc_shutdown_net+0x12/0x40 [ 109.442480] nfsd_destroy+0xc5/0x180 [ 109.442482] nfsd+0x1bc/0x270 [ 109.442483] kthread+0x194/0x1b0 [ 109.442487] ret_from_fork+0x22/0x30 [ 109.442492] [ 109.442492] other info that might help us debug this: [ 109.442492] [ 109.442493] Possible unsafe locking scenario: [ 109.442493] [ 109.442493] CPU0 CPU1 [ 109.442494] ---- ---- [ 109.442495] lock(&serv->sv_lock); [ 109.442496] lock((softirq_ctrl.lock).lock); [ 109.442498] lock(&serv->sv_lock); [ 109.442499] lock((softirq_ctrl.lock).lock); [ 109.442501] [ 109.442501] *** DEADLOCK *** [ 109.442501] [ 109.442501] 3 locks held by nfsd/1032: [ 109.442503] #0: ffffffff93b49258 (nfsd_mutex){+.+.}-{3:3}, at: nfsd+0x19a/0x270 [ 109.442508] #1: ffff994245cb00b0 (&serv->sv_lock){+.+.}-{0:0}, at: svc_close_list+0x1f/0x90 [ 109.442512] #2: ffffffff93a81b20 (rcu_read_lock){....}-{1:2}, at: rt_spin_lock+0x5/0xc0 [ 109.442518] [ 109.442518] stack backtrace: [ 109.442519] CPU: 0 PID: 1032 Comm: nfsd Not tainted 5.10.16-rt30 #1 [ 109.442522] Hardware name: Supermicro X9DRL-3F/iF/X9DRL-3F/iF, BIOS 3.2 09/22/2015 [ 109.442524] Call Trace: [ 109.442527] dump_stack+0x77/0x97 [ 109.442533] check_noncircular+0xdc/0xf0 [ 109.442546] __lock_acquire+0x1264/0x20b0 [ 109.442553] lock_acquire+0xc2/0x400 [ 109.442564] rt_spin_lock+0x2b/0xc0 [ 109.442570] __local_bh_disable_ip+0xd9/0x270 [ 109.442573] svc_xprt_do_enqueue+0xc0/0x4d0 [ 109.442577] svc_close_list+0x60/0x90 [ 109.442581] svc_close_net+0x49/0x1a0 [ 109.442585] svc_shutdown_net+0x12/0x40 [ 109.442588] nfsd_destroy+0xc5/0x180 [ 109.442590] nfsd+0x1bc/0x270 [ 109.442595] kthread+0x194/0x1b0 [ 109.442600] ret_from_fork+0x22/0x30 [ 109.518225] nfsd: last server has exited, flushing export cache [ OK ] Stopped NFSv4 ID-name mapping service. [ OK ] Stopped GSSAPI Proxy Daemon. [ OK ] Stopped NFS Mount Daemon. [ OK ] Stopped NFS status monitor for NFSv2/3 locking.. Fixes: 719f8bcc883e ("svcrpc: fix xpt_list traversal locking on shutdown") Signed-off-by: Joe Korty Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/svc_xprt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 56e4ac8e2e99..7d366f6bef86 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1104,7 +1104,7 @@ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, st struct svc_xprt *xprt; int ret = 0; - spin_lock(&serv->sv_lock); + spin_lock_bh(&serv->sv_lock); list_for_each_entry(xprt, xprt_list, xpt_list) { if (xprt->xpt_net != net) continue; @@ -1112,7 +1112,7 @@ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, st set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); } - spin_unlock(&serv->sv_lock); + spin_unlock_bh(&serv->sv_lock); return ret; } From edf3c31a1a34323ff59ec91e7b064d1e5b4f186e Mon Sep 17 00:00:00 2001 From: Timo Rothenpieler Date: Tue, 23 Feb 2021 00:36:19 +0100 Subject: [PATCH 10/26] svcrdma: disable timeouts on rdma backchannel commit 6820bf77864d5894ff67b5c00d7dba8f92011e3d upstream. This brings it in line with the regular tcp backchannel, which also has all those timeouts disabled. Prevents the backchannel from timing out, getting some async operations like server side copying getting stuck indefinitely on the client side. Signed-off-by: Timo Rothenpieler Fixes: 5d252f90a800 ("svcrdma: Add class for RDMA backwards direction transport") Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index b3d48c6243c8..791072cd9492 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -328,9 +328,9 @@ xprt_setup_rdma_bc(struct xprt_create *args) xprt->timeout = &xprt_rdma_bc_timeout; xprt_set_bound(xprt); xprt_set_connected(xprt); - xprt->bind_timeout = RPCRDMA_BIND_TO; - xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; - xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; + xprt->bind_timeout = 0; + xprt->reestablish_timeout = 0; + xprt->idle_timeout = 0; xprt->prot = XPRT_TRANSPORT_BC_RDMA; xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32); From b932f0813aac33a6d9af858c1701a1d6f283eab8 Mon Sep 17 00:00:00 2001 From: Daniel Kobras Date: Sat, 27 Feb 2021 00:04:37 +0100 Subject: [PATCH 11/26] sunrpc: fix refcount leak for rpc auth modules commit f1442d6349a2e7bb7a6134791bdc26cb776c79af upstream. If an auth module's accept op returns SVC_CLOSE, svc_process_common() enters a call path that does not call svc_authorise() before leaving the function, and thus leaks a reference on the auth module's refcount. Hence, make sure calls to svc_authenticate() and svc_authorise() are paired for all call paths, to make sure rpc auth modules can be unloaded. Signed-off-by: Daniel Kobras Fixes: 4d712ef1db05 ("svcauth_gss: Close connection when dropping an incoming message") Link: https://lore.kernel.org/linux-nfs/3F1B347F-B809-478F-A1E9-0BE98E22B0F0@oracle.com/T/#t Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/svc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index eea18a124e4f..ab6b7852a11c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1306,7 +1306,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) sendit: if (svc_authorise(rqstp)) - goto close; + goto close_xprt; return 1; /* Caller can now send it */ dropit: @@ -1315,6 +1315,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) return 0; close: + svc_authorise(rqstp); +close_xprt: if (rqstp->rq_xprt && test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) svc_close_xprt(rqstp->rq_xprt); dprintk("svc: svc_process close\n"); @@ -1323,7 +1325,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) err_short_len: svc_printk(rqstp, "short len %Zd, dropping request\n", argv->iov_len); - goto close; + goto close_xprt; err_bad_rpc: serv->sv_stats->rpcbadfmt++; From 18905249c22b343365e6a4a22954d7b97edbdd29 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Mon, 1 Mar 2021 02:22:40 +0300 Subject: [PATCH 12/26] net/qrtr: fix __netdev_alloc_skb call commit 093b036aa94e01a0bea31a38d7f0ee28a2749023 upstream. syzbot found WARNING in __alloc_pages_nodemask()[1] when order >= MAX_ORDER. It was caused by a huge length value passed from userspace to qrtr_tun_write_iter(), which tries to allocate skb. Since the value comes from the untrusted source there is no need to raise a warning in __alloc_pages_nodemask(). [1] WARNING in __alloc_pages_nodemask+0x5f8/0x730 mm/page_alloc.c:5014 Call Trace: __alloc_pages include/linux/gfp.h:511 [inline] __alloc_pages_node include/linux/gfp.h:524 [inline] alloc_pages_node include/linux/gfp.h:538 [inline] kmalloc_large_node+0x60/0x110 mm/slub.c:3999 __kmalloc_node_track_caller+0x319/0x3f0 mm/slub.c:4496 __kmalloc_reserve net/core/skbuff.c:150 [inline] __alloc_skb+0x4e4/0x5a0 net/core/skbuff.c:210 __netdev_alloc_skb+0x70/0x400 net/core/skbuff.c:446 netdev_alloc_skb include/linux/skbuff.h:2832 [inline] qrtr_endpoint_post+0x84/0x11b0 net/qrtr/qrtr.c:442 qrtr_tun_write_iter+0x11f/0x1a0 net/qrtr/tun.c:98 call_write_iter include/linux/fs.h:1901 [inline] new_sync_write+0x426/0x650 fs/read_write.c:518 vfs_write+0x791/0xa30 fs/read_write.c:605 ksys_write+0x12d/0x250 fs/read_write.c:658 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported-by: syzbot+80dccaee7c6630fa9dcf@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Acked-by: Alexander Lobakin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/qrtr/qrtr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index a8253079902f..d62b7a7f65bb 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -232,7 +232,7 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) if (dst != QRTR_PORT_CTRL && type != QRTR_TYPE_DATA) return -EINVAL; - skb = netdev_alloc_skb(NULL, len); + skb = __netdev_alloc_skb(NULL, len, GFP_ATOMIC | __GFP_NOWARN); if (!skb) return -ENOMEM; From debd66577f059bab9d46130625920ee389858542 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 12 Mar 2021 10:42:11 +0300 Subject: [PATCH 13/26] scsi: lpfc: Fix some error codes in debugfs commit 19f1bc7edf0f97186810e13a88f5b62069d89097 upstream. If copy_from_user() or kstrtoull() fail then the correct behavior is to return a negative error code. Link: https://lore.kernel.org/r/YEsbU/UxYypVrC7/@mwanda Fixes: f9bb2da11db8 ("[SCSI] lpfc 8.3.27: T10 additions for SLI4") Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/lpfc/lpfc_debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index a63542bac153..feb00585a7a4 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -1061,7 +1061,7 @@ lpfc_debugfs_dif_err_write(struct file *file, const char __user *buf, memset(dstbuf, 0, 33); size = (nbytes < 32) ? nbytes : 32; if (copy_from_user(dstbuf, buf, size)) - return 0; + return -EFAULT; if (dent == phba->debug_InjErrLBA) { if ((buf[0] == 'o') && (buf[1] == 'f') && (buf[2] == 'f')) @@ -1069,7 +1069,7 @@ lpfc_debugfs_dif_err_write(struct file *file, const char __user *buf, } if ((tmp == 0) && (kstrtoull(dstbuf, 0, &tmp))) - return 0; + return -EINVAL; if (dent == phba->debug_writeGuard) phba->lpfc_injerr_wgrd_cnt = (uint32_t)tmp; From fe0c15025a38372cfe08bda746bd2851408634fa Mon Sep 17 00:00:00 2001 From: Macpaul Lin Date: Thu, 18 Jun 2020 17:13:38 +0800 Subject: [PATCH 14/26] USB: replace hardcode maximum usb string length by definition commit 81c7462883b0cc0a4eeef0687f80ad5b5baee5f6 upstream. Replace hardcoded maximum USB string length (126 bytes) by definition "USB_MAX_STRING_LEN". Signed-off-by: Macpaul Lin Acked-by: Alan Stern Link: https://lore.kernel.org/r/1592471618-29428-1-git-send-email-macpaul.lin@mediatek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/composite.c | 4 ++-- drivers/usb/gadget/configfs.c | 2 +- drivers/usb/gadget/usbstring.c | 4 ++-- include/uapi/linux/usb/ch9.h | 3 +++ 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index c574bf981140..f33635ab967f 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -1076,7 +1076,7 @@ static void collect_langs(struct usb_gadget_strings **sp, __le16 *buf) while (*sp) { s = *sp; language = cpu_to_le16(s->language); - for (tmp = buf; *tmp && tmp < &buf[126]; tmp++) { + for (tmp = buf; *tmp && tmp < &buf[USB_MAX_STRING_LEN]; tmp++) { if (*tmp == language) goto repeat; } @@ -1151,7 +1151,7 @@ static int get_string(struct usb_composite_dev *cdev, collect_langs(sp, s->wData); } - for (len = 0; len <= 126 && s->wData[len]; len++) + for (len = 0; len <= USB_MAX_STRING_LEN && s->wData[len]; len++) continue; if (!len) return -EINVAL; diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 0aa1b8fa9b47..7f0f30484d79 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -114,7 +114,7 @@ static int usb_string_copy(const char *s, char **s_copy) char *str; char *copy = *s_copy; ret = strlen(s); - if (ret > 126) + if (ret > USB_MAX_STRING_LEN) return -EOVERFLOW; str = kstrdup(s, GFP_KERNEL); diff --git a/drivers/usb/gadget/usbstring.c b/drivers/usb/gadget/usbstring.c index 73a4dfba0edb..0173a9969b9a 100644 --- a/drivers/usb/gadget/usbstring.c +++ b/drivers/usb/gadget/usbstring.c @@ -59,9 +59,9 @@ usb_gadget_get_string (struct usb_gadget_strings *table, int id, u8 *buf) return -EINVAL; /* string descriptors have length, tag, then UTF16-LE text */ - len = min ((size_t) 126, strlen (s->s)); + len = min((size_t)USB_MAX_STRING_LEN, strlen(s->s)); len = utf8s_to_utf16s(s->s, len, UTF16_LITTLE_ENDIAN, - (wchar_t *) &buf[2], 126); + (wchar_t *) &buf[2], USB_MAX_STRING_LEN); if (len < 0) return -EINVAL; buf [0] = (len + 1) * 2; diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index 33c603dd7cd3..8e0e6d3091f1 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -358,6 +358,9 @@ struct usb_config_descriptor { /*-------------------------------------------------------------------------*/ +/* USB String descriptors can contain at most 126 characters. */ +#define USB_MAX_STRING_LEN 126 + /* USB_DT_STRING: String descriptor */ struct usb_string_descriptor { __u8 bLength; From 394ca034bb3eca17c173ccb7b175ab8f685264b8 Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Thu, 11 Mar 2021 14:42:41 +0800 Subject: [PATCH 15/26] usb: gadget: configfs: Fix KASAN use-after-free commit 98f153a10da403ddd5e9d98a3c8c2bb54bb5a0b6 upstream. When gadget is disconnected, running sequence is like this. . composite_disconnect . Call trace: usb_string_copy+0xd0/0x128 gadget_config_name_configuration_store+0x4 gadget_config_name_attr_store+0x40/0x50 configfs_write_file+0x198/0x1f4 vfs_write+0x100/0x220 SyS_write+0x58/0xa8 . configfs_composite_unbind . configfs_composite_bind In configfs_composite_bind, it has "cn->strings.s = cn->configuration;" When usb_string_copy is invoked. it would allocate memory, copy input string, release previous pointed memory space, and use new allocated memory. When gadget is connected, host sends down request to get information. Call trace: usb_gadget_get_string+0xec/0x168 lookup_string+0x64/0x98 composite_setup+0xa34/0x1ee8 If gadget is disconnected and connected quickly, in the failed case, cn->configuration memory has been released by usb_string_copy kfree but configfs_composite_bind hasn't been run in time to assign new allocated "cn->configuration" pointer to "cn->strings.s". When "strlen(s->s) of usb_gadget_get_string is being executed, the dangling memory is accessed, "BUG: KASAN: use-after-free" error occurs. Cc: stable@vger.kernel.org Signed-off-by: Jim Lin Signed-off-by: Macpaul Lin Link: https://lore.kernel.org/r/1615444961-13376-1-git-send-email-macpaul.lin@mediatek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/configfs.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 7f0f30484d79..fe1b54a8fb0c 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -108,6 +108,8 @@ struct gadget_config_name { struct list_head list; }; +#define USB_MAX_STRING_WITH_NULL_LEN (USB_MAX_STRING_LEN+1) + static int usb_string_copy(const char *s, char **s_copy) { int ret; @@ -117,12 +119,16 @@ static int usb_string_copy(const char *s, char **s_copy) if (ret > USB_MAX_STRING_LEN) return -EOVERFLOW; - str = kstrdup(s, GFP_KERNEL); - if (!str) - return -ENOMEM; + if (copy) { + str = copy; + } else { + str = kmalloc(USB_MAX_STRING_WITH_NULL_LEN, GFP_KERNEL); + if (!str) + return -ENOMEM; + } + strcpy(str, s); if (str[ret - 1] == '\n') str[ret - 1] = '\0'; - kfree(copy); *s_copy = str; return 0; } From cd124fcd999d3aafaf4ea72ab2669d79cb67de40 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 16 Feb 2021 22:42:13 +0300 Subject: [PATCH 16/26] iio: adis16400: Fix an error code in adis16400_initial_setup() commit a71266e454b5df10d019b06f5ebacd579f76be28 upstream. This is to silence a new Smatch warning: drivers/iio/imu/adis16400.c:492 adis16400_initial_setup() warn: sscanf doesn't return error codes If the condition "if (st->variant->flags & ADIS16400_HAS_SLOW_MODE) {" is false then we return 1 instead of returning 0 and probe will fail. Fixes: 72a868b38bdd ("iio: imu: check sscanf return value") Signed-off-by: Dan Carpenter Cc: Link: https://lore.kernel.org/r/YCwgFb3JVG6qrlQ+@mwanda Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/imu/adis16400_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iio/imu/adis16400_core.c b/drivers/iio/imu/adis16400_core.c index fb7c0dbed51c..67837905f7b4 100644 --- a/drivers/iio/imu/adis16400_core.c +++ b/drivers/iio/imu/adis16400_core.c @@ -288,8 +288,7 @@ static int adis16400_initial_setup(struct iio_dev *indio_dev) if (ret) goto err_ret; - ret = sscanf(indio_dev->name, "adis%u\n", &device_id); - if (ret != 1) { + if (sscanf(indio_dev->name, "adis%u\n", &device_id) != 1) { ret = -EINVAL; goto err_ret; } From ef8dc3d327cc799e3f6f1af41852f8f954f7115f Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Mon, 15 Mar 2021 15:48:21 -0600 Subject: [PATCH 17/26] PCI: rpadlpar: Fix potential drc_name corruption in store functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit cc7a0bb058b85ea03db87169c60c7cfdd5d34678 upstream. Both add_slot_store() and remove_slot_store() try to fix up the drc_name copied from the store buffer by placing a NUL terminator at nbyte + 1 or in place of a '\n' if present. However, the static buffer that we copy the drc_name data into is not zeroed and can contain anything past the n-th byte. This is problematic if a '\n' byte appears in that buffer after nbytes and the string copied into the store buffer was not NUL terminated to start with as the strchr() search for a '\n' byte will mark this incorrectly as the end of the drc_name string resulting in a drc_name string that contains garbage data after the n-th byte. Additionally it will cause us to overwrite that '\n' byte on the stack with NUL, potentially corrupting data on the stack. The following debugging shows an example of the drmgr utility writing "PHB 4543" to the add_slot sysfs attribute, but add_slot_store() logging a corrupted string value. drmgr: drmgr: -c phb -a -s PHB 4543 -d 1 add_slot_store: drc_name = PHB 4543°|<82>!, rc = -19 Fix this by using strscpy() instead of memcpy() to ensure the string is NUL terminated when copied into the static drc_name buffer. Further, since the string is now NUL terminated the code only needs to change '\n' to '\0' when present. Cc: stable@vger.kernel.org Signed-off-by: Tyrel Datwyler [mpe: Reformat change log and add mention of possible stack corruption] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210315214821.452959-1-tyreld@linux.ibm.com Signed-off-by: Greg Kroah-Hartman --- drivers/pci/hotplug/rpadlpar_sysfs.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/pci/hotplug/rpadlpar_sysfs.c b/drivers/pci/hotplug/rpadlpar_sysfs.c index a796301ea03f..ca9d832bd9f8 100644 --- a/drivers/pci/hotplug/rpadlpar_sysfs.c +++ b/drivers/pci/hotplug/rpadlpar_sysfs.c @@ -39,12 +39,11 @@ static ssize_t add_slot_store(struct kobject *kobj, struct kobj_attribute *attr, if (nbytes >= MAX_DRC_NAME_LEN) return 0; - memcpy(drc_name, buf, nbytes); + strscpy(drc_name, buf, nbytes + 1); end = strchr(drc_name, '\n'); - if (!end) - end = &drc_name[nbytes]; - *end = '\0'; + if (end) + *end = '\0'; rc = dlpar_add_slot(drc_name); if (rc) @@ -70,12 +69,11 @@ static ssize_t remove_slot_store(struct kobject *kobj, if (nbytes >= MAX_DRC_NAME_LEN) return 0; - memcpy(drc_name, buf, nbytes); + strscpy(drc_name, buf, nbytes + 1); end = strchr(drc_name, '\n'); - if (!end) - end = &drc_name[nbytes]; - *end = '\0'; + if (end) + *end = '\0'; rc = dlpar_remove_slot(drc_name); if (rc) From 6c2ab223a7286ecfa016f532b7231fb049fb2a02 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 12 Mar 2021 05:21:37 -0800 Subject: [PATCH 18/26] perf/x86/intel: Fix a crash caused by zero PEBS status commit d88d05a9e0b6d9356e97129d4ff9942d765f46ea upstream. A repeatable crash can be triggered by the perf_fuzzer on some Haswell system. https://lore.kernel.org/lkml/7170d3b-c17f-1ded-52aa-cc6d9ae999f4@maine.edu/ For some old CPUs (HSW and earlier), the PEBS status in a PEBS record may be mistakenly set to 0. To minimize the impact of the defect, the commit was introduced to try to avoid dropping the PEBS record for some cases. It adds a check in the intel_pmu_drain_pebs_nhm(), and updates the local pebs_status accordingly. However, it doesn't correct the PEBS status in the PEBS record, which may trigger the crash, especially for the large PEBS. It's possible that all the PEBS records in a large PEBS have the PEBS status 0. If so, the first get_next_pebs_record_by_bit() in the __intel_pmu_pebs_event() returns NULL. The at = NULL. Since it's a large PEBS, the 'count' parameter must > 1. The second get_next_pebs_record_by_bit() will crash. Besides the local pebs_status, correct the PEBS status in the PEBS record as well. Fixes: 01330d7288e0 ("perf/x86: Allow zero PEBS status with only single active event") Reported-by: Vince Weaver Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1615555298-140216-1-git-send-email-kan.liang@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/events/intel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index f562ddbeb20c..f39838d78976 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1473,7 +1473,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) */ if (!pebs_status && cpuc->pebs_enabled && !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1))) - pebs_status = cpuc->pebs_enabled; + pebs_status = p->status = cpuc->pebs_enabled; bit = find_first_bit((unsigned long *)&pebs_status, x86_pmu.max_pebs_events); From 3d9fcc2502dd931aaf13920b61e1f3948030ffed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 18 Mar 2021 20:26:47 +0100 Subject: [PATCH 19/26] x86/ioapic: Ignore IRQ2 again commit a501b048a95b79e1e34f03cac3c87ff1e9f229ad upstream. Vitaly ran into an issue with hotplugging CPU0 on an Amazon instance where the matrix allocator claimed to be out of vectors. He analyzed it down to the point that IRQ2, the PIC cascade interrupt, which is supposed to be not ever routed to the IO/APIC ended up having an interrupt vector assigned which got moved during unplug of CPU0. The underlying issue is that IRQ2 for various reasons (see commit af174783b925 ("x86: I/O APIC: Never configure IRQ2" for details) is treated as a reserved system vector by the vector core code and is not accounted as a regular vector. The Amazon BIOS has an routing entry of pin2 to IRQ2 which causes the IO/APIC setup to claim that interrupt which is granted by the vector domain because there is no sanity check. As a consequence the allocation counter of CPU0 underflows which causes a subsequent unplug to fail with: [ ... ] CPU 0 has 4294967295 vectors, 589 available. Cannot disable CPU There is another sanity check missing in the matrix allocator, but the underlying root cause is that the IO/APIC code lost the IRQ2 ignore logic during the conversion to irqdomains. For almost 6 years nobody complained about this wreckage, which might indicate that this requirement could be lifted, but for any system which actually has a PIC IRQ2 is unusable by design so any routing entry has no effect and the interrupt cannot be connected to a device anyway. Due to that and due to history biased paranoia reasons restore the IRQ2 ignore logic and treat it as non existent despite a routing entry claiming otherwise. Fixes: d32932d02e18 ("x86/irq: Convert IOAPIC to use hierarchical irqdomain interfaces") Reported-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Tested-by: Vitaly Kuznetsov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210318192819.636943062@linutronix.de Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/apic/io_apic.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3401b28f1312..f398612e8a81 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1042,6 +1042,16 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; legacy = mp_is_legacy_irq(irq); + /* + * IRQ2 is unusable for historical reasons on systems which + * have a legacy PIC. See the comment vs. IRQ2 further down. + * + * If this gets removed at some point then the related code + * in lapic_assign_system_vectors() needs to be adjusted as + * well. + */ + if (legacy && irq == PIC_CASCADE_IR) + return -EINVAL; } mutex_lock(&ioapic_mutex); From 376a76aa925722ecb0ab2e2b0fa765bf0cf06844 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 1 Feb 2021 18:46:41 +0100 Subject: [PATCH 20/26] kernel, fs: Introduce and use set_restart_fn() and arch_set_restart_data() commit 5abbe51a526253b9f003e9a0a195638dc882d660 upstream. Preparation for fixing get_nr_restart_syscall() on X86 for COMPAT. Add a new helper which sets restart_block->fn and calls a dummy arch_set_restart_data() helper. Fixes: 609c19a385c8 ("x86/ptrace: Stop setting TS_COMPAT in ptrace code") Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210201174641.GA17871@redhat.com Signed-off-by: Greg Kroah-Hartman --- fs/select.c | 10 ++++------ include/linux/thread_info.h | 13 +++++++++++++ kernel/futex.c | 3 +-- kernel/time/alarmtimer.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/posix-cpu-timers.c | 2 +- 6 files changed, 21 insertions(+), 11 deletions(-) diff --git a/fs/select.c b/fs/select.c index 3d4f85defeab..de675c79f479 100644 --- a/fs/select.c +++ b/fs/select.c @@ -961,10 +961,9 @@ static long do_restart_poll(struct restart_block *restart_block) ret = do_sys_poll(ufds, nfds, to); - if (ret == -EINTR) { - restart_block->fn = do_restart_poll; - ret = -ERESTART_RESTARTBLOCK; - } + if (ret == -EINTR) + ret = set_restart_fn(restart_block, do_restart_poll); + return ret; } @@ -986,7 +985,6 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, struct restart_block *restart_block; restart_block = ¤t->restart_block; - restart_block->fn = do_restart_poll; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; @@ -997,7 +995,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, } else restart_block->poll.has_timeout = 0; - ret = -ERESTART_RESTARTBLOCK; + ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 5e6436741f96..330794252270 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -9,6 +9,7 @@ #include #include +#include struct timespec; struct compat_timespec; @@ -59,6 +60,18 @@ extern long do_no_restart_syscall(struct restart_block *parm); #ifdef __KERNEL__ +#ifndef arch_set_restart_data +#define arch_set_restart_data(restart) do { } while (0) +#endif + +static inline long set_restart_fn(struct restart_block *restart, + long (*fn)(struct restart_block *)) +{ + restart->fn = fn; + arch_set_restart_data(restart); + return -ERESTART_RESTARTBLOCK; +} + #define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO) /* diff --git a/kernel/futex.c b/kernel/futex.c index 0015c14ac2c0..796b1c860839 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2822,14 +2822,13 @@ retry: goto out; restart = ¤t->restart_block; - restart->fn = futex_wait_restart; restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; - ret = -ERESTART_RESTARTBLOCK; + ret = set_restart_fn(restart, futex_wait_restart); out: if (to) { diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 6aef4a0bed29..d06c1d8dd4d2 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -809,10 +809,10 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, } restart = ¤t->restart_block; - restart->fn = alarm_timer_nsleep_restart; restart->nanosleep.clockid = type; restart->nanosleep.expires = exp.tv64; restart->nanosleep.rmtp = rmtp; + set_restart_fn(restart, alarm_timer_nsleep_restart); ret = -ERESTART_RESTARTBLOCK; out: diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f31637ced7fa..df7750e03242 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1582,10 +1582,10 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, } restart = ¤t->restart_block; - restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.rmtp = rmtp; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + set_restart_fn(restart, hrtimer_nanosleep_restart); ret = -ERESTART_RESTARTBLOCK; out: diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 21a27bb73587..9fff077d0ffc 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1377,10 +1377,10 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; - restart_block->fn = posix_cpu_nsleep_restart; restart_block->nanosleep.clockid = which_clock; restart_block->nanosleep.rmtp = rmtp; restart_block->nanosleep.expires = timespec_to_ns(rqtp); + set_restart_fn(restart_block, posix_cpu_nsleep_restart); } return error; } From 4a3b824655b5319d272c6e5a7dc35f3f40d7bf6e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 1 Feb 2021 18:46:49 +0100 Subject: [PATCH 21/26] x86: Move TS_COMPAT back to asm/thread_info.h commit 66c1b6d74cd7035e85c426f0af4aede19e805c8a upstream. Move TS_COMPAT back to asm/thread_info.h, close to TS_I386_REGS_POKED. It was moved to asm/processor.h by b9d989c7218a ("x86/asm: Move the thread_info::status field to thread_struct"), then later 37a8f7c38339 ("x86/asm: Move 'status' from thread_struct to thread_info") moved the 'status' field back but TS_COMPAT was forgotten. Preparatory patch to fix the COMPAT case for get_nr_restart_syscall() Fixes: 609c19a385c8 ("x86/ptrace: Stop setting TS_COMPAT in ptrace code") Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210201174649.GA17880@redhat.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/processor.h | 9 --------- arch/x86/include/asm/thread_info.h | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7aa9a9bd9d98..f606ef598917 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -461,15 +461,6 @@ struct thread_struct { */ }; -/* - * Thread-synchronous status. - * - * This is different from the flags in that nobody else - * ever touches our thread-synchronous status, so we don't - * have to worry about atomic accesses. - */ -#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ - /* * Set IOPL bits in EFLAGS from given mask */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 0438f7fbb383..e23c44fb98dc 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -221,6 +221,15 @@ static inline int arch_within_stack_frames(const void * const stack, #endif +/* + * Thread-synchronous status. + * + * This is different from the flags in that nobody else + * ever touches our thread-synchronous status, so we don't + * have to worry about atomic accesses. + */ +#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ + #ifdef CONFIG_COMPAT #define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */ #endif From 5b871095c08cc987ba5152ea37ce0c3b23c0ddcd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 1 Feb 2021 18:47:09 +0100 Subject: [PATCH 22/26] x86: Introduce TS_COMPAT_RESTART to fix get_nr_restart_syscall() commit 8c150ba2fb5995c84a7a43848250d444a3329a7d upstream. The comment in get_nr_restart_syscall() says: * The problem is that we can get here when ptrace pokes * syscall-like values into regs even if we're not in a syscall * at all. Yes, but if not in a syscall then the status & (TS_COMPAT|TS_I386_REGS_POKED) check below can't really help: - TS_COMPAT can't be set - TS_I386_REGS_POKED is only set if regs->orig_ax was changed by 32bit debugger; and even in this case get_nr_restart_syscall() is only correct if the tracee is 32bit too. Suppose that a 64bit debugger plays with a 32bit tracee and * Tracee calls sleep(2) // TS_COMPAT is set * User interrupts the tracee by CTRL-C after 1 sec and does "(gdb) call func()" * gdb saves the regs by PTRACE_GETREGS * does PTRACE_SETREGS to set %rip='func' and %orig_rax=-1 * PTRACE_CONT // TS_COMPAT is cleared * func() hits int3. * Debugger catches SIGTRAP. * Restore original regs by PTRACE_SETREGS. * PTRACE_CONT get_nr_restart_syscall() wrongly returns __NR_restart_syscall==219, the tracee calls ia32_sys_call_table[219] == sys_madvise. Add the sticky TS_COMPAT_RESTART flag which survives after return to user mode. It's going to be removed in the next step again by storing the information in the restart block. As a further cleanup it might be possible to remove also TS_I386_REGS_POKED with that. Test-case: $ cvs -d :pserver:anoncvs:anoncvs@sourceware.org:/cvs/systemtap co ptrace-tests $ gcc -o erestartsys-trap-debuggee ptrace-tests/tests/erestartsys-trap-debuggee.c --m32 $ gcc -o erestartsys-trap-debugger ptrace-tests/tests/erestartsys-trap-debugger.c -lutil $ ./erestartsys-trap-debugger Unexpected: retval 1, errno 22 erestartsys-trap-debugger: ptrace-tests/tests/erestartsys-trap-debugger.c:421 Fixes: 609c19a385c8 ("x86/ptrace: Stop setting TS_COMPAT in ptrace code") Reported-by: Jan Kratochvil Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210201174709.GA17895@redhat.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/thread_info.h | 14 +++++++++++++- arch/x86/kernel/signal.c | 24 +----------------------- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e23c44fb98dc..3ba5e2fce171 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -230,10 +230,22 @@ static inline int arch_within_stack_frames(const void * const stack, */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ +#ifndef __ASSEMBLY__ #ifdef CONFIG_COMPAT #define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */ +#define TS_COMPAT_RESTART 0x0008 + +#define arch_set_restart_data arch_set_restart_data + +static inline void arch_set_restart_data(struct restart_block *restart) +{ + struct thread_info *ti = current_thread_info(); + if (ti->status & TS_COMPAT) + ti->status |= TS_COMPAT_RESTART; + else + ti->status &= ~TS_COMPAT_RESTART; +} #endif -#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_32 #define in_ia32_syscall() true diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ca010dfb9682..4050d5092c86 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -767,30 +767,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { - /* - * This function is fundamentally broken as currently - * implemented. - * - * The idea is that we want to trigger a call to the - * restart_block() syscall and that we want in_ia32_syscall(), - * in_x32_syscall(), etc. to match whatever they were in the - * syscall being restarted. We assume that the syscall - * instruction at (regs->ip - 2) matches whatever syscall - * instruction we used to enter in the first place. - * - * The problem is that we can get here when ptrace pokes - * syscall-like values into regs even if we're not in a syscall - * at all. - * - * For now, we maintain historical behavior and guess based on - * stored state. We could do better by saving the actual - * syscall arch in restart_block or (with caveats on x32) by - * checking if regs->ip points to 'int $0x80'. The current - * behavior is incorrect if a tracer has a different bitness - * than the tracee. - */ #ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (current_thread_info()->status & TS_COMPAT_RESTART) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI From 5acfb54aebf93deea69a7d5c534392102c27fdde Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 3 Mar 2021 21:17:02 +0800 Subject: [PATCH 23/26] ext4: find old entry again if failed to rename whiteout commit b7ff91fd030dc9d72ed91b1aab36e445a003af4f upstream. If we failed to add new entry on rename whiteout, we cannot reset the old->de entry directly, because the old->de could have moved from under us during make indexed dir. So find the old entry again before reset is needed, otherwise it may corrupt the filesystem as below. /dev/sda: Entry '00000001' in ??? (12) has deleted/unused inode 15. CLEARED. /dev/sda: Unattached inode 75 /dev/sda: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. Fixes: 6b4b8e6b4ad ("ext4: fix bug for rename with RENAME_WHITEOUT") Cc: stable@vger.kernel.org Signed-off-by: zhangyi (F) Link: https://lore.kernel.org/r/20210303131703.330415-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/namei.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6224b0e6fb64..e6e3eb8dd4d6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3425,6 +3425,31 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, return 0; } +static void ext4_resetent(handle_t *handle, struct ext4_renament *ent, + unsigned ino, unsigned file_type) +{ + struct ext4_renament old = *ent; + int retval = 0; + + /* + * old->de could have moved from under us during make indexed dir, + * so the old->de may no longer valid and need to find it again + * before reset old inode info. + */ + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); + if (IS_ERR(old.bh)) + retval = PTR_ERR(old.bh); + if (!old.bh) + retval = -ENOENT; + if (retval) { + ext4_std_error(old.dir->i_sb, retval); + return; + } + + ext4_setent(handle, &old, ino, file_type); + brelse(old.bh); +} + static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, const struct qstr *d_name) { @@ -3734,8 +3759,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, end_rename: if (whiteout) { if (retval) { - ext4_setent(handle, &old, - old.inode->i_ino, old_file_type); + ext4_resetent(handle, &old, + old.inode->i_ino, old_file_type); drop_nlink(whiteout); } unlock_new_inode(whiteout); From 542e59b0d184937e7237062afe2010950f6da081 Mon Sep 17 00:00:00 2001 From: Shijie Luo Date: Fri, 12 Mar 2021 01:50:51 -0500 Subject: [PATCH 24/26] ext4: fix potential error in ext4_do_update_inode commit 7d8bd3c76da1d94b85e6c9b7007e20e980bfcfe6 upstream. If set_large_file = 1 and errors occur in ext4_handle_dirty_metadata(), the error code will be overridden, go to out_brelse to avoid this situation. Signed-off-by: Shijie Luo Link: https://lore.kernel.org/r/20210312065051.36314-1-luoshijie1@huawei.com Cc: stable@kernel.org Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 17b16bbf8361..aa97a3ed3d8f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4893,7 +4893,7 @@ static int ext4_do_update_inode(handle_t *handle, struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; struct super_block *sb = inode->i_sb; - int err = 0, rc, block; + int err = 0, block; int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; @@ -5003,9 +5003,9 @@ static int ext4_do_update_inode(handle_t *handle, bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - rc = ext4_handle_dirty_metadata(handle, NULL, bh); - if (!err) - err = rc; + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out_brelse; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); From 528d3b767ec5c1888f2a259240af9adcbceb0c6b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 17 Mar 2021 15:38:52 +0100 Subject: [PATCH 25/26] genirq: Disable interrupts for force threaded handlers commit 81e2073c175b887398e5bca6c004efa89983f58d upstream. With interrupt force threading all device interrupt handlers are invoked from kernel threads. Contrary to hard interrupt context the invocation only disables bottom halfs, but not interrupts. This was an oversight back then because any code like this will have an issue: thread(irq_A) irq_handler(A) spin_lock(&foo->lock); interrupt(irq_B) irq_handler(B) spin_lock(&foo->lock); This has been triggered with networking (NAPI vs. hrtimers) and console drivers where printk() happens from an interrupt which interrupted the force threaded handler. Now people noticed and started to change the spin_lock() in the handler to spin_lock_irqsave() which affects performance or add IRQF_NOTHREAD to the interrupt request which in turn breaks RT. Fix the root cause and not the symptom and disable interrupts before invoking the force threaded handler which preserves the regular semantics and the usefulness of the interrupt force threading as a general debugging tool. For not RT this is not changing much, except that during the execution of the threaded handler interrupts are delayed until the handler returns. Vs. scheduling and softirq processing there is no difference. For RT kernels there is no issue. Fixes: 8d32a307e4fa ("genirq: Provide forced interrupt threading") Reported-by: Johan Hovold Signed-off-by: Thomas Gleixner Reviewed-by: Johan Hovold Acked-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20210317143859.513307808@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/irq/manage.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b2fc2a581b86..33770ef64d21 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -886,11 +886,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) irqreturn_t ret; local_bh_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) + local_irq_disable(); ret = action->thread_fn(action->irq, action->dev_id); if (ret == IRQ_HANDLED) atomic_inc(&desc->threads_handled); irq_finalize_oneshot(desc, action); + if (!IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) + local_irq_enable(); local_bh_enable(); return ret; } From 5023febc3a090aa8c812f6149dca451432f69067 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 24 Mar 2021 10:59:26 +0100 Subject: [PATCH 26/26] Linux 4.9.263 Tested-by: Jon Hunter Tested-by: Florian Fainelli Tested-by: Guenter Roeck Tested-by: Jason Self Tested-by: Linux Kernel Functional Testing Link: https://lore.kernel.org/r/20210322121920.399826335@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index be5eac0a12d3..80b265a383bb 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 262 +SUBLEVEL = 263 EXTRAVERSION = NAME = Roaring Lionus