From 1f1009ea8ca5a0271ad69afe8a86c887d530b5c8 Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Fri, 9 Sep 2022 12:04:22 +0300 Subject: [PATCH 01/66] scsi: target: core: Fix preempt and abort for allreg res Match a key only if SARK is not zero according to SPC-4 and the comment above the code: If an all registrants persistent reservation is present and the SERVICE ACTION RESERVATION KEY field is set to zero, then all registrations shall be removed except for that of the I_T nexus that is being used for the PERSISTENT RESERVE OUT command; Without this patch in case of SARK==0 no registrants will be removed. Link: https://lore.kernel.org/r/20220909090425.14479-2-d.bogdanov@yadro.com Reviewed-by: Mike Christie Signed-off-by: Dmitry Bogdanov Signed-off-by: Martin K. Petersen --- drivers/target/target_core_pr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index a1d67554709f..1521a97ddac2 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -3022,7 +3022,7 @@ core_scsi3_pro_preempt(struct se_cmd *cmd, int type, int scope, u64 res_key, if (calling_it_nexus) continue; - if (pr_reg->pr_res_key != sa_res_key) + if (sa_res_key && pr_reg->pr_res_key != sa_res_key) continue; pr_reg_nacl = pr_reg->pr_reg_nacl; From f050a7c66ca56aa2f49ab9b53e01d04b3e7e94c5 Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Fri, 9 Sep 2022 12:04:23 +0300 Subject: [PATCH 02/66] scsi: target: core: Fix memory leak in preempt_and_abort Always release preempt_and_abort_list to avoid memory leak of t10_pr_registration objects in it. Link: https://lore.kernel.org/r/20220909090425.14479-3-d.bogdanov@yadro.com Reviewed-by: Mike Christie Signed-off-by: Dmitry Bogdanov Signed-off-by: Martin K. Petersen --- drivers/target/target_core_pr.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 1521a97ddac2..e3869576f254 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -2956,13 +2956,14 @@ core_scsi3_pro_preempt(struct se_cmd *cmd, int type, int scope, u64 res_key, __core_scsi3_complete_pro_preempt(dev, pr_reg_n, (preempt_type == PREEMPT_AND_ABORT) ? &preempt_and_abort_list : NULL, type, scope, preempt_type); - - if (preempt_type == PREEMPT_AND_ABORT) - core_scsi3_release_preempt_and_abort( - &preempt_and_abort_list, pr_reg_n); } + spin_unlock(&dev->dev_reservation_lock); + if (preempt_type == PREEMPT_AND_ABORT) + core_scsi3_release_preempt_and_abort( + &preempt_and_abort_list, pr_reg_n); + if (pr_tmpl->pr_aptpl_active) core_scsi3_update_and_write_aptpl(cmd->se_dev, true); From 49790e6a582012c36ca17174cda228444f9a2414 Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Fri, 9 Sep 2022 12:04:24 +0300 Subject: [PATCH 03/66] scsi: target: core: Abort all preempted regs if requested According to SPC the preempted commands shall be always aborted. SPC-4: 5.12.11.2.6 Preempting and aborting c) all commands from the I_T nexus(es) associated with the persistent reservations or registrations being preempted (i.e., preempted commands) except the PERSISTENT RESERVE OUT command itself shall be aborted as defined in SAM-5; Link: https://lore.kernel.org/r/20220909090425.14479-4-d.bogdanov@yadro.com Reviewed-by: Mike Christie Signed-off-by: Dmitry Bogdanov Signed-off-by: Martin K. Petersen --- drivers/target/target_core_pr.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index e3869576f254..6a5f9504a481 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -2960,9 +2960,23 @@ core_scsi3_pro_preempt(struct se_cmd *cmd, int type, int scope, u64 res_key, spin_unlock(&dev->dev_reservation_lock); - if (preempt_type == PREEMPT_AND_ABORT) + /* + * SPC-4 5.12.11.2.6 Preempting and aborting + * The actions described in this subclause shall be performed + * for all I_T nexuses that are registered with the non-zero + * SERVICE ACTION RESERVATION KEY value, without regard for + * whether the preempted I_T nexuses hold the persistent + * reservation. If the SERVICE ACTION RESERVATION KEY field is + * set to zero and an all registrants persistent reservation is + * present, the device server shall abort all commands for all + * registered I_T nexuses. + */ + if (preempt_type == PREEMPT_AND_ABORT) { + core_tmr_lun_reset(dev, NULL, &preempt_and_abort_list, + cmd); core_scsi3_release_preempt_and_abort( &preempt_and_abort_list, pr_reg_n); + } if (pr_tmpl->pr_aptpl_active) core_scsi3_update_and_write_aptpl(cmd->se_dev, true); From 3e2deba7aa662862c8046aa24148b83b49298a9b Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Fri, 9 Sep 2022 12:04:25 +0300 Subject: [PATCH 04/66] scsi: target: core: New key must be used for moved PR According to SPC4 5.12.8: e) Retain the reservation key specified in the SERVICE ACTION RESERVATION KEY field and associated information; But currently sa_res_key is only used for the not existing I_T nexus. Add a changing of the key for the existing I_T nexus the PR moved to. Link: https://lore.kernel.org/r/20220909090425.14479-5-d.bogdanov@yadro.com Reviewed-by: Mike Christie Signed-off-by: Dmitry Bogdanov Signed-off-by: Martin K. Petersen --- drivers/target/target_core_pr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 6a5f9504a481..1493b1d01194 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -3440,8 +3440,6 @@ after_iport_check: * transport protocols where port names are not required; * d) Register the reservation key specified in the SERVICE ACTION * RESERVATION KEY field; - * e) Retain the reservation key specified in the SERVICE ACTION - * RESERVATION KEY field and associated information; * * Also, It is not an error for a REGISTER AND MOVE service action to * register an I_T nexus that is already registered with the same @@ -3463,6 +3461,12 @@ after_iport_check: dest_pr_reg = __core_scsi3_locate_pr_reg(dev, dest_node_acl, iport_ptr); new_reg = 1; + } else { + /* + * e) Retain the reservation key specified in the SERVICE ACTION + * RESERVATION KEY field and associated information; + */ + dest_pr_reg->pr_res_key = sa_res_key; } /* * f) Release the persistent reservation for the persistent reservation From 6290e23f3bd8cee52fb8fd98980bb1eb31c8284d Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Tue, 13 Sep 2022 19:36:02 +0300 Subject: [PATCH 05/66] scsi: target: core: UA on all LUNs after reset Allocate UNIT ATTENTION "BUS DEVICE RESET OCCURRED" on all LUNs on all target ports of the device upon reception of TMF LUN RESET. This change passes libiscsi test SCSI.MultipathIO.Reset. Link: https://lore.kernel.org/r/20220913163602.20597-1-d.bogdanov@yadro.com Signed-off-by: Dmitry Bogdanov Signed-off-by: Martin K. Petersen --- drivers/target/target_core_device.c | 19 +++++++++++++++++++ drivers/target/target_core_internal.h | 1 + drivers/target/target_core_transport.c | 3 +-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index b7f16ee8aa0e..cb4f7cc02f8f 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -284,6 +284,25 @@ void target_pr_kref_release(struct kref *kref) complete(&deve->pr_comp); } +/* + * Establish UA condition on SCSI device - all LUNs + */ +void target_dev_ua_allocate(struct se_device *dev, u8 asc, u8 ascq) +{ + struct se_dev_entry *se_deve; + struct se_lun *lun; + + spin_lock(&dev->se_port_lock); + list_for_each_entry(lun, &dev->dev_sep_list, lun_dev_link) { + + spin_lock(&lun->lun_deve_lock); + list_for_each_entry(se_deve, &lun->lun_deve_list, lun_link) + core_scsi3_ua_allocate(se_deve, asc, ascq); + spin_unlock(&lun->lun_deve_lock); + } + spin_unlock(&dev->se_port_lock); +} + static void target_luns_data_has_changed(struct se_node_acl *nacl, struct se_dev_entry *new, bool skip_new) diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h index 30fcf69e1a1d..38a6d08f75b3 100644 --- a/drivers/target/target_core_internal.h +++ b/drivers/target/target_core_internal.h @@ -89,6 +89,7 @@ int target_configure_device(struct se_device *dev); void target_free_device(struct se_device *); int target_for_each_device(int (*fn)(struct se_device *dev, void *data), void *data); +void target_dev_ua_allocate(struct se_device *dev, u8 asc, u8 ascq); /* target_core_configfs.c */ extern struct configfs_item_operations target_core_dev_item_ops; diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 7838dc20f713..5926316252eb 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -3531,8 +3531,7 @@ static void target_tmr_work(struct work_struct *work) tmr->response = (!ret) ? TMR_FUNCTION_COMPLETE : TMR_FUNCTION_REJECTED; if (tmr->response == TMR_FUNCTION_COMPLETE) { - target_ua_allocate_lun(cmd->se_sess->se_node_acl, - cmd->orig_fe_lun, 0x29, + target_dev_ua_allocate(dev, 0x29, ASCQ_29H_BUS_DEVICE_RESET_FUNCTION_OCCURRED); } break; From 9b78d8fadeee078ca947a3b44157f42035fdf8b1 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 14 Sep 2022 16:47:59 +0800 Subject: [PATCH 06/66] scsi: megaraid_sas: Correct value passed to scsi_device_lookup() The "id" parameter currently passed to scsi_device_lookup() when removing a device is incorrect. It should be "ld_target_id % MEGASAS_MAX_DEV_PER_CHANNEL". Link: https://lore.kernel.org/r/1663145283-4872-2-git-send-email-kanie@linux.alibaba.com Fixes: ae6874ba4b43 ("scsi: megaraid_sas: Early detection of VD deletion through RaidMap update") Acked-by: Sumit Saxena Signed-off-by: Guixin Liu Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index ae6b9a570fa9..1772b0be88fe 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -8924,7 +8924,7 @@ megasas_aen_polling(struct work_struct *work) sdev1 = scsi_device_lookup(instance->host, MEGASAS_MAX_PD_CHANNELS + (ld_target_id / MEGASAS_MAX_DEV_PER_CHANNEL), - (ld_target_id - MEGASAS_MAX_DEV_PER_CHANNEL), + (ld_target_id % MEGASAS_MAX_DEV_PER_CHANNEL), 0); if (sdev1) megasas_remove_scsi_device(sdev1); From 9b201b5dff81f298cebda10d51767cd25b432a1a Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 14 Sep 2022 16:48:00 +0800 Subject: [PATCH 07/66] scsi: megaraid_sas: Correct an error message Correct the error message logged when allocation of ioc_init_request fails. Link: https://lore.kernel.org/r/1663145283-4872-3-git-send-email-kanie@linux.alibaba.com Acked-by: Sumit Saxena Signed-off-by: Guixin Liu Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 1772b0be88fe..22c3e3370403 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -7226,7 +7226,7 @@ int megasas_alloc_ctrl_dma_buffers(struct megasas_instance *instance) if (!fusion->ioc_init_request) { dev_err(&pdev->dev, - "Failed to allocate PD list buffer\n"); + "Failed to allocate ioc init request\n"); return -ENOMEM; } From 17883cd59f5575ebe5b3cce2fd0f0d91738871bb Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 14 Sep 2022 16:48:01 +0800 Subject: [PATCH 08/66] scsi: megaraid_sas: Simplify megasas_update_device_list Remove unnecessary dcmd_ret check and goto statement. Link: https://lore.kernel.org/r/1663145283-4872-4-git-send-email-kanie@linux.alibaba.com Acked-by: Sumit Saxena Signed-off-by: Guixin Liu Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 22c3e3370403..4e8b7042e8b3 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -8768,33 +8768,26 @@ static int megasas_update_device_list(struct megasas_instance *instance, int event_type) { - int dcmd_ret = DCMD_SUCCESS; + int dcmd_ret; if (instance->enable_fw_dev_list) { - dcmd_ret = megasas_host_device_list_query(instance, false); - if (dcmd_ret != DCMD_SUCCESS) - goto out; + return megasas_host_device_list_query(instance, false); } else { if (event_type & SCAN_PD_CHANNEL) { dcmd_ret = megasas_get_pd_list(instance); - if (dcmd_ret != DCMD_SUCCESS) - goto out; + return dcmd_ret; } if (event_type & SCAN_VD_CHANNEL) { if (!instance->requestorId || megasas_get_ld_vf_affiliation(instance, 0)) { - dcmd_ret = megasas_ld_list_query(instance, + return megasas_ld_list_query(instance, MR_LD_QUERY_TYPE_EXPOSED_TO_HOST); - if (dcmd_ret != DCMD_SUCCESS) - goto out; } } } - -out: - return dcmd_ret; + return DCMD_SUCCESS; } /** From ad40d51992392a2336af861f83c17c0b08ca64b6 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 14 Sep 2022 16:48:02 +0800 Subject: [PATCH 09/66] scsi: megaraid_sas: Remove unnecessary memset() Remove memset() of pd_list and ld_ids in megasas_get_device_list(). These lists will be cleared by megasas_host_device_list_query(), megasas_get_pd_list(), and megasas_ld_list_query(). Link: https://lore.kernel.org/r/1663145283-4872-5-git-send-email-kanie@linux.alibaba.com Acked-by: Sumit Saxena Signed-off-by: Guixin Liu Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 4e8b7042e8b3..f5e8c7cd0dca 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -5876,10 +5876,6 @@ fallback: static int megasas_get_device_list(struct megasas_instance *instance) { - memset(instance->pd_list, 0, - (MEGASAS_MAX_PD * sizeof(struct megasas_pd_list))); - memset(instance->ld_ids, 0xff, MEGASAS_MAX_LD_IDS); - if (instance->enable_fw_dev_list) { if (megasas_host_device_list_query(instance, true)) return FAILED; From 27b571cc454e5a5939b4940ed0bf20aaf37f5225 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 14 Sep 2022 16:48:03 +0800 Subject: [PATCH 10/66] scsi: megaraid_sas: Move megasas_dbg_lvl init to megasas_init() The megasas_dbg_lvl is a driver level parameter. Do not initialize it in the probe path. Otherwise we will miss the debug print when binding a new device to the megaraid driver. Link: https://lore.kernel.org/r/1663145283-4872-6-git-send-email-kanie@linux.alibaba.com Acked-by: Sumit Saxena Signed-off-by: Guixin Liu Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index f5e8c7cd0dca..465274ba9f1b 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -7441,7 +7441,6 @@ static inline void megasas_init_ctrl_params(struct megasas_instance *instance) (instance->pdev->device == PCI_DEVICE_ID_LSI_SAS0071SKINNY)) instance->flag_ieee = 1; - megasas_dbg_lvl = 0; instance->flag = 0; instance->unload = 1; instance->last_time = 0; @@ -9011,6 +9010,7 @@ static int __init megasas_init(void) */ pr_info("megasas: %s\n", MEGASAS_VERSION); + megasas_dbg_lvl = 0; support_poll_for_event = 2; support_device_change = 1; support_nvme_encapsulation = true; From 07e433614cdb91e6f85cc79d738bb0a3d8c741a2 Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Fri, 23 Sep 2022 18:12:17 +0800 Subject: [PATCH 11/66] scsi: ufs: qcom: Remove redundant dev_err() call devm_ioremap_resource() already prints an error message. Remove the redundant dev_err() call. Link: https://lore.kernel.org/r/20220923101217.18345-1-shangxiaojing@huawei.com Reviewed-by: Bart Van Assche Signed-off-by: Shang XiaoJing Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-qcom-ice.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/ufs/host/ufs-qcom-ice.c b/drivers/ufs/host/ufs-qcom-ice.c index 745e48ec598f..62387ccd5b30 100644 --- a/drivers/ufs/host/ufs-qcom-ice.c +++ b/drivers/ufs/host/ufs-qcom-ice.c @@ -118,7 +118,6 @@ int ufs_qcom_ice_init(struct ufs_qcom_host *host) host->ice_mmio = devm_ioremap_resource(dev, res); if (IS_ERR(host->ice_mmio)) { err = PTR_ERR(host->ice_mmio); - dev_err(dev, "Failed to map ICE registers; err=%d\n", err); return err; } From 3ddeabd1536a71abf2b66a577c90df84514a0af2 Mon Sep 17 00:00:00 2001 From: Rafael Mendonca Date: Mon, 26 Sep 2022 20:02:44 -0300 Subject: [PATCH 12/66] scsi: qla2xxx: Fix serialization of DCBX TLV data request Commit b6faaaf796d7 ("scsi: qla2xxx: Serialize mailbox request") serialized mailbox requests from userspace using the 'optrom' mutex. However, in the case of DCBX TLV data, if the memory for it is already allocated, then the mailbox request ends up not being serialized because it is done without holding the 'optrom' mutex. Link: https://lore.kernel.org/r/20220926230245.790508-1-rafaelmendsr@gmail.com Fixes: b6faaaf796d7 ("scsi: qla2xxx: Serialize mailbox request") Reviewed-by: Himanshu Madhani Signed-off-by: Rafael Mendonca Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_attr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index fa1fcbfb946f..c2bc7f9c728a 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -951,9 +951,9 @@ qla2x00_sysfs_read_dcbx_tlv(struct file *filp, struct kobject *kobj, if (!capable(CAP_SYS_ADMIN) || off != 0 || count > DCBX_TLV_DATA_SIZE) return 0; + mutex_lock(&vha->hw->optrom_mutex); if (ha->dcbx_tlv) goto do_read; - mutex_lock(&vha->hw->optrom_mutex); if (qla2x00_chip_is_down(vha)) { mutex_unlock(&vha->hw->optrom_mutex); return 0; From f915f58e382e907e2be0b2f5472617dc13f2c390 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Sep 2022 10:22:25 +0200 Subject: [PATCH 13/66] scsi: target: iblock: Fold iblock_emulate_read_cap_with_block_size() into iblock_get_blocks() Fold iblock_emulate_read_cap_with_block_size() into its only caller. Link: https://lore.kernel.org/r/20220927082225.271975-1-hch@lst.de Reviewed-by: Mike Christie Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/target/target_core_iblock.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 8351c974cee3..d9266cf558dc 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -230,14 +230,12 @@ static void iblock_unplug_device(struct se_dev_plug *se_plug) clear_bit(IBD_PLUGF_PLUGGED, &ib_dev_plug->flags); } -static unsigned long long iblock_emulate_read_cap_with_block_size( - struct se_device *dev, - struct block_device *bd, - struct request_queue *q) +static sector_t iblock_get_blocks(struct se_device *dev) { - u32 block_size = bdev_logical_block_size(bd); + struct iblock_dev *ib_dev = IBLOCK_DEV(dev); + u32 block_size = bdev_logical_block_size(ib_dev->ibd_bd); unsigned long long blocks_long = - div_u64(bdev_nr_bytes(bd), block_size) - 1; + div_u64(bdev_nr_bytes(ib_dev->ibd_bd), block_size) - 1; if (block_size == dev->dev_attrib.block_size) return blocks_long; @@ -829,15 +827,6 @@ fail: return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } -static sector_t iblock_get_blocks(struct se_device *dev) -{ - struct iblock_dev *ib_dev = IBLOCK_DEV(dev); - struct block_device *bd = ib_dev->ibd_bd; - struct request_queue *q = bdev_get_queue(bd); - - return iblock_emulate_read_cap_with_block_size(dev, bd, q); -} - static sector_t iblock_get_alignment_offset_lbas(struct se_device *dev) { struct iblock_dev *ib_dev = IBLOCK_DEV(dev); From 0b863257c17c5f57a41e0a48de140ed026957a63 Mon Sep 17 00:00:00 2001 From: Manish Rangankar Date: Tue, 27 Sep 2022 04:59:46 -0700 Subject: [PATCH 14/66] scsi: qla2xxx: Use transport-defined speed mask for supported_speeds One of the sysfs values reported for supported_speeds was not valid (20Gb/s reported instead of 64Gb/s). Instead of driver internal speed mask definition, use speed mask defined in transport_fc for reporting host->supported_speeds. Link: https://lore.kernel.org/r/20220927115946.17559-1-njavali@marvell.com Cc: stable@vger.kernel.org Reviewed-by: Himanshu Madhani Signed-off-by: Manish Rangankar Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_attr.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index c2bc7f9c728a..b67ad30d56e6 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -3330,11 +3330,34 @@ struct fc_function_template qla2xxx_transport_vport_functions = { .bsg_timeout = qla24xx_bsg_timeout, }; +static uint +qla2x00_get_host_supported_speeds(scsi_qla_host_t *vha, uint speeds) +{ + uint supported_speeds = FC_PORTSPEED_UNKNOWN; + + if (speeds & FDMI_PORT_SPEED_64GB) + supported_speeds |= FC_PORTSPEED_64GBIT; + if (speeds & FDMI_PORT_SPEED_32GB) + supported_speeds |= FC_PORTSPEED_32GBIT; + if (speeds & FDMI_PORT_SPEED_16GB) + supported_speeds |= FC_PORTSPEED_16GBIT; + if (speeds & FDMI_PORT_SPEED_8GB) + supported_speeds |= FC_PORTSPEED_8GBIT; + if (speeds & FDMI_PORT_SPEED_4GB) + supported_speeds |= FC_PORTSPEED_4GBIT; + if (speeds & FDMI_PORT_SPEED_2GB) + supported_speeds |= FC_PORTSPEED_2GBIT; + if (speeds & FDMI_PORT_SPEED_1GB) + supported_speeds |= FC_PORTSPEED_1GBIT; + + return supported_speeds; +} + void qla2x00_init_host_attr(scsi_qla_host_t *vha) { struct qla_hw_data *ha = vha->hw; - u32 speeds = FC_PORTSPEED_UNKNOWN; + u32 speeds = 0, fdmi_speed = 0; fc_host_dev_loss_tmo(vha->host) = ha->port_down_retry_count; fc_host_node_name(vha->host) = wwn_to_u64(vha->node_name); @@ -3344,7 +3367,8 @@ qla2x00_init_host_attr(scsi_qla_host_t *vha) fc_host_max_npiv_vports(vha->host) = ha->max_npiv_vports; fc_host_npiv_vports_inuse(vha->host) = ha->cur_vport_count; - speeds = qla25xx_fdmi_port_speed_capability(ha); + fdmi_speed = qla25xx_fdmi_port_speed_capability(ha); + speeds = qla2x00_get_host_supported_speeds(vha, fdmi_speed); fc_host_supported_speeds(vha->host) = speeds; } From 638eec06c7f4df8eb415a0b33dd18cc6dfc986e6 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 28 Sep 2022 23:21:16 +0100 Subject: [PATCH 15/66] scsi: lpfc: Fix spelling mistake "unsolicted" -> "unsolicited" There are spelling mistakes in a log message and two comments. Fix them. Link: https://lore.kernel.org/r/20220928222116.68294-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_bsg.c | 4 ++-- drivers/scsi/lpfc/lpfc_ct.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_bsg.c b/drivers/scsi/lpfc/lpfc_bsg.c index ac0c7ccf2eae..852b025e2fec 100644 --- a/drivers/scsi/lpfc/lpfc_bsg.c +++ b/drivers/scsi/lpfc/lpfc_bsg.c @@ -2582,7 +2582,7 @@ static int lpfcdiag_loop_self_unreg(struct lpfc_hba *phba, uint16_t rpi) * * This function obtains the transmit and receive ids required to send * an unsolicited ct command with a payload. A special lpfc FsType and CmdRsp - * flags are used to the unsolicted response handler is able to process + * flags are used to the unsolicited response handler is able to process * the ct command sent on the same port. **/ static int lpfcdiag_loop_get_xri(struct lpfc_hba *phba, uint16_t rpi, @@ -2874,7 +2874,7 @@ out: * @len: Number of data bytes * * This function allocates and posts a data buffer of sufficient size to receive - * an unsolicted CT command. + * an unsolicited CT command. **/ static int lpfcdiag_sli3_loop_post_rxbufs(struct lpfc_hba *phba, uint16_t rxxri, size_t len) diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c index 75fd2bfc212b..e941a99aa965 100644 --- a/drivers/scsi/lpfc/lpfc_ct.c +++ b/drivers/scsi/lpfc/lpfc_ct.c @@ -90,7 +90,7 @@ lpfc_ct_ignore_hbq_buffer(struct lpfc_hba *phba, struct lpfc_iocbq *piocbq, get_job_ulpstatus(phba, piocbq)); } lpfc_printf_log(phba, KERN_INFO, LOG_ELS, - "0145 Ignoring unsolicted CT HBQ Size:%d " + "0145 Ignoring unsolicited CT HBQ Size:%d " "status = x%x\n", size, get_job_ulpstatus(phba, piocbq)); } From 67d0a917fb3f9e80c3fb6098ada2080d1b425c94 Mon Sep 17 00:00:00 2001 From: Markus Fuchs Date: Thu, 29 Sep 2022 00:22:42 +0200 Subject: [PATCH 16/66] scsi: ufs: core: Remove unneeded casts from void * The end_io_data member of the "struct request" type has type "void *", so no cast is necessary. Link: https://lore.kernel.org/r/20220928222241.131334-1-mklntf@gmail.com Reviewed-by: Bart Van Assche Signed-off-by: Markus Fuchs Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshpb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ufs/core/ufshpb.c b/drivers/ufs/core/ufshpb.c index a1a7a1175a5a..0ce5063bedc5 100644 --- a/drivers/ufs/core/ufshpb.c +++ b/drivers/ufs/core/ufshpb.c @@ -615,14 +615,14 @@ static void ufshpb_activate_subregion(struct ufshpb_lu *hpb, static void ufshpb_umap_req_compl_fn(struct request *req, blk_status_t error) { - struct ufshpb_req *umap_req = (struct ufshpb_req *)req->end_io_data; + struct ufshpb_req *umap_req = req->end_io_data; ufshpb_put_req(umap_req->hpb, umap_req); } static void ufshpb_map_req_compl_fn(struct request *req, blk_status_t error) { - struct ufshpb_req *map_req = (struct ufshpb_req *) req->end_io_data; + struct ufshpb_req *map_req = req->end_io_data; struct ufshpb_lu *hpb = map_req->hpb; struct ufshpb_subregion *srgn; unsigned long flags; From 89ed0b769d6adf30364f60e6b1566961821a9893 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Sun, 9 Oct 2022 20:41:25 -0700 Subject: [PATCH 17/66] powerpc/pseries/vas: Add VAS IRQ primary handler irq_default_primary_handler() can be used only with IRQF_ONESHOT flag, but the flag disables IRQ before executing the thread handler and enables it after the interrupt is handled. But this IRQ disable sets the VAS IRQ OFF state in the hypervisor. In case if NX faults during this window, the hypervisor will not deliver the fault interrupt to the partition and the user space may wait continuously for the CSB update. So use VAS specific IRQ handler instead of calling the default primary handler. Increment pending_faults counter in IRQ handler and the bottom thread handler will process all faults based on this counter. In case if the another interrupt is received while the thread is running, it will be processed using this counter. The synchronization of top and bottom handlers will be done with IRQTF_RUNTHREAD flag and will re-enter to bottom half if this flag is set. Signed-off-by: Haren Myneni Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/aaad8813b4762a6753cfcd0b605a7574a5192ec7.camel@linux.ibm.com --- arch/powerpc/platforms/pseries/vas.c | 40 +++++++++++++++++++++++----- arch/powerpc/platforms/pseries/vas.h | 1 + 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 0e0524cbe20c..53b36381489e 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -200,16 +200,41 @@ static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data) struct vas_user_win_ref *tsk_ref; int rc; - rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb)); - if (!rc) { - tsk_ref = &txwin->vas_win.task_ref; - vas_dump_crb(&crb); - vas_update_csb(&crb, tsk_ref); + while (atomic_read(&txwin->pending_faults)) { + rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb)); + if (!rc) { + tsk_ref = &txwin->vas_win.task_ref; + vas_dump_crb(&crb); + vas_update_csb(&crb, tsk_ref); + } + atomic_dec(&txwin->pending_faults); } return IRQ_HANDLED; } +/* + * irq_default_primary_handler() can be used only with IRQF_ONESHOT + * which disables IRQ before executing the thread handler and enables + * it after. But this disabling interrupt sets the VAS IRQ OFF + * state in the hypervisor. If the NX generates fault interrupt + * during this window, the hypervisor will not deliver this + * interrupt to the LPAR. So use VAS specific IRQ handler instead + * of calling the default primary handler. + */ +static irqreturn_t pseries_vas_irq_handler(int irq, void *data) +{ + struct pseries_vas_window *txwin = data; + + /* + * The thread hanlder will process this interrupt if it is + * already running. + */ + atomic_inc(&txwin->pending_faults); + + return IRQ_WAKE_THREAD; +} + /* * Allocate window and setup IRQ mapping. */ @@ -240,8 +265,9 @@ static int allocate_setup_window(struct pseries_vas_window *txwin, goto out_irq; } - rc = request_threaded_irq(txwin->fault_virq, NULL, - pseries_vas_fault_thread_fn, IRQF_ONESHOT, + rc = request_threaded_irq(txwin->fault_virq, + pseries_vas_irq_handler, + pseries_vas_fault_thread_fn, 0, txwin->name, txwin); if (rc) { pr_err("VAS-Window[%d]: Request IRQ(%u) failed with %d\n", diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h index 333ffa2f9f42..a2cb12a31c17 100644 --- a/arch/powerpc/platforms/pseries/vas.h +++ b/arch/powerpc/platforms/pseries/vas.h @@ -132,6 +132,7 @@ struct pseries_vas_window { u64 flags; char *name; int fault_virq; + atomic_t pending_faults; /* Number of pending faults */ }; int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps); From 2147783d6bf0b7ca14c72a25527dc5135bd17f65 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Thu, 6 Oct 2022 22:29:59 -0700 Subject: [PATCH 18/66] powerpc/pseries: Use lparcfg to reconfig VAS windows for DLPAR CPU The hypervisor assigns VAS (Virtual Accelerator Switchboard) windows depends on cores configured in LPAR. The kernel uses OF reconfig notifier to reconfig VAS windows for DLPAR CPU event. In the case of shared CPU mode partition, the hypervisor assigns VAS windows depends on CPU entitled capacity, not based on vcpus. When the user changes CPU entitled capacity for the partition, drmgr uses /proc/ppc64/lparcfg interface to notify the kernel. This patch adds the following changes to update VAS resources for shared mode: - Call vas reconfig windows from lparcfg_write() - Ignore reconfig changes in the VAS notifier Signed-off-by: Haren Myneni [mpe: Rework error handling, report any errors as EIO] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/efa9c16e4a78dda4567a16f13dabfd73cb4674a2.camel@linux.ibm.com --- arch/powerpc/platforms/pseries/lparcfg.c | 11 ++++++ arch/powerpc/platforms/pseries/vas.c | 43 ++++++++++++++++-------- arch/powerpc/platforms/pseries/vas.h | 5 +++ 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 507dc0b5987d..63fd925ccbb8 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -35,6 +35,7 @@ #include #include "pseries.h" +#include "vas.h" /* pseries_vas_dlpar_cpu() */ /* * This isn't a module but we expose that to userspace @@ -748,6 +749,16 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf, return -EINVAL; retval = update_ppp(new_entitled_ptr, NULL); + + if (retval == H_SUCCESS || retval == H_CONSTRAINED) { + /* + * The hypervisor assigns VAS resources based + * on entitled capacity for shared mode. + * Reconfig VAS windows based on DLPAR CPU events. + */ + if (pseries_vas_dlpar_cpu() != 0) + retval = H_HARDWARE; + } } else if (!strcmp(kbuf, "capacity_weight")) { char *endp; *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 53b36381489e..4ad6e510d405 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -852,6 +852,25 @@ int vas_reconfig_capabilties(u8 type, int new_nr_creds) mutex_unlock(&vas_pseries_mutex); return rc; } + +int pseries_vas_dlpar_cpu(void) +{ + int new_nr_creds, rc; + + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, + vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, + (u64)virt_to_phys(&hv_cop_caps)); + if (!rc) { + new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); + rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, new_nr_creds); + } + + if (rc) + pr_err("Failed reconfig VAS capabilities with DLPAR\n"); + + return rc; +} + /* * Total number of default credits available (target_credits) * in LPAR depends on number of cores configured. It varies based on @@ -866,7 +885,15 @@ static int pseries_vas_notifier(struct notifier_block *nb, struct of_reconfig_data *rd = data; struct device_node *dn = rd->dn; const __be32 *intserv = NULL; - int new_nr_creds, len, rc = 0; + int len; + + /* + * For shared CPU partition, the hypervisor assigns total credits + * based on entitled core capacity. So updating VAS windows will + * be called from lparcfg_write(). + */ + if (is_shared_processor()) + return NOTIFY_OK; if ((action == OF_RECONFIG_ATTACH_NODE) || (action == OF_RECONFIG_DETACH_NODE)) @@ -878,19 +905,7 @@ static int pseries_vas_notifier(struct notifier_block *nb, if (!intserv) return NOTIFY_OK; - rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, - vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, - (u64)virt_to_phys(&hv_cop_caps)); - if (!rc) { - new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); - rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, - new_nr_creds); - } - - if (rc) - pr_err("Failed reconfig VAS capabilities with DLPAR\n"); - - return rc; + return pseries_vas_dlpar_cpu(); } static struct notifier_block pseries_vas_nb = { diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h index a2cb12a31c17..7115043ec488 100644 --- a/arch/powerpc/platforms/pseries/vas.h +++ b/arch/powerpc/platforms/pseries/vas.h @@ -141,10 +141,15 @@ int __init sysfs_pseries_vas_init(struct vas_all_caps *vas_caps); #ifdef CONFIG_PPC_VAS int vas_migration_handler(int action); +int pseries_vas_dlpar_cpu(void); #else static inline int vas_migration_handler(int action) { return 0; } +static inline int pseries_vas_dlpar_cpu(void) +{ + return 0; +} #endif #endif /* _VAS_H */ From be83d5485da549d934ec65463ea831709f2827b1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 09:07:08 +1000 Subject: [PATCH 19/66] powerpc/64s: Add lockdep for HPTE lock Add lockdep annotation for the HPTE bit-spinlock. Modern systems don't take the tlbie lock, so this shows up some of the same lockdep warnings that were being reported by the ppc970. And they're not taken in exactly the same places so this is nice to have in its own right. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013230710.1987253-1-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_native.c | 42 +++++++++++++++++++++----- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 623a7b7ab38b..7a640e610ea3 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -43,6 +43,29 @@ static DEFINE_RAW_SPINLOCK(native_tlbie_lock); +#ifdef CONFIG_LOCKDEP +static struct lockdep_map hpte_lock_map = + STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map); + +static void acquire_hpte_lock(void) +{ + lock_map_acquire(&hpte_lock_map); +} + +static void release_hpte_lock(void) +{ + lock_map_release(&hpte_lock_map); +} +#else +static void acquire_hpte_lock(void) +{ +} + +static void release_hpte_lock(void) +{ +} +#endif + static inline unsigned long ___tlbie(unsigned long vpn, int psize, int apsize, int ssize) { @@ -220,6 +243,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + acquire_hpte_lock(); while (1) { if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; @@ -234,6 +258,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + release_hpte_lock(); clear_bit_unlock(HPTE_LOCK_BIT, word); } @@ -279,6 +304,7 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hpte_v = hpte_old_to_new_v(hpte_v); } + release_hpte_lock(); hptep->r = cpu_to_be64(hpte_r); /* Guarantee the second dword is visible before the valid bit */ eieio(); @@ -327,6 +353,7 @@ static long native_hpte_remove(unsigned long hpte_group) return -1; /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; return i; @@ -517,10 +544,11 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, /* recheck with locks held */ hpte_v = hpte_get_old_v(hptep); - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; - else + } else native_unlock_hpte(hptep); } /* @@ -580,10 +608,8 @@ static void native_hugepage_invalidate(unsigned long vsid, hpte_v = hpte_get_old_v(hptep); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* - * Invalidate the hpte. NOTE: this also unlocks it - */ - + /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; } else native_unlock_hpte(hptep); @@ -765,8 +791,10 @@ static void native_flush_hash_range(unsigned long number, int local) if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); - else + else { + release_hpte_lock(); hptep->v = 0; + } } pte_iterate_hashed_end(); } From 35159b5717fa9c6031fdd6a2193c7a3dc717ce33 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 09:07:09 +1000 Subject: [PATCH 20/66] powerpc/64s: make HPTE lock and native_tlbie_lock irq-safe With kfence enabled, there are several cases where HPTE and TLBIE locks are called from softirq context, for example: WARNING: inconsistent lock state 6.0.0-11845-g0cbbc95b12ac #1 Tainted: G N -------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. swapper/0/1 [HC0[0]:SC0[0]:HE1:SE1] takes: c000000002734de8 (native_tlbie_lock){+.?.}-{2:2}, at: .native_hpte_updateboltedpp+0x1a4/0x600 {IN-SOFTIRQ-W} state was registered at: .lock_acquire+0x20c/0x520 ._raw_spin_lock+0x4c/0x70 .native_hpte_invalidate+0x62c/0x840 .hash__kernel_map_pages+0x450/0x640 .kfence_protect+0x58/0xc0 .kfence_guarded_free+0x374/0x5a0 .__slab_free+0x3d0/0x630 .put_cred_rcu+0xcc/0x120 .rcu_core+0x3c4/0x14e0 .__do_softirq+0x1dc/0x7dc .do_softirq_own_stack+0x40/0x60 Fix this by consistently disabling irqs while taking either of these locks. Don't just disable bh because several of the more common cases already disable irqs, so this just makes the locks always irq-safe. Reported-by: Guenter Roeck Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013230710.1987253-2-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_native.c | 27 ++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 7a640e610ea3..9342e79870df 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -268,8 +268,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, { struct hash_pte *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; + unsigned long flags; int i; + local_irq_save(flags); + if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," " rflags=%lx, vflags=%lx, psize=%d)\n", @@ -288,8 +291,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hptep++; } - if (i == HPTES_PER_GROUP) + if (i == HPTES_PER_GROUP) { + local_irq_restore(flags); return -1; + } hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; @@ -304,7 +309,6 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hpte_v = hpte_old_to_new_v(hpte_v); } - release_hpte_lock(); hptep->r = cpu_to_be64(hpte_r); /* Guarantee the second dword is visible before the valid bit */ eieio(); @@ -312,10 +316,13 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, * Now set the first dword including the valid bit * NOTE: this also unlocks the hpte */ + release_hpte_lock(); hptep->v = cpu_to_be64(hpte_v); __asm__ __volatile__ ("ptesync" : : : "memory"); + local_irq_restore(flags); + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); } @@ -366,6 +373,9 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v, want_v; int ret = 0, local = 0; + unsigned long irqflags; + + local_irq_save(irqflags); want_v = hpte_encode_avpn(vpn, bpsize, ssize); @@ -409,6 +419,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, if (!(flags & HPTE_NOHPTE_UPDATE)) tlbie(vpn, bpsize, apsize, ssize, local); + local_irq_restore(irqflags); + return ret; } @@ -472,6 +484,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -490,6 +505,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, * actual page size will be same. */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); } /* @@ -503,6 +520,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -520,6 +540,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) /* Invalidate the TLB */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); + return 0; } From b12eb279ff552bd67c167b0fe701ae602aa7311e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 09:07:10 +1000 Subject: [PATCH 21/66] powerpc/64s: make linear_map_hash_lock a raw spinlock This lock is taken while the raw kfence_freelist_lock is held, so it must also be a raw spinlock, as reported by lockdep when raw lock nesting checking is enabled. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013230710.1987253-3-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_utils.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index df008edf7be0..6df4c6d38b66 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1981,7 +1981,7 @@ repeat: } #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) -static DEFINE_SPINLOCK(linear_map_hash_lock); +static DEFINE_RAW_SPINLOCK(linear_map_hash_lock); static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) { @@ -2005,10 +2005,10 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) mmu_linear_psize, mmu_kernel_ssize); BUG_ON (ret < 0); - spin_lock(&linear_map_hash_lock); + raw_spin_lock(&linear_map_hash_lock); BUG_ON(linear_map_hash_slots[lmi] & 0x80); linear_map_hash_slots[lmi] = ret | 0x80; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); } static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) @@ -2018,14 +2018,14 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - spin_lock(&linear_map_hash_lock); + raw_spin_lock(&linear_map_hash_lock); if (!(linear_map_hash_slots[lmi] & 0x80)) { - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); return; } hidx = linear_map_hash_slots[lmi] & 0x7f; linear_map_hash_slots[lmi] = 0; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; From b9ef323ea1682f9837bf63ba10c5e3750f71a20a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 01:16:45 +1000 Subject: [PATCH 22/66] powerpc/64s: Disable preemption in hash lazy mmu mode apply_to_page_range on kernel pages does not disable preemption, which is a requirement for hash's lazy mmu mode, which keeps track of the TLBs to flush with a per-cpu array. Reported-by: Guenter Roeck Signed-off-by: Nicholas Piggin Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013151647.1857994-1-npiggin@gmail.com --- arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index fab8332fe1ad..751921f6db46 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -32,6 +32,11 @@ static inline void arch_enter_lazy_mmu_mode(void) if (radix_enabled()) return; + /* + * apply_to_page_range can call us this preempt enabled when + * operating on kernel page tables. + */ + preempt_disable(); batch = this_cpu_ptr(&ppc64_tlb_batch); batch->active = 1; } @@ -47,6 +52,7 @@ static inline void arch_leave_lazy_mmu_mode(void) if (batch->index) __flush_tlb_pending(batch); batch->active = 0; + preempt_enable(); } #define arch_flush_lazy_mmu_mode() do {} while (0) From 2b2095f3a6b43ec36ff890febc588df1ec32e826 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 01:16:46 +1000 Subject: [PATCH 23/66] powerpc/64s: Fix hash__change_memory_range preemption warning stop_machine_cpuslocked takes a mutex so it must be called in a preemptible context, so it can't simply be fixed by disabling preemption. This is not a bug, because CPU hotplug is locked, so this processor will call in to the stop machine function. So raw_smp_processor_id() could be used. This leaves a small chance that this thread will be migrated to another CPU, so the master work would be done by a CPU from a different context. Better for test coverage to make that a common case by just having the first CPU to call in become the master. Signed-off-by: Nicholas Piggin Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013151647.1857994-2-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_pgtable.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 747492edb75a..51f48984abca 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -404,7 +404,8 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); struct change_memory_parms { unsigned long start, end, newpp; - unsigned int step, nr_cpus, master_cpu; + unsigned int step, nr_cpus; + atomic_t master_cpu; atomic_t cpu_counter; }; @@ -478,7 +479,8 @@ static int change_memory_range_fn(void *data) { struct change_memory_parms *parms = data; - if (parms->master_cpu != smp_processor_id()) + // First CPU goes through, all others wait. + if (atomic_xchg(&parms->master_cpu, 1) == 1) return chmem_secondary_loop(parms); // Wait for all but one CPU (this one) to call-in @@ -516,7 +518,7 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end, chmem_parms.end = end; chmem_parms.step = step; chmem_parms.newpp = newpp; - chmem_parms.master_cpu = smp_processor_id(); + atomic_set(&chmem_parms.master_cpu, 0); cpus_read_lock(); From 00ff1eaac129a24516a3f6d75adfb9df1efb55dd Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 01:16:47 +1000 Subject: [PATCH 24/66] powerpc: Fix reschedule bug in KUAP-unlocked user copy schedule must not be explicitly called while KUAP is unlocked, because the AMR register will not be saved across the context switch on 64s (preemption is allowed because that is driven by interrupts which do save the AMR). exit_vmx_usercopy() runs inside an unlocked user access region, and it calls preempt_enable() which will call schedule() if need_resched() was set while non-preemptible. This can cause tasks to run unprotected when the should not, and can cause the user copy to be improperly blocked when scheduling back to it. Fix this by avoiding the explicit resched for preempt kernels by generating an interrupt to reschedule the context if need_resched() got set. Reported-by: Samuel Holland Signed-off-by: Nicholas Piggin Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013151647.1857994-3-npiggin@gmail.com --- arch/powerpc/lib/vmx-helper.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index f76a50291fd7..d491da8d1838 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -36,7 +36,17 @@ int exit_vmx_usercopy(void) { disable_kernel_altivec(); pagefault_enable(); - preempt_enable(); + preempt_enable_no_resched(); + /* + * Must never explicitly call schedule (including preempt_enable()) + * while in a kuap-unlocked user copy, because the AMR register will + * not be saved and restored across context switch. However preempt + * kernels need to be preempted as soon as possible if need_resched is + * set and we are preemptible. The hack here is to schedule a + * decrementer to fire here and reschedule for us if necessary. + */ + if (IS_ENABLED(CONFIG_PREEMPT) && need_resched()) + set_dec(1); return 0; } From e59b3399fde5e173b026d4952b215043e77b4521 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 13:07:28 +1000 Subject: [PATCH 25/66] KVM: PPC: BookS PR-KVM and BookE do not support context tracking The context tracking code in PR-KVM and BookE implementations is not complete, and can cause host crashes if context tracking is enabled. Make these implementations depend on !CONTEXT_TRACKING_USER. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221014030729.2077151-2-npiggin@gmail.com --- arch/powerpc/kvm/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 61cdd782d3c5..a9f57dad6d91 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -51,6 +51,7 @@ config KVM_BOOK3S_HV_POSSIBLE config KVM_BOOK3S_32 tristate "KVM support for PowerPC book3s_32 processors" depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT + depends on !CONTEXT_TRACKING_USER select KVM select KVM_BOOK3S_32_HANDLER select KVM_BOOK3S_PR_POSSIBLE @@ -105,6 +106,7 @@ config KVM_BOOK3S_64_HV config KVM_BOOK3S_64_PR tristate "KVM support without using hypervisor mode in host" depends on KVM_BOOK3S_64 + depends on !CONTEXT_TRACKING_USER select KVM_BOOK3S_PR_POSSIBLE help Support running guest kernels in virtual machines on processors @@ -190,6 +192,7 @@ config KVM_EXIT_TIMING config KVM_E500V2 bool "KVM support for PowerPC E500v2 processors" depends on PPC_E500 && !PPC_E500MC + depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO select MMU_NOTIFIER @@ -205,6 +208,7 @@ config KVM_E500V2 config KVM_E500MC bool "KVM support for PowerPC E500MC/E5500/E6500 processors" depends on PPC_E500MC + depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO select KVM_BOOKE_HV From a073672eb09670540e95a2a4aa1c46f5da74159f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 13:07:29 +1000 Subject: [PATCH 26/66] powerpc/64/interrupt: Prevent NMI PMI causing a dangerous warning NMI PMIs really should not return using the normal interrupt_return function. If such a PMI hits in code returning to user with the context switched to user mode, this warning can fire. This was enough to cause crashes when reproducing on 64s, because another perf interrupt would hit while reporting bug, and that would cause another bug, and so on until smashing the stack. Work around that particular crash for now by just disabling that context warning for PMIs. This is a hack and not a complete fix, there could be other such problems lurking in corners. But it does fix the known crash. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221014030729.2077151-3-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64e.S | 7 +++++++ arch/powerpc/kernel/interrupt.c | 12 +++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 930e36099015..2f68fb2ee4fc 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -813,6 +813,13 @@ kernel_dbg_exc: EXCEPTION_COMMON(0x260) CHECK_NAPPING() addi r3,r1,STACK_FRAME_OVERHEAD + /* + * XXX: Returning from performance_monitor_exception taken as a + * soft-NMI (Linux irqs disabled) may be risky to use interrupt_return + * and could cause bugs in return or elsewhere. That case should just + * restore registers and return. There is a workaround for one known + * problem in interrupt_exit_kernel_prepare(). + */ bl performance_monitor_exception b interrupt_return diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index f9db0a172401..7bc93367de68 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -374,10 +374,16 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) if (regs_is_unrecoverable(regs)) unrecoverable_exception(regs); /* - * CT_WARN_ON comes here via program_check_exception, - * so avoid recursion. + * CT_WARN_ON comes here via program_check_exception, so avoid + * recursion. + * + * Skip the assertion on PMIs to work around a problem caused by NMI + * PMIs incorrectly taking this interrupt return path, it's possible + * for this to hit after interrupt exit to user switches context to + * user. See also the comment in the performance monitor handler in + * exceptions-64e/s.S */ - if (TRAP(regs) != INTERRUPT_PROGRAM) + if (TRAP(regs) != INTERRUPT_PROGRAM && TRAP(regs) != INTERRUPT_PERFMON) CT_WARN_ON(ct_state() == CONTEXT_USER); kuap = kuap_get_and_assert_locked(); From dc398a084d459f065658855454e09f2778f8c5cc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 7 Oct 2022 00:04:11 +1000 Subject: [PATCH 27/66] powerpc/64s/interrupt: Perf NMI should not take normal exit path NMI interrupts should exit with EXCEPTION_RESTORE_REGS not with interrupt_return_srr, which is what the perf NMI handler currently does. This breaks if a PMI hits after interrupt_exit_user_prepare_main() has switched the context tracking to user mode, then the CT_WARN_ON() in interrupt_exit_kernel_prepare() fires because it returns to kernel with context set to user. This could possibly be solved by soft-disabling PMIs in the exit path, but that reduces our ability to profile that code. The warning could be removed, but it's potentially useful. All other NMIs and soft-NMIs return using EXCEPTION_RESTORE_REGS, so this makes perf interrupts consistent with that and seems like the best fix. Signed-off-by: Nicholas Piggin [mpe: Squash in fixups from Nick] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221006140413.126443-3-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 14 +++++++++++++- arch/powerpc/kernel/interrupt.c | 14 ++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 5381a43e50fe..651c36b056bd 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -2357,9 +2357,21 @@ EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) EXC_COMMON_BEGIN(performance_monitor_common) GEN_COMMON performance_monitor addi r3,r1,STACK_FRAME_OVERHEAD - bl performance_monitor_exception + lbz r4,PACAIRQSOFTMASK(r13) + cmpdi r4,IRQS_ENABLED + bne 1f + bl performance_monitor_exception_async b interrupt_return_srr +1: + bl performance_monitor_exception_nmi + /* Clear MSR_RI before setting SRR0 and SRR1. */ + li r9,0 + mtmsrd r9,1 + kuap_kernel_restore r9, r10 + + EXCEPTION_RESTORE_REGS hsrr=0 + RFI_TO_KERNEL /** * Interrupt 0xf20 - Vector Unavailable Interrupt. diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 7bc93367de68..fc6631a80527 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -377,13 +377,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) * CT_WARN_ON comes here via program_check_exception, so avoid * recursion. * - * Skip the assertion on PMIs to work around a problem caused by NMI - * PMIs incorrectly taking this interrupt return path, it's possible - * for this to hit after interrupt exit to user switches context to - * user. See also the comment in the performance monitor handler in - * exceptions-64e/s.S + * Skip the assertion on PMIs on 64e to work around a problem caused + * by NMI PMIs incorrectly taking this interrupt return path, it's + * possible for this to hit after interrupt exit to user switches + * context to user. See also the comment in the performance monitor + * handler in exceptions-64e.S */ - if (TRAP(regs) != INTERRUPT_PROGRAM && TRAP(regs) != INTERRUPT_PERFMON) + if (!IS_ENABLED(CONFIG_PPC_BOOK3E_64) && + TRAP(regs) != INTERRUPT_PROGRAM && + TRAP(regs) != INTERRUPT_PERFMON) CT_WARN_ON(ct_state() == CONTEXT_USER); kuap = kuap_get_and_assert_locked(); From 48ee7952808183201b0601d85b89d2d8ccca95ff Mon Sep 17 00:00:00 2001 From: Dukhyun Kwon Date: Tue, 18 Oct 2022 16:30:03 +0900 Subject: [PATCH 28/66] scsi: ufs: core: Fix the error log in ufshcd_query_flag_retry() In ufshcd_query_flag_retry() failed log is incorrectly output as "ufs attibute". Signed-off-by: Dukhyun Kwon Link: https://lore.kernel.org/r/1891546521.01666080182092.JavaMail.epsvc@epcpadp4 Reviewed-by: Bean Huo Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 7256e6c43ca6..54ba80cc71bc 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -3098,7 +3098,7 @@ static int ufshcd_query_flag_retry(struct ufs_hba *hba, if (ret) dev_err(hba->dev, - "%s: query attribute, opcode %d, idn %d, failed with error %d after %d retries\n", + "%s: query flag, opcode %d, idn %d, failed with error %d after %d retries\n", __func__, opcode, idn, ret, retries); return ret; } From 181dfce9b63b80adbb861b219550ec9b27fe63d5 Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Fri, 7 Oct 2022 16:06:51 -0700 Subject: [PATCH 29/66] scsi: pm80xx: Display proc_name in sysfs The proc_name entry in sysfs for pm80xx is "(null)" because it is not initialized in scsi_host_template: Before: host:~# cat /sys/class/scsi_host/host6/proc_name (null) After: host:~# cat /sys/class/scsi_host/host6/proc_name pm80xx Signed-off-by: Igor Pylypiv Link: https://lore.kernel.org/r/20221007230651.308969-1-ipylypiv@google.com Reviewed-by: Jolly Shah Acked-by: Jack Wang Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c index 2ff2fac1e403..7a7d63aa90e2 100644 --- a/drivers/scsi/pm8001/pm8001_init.c +++ b/drivers/scsi/pm8001/pm8001_init.c @@ -99,6 +99,7 @@ static void pm8001_map_queues(struct Scsi_Host *shost) static struct scsi_host_template pm8001_sht = { .module = THIS_MODULE, .name = DRV_NAME, + .proc_name = DRV_NAME, .queuecommand = sas_queuecommand, .dma_need_drain = ata_scsi_dma_need_drain, .target_alloc = sas_target_alloc, From 4652b58fe3bb177a9b208bb7a8b7a3fb64184a00 Mon Sep 17 00:00:00 2001 From: Keoseong Park Date: Mon, 17 Oct 2022 18:58:15 +0900 Subject: [PATCH 30/66] scsi: ufs: core: Fix typo for register name in comments Change "UTRMLCLR" to "UTMRLCLR". The meaning is "UTP Task Management Request List CLear Register" Signed-off-by: Keoseong Park Link: https://lore.kernel.org/r/20221017095815epcms2p110e3421b99bb9a937620b4d065d0ed12@epcms2p1 Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 54ba80cc71bc..b1f59a5fe632 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -772,7 +772,7 @@ static inline void ufshcd_utrl_clear(struct ufs_hba *hba, u32 mask) } /** - * ufshcd_utmrl_clear - Clear a bit in UTRMLCLR register + * ufshcd_utmrl_clear - Clear a bit in UTMRLCLR register * @hba: per adapter instance * @pos: position of the bit to be cleared */ From 65244389b1b347447a7eed866d9d458963e851e8 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Mon, 17 Oct 2022 16:55:17 +0200 Subject: [PATCH 31/66] scsi: mpi3mr: Select CONFIG_SCSI_SAS_ATTRS Starting with commit 42fc9fee116f ("scsi: mpi3mr: Add helper functions to manage device's port"), kernel configured with CONFIG_SCSI_MPI3MR=m and CONFIG_SCSI_SAS_ATTRS=n fails to build because modpost cannot find symbols used in mpi3mr_transport.c: ERROR: modpost: "sas_port_alloc_num" [drivers/scsi/mpi3mr/mpi3mr.ko] undefined! ERROR: modpost: "sas_remove_host" [drivers/scsi/mpi3mr/mpi3mr.ko] undefined! ERROR: modpost: "sas_phy_alloc" [drivers/scsi/mpi3mr/mpi3mr.ko] undefined! ERROR: modpost: "sas_phy_free" [drivers/scsi/mpi3mr/mpi3mr.ko] undefined! ... Select CONFIG_SCSI_SAS_ATTRS when CONFIG_SCSI_MPI3MR is enabled to prevent inconsistent configs. Link: https://lore.kernel.org/r/20221017145517.93BCB6043B@lion.mk-sys.cz Fixes: 42fc9fee116f ("scsi: mpi3mr: Add helper functions to manage device's port") Acked-by: Sathya Prakash Veerichetty Signed-off-by: Michal Kubecek Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/mpi3mr/Kconfig b/drivers/scsi/mpi3mr/Kconfig index 8997531940c2..f48740cd5b95 100644 --- a/drivers/scsi/mpi3mr/Kconfig +++ b/drivers/scsi/mpi3mr/Kconfig @@ -4,5 +4,6 @@ config SCSI_MPI3MR tristate "Broadcom MPI3 Storage Controller Device Driver" depends on PCI && SCSI select BLK_DEV_BSGLIB + select SCSI_SAS_ATTRS help MPI3 based Storage & RAID Controllers Driver. From 307539eed46395d27e0ecc0ae4d9d6e99eb15fcd Mon Sep 17 00:00:00 2001 From: Keoseong Park Date: Wed, 19 Oct 2022 12:45:30 +0900 Subject: [PATCH 32/66] scsi: ufs: core: Fix typo in comment Change "drity" to "dirty". Signed-off-by: Keoseong Park Link: https://lore.kernel.org/r/20221019034530epcms2p2b10e072bb66b3fd6cdbe0e2423c11735@epcms2p2 Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshpb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshpb.c b/drivers/ufs/core/ufshpb.c index 7d56c9b4f7a8..b7f412d0f301 100644 --- a/drivers/ufs/core/ufshpb.c +++ b/drivers/ufs/core/ufshpb.c @@ -383,7 +383,7 @@ int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) rgn = hpb->rgn_tbl + rgn_idx; srgn = rgn->srgn_tbl + srgn_idx; - /* If command type is WRITE or DISCARD, set bitmap as drity */ + /* If command type is WRITE or DISCARD, set bitmap as dirty */ if (ufshpb_is_write_or_discard(cmd)) { ufshpb_iterate_rgn(hpb, rgn_idx, srgn_idx, srgn_offset, transfer_len, true); From 02341a08c9dec5a88527981b0bdf0fb6f7499cbf Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 22 Oct 2022 10:16:15 +0800 Subject: [PATCH 33/66] block: fix memory leak for elevator on add_disk failure The default elevator is allocated in the beginning of device_add_disk(), however, it's not freed in the following error path. Fixes: 50e34d78815e ("block: disable the elevator int del_gendisk") Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Reviewed-by: Jason Yan Link: https://lore.kernel.org/r/20221022021615.2756171-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/genhd.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 17b33c62423d..fee90eb98b4a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -410,9 +410,10 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, * Otherwise just allocate the device numbers for both the whole device * and all partitions from the extended dev_t space. */ + ret = -EINVAL; if (disk->major) { if (WARN_ON(!disk->minors)) - return -EINVAL; + goto out_exit_elevator; if (disk->minors > DISK_MAX_PARTS) { pr_err("block: can't allocate more than %d partitions\n", @@ -420,14 +421,14 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, disk->minors = DISK_MAX_PARTS; } if (disk->first_minor + disk->minors > MINORMASK + 1) - return -EINVAL; + goto out_exit_elevator; } else { if (WARN_ON(disk->minors)) - return -EINVAL; + goto out_exit_elevator; ret = blk_alloc_ext_minor(); if (ret < 0) - return ret; + goto out_exit_elevator; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = ret; } @@ -540,6 +541,9 @@ out_device_del: out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); +out_exit_elevator: + if (disk->queue->elevator) + elevator_exit(disk->queue); return ret; } EXPORT_SYMBOL(device_add_disk); From 5fa9add66b00ad0c796185ff7438eaa3e67c1187 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sat, 22 Oct 2022 19:46:36 +0200 Subject: [PATCH 34/66] nvme-tcp: replace sg_init_marker() with sg_init_table() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In nvme_tcp_ddgst_update(), sg_init_marker() is called with an uninitialized scatterlist. This is probably fine, but gcc complains: CC [M] drivers/nvme/host/tcp.o In file included from ./include/linux/dma-mapping.h:10, from ./include/linux/skbuff.h:31, from ./include/net/net_namespace.h:43, from ./include/linux/netdevice.h:38, from ./include/net/sock.h:46, from drivers/nvme/host/tcp.c:12: In function ‘sg_mark_end’, inlined from ‘sg_init_marker’ at ./include/linux/scatterlist.h:356:2, inlined from ‘nvme_tcp_ddgst_update’ at drivers/nvme/host/tcp.c:390:2: ./include/linux/scatterlist.h:234:11: error: ‘sg.page_link’ is used uninitialized [-Werror=uninitialized] 234 | sg->page_link |= SG_END; | ~~^~~~~~~~~~~ drivers/nvme/host/tcp.c: In function ‘nvme_tcp_ddgst_update’: drivers/nvme/host/tcp.c:388:28: note: ‘sg’ declared here 388 | struct scatterlist sg; | ^~ cc1: all warnings being treated as errors Use sg_init_table() instead, which basically memset the scatterlist to zero first before calling sg_init_marker(). Signed-off-by: Nam Cao Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1eed0fc26b3a..dc2def86076d 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -387,7 +387,7 @@ static inline void nvme_tcp_ddgst_update(struct ahash_request *hash, { struct scatterlist sg; - sg_init_marker(&sg, 1); + sg_init_table(&sg, 1); sg_set_page(&sg, page, len, off); ahash_request_set_crypt(hash, &sg, NULL, len); crypto_ahash_update(hash); From 83e1226b0ee2d7e3fb6e002fbbfc6ab36aabdc35 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 23 Oct 2022 11:04:43 +0300 Subject: [PATCH 35/66] nvme-tcp: fix possible circular locking when deleting a controller under memory pressure When destroying a queue, when calling sock_release, the network stack might need to allocate an skb to send a FIN/RST. When that happens during memory pressure, there is a need to reclaim memory, which in turn may ask the nvme-tcp device to write out dirty pages, however this is not possible due to a ctrl teardown that is going on. Set PF_MEMALLOC to the task that releases the socket to grant access to PF_MEMALLOC reserves. In addition, do the same for the nvme-tcp thread as this may also originate from the swap itself and should be more resilient to memory pressure situations. This fixes the following lockdep complaint: -- ====================================================== WARNING: possible circular locking dependency detected 6.0.0-rc2+ #25 Tainted: G W ------------------------------------------------------ kswapd0/92 is trying to acquire lock: ffff888114003240 (sk_lock-AF_INET-NVME){+.+.}-{0:0}, at: tcp_sendpage+0x23/0xa0 but task is already holding lock: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x11e/0x160 kmem_cache_alloc_node+0x44/0x530 __alloc_skb+0x158/0x230 tcp_send_active_reset+0x7e/0x730 tcp_disconnect+0x1272/0x1ae0 __tcp_close+0x707/0xd90 tcp_close+0x26/0x80 inet_release+0xfa/0x220 sock_release+0x85/0x1a0 nvme_tcp_free_queue+0x1fd/0x470 [nvme_tcp] nvme_do_delete_ctrl+0x130/0x13d [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write_iter+0x356/0x530 vfs_write+0x4e8/0xce0 ksys_write+0xfd/0x1d0 do_syscall_64+0x58/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #0 (sk_lock-AF_INET-NVME){+.+.}-{0:0}: __lock_acquire+0x2a0c/0x5690 lock_acquire+0x18e/0x4f0 lock_sock_nested+0x37/0xc0 tcp_sendpage+0x23/0xa0 inet_sendpage+0xad/0x120 kernel_sendpage+0x156/0x440 nvme_tcp_try_send+0x48a/0x2630 [nvme_tcp] nvme_tcp_queue_rq+0xefb/0x17e0 [nvme_tcp] __blk_mq_try_issue_directly+0x452/0x660 blk_mq_plug_issue_direct.constprop.0+0x207/0x700 blk_mq_flush_plug_list+0x6f5/0xc70 __blk_flush_plug+0x264/0x410 blk_finish_plug+0x4b/0xa0 shrink_lruvec+0x1263/0x1ea0 shrink_node+0x736/0x1a80 balance_pgdat+0x740/0x10d0 kswapd+0x5f2/0xaf0 kthread+0x256/0x2f0 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); lock(fs_reclaim); lock(sk_lock-AF_INET-NVME); *** DEADLOCK *** 3 locks held by kswapd0/92: #0: ffffffff97e95ca0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x987/0x10d0 #1: ffff88811f21b0b0 (q->srcu){....}-{0:0}, at: blk_mq_flush_plug_list+0x6b3/0xc70 #2: ffff888170b11470 (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0xeb9/0x17e0 [nvme_tcp] Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Reported-by: Daniel Wagner Signed-off-by: Sagi Grimberg Tested-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index dc2def86076d..9b47dcb2a7d9 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1141,6 +1141,7 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) { struct nvme_tcp_request *req; + unsigned int noreclaim_flag; int ret = 1; if (!queue->request) { @@ -1150,12 +1151,13 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) } req = queue->request; + noreclaim_flag = memalloc_noreclaim_save(); if (req->state == NVME_TCP_SEND_CMD_PDU) { ret = nvme_tcp_try_send_cmd_pdu(req); if (ret <= 0) goto done; if (!nvme_tcp_has_inline_data(req)) - return ret; + goto out; } if (req->state == NVME_TCP_SEND_H2C_PDU) { @@ -1181,6 +1183,8 @@ done: nvme_tcp_fail_request(queue->request); nvme_tcp_done_send_req(queue); } +out: + memalloc_noreclaim_restore(noreclaim_flag); return ret; } @@ -1296,6 +1300,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) struct page *page; struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + unsigned int noreclaim_flag; if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return; @@ -1308,7 +1313,11 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); queue->pf_cache.va = NULL; } + + noreclaim_flag = memalloc_noreclaim_save(); sock_release(queue->sock); + memalloc_noreclaim_restore(noreclaim_flag); + kfree(queue->pdu); mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); From fe8714b04fb137aa62e9a69424c48b5301b721b9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 24 Oct 2022 11:57:45 -0700 Subject: [PATCH 36/66] nvme-multipath: set queue dma alignment to 3 NVMe spec requires all transports support dword aligned addresses, which is already set in the namespace request_queue. Set the same limit in the multipath device's request_queue as well. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 0ea7e441e080..93e2138a8b42 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -516,6 +516,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) /* set to a default value of 512 until the disk is validated */ blk_queue_logical_block_size(head->disk->queue, 512); blk_set_stacking_limits(&head->disk->queue->limits); + blk_queue_dma_alignment(head->disk->queue, 3); /* we need to propagate up the VMC settings */ if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) From 65722736c3baf29e02e964a09e85c9ef71c48e8d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 22 Oct 2022 15:22:07 +1000 Subject: [PATCH 37/66] powerpc/64s/interrupt: Fix clear of PACA_IRQS_HARD_DIS when returning to soft-masked context Commit a4cb3651a1743 ("powerpc/64s/interrupt: Fix lost interrupts when returning to soft-masked context") fixed the problem of pending irqs being cleared when clearing the HARD_DIS bit, but then it didn't clear the bit at all. This change clears HARD_DIS without affecting other bits in the mask. When an interrupt hits in a soft-masked section that has MSR[EE]=1, it can hard disable and set PACA_IRQS_HARD_DIS, which must be cleared when returning to the EE=1 caller (unless it was set due to a MUST_HARD_MASK interrupt becoming pending). Failure to clear this leaves the returned-to context running with MSR[EE]=1 and PACA_IRQS_HARD_DIS, which confuses irq assertions and could be dangerous for code that might test the flag. This was observed in a hash MMU kernel where a kernel hash fault hits in a local_irqs_disabled region that has EE=1. The hash fault also runs with EE=1, then as it returns, a decrementer hits in the restart section and the irq restart code hard-masks which sets the PACA_IRQ_HARD_DIS flag, which is not clear when the original context is returned to. Reported-by: Sachin Sant Fixes: a4cb3651a1743 ("powerpc/64s/interrupt: Fix lost interrupts when returning to soft-masked context") Signed-off-by: Nicholas Piggin Tested-by: Sachin Sant Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221022052207.471328-1-npiggin@gmail.com --- arch/powerpc/kernel/interrupt_64.S | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 978a173eb339..a019ed6fc839 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -532,15 +532,24 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel) * Returning to soft-disabled context. * Check if a MUST_HARD_MASK interrupt has become pending, in which * case we need to disable MSR[EE] in the return context. + * + * The MSR[EE] check catches among other things the short incoherency + * in hard_irq_disable() between clearing MSR[EE] and setting + * PACA_IRQ_HARD_DIS. */ ld r12,_MSR(r1) andi. r10,r12,MSR_EE beq .Lfast_kernel_interrupt_return_\srr\() // EE already disabled lbz r11,PACAIRQHAPPENED(r13) andi. r10,r11,PACA_IRQ_MUST_HARD_MASK - beq .Lfast_kernel_interrupt_return_\srr\() // No HARD_MASK pending + bne 1f // HARD_MASK is pending + // No HARD_MASK pending, clear possible HARD_DIS set by interrupt + andi. r11,r11,(~PACA_IRQ_HARD_DIS)@l + stb r11,PACAIRQHAPPENED(r13) + b .Lfast_kernel_interrupt_return_\srr\() - /* Must clear MSR_EE from _MSR */ + +1: /* Must clear MSR_EE from _MSR */ #ifdef CONFIG_PPC_BOOK3S li r10,0 /* Clear valid before changing _MSR */ From 7f21735ffb2648a29e0fc79c4bdcb1b9ed8602cd Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 27 Oct 2022 17:19:18 +0800 Subject: [PATCH 38/66] rbd: fix possible memory leak in rbd_sysfs_init() If device_register() returns error in rbd_sysfs_init(), name of kobject which is allocated in dev_set_name() called in device_add() is leaked. As comment of device_add() says, it should call put_device() to drop the reference count that was set in device_initialize() when it fails, so the name can be freed in kobject_cleanup(). Fault injection test can trigger this problem: unreferenced object 0xffff88810173aa78 (size 8): comm "modprobe", pid 247, jiffies 4294714278 (age 31.789s) hex dump (first 8 bytes): 72 62 64 00 81 88 ff ff rbd..... backtrace: [<00000000f58fae56>] __kmalloc_node_track_caller+0x44/0x1b0 [<00000000bdd44fe7>] kstrdup+0x3a/0x70 [<00000000f7844d0b>] kstrdup_const+0x63/0x80 [<000000001b0a0eeb>] kvasprintf_const+0x10b/0x190 [<00000000a47bd894>] kobject_set_name_vargs+0x56/0x150 [<00000000d5edbf18>] dev_set_name+0xab/0xe0 [<00000000f5153e80>] device_add+0x106/0x1f20 Fixes: dfc5606dc513 ("rbd: replace the rbd sysfs interface") Signed-off-by: Yang Yingliang Reviewed-by: Alex Elder Link: https://lore.kernel.org/r/20221027091918.2294132-1-yangyingliang@huawei.com Signed-off-by: Jens Axboe --- drivers/block/rbd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f9e39301c4af..04453f4a319c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -7222,8 +7222,10 @@ static int __init rbd_sysfs_init(void) int ret; ret = device_register(&rbd_root_dev); - if (ret < 0) + if (ret < 0) { + put_device(&rbd_root_dev); return ret; + } ret = bus_register(&rbd_bus_type); if (ret < 0) From 2d87d455ead2cbdee7e60463cddc5bff3f98c912 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Oct 2022 16:57:09 +0800 Subject: [PATCH 39/66] blk-mq: don't add non-pt request with ->end_io to batch dm-rq implements ->end_io callback for request issued to underlying queue, and it isn't passthrough request. Commit ab3e1d3bbab9 ("block: allow end_io based requests in the completion batch handling") doesn't clear rq->bio and rq->__data_len for request with ->end_io in blk_mq_end_request_batch(), and this way is actually dangerous, but so far it is only for nvme passthrough request. dm-rq needs to clean up remained bios in case of partial completion, and req->bio is required, then use-after-free is triggered, so the underlying clone request can't be completed in blk_mq_end_request_batch. Fix panic by not adding such request into batch list, and the issue can be triggered simply by exposing nvme pci to dm-mpath simply. Fixes: ab3e1d3bbab9 ("block: allow end_io based requests in the completion batch handling") Cc: dm-devel@redhat.com Cc: Mike Snitzer Reported-by: Changhui Zhong Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20221027085709.513175-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ba18e9bdb799..d6119c5d1069 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -853,7 +853,8 @@ static inline bool blk_mq_add_to_batch(struct request *req, struct io_comp_batch *iob, int ioerror, void (*complete)(struct io_comp_batch *)) { - if (!iob || (req->rq_flags & RQF_ELV) || ioerror) + if (!iob || (req->rq_flags & RQF_ELV) || ioerror || + (req->end_io && !blk_rq_is_passthrough(req))) return false; if (!iob->complete) From 8de11cdc96bf58b324c59a28512eb9513fd02553 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 27 Oct 2022 07:44:28 -0700 Subject: [PATCH 40/66] io_uring: use io_run_local_work_locked helper prefer to use io_run_local_work_locked helper for consistency Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221027144429.3971400-2-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6cc16e39b27f..8a0ce7379e89 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1446,8 +1446,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) io_task_work_pending(ctx)) { u32 tail = ctx->cached_cq_tail; - if (!llist_empty(&ctx->work_llist)) - __io_run_local_work(ctx, true); + (void) io_run_local_work_locked(ctx); if (task_work_pending(current) || wq_list_empty(&ctx->iopoll_list)) { From b3026767e15b488860d4bbf1649d69612bab2c25 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 27 Oct 2022 07:44:29 -0700 Subject: [PATCH 41/66] io_uring: unlock if __io_run_local_work locked inside It is possible for tw to lock the ring, and this was not propogated out to io_run_local_work. This can cause an unlock to be missed. Instead pass a pointer to locked into __io_run_local_work. Fixes: 8ac5d85a89b4 ("io_uring: add local task_work run helper that is entered locked") Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221027144429.3971400-3-dylany@meta.com [axboe: WARN_ON() -> WARN_ON_ONCE() and add a minor comment] Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 ++++---- io_uring/io_uring.h | 13 +++++++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8a0ce7379e89..ac8c488e3077 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1173,7 +1173,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) } } -int __io_run_local_work(struct io_ring_ctx *ctx, bool locked) +int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked) { struct llist_node *node; struct llist_node fake; @@ -1192,7 +1192,7 @@ again: struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - req->io_task_work.func(req, &locked); + req->io_task_work.func(req, locked); ret++; node = next; } @@ -1208,7 +1208,7 @@ again: goto again; } - if (locked) + if (*locked) io_submit_flush_completions(ctx); trace_io_uring_local_work_run(ctx, ret, loops); return ret; @@ -1225,7 +1225,7 @@ int io_run_local_work(struct io_ring_ctx *ctx) __set_current_state(TASK_RUNNING); locked = mutex_trylock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, locked); + ret = __io_run_local_work(ctx, &locked); if (locked) mutex_unlock(&ctx->uring_lock); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index ef77d2aa3172..e99a79f2df9b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -27,7 +27,7 @@ enum { struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow); bool io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); -int __io_run_local_work(struct io_ring_ctx *ctx, bool locked); +int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked); int io_run_local_work(struct io_ring_ctx *ctx); void io_req_complete_failed(struct io_kiocb *req, s32 res); void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); @@ -277,9 +277,18 @@ static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx) static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) { + bool locked; + int ret; + if (llist_empty(&ctx->work_llist)) return 0; - return __io_run_local_work(ctx, true); + + locked = true; + ret = __io_run_local_work(ctx, &locked); + /* shouldn't happen! */ + if (WARN_ON_ONCE(!locked)) + mutex_lock(&ctx->uring_lock); + return ret; } static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) From e3c5a78cdb6237bfb9641b63cccf366325229eec Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 26 Oct 2022 18:35:13 +0800 Subject: [PATCH 42/66] blk-mq: Properly init requests from blk_mq_alloc_request_hctx() Function blk_mq_alloc_request_hctx() is missing zeroing/init of rq->bio, biotail, __sector, and __data_len members, which blk_mq_alloc_request() has, so duplicate what we do in blk_mq_alloc_request(). Fixes: 1f5bd336b9150 ("blk-mq: add blk_mq_alloc_request_hctx") Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/1666780513-121650-1-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 33292c01875d..75c8296b6feb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -611,6 +611,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, .nr_tags = 1, }; u64 alloc_time_ns = 0; + struct request *rq; unsigned int cpu; unsigned int tag; int ret; @@ -660,8 +661,12 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; - return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, + rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, alloc_time_ns); + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = rq->biotail = NULL; + return rq; out_queue_exit: blk_queue_exit(q); From 9ef8eb6104527bfe9ed31f7a4ffa721390adf9a8 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 20 Oct 2022 23:36:14 +0100 Subject: [PATCH 43/66] squashfs: fix read regression introduced in readahead code Patch series "squashfs: fix some regressions introduced in the readahead code". This patchset fixes 3 regressions introduced by the recent readahead code changes. The first regression is causing "snaps" to randomly fail after a couple of hours or days, which how the regression came to light. This patch (of 3): If a file isn't a whole multiple of the page size, the last page will have trailing bytes unfilled. There was a mistake in the readahead code which did this. In particular it incorrectly assumed that the last page in the readahead page array (page[nr_pages - 1]) will always contain the last page in the block, which if we're at file end, will be the page that needs to be zero filled. But the readahead code may not return the last page in the block, which means it is unmapped and will be skipped by the decompressors (a temporary buffer used). In this case the zero filling code will zero out the wrong page, leading to data corruption. Fix this by by extending the "page actor" to return the last page if present, or NULL if a temporary buffer was used. Link: https://lkml.kernel.org/r/20221020223616.7571-1-phillip@squashfs.org.uk Link: https://lkml.kernel.org/r/20221020223616.7571-2-phillip@squashfs.org.uk Fixes: 8fc78b6fe24c ("squashfs: implement readahead") Link: https://lore.kernel.org/lkml/b0c258c3-6dcf-aade-efc4-d62a8b3a1ce2@alu.unizg.hr/ Signed-off-by: Phillip Lougher Reported-by: Mirsad Goran Todorovac Tested-by: Mirsad Goran Todorovac Tested-by: Slade Watkins Tested-by: Bagas Sanjaya Reported-by: Marc Miltenberger Cc: Dimitri John Ledkov Cc: Hsin-Yi Wang Cc: Thorsten Leemhuis Cc: Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 7 ++++--- fs/squashfs/page_actor.c | 3 +++ fs/squashfs/page_actor.h | 6 +++++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index e56510964b22..e526eb7a1658 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -557,6 +557,7 @@ static void squashfs_readahead(struct readahead_control *ractl) int res, bsize; u64 block = 0; unsigned int expected; + struct page *last_page; nr_pages = __readahead_batch(ractl, pages, max_pages); if (!nr_pages) @@ -593,15 +594,15 @@ static void squashfs_readahead(struct readahead_control *ractl) res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); - squashfs_page_actor_free(actor); + last_page = squashfs_page_actor_free(actor); if (res == expected) { int bytes; /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (pages[nr_pages - 1]->index == file_end && bytes) - memzero_page(pages[nr_pages - 1], bytes, + if (index == file_end && bytes && last_page) + memzero_page(last_page, bytes, PAGE_SIZE - bytes); for (i = 0; i < nr_pages; i++) { diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 54b93bf4a25c..81af6c4ca115 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -71,11 +71,13 @@ static void *handle_next_page(struct squashfs_page_actor *actor) (actor->next_index != actor->page[actor->next_page]->index)) { actor->next_index++; actor->returned_pages++; + actor->last_page = NULL; return actor->alloc_buffer ? actor->tmp_buffer : ERR_PTR(-ENOMEM); } actor->next_index++; actor->returned_pages++; + actor->last_page = actor->page[actor->next_page]; return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]); } @@ -125,6 +127,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_ actor->returned_pages = 0; actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); actor->pageaddr = NULL; + actor->last_page = NULL; actor->alloc_buffer = msblk->decompressor->alloc_buffer; actor->squashfs_first_page = direct_first_page; actor->squashfs_next_page = direct_next_page; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 95ffbb543d91..97d4983559b1 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -16,6 +16,7 @@ struct squashfs_page_actor { void *(*squashfs_first_page)(struct squashfs_page_actor *); void *(*squashfs_next_page)(struct squashfs_page_actor *); void (*squashfs_finish_page)(struct squashfs_page_actor *); + struct page *last_page; int pages; int length; int next_page; @@ -29,10 +30,13 @@ extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, extern struct squashfs_page_actor *squashfs_page_actor_init_special( struct squashfs_sb_info *msblk, struct page **page, int pages, int length); -static inline void squashfs_page_actor_free(struct squashfs_page_actor *actor) +static inline struct page *squashfs_page_actor_free(struct squashfs_page_actor *actor) { + struct page *last_page = actor->last_page; + kfree(actor->tmp_buffer); kfree(actor); + return last_page; } static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { From c9199de82bad03bceb94ec3c5195c879d7e11911 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 20 Oct 2022 23:36:15 +0100 Subject: [PATCH 44/66] squashfs: fix extending readahead beyond end of file The readahead code will try to extend readahead to the entire size of the Squashfs data block. But, it didn't take into account that the last block at the end of the file may not be a whole block. In this case, the code would extend readahead to beyond the end of the file, leaving trailing pages. Fix this by only requesting the expected number of pages. Link: https://lkml.kernel.org/r/20221020223616.7571-3-phillip@squashfs.org.uk Fixes: 8fc78b6fe24c ("squashfs: implement readahead") Signed-off-by: Phillip Lougher Tested-by: Bagas Sanjaya Reported-by: Marc Miltenberger Cc: Dimitri John Ledkov Cc: Hsin-Yi Wang Cc: Mirsad Goran Todorovac Cc: Slade Watkins Cc: Thorsten Leemhuis Cc: Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index e526eb7a1658..f0afd4d6fd30 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -559,6 +559,12 @@ static void squashfs_readahead(struct readahead_control *ractl) unsigned int expected; struct page *last_page; + expected = start >> msblk->block_log == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + + max_pages = (expected + PAGE_SIZE - 1) >> PAGE_SHIFT; + nr_pages = __readahead_batch(ractl, pages, max_pages); if (!nr_pages) break; @@ -567,13 +573,10 @@ static void squashfs_readahead(struct readahead_control *ractl) goto skip_pages; index = pages[0]->index >> shift; + if ((pages[nr_pages - 1]->index >> shift) != index) goto skip_pages; - expected = index == file_end ? - (i_size_read(inode) & (msblk->block_size - 1)) : - msblk->block_size; - if (index == file_end && squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { res = squashfs_readahead_fragment(pages, nr_pages, From e11c4e088be4c39d17f304fcf331670891905f42 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 20 Oct 2022 23:36:16 +0100 Subject: [PATCH 45/66] squashfs: fix buffer release race condition in readahead code Fix a buffer release race condition, where the error value was used after release. Link: https://lkml.kernel.org/r/20221020223616.7571-4-phillip@squashfs.org.uk Fixes: b09a7a036d20 ("squashfs: support reading fragments in readahead call") Signed-off-by: Phillip Lougher Tested-by: Bagas Sanjaya Reported-by: Marc Miltenberger Cc: Dimitri John Ledkov Cc: Hsin-Yi Wang Cc: Mirsad Goran Todorovac Cc: Slade Watkins Cc: Thorsten Leemhuis Cc: Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index f0afd4d6fd30..8ba8c4c50770 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -506,8 +506,9 @@ static int squashfs_readahead_fragment(struct page **page, squashfs_i(inode)->fragment_size); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; + int error = buffer->error; - if (buffer->error) + if (error) goto out; expected += squashfs_i(inode)->fragment_offset; @@ -529,7 +530,7 @@ static int squashfs_readahead_fragment(struct page **page, out: squashfs_cache_put(buffer); - return buffer->error; + return error; } static void squashfs_readahead(struct readahead_control *ractl) From 984a608377cb623351b8a3670b285f32ebeb2d32 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 20 Oct 2022 13:56:19 -0400 Subject: [PATCH 46/66] mm/kmemleak: prevent soft lockup in kmemleak_scan()'s object iteration loops Commit 6edda04ccc7c ("mm/kmemleak: prevent soft lockup in first object iteration loop of kmemleak_scan()") adds cond_resched() in the first object iteration loop of kmemleak_scan(). However, it turns that the 2nd objection iteration loop can still cause soft lockup to happen in some cases. So add a cond_resched() call in the 2nd and 3rd loops as well to prevent that and for completeness. Link: https://lkml.kernel.org/r/20221020175619.366317-1-longman@redhat.com Fixes: 6edda04ccc7c ("mm/kmemleak: prevent soft lockup in first object iteration loop of kmemleak_scan()") Signed-off-by: Waiman Long Cc: Catalin Marinas Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/kmemleak.c | 61 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 37af2dc8dac9..646e2979641f 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1460,6 +1460,27 @@ static void scan_gray_list(void) WARN_ON(!list_empty(&gray_list)); } +/* + * Conditionally call resched() in a object iteration loop while making sure + * that the given object won't go away without RCU read lock by performing a + * get_object() if !pinned. + * + * Return: false if can't do a cond_resched() due to get_object() failure + * true otherwise + */ +static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned) +{ + if (!pinned && !get_object(object)) + return false; + + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + if (!pinned) + put_object(object); + return true; +} + /* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the @@ -1471,7 +1492,7 @@ static void kmemleak_scan(void) struct zone *zone; int __maybe_unused i; int new_leaks = 0; - int loop1_cnt = 0; + int loop_cnt = 0; jiffies_last_scan = jiffies; @@ -1480,7 +1501,6 @@ static void kmemleak_scan(void) list_for_each_entry_rcu(object, &object_list, object_list) { bool obj_pinned = false; - loop1_cnt++; raw_spin_lock_irq(&object->lock); #ifdef DEBUG /* @@ -1514,24 +1534,11 @@ static void kmemleak_scan(void) raw_spin_unlock_irq(&object->lock); /* - * Do a cond_resched() to avoid soft lockup every 64k objects. - * Make sure a reference has been taken so that the object - * won't go away without RCU read lock. + * Do a cond_resched() every 64k objects to avoid soft lockup. */ - if (!(loop1_cnt & 0xffff)) { - if (!obj_pinned && !get_object(object)) { - /* Try the next object instead */ - loop1_cnt--; - continue; - } - - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); - - if (!obj_pinned) - put_object(object); - } + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, obj_pinned)) + loop_cnt--; /* Try again on next object */ } rcu_read_unlock(); @@ -1598,7 +1605,15 @@ static void kmemleak_scan(void) * scan and color them gray until the next scan. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { + /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in @@ -1632,7 +1647,15 @@ static void kmemleak_scan(void) * Scanning result reporting. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { + /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in From b214fadff28d20f96456b73fe93340cb581ca891 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 20 Oct 2022 11:42:55 +0900 Subject: [PATCH 47/66] MAINTAINERS: git://github.com -> https://github.com for nilfs2 Github deprecated the git:// links about a year ago, so let's move to the https:// URLs instead. Link: https://lkml.kernel.org/r/20221020024255.5000-1-konishi.ryusuke@gmail.com Link: https://github.blog/2021-09-01-improving-git-protocol-security-github/ Link: https://lkml.kernel.org/r/20221013214638.30933-1-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt Signed-off-by: Ryusuke Konishi Reported-by: Conor Dooley Signed-off-by: Andrew Morton --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index cf0f18502372..fe447637bf03 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14520,7 +14520,7 @@ L: linux-nilfs@vger.kernel.org S: Supported W: https://nilfs.sourceforge.io/ W: https://nilfs.osdn.jp/ -T: git git://github.com/konis/nilfs2.git +T: git https://github.com/konis/nilfs2.git F: Documentation/filesystems/nilfs2.rst F: fs/nilfs2/ F: include/trace/events/nilfs2.h From 27d676a1c2010450d00d514a8a6c1c780cb8d77f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 20 Oct 2022 09:51:22 +0800 Subject: [PATCH 48/66] memory tier, sysfs: rename attribute "nodes" to "nodelist" In sysfs, we use attribute name "cpumap" or "cpus" for cpu mask and "cpulist" or "cpus_list" for cpu list. For example, in my system, $ cat /sys/devices/system/node/node0/cpumap f,ffffffff $ cat /sys/devices/system/cpu/cpu2/topology/core_cpus 0,00100004 $ cat cat /sys/devices/system/node/node0/cpulist 0-35 $ cat /sys/devices/system/cpu/cpu2/topology/core_cpus_list 2,20 It looks reasonable to use "nodemap" for node mask and "nodelist" for node list. So, rename the attribute to follow the naming convention. Link: https://lkml.kernel.org/r/20221020015122.290097-1-ying.huang@intel.com Fixes: 9832fb87834e2b ("mm/demotion: expose memory tier details via sysfs") Signed-off-by: "Huang, Ying" Acked-by: Wei Xu Reviewed-by: Aneesh Kumar K.V Reviewed-by: Yang Shi Reviewed-by: Davidlohr Bueso Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Hesham Almatary Cc: Jagdish Gediya Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers | 4 ++-- mm/memory-tiers.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers index 45985e411f13..721a05b90109 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers +++ b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers @@ -10,7 +10,7 @@ Description: A collection of all the memory tiers allocated. What: /sys/devices/virtual/memory_tiering/memory_tierN/ - /sys/devices/virtual/memory_tiering/memory_tierN/nodes + /sys/devices/virtual/memory_tiering/memory_tierN/nodelist Date: August 2022 Contact: Linux memory management mailing list Description: Directory with details of a specific memory tier @@ -21,5 +21,5 @@ Description: Directory with details of a specific memory tier A smaller value of N implies a higher (faster) memory tier in the hierarchy. - nodes: NUMA nodes that are part of this memory tier. + nodelist: NUMA nodes that are part of this memory tier. diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index f116b7b6333e..fa8c9d07f9ce 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -131,8 +131,8 @@ static void memory_tier_device_release(struct device *dev) kfree(tier); } -static ssize_t nodes_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t nodelist_show(struct device *dev, + struct device_attribute *attr, char *buf) { int ret; nodemask_t nmask; @@ -143,10 +143,10 @@ static ssize_t nodes_show(struct device *dev, mutex_unlock(&memory_tier_lock); return ret; } -static DEVICE_ATTR_RO(nodes); +static DEVICE_ATTR_RO(nodelist); static struct attribute *memtier_dev_attrs[] = { - &dev_attr_nodes.attr, + &dev_attr_nodelist.attr, NULL }; From 64b4c411a6c7a5f27555bfc2d6310b87bde3db67 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 20 Oct 2022 21:19:22 -0700 Subject: [PATCH 49/66] ipc/msg.c: fix percpu_counter use after free These percpu counters are referenced in free_ipcs->freeque, so destroy them later. Fixes: 72d1e611082e ("ipc/msg: mitigate the lock contention with percpu counter") Reported-by: syzbot+96e659d35b9d6b541152@syzkaller.appspotmail.com Tested-by: Mark Rutland Cc: Jiebin Sun Signed-off-by: Andrew Morton --- ipc/msg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index e4e0990e08f7..fd08b3cb36d7 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -1329,11 +1329,11 @@ fail_msg_bytes: #ifdef CONFIG_IPC_NS void msg_exit_ns(struct ipc_namespace *ns) { - percpu_counter_destroy(&ns->percpu_msg_bytes); - percpu_counter_destroy(&ns->percpu_msg_hdrs); free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); + percpu_counter_destroy(&ns->percpu_msg_bytes); + percpu_counter_destroy(&ns->percpu_msg_hdrs); } #endif From bb2282cf01fcae7379314dc026f3b534c83c186c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 21 Oct 2022 08:05:49 -0700 Subject: [PATCH 50/66] fs/ext4/super.c: remove unused `deprecated_msg' fs/ext4/super.c:1744:19: warning: 'deprecated_msg' defined but not used [-Wunused-const-variable=] Reported-by: kernel test robot Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ext4/super.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 989365b878a6..7950904fbf04 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1741,10 +1741,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) -static const char deprecated_msg[] = - "Mount option \"%s\" will be removed by %s\n" - "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; - #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 From fba4eaf93164a6a6eb3cc12a3391b06f6187aa20 Mon Sep 17 00:00:00 2001 From: Maria Yu Date: Fri, 21 Oct 2022 18:15:55 +0800 Subject: [PATCH 51/66] mm/page_isolation: fix clang deadcode warning When !CONFIG_VM_BUG_ON, there is warning of clang-analyzer-deadcode.DeadStores: Value stored to 'mt' during its initialization is never read. Link: https://lkml.kernel.org/r/20221021101555.7992-2-quic_aiquny@quicinc.com Signed-off-by: Maria Yu Cc: David Hildenbrand Cc: Doug Berger Cc: Mike Kravetz Cc: Zi Yan Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_isolation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 04141a9bea70..47fbc1696466 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -330,7 +330,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, zone->zone_start_pfn); if (skip_isolation) { - int mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); + int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); VM_BUG_ON(!is_migrate_isolate(mt)); } else { From 8ebe0a5eaaeb099de03d09ad20f54ed962e2261e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 21 Oct 2022 19:28:05 -0400 Subject: [PATCH 52/66] mm,madvise,hugetlb: fix unexpected data loss with MADV_DONTNEED on hugetlbfs A common use case for hugetlbfs is for the application to create memory pools backed by huge pages, which then get handed over to some malloc library (eg. jemalloc) for further management. That malloc library may be doing MADV_DONTNEED calls on memory that is no longer needed, expecting those calls to happen on PAGE_SIZE boundaries. However, currently the MADV_DONTNEED code rounds up any such requests to HPAGE_PMD_SIZE boundaries. This leads to undesired outcomes when jemalloc expects a 4kB MADV_DONTNEED, but 2MB of memory get zeroed out, instead. Use of pre-built shared libraries means that user code does not always know the page size of every memory arena in use. Avoid unexpected data loss with MADV_DONTNEED by rounding up only to PAGE_SIZE (in do_madvise), and rounding down to huge page granularity. That way programs will only get as much memory zeroed out as they requested. Link: https://lkml.kernel.org/r/20221021192805.366ad573@imladris.surriel.com Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings") Signed-off-by: Rik van Riel Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton --- mm/madvise.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 2baa93ca2310..c7105ec6d08c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -813,7 +813,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, if (start & ~huge_page_mask(hstate_vma(vma))) return false; - *end = ALIGN(*end, huge_page_size(hstate_vma(vma))); + /* + * Madvise callers expect the length to be rounded up to PAGE_SIZE + * boundaries, and may be unaware that this VMA uses huge pages. + * Avoid unexpected data loss by rounding down the number of + * huge pages freed. + */ + *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); + return true; } @@ -828,6 +835,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; + if (start == end) + return 0; + if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ From 5aae9265ee1a30cf716d6caf6b29fe99b9d55130 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 22 Oct 2022 00:51:06 -0700 Subject: [PATCH 53/66] mm: prep_compound_tail() clear page->private Although page allocation always clears page->private in the first page or head page of an allocation, it has never made a point of clearing page->private in the tails (though 0 is often what is already there). But now commit 71e2d666ef85 ("mm/huge_memory: do not clobber swp_entry_t during THP split") issues a warning when page_tail->private is found to be non-0 (unless it's swapcache). Change that warning to dump page_tail (which also dumps head), instead of just the head: so far we have seen dead000000000122, dead000000000003, dead000000000001 or 0000000000000002 in the raw output for tail private. We could just delete the warning, but today's consensus appears to want page->private to be 0, unless there's a good reason for it to be set: so now clear it in prep_compound_tail() (more general than just for THP; but not for high order allocation, which makes no pass down the tails). Link: https://lkml.kernel.org/r/1c4233bb-4e4d-5969-fbd4-96604268a285@google.com Fixes: 71e2d666ef85 ("mm/huge_memory: do not clobber swp_entry_t during THP split") Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- mm/page_alloc.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 03fc7e5edf07..561a42567477 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2462,7 +2462,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * Fix up and warn once if private is unexpectedly set. */ if (!folio_test_swapcache(page_folio(head))) { - VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, head); + VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); page_tail->private = 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b5a6c815ae28..218b28ee49ed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -807,6 +807,7 @@ static void prep_compound_tail(struct page *head, int tail_idx) p->mapping = TAIL_MAPPING; set_compound_head(p, head); + set_page_private(p, 0); } void prep_compound_page(struct page *page, unsigned int order) From 67eae54bc227b30dedcce9db68b063ba1adb7838 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 24 Oct 2022 15:33:35 -0400 Subject: [PATCH 54/66] mm/uffd: fix vma check on userfault for wp We used to have a report that pte-marker code can be reached even when uffd-wp is not compiled in for file memories, here: https://lore.kernel.org/all/YzeR+R6b4bwBlBHh@x1n/T/#u I just got time to revisit this and found that the root cause is we simply messed up with the vma check, so that for !PTE_MARKER_UFFD_WP system, we will allow UFFDIO_REGISTER of MINOR & WP upon shmem as the check was wrong: if (vm_flags & VM_UFFD_MINOR) return is_vm_hugetlb_page(vma) || vma_is_shmem(vma); Where we'll allow anything to pass on shmem as long as minor mode is requested. Axel did it right when introducing minor mode but I messed it up in b1f9e876862d when moving code around. Fix it. Link: https://lkml.kernel.org/r/20221024193336.1233616-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221024193336.1233616-2-peterx@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: Andrea Arcangeli Cc: Nadav Amit Cc: Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f07e6998bb68..9df0b9a762cc 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -146,9 +146,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) static inline bool vma_can_userfault(struct vm_area_struct *vma, unsigned long vm_flags) { - if (vm_flags & VM_UFFD_MINOR) - return is_vm_hugetlb_page(vma) || vma_is_shmem(vma); - + if ((vm_flags & VM_UFFD_MINOR) && + (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) + return false; #ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for From 03e5f82ea632af329e32ec03d952b2d99497eeaa Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 24 Oct 2022 16:34:21 +0800 Subject: [PATCH 55/66] mm: migrate: fix return value if all subpages of THPs are migrated successfully During THP migration, if THPs are not migrated but they are split and all subpages are migrated successfully, migrate_pages() will still return the number of THP pages that were not migrated. This will confuse the callers of migrate_pages(). For example, the longterm pinning will failed though all pages are migrated successfully. Thus we should return 0 to indicate that all pages are migrated in this case Link: https://lkml.kernel.org/r/de386aa864be9158d2f3b344091419ea7c38b2f7.1666599848.git.baolin.wang@linux.alibaba.com Fixes: b5bade978e9b ("mm: migrate: fix the return value of migrate_pages()") Signed-off-by: Baolin Wang Reviewed-by: Alistair Popple Reviewed-by: Yang Shi Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- mm/migrate.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/migrate.c b/mm/migrate.c index 1379e1912772..dff333593a8a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1582,6 +1582,13 @@ out: */ list_splice(&ret_pages, from); + /* + * Return 0 in case all subpages of fail-to-migrate THPs are + * migrated successfully. + */ + if (list_empty(from)) + rc = 0; + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); From f59a3ee6912997fc56ecee78613fef53aae668d9 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 24 Oct 2022 23:21:40 +0200 Subject: [PATCH 56/66] mm: kmsan: export kmsan_copy_page_meta() Certain modules call copy_user_highpage(), which calls kmsan_copy_page_meta() under KMSAN, so we need to export the latter. Link: https://lkml.kernel.org/r/20221024212144.2852069-1-glider@google.com Link: https://github.com/google/kmsan/issues/89 Fixes: b073d7f8aee4 ("mm: kmsan: maintain KMSAN metadata for page operations") Signed-off-by: Alexander Potapenko Signed-off-by: Andrew Morton --- mm/kmsan/shadow.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 21e3e196ec3c..a787c04e9583 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -167,6 +167,7 @@ void kmsan_copy_page_meta(struct page *dst, struct page *src) __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE); kmsan_leave_runtime(); } +EXPORT_SYMBOL(kmsan_copy_page_meta); void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) { From 42855f588e187a6f22978e54422adbc010ac7630 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 24 Oct 2022 23:21:41 +0200 Subject: [PATCH 57/66] x86/purgatory: disable KMSAN instrumentation The stand-alone purgatory.ro does not contain the KMSAN runtime, therefore it can't be built with KMSAN compiler instrumentation. Link: https://lkml.kernel.org/r/20221024212144.2852069-2-glider@google.com Link: https://github.com/google/kmsan/issues/89 Signed-off-by: Alexander Potapenko Cc: Andrew Morton Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton --- arch/x86/purgatory/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 58a200dc762d..17f09dc26381 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -26,6 +26,7 @@ GCOV_PROFILE := n KASAN_SANITIZE := n UBSAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n KCOV_INSTRUMENT := n # These are adjustments to the compiler flags used for objects that From 921757bc9b611efc483a548b86769934384e9c79 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 24 Oct 2022 23:21:42 +0200 Subject: [PATCH 58/66] Kconfig.debug: disable CONFIG_FRAME_WARN for KMSAN by default KMSAN adds a lot of instrumentation to the code, which results in increased stack usage (up to 2048 bytes and more in some cases). It's hard to predict how big the stack frames can be, so we disable the warnings for KMSAN instead. Link: https://lkml.kernel.org/r/20221024212144.2852069-3-glider@google.com Link: https://github.com/google/kmsan/issues/89 Signed-off-by: Alexander Potapenko Cc: Kees Cook Cc: Masahiro Yamada Cc: Nick Desaulniers Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3fc7abffc7aa..29280072dc0e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -400,8 +400,9 @@ config FRAME_WARN default 1536 if (!64BIT && XTENSA) default 1024 if !64BIT default 2048 if 64BIT + default 0 if KMSAN help - Tell gcc to warn at build time for stack frames larger than this. + Tell the compiler to warn at build time for stack frames larger than this. Setting this too low will cause a lot of warnings. Setting it to 0 disables the warning. From 59c8a02e24894e75639bcecc3cb1e768a2792220 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 24 Oct 2022 23:21:43 +0200 Subject: [PATCH 59/66] x86: asm: make sure __put_user_size() evaluates pointer once User access macros must ensure their arguments are evaluated only once if they are used more than once in the macro body. Adding instrument_put_user() to __put_user_size() resulted in double evaluation of the `ptr` argument, which led to correctness issues when performing e.g. unsafe_put_user(..., p++, ...). To fix those issues, evaluate the `ptr` argument of __put_user_size() at the beginning of the macro. Link: https://lkml.kernel.org/r/20221024212144.2852069-4-glider@google.com Fixes: 888f84a6da4d ("x86: asm: instrument usercopy in get_user() and put_user()") Signed-off-by: Alexander Potapenko Reported-by: youling257 Cc: Borislav Petkov Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/include/asm/uaccess.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8bc614cfe21b..1cc756eafa44 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -254,24 +254,25 @@ extern void __put_user_nocheck_8(void); #define __put_user_size(x, ptr, size, label) \ do { \ __typeof__(*(ptr)) __x = (x); /* eval x once */ \ - __chk_user_ptr(ptr); \ + __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \ + __chk_user_ptr(__ptr); \ switch (size) { \ case 1: \ - __put_user_goto(__x, ptr, "b", "iq", label); \ + __put_user_goto(__x, __ptr, "b", "iq", label); \ break; \ case 2: \ - __put_user_goto(__x, ptr, "w", "ir", label); \ + __put_user_goto(__x, __ptr, "w", "ir", label); \ break; \ case 4: \ - __put_user_goto(__x, ptr, "l", "ir", label); \ + __put_user_goto(__x, __ptr, "l", "ir", label); \ break; \ case 8: \ - __put_user_goto_u64(__x, ptr, label); \ + __put_user_goto_u64(__x, __ptr, label); \ break; \ default: \ __put_user_bad(); \ } \ - instrument_put_user(__x, ptr, size); \ + instrument_put_user(__x, __ptr, size); \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT From 78a498c3a227f2ac773a8234b2ce092a4403f2c3 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 24 Oct 2022 23:21:44 +0200 Subject: [PATCH 60/66] x86: fortify: kmsan: fix KMSAN fortify builds Ensure that KMSAN builds replace memset/memcpy/memmove calls with the respective __msan_XXX functions, and that none of the macros are redefined twice. This should allow building kernel with both CONFIG_KMSAN and CONFIG_FORTIFY_SOURCE. Link: https://lkml.kernel.org/r/20221024212144.2852069-5-glider@google.com Link: https://github.com/google/kmsan/issues/89 Signed-off-by: Alexander Potapenko Reported-by: Tamas K Lengyel Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Kees Cook Signed-off-by: Andrew Morton --- arch/x86/include/asm/string_64.h | 11 +++++++---- include/linux/fortify-string.h | 17 +++++++++++++++-- include/linux/kmsan_string.h | 21 +++++++++++++++++++++ mm/kmsan/instrumentation.c | 1 + 4 files changed, 44 insertions(+), 6 deletions(-) create mode 100644 include/linux/kmsan_string.h diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 3b87d889b6e1..888731ccf1f6 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -10,10 +10,13 @@ /* Even with __builtin_ the compiler may decide to use the out of line function. */ +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) +#include +#endif + #define __HAVE_ARCH_MEMCPY 1 -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) #undef memcpy -void *__msan_memcpy(void *dst, const void *src, size_t size); #define memcpy __msan_memcpy #else extern void *memcpy(void *to, const void *from, size_t len); @@ -21,7 +24,7 @@ extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) extern void *__msan_memset(void *s, int c, size_t n); #undef memset #define memset __msan_memset @@ -67,7 +70,7 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n) } #define __HAVE_ARCH_MEMMOVE -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) #undef memmove void *__msan_memmove(void *dest, const void *src, size_t len); #define memmove __msan_memmove diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 4029fe368a4f..18a31b125f9d 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -43,11 +43,24 @@ extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); #else -#define __underlying_memchr __builtin_memchr -#define __underlying_memcmp __builtin_memcmp + +#if defined(__SANITIZE_MEMORY__) +/* + * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the + * corresponding __msan_XXX functions. + */ +#include +#define __underlying_memcpy __msan_memcpy +#define __underlying_memmove __msan_memmove +#define __underlying_memset __msan_memset +#else #define __underlying_memcpy __builtin_memcpy #define __underlying_memmove __builtin_memmove #define __underlying_memset __builtin_memset +#endif + +#define __underlying_memchr __builtin_memchr +#define __underlying_memcmp __builtin_memcmp #define __underlying_strcat __builtin_strcat #define __underlying_strcpy __builtin_strcpy #define __underlying_strlen __builtin_strlen diff --git a/include/linux/kmsan_string.h b/include/linux/kmsan_string.h new file mode 100644 index 000000000000..7287da6f52ef --- /dev/null +++ b/include/linux/kmsan_string.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN string functions API used in other headers. + * + * Copyright (C) 2022 Google LLC + * Author: Alexander Potapenko + * + */ +#ifndef _LINUX_KMSAN_STRING_H +#define _LINUX_KMSAN_STRING_H + +/* + * KMSAN overrides the default memcpy/memset/memmove implementations in the + * kernel, which requires having __msan_XXX function prototypes in several other + * headers. Keep them in one place instead of open-coding. + */ +void *__msan_memcpy(void *dst, const void *src, size_t size); +void *__msan_memset(void *s, int c, size_t n); +void *__msan_memmove(void *dest, const void *src, size_t len); + +#endif /* _LINUX_KMSAN_STRING_H */ diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 280d15413268..271f135f97a1 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -14,6 +14,7 @@ #include "kmsan.h" #include +#include #include #include From 5521de7dddd211e3a9403d7bde0b614fd0936ac6 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Sun, 23 Oct 2022 21:34:52 -0700 Subject: [PATCH 61/66] mm/userfaultfd: replace kmap/kmap_atomic() with kmap_local_page() kmap() and kmap_atomic() are being deprecated in favor of kmap_local_page() which is appropriate for any thread local context.[1] A recent locking bug report with userfaultfd showed that the conversion of the kmap_atomic()'s in those code flows requires care with regard to the prevention of deadlock.[2] git archaeology implied that the recursion may not be an actual bug.[3] However, depending on the implementation of the mmap_lock and the condition of the call there may still be a deadlock.[4] So this is not purely a lockdep issue. Considering a single threaded call stack there are 3 options. 1) Different mm's are in play (no issue) 2) Readlock implementation is recursive and same mm is in play (no issue) 3) Readlock implementation is _not_ recursive (issue) The mmap_lock is recursive so with a single thread there is no issue. However, Matthew pointed out a deadlock scenario when you consider additional process' and threads thusly. "The readlock implementation is only recursive if nobody else has taken a write lock. If you have a multithreaded process, one of the other threads can call mmap() and that will prevent recursion (due to fairness). Even if it's a different process that you're trying to acquire the mmap read lock on, you can still get into a deadly embrace. eg: process A thread 1 takes read lock on own mmap_lock process A thread 2 calls mmap, blocks taking write lock process B thread 1 takes page fault, read lock on own mmap lock process B thread 2 calls mmap, blocks taking write lock process A thread 1 blocks taking read lock on process B process B thread 1 blocks taking read lock on process A Now all four threads are blocked waiting for each other." Regardless using pagefault_disable() ensures that no matter what locking implementation is used a deadlock will not occur. Complete kmap conversion in userfaultfd by replacing the kmap() and kmap_atomic() calls with kmap_local_page(). When replacing the kmap_atomic() call ensure page faults continue to be disabled to support the correct fall back behavior and add a comment to inform future souls of the requirement. [1] https://lore.kernel.org/all/20220813220034.806698-1-ira.weiny@intel.com/ [2] https://lore.kernel.org/all/Y1Mh2S7fUGQ%2FiKFR@iweiny-desk3/ [3] https://lore.kernel.org/all/Y1MymJ%2FINb45AdaY@iweiny-desk3/ [4] https://lore.kernel.org/lkml/Y1bXBtGTCym77%2FoD@casper.infradead.org/ [ira.weiny@intel.com: v2] Link: https://lkml.kernel.org/r/20221025220136.2366143-1-ira.weiny@intel.com Link: https://lkml.kernel.org/r/20221024043452.1491677-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Cc: Matthew Wilcox Cc: Andrew Morton Cc: Andrea Arcangeli Cc: Peter Xu Cc: Axel Rasmussen Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e24e8a47ce8a..3d0fef3980b3 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -157,11 +157,28 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (!page) goto out; - page_kaddr = kmap_atomic(page); + page_kaddr = kmap_local_page(page); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap_atomic(page_kaddr); + pagefault_enable(); + kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { @@ -646,11 +663,11 @@ retry: mmap_read_unlock(dst_mm); BUG_ON(!page); - page_kaddr = kmap(page); + page_kaddr = kmap_local_page(page); err = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap(page); + kunmap_local(page_kaddr); if (unlikely(err)) { err = -EFAULT; goto out; From 5dc21f0c0b1c02ea2c9014cbe7cd3b28884ff306 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 25 Oct 2022 15:01:08 -0700 Subject: [PATCH 62/66] mm/shmem: ensure proper fallback if page faults The kernel test robot flagged a recursive lock as a result of a conversion from kmap_atomic() to kmap_local_folio()[Link] The cause was due to the code depending on the kmap_atomic() side effect of disabling page faults. In that case the code expects the fault to fail and take the fallback case. git archaeology implied that the recursion may not be an actual bug.[1] However, depending on the implementation of the mmap_lock and the condition of the call there may still be a deadlock.[2] So this is not purely a lockdep issue. Considering a single threaded call stack there are 3 options. 1) Different mm's are in play (no issue) 2) Readlock implementation is recursive and same mm is in play (no issue) 3) Readlock implementation is _not_ recursive (issue) The mmap_lock is recursive so with a single thread there is no issue. However, Matthew pointed out a deadlock scenario when you consider additional process' and threads thusly. "The readlock implementation is only recursive if nobody else has taken a write lock. If you have a multithreaded process, one of the other threads can call mmap() and that will prevent recursion (due to fairness). Even if it's a different process that you're trying to acquire the mmap read lock on, you can still get into a deadly embrace. eg: process A thread 1 takes read lock on own mmap_lock process A thread 2 calls mmap, blocks taking write lock process B thread 1 takes page fault, read lock on own mmap lock process B thread 2 calls mmap, blocks taking write lock process A thread 1 blocks taking read lock on process B process B thread 1 blocks taking read lock on process A Now all four threads are blocked waiting for each other." Regardless using pagefault_disable() ensures that no matter what locking implementation is used a deadlock will not occur. Add an explicit pagefault_disable() and a big comment to explain this for future souls looking at this code. [1] https://lore.kernel.org/all/Y1MymJ%2FINb45AdaY@iweiny-desk3/ [2] https://lore.kernel.org/lkml/Y1bXBtGTCym77%2FoD@casper.infradead.org/ Link: https://lkml.kernel.org/r/20221025220108.2366043-1-ira.weiny@intel.com Link: https://lore.kernel.org/r/202210211215.9dc6efb5-yujie.liu@intel.com Fixes: 7a7256d5f512 ("shmem: convert shmem_mfill_atomic_pte() to use a folio") Signed-off-by: Ira Weiny Reported-by: Matthew Wilcox (Oracle) Reported-by: kernel test robot Cc: Randy Dunlap Cc: Peter Xu Cc: Andrea Arcangeli Signed-off-by: Andrew Morton --- mm/shmem.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 8280a5cb48df..c1d8b8a1aa3b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2424,9 +2424,26 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!zeropage) { /* COPY */ page_kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); + pagefault_enable(); kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ From 1db43d3f3733351849ddca4b573c037c7821bfd8 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Tue, 25 Oct 2022 16:12:49 +0000 Subject: [PATCH 63/66] mmap: fix remap_file_pages() regression When using the VMA iterator, the final execution will set the variable 'next' to NULL which causes the function to fail out. Restore the break in the loop to exit the VMA iterator early without clearing NULL fixes the issue. Link: https://lore.kernel.org/lkml/29344.1666681759@jrobl/ Link: https://lkml.kernel.org/r/20221025161222.2634030-1-Liam.Howlett@oracle.com Fixes: 763ecb035029 (mm: remove the vma linked list) Signed-off-by: Liam R. Howlett Reported-by: "J. R. Okajima" Tested-by: "J. R. Okajima" Signed-off-by: Andrew Morton --- mm/mmap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/mmap.c b/mm/mmap.c index e270057ed04e..2def55555e05 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2852,6 +2852,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (next->vm_flags != vma->vm_flags) goto out; + if (start + size <= next->vm_end) + break; + prev = next; } From 1b9c918318476b4441ddd754ee6699b5367bb5ee Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 26 Oct 2022 14:00:29 +0200 Subject: [PATCH 64/66] lib: maple_tree: remove unneeded initialization in mtree_range_walk() Before the do-while loop in mtree_range_walk(), the variables next, min, max need to be initialized. The variables last, prev_min and prev_max are set within the loop body before they are eventually used after exiting the loop body. As it is a do-while loop, the loop body is executed at least once, so the variables last, prev_min and prev_max do not need to be initialized before the loop body. Remove unneeded initialization of last and prev_min. The needless initialization was reported by clang-analyzer as Dead Stores. As the compiler already identifies these assignments as unneeded, it optimizes the assignments away. Hence: No functional change. No change in object code. Link: https://lkml.kernel.org/r/20221026120029.12555-2-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index e1743803c851..fbde494444b8 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2903,8 +2903,8 @@ static inline void *mtree_range_walk(struct ma_state *mas) unsigned long max, min; unsigned long prev_max, prev_min; - last = next = mas->node; - prev_min = min = mas->min; + next = mas->node; + min = mas->min; max = mas->max; do { offset = 0; From dda1c41a07b4a4c3f99b5b28c1e8c485205fe860 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 26 Oct 2022 15:48:30 +0200 Subject: [PATCH 65/66] mm: multi-gen LRU: move lru_gen_add_mm() out of IRQ-off region lru_gen_add_mm() has been added within an IRQ-off region in the commit mentioned below. The other invocations of lru_gen_add_mm() are not within an IRQ-off region. The invocation within IRQ-off region is problematic on PREEMPT_RT because the function is using a spin_lock_t which must not be used within IRQ-disabled regions. The other invocations of lru_gen_add_mm() occur while task_struct::alloc_lock is acquired. Move lru_gen_add_mm() after interrupts are enabled and before task_unlock(). Link: https://lkml.kernel.org/r/20221026134830.711887-1-bigeasy@linutronix.de Fixes: bd74fdaea1460 ("mm: multi-gen LRU: support page table walks") Signed-off-by: Sebastian Andrzej Siewior Acked-by: Yu Zhao Cc: Al Viro Cc: "Eric W . Biederman" Cc: Kees Cook Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 349a5da91efe..7ab1f27b805d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1012,7 +1012,6 @@ static int exec_mmap(struct mm_struct *mm) active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; - lru_gen_add_mm(mm); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for @@ -1025,6 +1024,7 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); + lru_gen_add_mm(mm); task_unlock(tsk); lru_gen_use_mm(mm); if (old_mm) { From f5e4ec155d145002fd9840868453d785fab86d42 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Fri, 28 Oct 2022 17:00:42 +0100 Subject: [PATCH 66/66] random: use arch_get_random*_early() in random_init() While reworking the archrandom handling, commit d349ab99eec7 ("random: handle archrandom with multiple longs") switched to the non-early archrandom helpers in random_init(), which broke initialization of the entropy pool from the arm64 random generator. Indeed at that point the arm64 CPU features, which verify that all CPUs have compatible capabilities, are not finalized so arch_get_random_seed_longs() is unsuccessful. Instead random_init() should use the _early functions, which check only the boot CPU on arm64. On other architectures the _early functions directly call the normal ones. Fixes: d349ab99eec7 ("random: handle archrandom with multiple longs") Cc: stable@vger.kernel.org Signed-off-by: Jean-Philippe Brucker Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 2fe28eeb2f38..69754155300e 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -791,13 +791,13 @@ void __init random_init_early(const char *command_line) #endif for (i = 0, arch_bits = sizeof(entropy) * 8; i < ARRAY_SIZE(entropy);) { - longs = arch_get_random_seed_longs(entropy, ARRAY_SIZE(entropy) - i); + longs = arch_get_random_seed_longs_early(entropy, ARRAY_SIZE(entropy) - i); if (longs) { _mix_pool_bytes(entropy, sizeof(*entropy) * longs); i += longs; continue; } - longs = arch_get_random_longs(entropy, ARRAY_SIZE(entropy) - i); + longs = arch_get_random_longs_early(entropy, ARRAY_SIZE(entropy) - i); if (longs) { _mix_pool_bytes(entropy, sizeof(*entropy) * longs); i += longs;