From b7be6c737a179a76901c872f6b4c1d00552d9a1b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 15:22:18 +0200 Subject: [PATCH 001/216] netfilter: nf_tables: disallow timeout for anonymous sets commit e26d3009efda338f19016df4175f354a9bd0a4ab upstream. Never used from userspace, disallow these parameters. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nf_tables_api.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e21ec3ad8093..d3ba947f4376 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4752,6 +4752,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout); if (err) return err; @@ -4760,6 +4763,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_SET_GC_INTERVAL] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); } From ae5f10ed9539878f1128f3fa129f104ba97ffc86 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 30 May 2023 09:38:09 +0200 Subject: [PATCH 002/216] drm/meson: fix unbind path if HDMI fails to bind [ Upstream commit 6a044642988b5f8285f3173b8e88784bef2bc306 ] If the case the HDMI controller fails to bind, we try to unbind all components before calling drm_dev_put() which makes drm_bridge_detach() crash because unbinding the HDMI controller frees the bridge memory. The solution is the unbind all components at the end like in the remove path. Reviewed-by: Nicolas Belin Tested-by: Nicolas Belin Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20230512-amlogic-v6-4-upstream-dsi-ccf-vim3-v5-8-56eb7a4d5b8e@linaro.org Stable-dep-of: bd915ae73a2d ("drm/meson: Don't remove bridges which are created by other drivers") Signed-off-by: Sasha Levin --- drivers/gpu/drm/meson/meson_drv.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_drv.c b/drivers/gpu/drm/meson/meson_drv.c index 119544d88b58..fbac39aa38cc 100644 --- a/drivers/gpu/drm/meson/meson_drv.c +++ b/drivers/gpu/drm/meson/meson_drv.c @@ -316,32 +316,34 @@ static int meson_drv_bind_master(struct device *dev, bool has_components) goto exit_afbcd; if (has_components) { - ret = component_bind_all(drm->dev, drm); + ret = component_bind_all(dev, drm); if (ret) { dev_err(drm->dev, "Couldn't bind all components\n"); + /* Do not try to unbind */ + has_components = false; goto exit_afbcd; } } ret = meson_encoder_hdmi_init(priv); if (ret) - goto unbind_all; + goto exit_afbcd; ret = meson_plane_create(priv); if (ret) - goto unbind_all; + goto exit_afbcd; ret = meson_overlay_create(priv); if (ret) - goto unbind_all; + goto exit_afbcd; ret = meson_crtc_create(priv); if (ret) - goto unbind_all; + goto exit_afbcd; ret = request_irq(priv->vsync_irq, meson_irq, 0, drm->driver->name, drm); if (ret) - goto unbind_all; + goto exit_afbcd; drm_mode_config_reset(drm); @@ -359,15 +361,18 @@ static int meson_drv_bind_master(struct device *dev, bool has_components) uninstall_irq: free_irq(priv->vsync_irq, drm); -unbind_all: - if (has_components) - component_unbind_all(drm->dev, drm); exit_afbcd: if (priv->afbcd.ops) priv->afbcd.ops->exit(priv); free_drm: drm_dev_put(drm); + meson_encoder_hdmi_remove(priv); + meson_encoder_cvbs_remove(priv); + + if (has_components) + component_unbind_all(dev, drm); + return ret; } From 7d34b1078665e171f4883b8675e52d17ebfc5c64 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Thu, 15 Feb 2024 23:04:42 +0100 Subject: [PATCH 003/216] drm/meson: Don't remove bridges which are created by other drivers [ Upstream commit bd915ae73a2d78559b376ad2caf5e4ef51de2455 ] Stop calling drm_bridge_remove() for bridges allocated/managed by other drivers in the remove paths of meson_encoder_{cvbs,dsi,hdmi}. drm_bridge_remove() unregisters the bridge so it cannot be used anymore. Doing so for bridges we don't own can lead to the video pipeline not being able to come up after -EPROBE_DEFER of the VPU because we're unregistering a bridge that's managed by another driver. The other driver doesn't know that we have unregistered it's bridge and on subsequent .probe() we're not able to find those bridges anymore (since nobody re-creates them). This fixes probe errors on Meson8b boards with the CVBS outputs enabled. Fixes: 09847723c12f ("drm/meson: remove drm bridges at aggregate driver unbind time") Fixes: 42dcf15f901c ("drm/meson: add DSI encoder") Cc: Reported-by: Steve Morvai Signed-off-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Tested-by: Steve Morvai Link: https://lore.kernel.org/r/20240215220442.1343152-1-martin.blumenstingl@googlemail.com Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20240215220442.1343152-1-martin.blumenstingl@googlemail.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/meson/meson_encoder_cvbs.c | 1 - drivers/gpu/drm/meson/meson_encoder_hdmi.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_encoder_cvbs.c b/drivers/gpu/drm/meson/meson_encoder_cvbs.c index 3f73b211fa8e..3407450435e2 100644 --- a/drivers/gpu/drm/meson/meson_encoder_cvbs.c +++ b/drivers/gpu/drm/meson/meson_encoder_cvbs.c @@ -294,6 +294,5 @@ void meson_encoder_cvbs_remove(struct meson_drm *priv) if (priv->encoders[MESON_ENC_CVBS]) { meson_encoder_cvbs = priv->encoders[MESON_ENC_CVBS]; drm_bridge_remove(&meson_encoder_cvbs->bridge); - drm_bridge_remove(meson_encoder_cvbs->next_bridge); } } diff --git a/drivers/gpu/drm/meson/meson_encoder_hdmi.c b/drivers/gpu/drm/meson/meson_encoder_hdmi.c index b14e6e507c61..03062e7a02b6 100644 --- a/drivers/gpu/drm/meson/meson_encoder_hdmi.c +++ b/drivers/gpu/drm/meson/meson_encoder_hdmi.c @@ -472,6 +472,5 @@ void meson_encoder_hdmi_remove(struct meson_drm *priv) if (priv->encoders[MESON_ENC_HDMI]) { meson_encoder_hdmi = priv->encoders[MESON_ENC_HDMI]; drm_bridge_remove(&meson_encoder_hdmi->bridge); - drm_bridge_remove(meson_encoder_hdmi->next_bridge); } } From cf33e6ca12d814e1be2263cb76960d0019d7fb94 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 29 Dec 2022 13:01:40 -0600 Subject: [PATCH 004/216] scsi: core: Add struct for args to execution functions [ Upstream commit d0949565811f0896c1c7e781ab2ad99d34273fdf ] Move the SCSI execution functions to use a struct for passing in optional args. This commit adds the new struct, temporarily converts scsi_execute() and scsi_execute_req() ands a new helper, scsi_execute_cmd(), which takes the scsi_exec_args struct. There should be no change in behavior. We no longer allow users to pass in any request->rq_flags value, but they were only passing in RQF_PM which we do support by allowing users to pass in the BLK_MQ_REQ flags used by blk_mq_alloc_request(). Subsequent commits will convert scsi_execute() and scsi_execute_req() users to the new helpers then remove scsi_execute() and scsi_execute_req(). Signed-off-by: Mike Christie Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Signed-off-by: Martin K. Petersen Stable-dep-of: 321da3dc1f3c ("scsi: sd: usb_storage: uas: Access media prior to querying device properties") Signed-off-by: Sasha Levin --- drivers/scsi/scsi_lib.c | 52 ++++++++++++++++++-------------------- include/scsi/scsi_device.h | 51 +++++++++++++++++++++++++++---------- 2 files changed, 62 insertions(+), 41 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 5c5954b78585..edd296f950a3 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -185,39 +185,37 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason) __scsi_queue_insert(cmd, reason, true); } - /** - * __scsi_execute - insert request and wait for the result - * @sdev: scsi device + * scsi_execute_cmd - insert request and wait for the result + * @sdev: scsi_device * @cmd: scsi command - * @data_direction: data direction + * @opf: block layer request cmd_flags * @buffer: data buffer * @bufflen: len of buffer - * @sense: optional sense buffer - * @sshdr: optional decoded sense header * @timeout: request timeout in HZ * @retries: number of times to retry request - * @flags: flags for ->cmd_flags - * @rq_flags: flags for ->rq_flags - * @resid: optional residual length + * @args: Optional args. See struct definition for field descriptions * * Returns the scsi_cmnd result field if a command was executed, or a negative * Linux error code if we didn't get that far. */ -int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, - int data_direction, void *buffer, unsigned bufflen, - unsigned char *sense, struct scsi_sense_hdr *sshdr, - int timeout, int retries, blk_opf_t flags, - req_flags_t rq_flags, int *resid) +int scsi_execute_cmd(struct scsi_device *sdev, const unsigned char *cmd, + blk_opf_t opf, void *buffer, unsigned int bufflen, + int timeout, int retries, + const struct scsi_exec_args *args) { + static const struct scsi_exec_args default_args; struct request *req; struct scsi_cmnd *scmd; int ret; - req = scsi_alloc_request(sdev->request_queue, - data_direction == DMA_TO_DEVICE ? - REQ_OP_DRV_OUT : REQ_OP_DRV_IN, - rq_flags & RQF_PM ? BLK_MQ_REQ_PM : 0); + if (!args) + args = &default_args; + else if (WARN_ON_ONCE(args->sense && + args->sense_len != SCSI_SENSE_BUFFERSIZE)) + return -EINVAL; + + req = scsi_alloc_request(sdev->request_queue, opf, args->req_flags); if (IS_ERR(req)) return PTR_ERR(req); @@ -232,8 +230,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, memcpy(scmd->cmnd, cmd, scmd->cmd_len); scmd->allowed = retries; req->timeout = timeout; - req->cmd_flags |= flags; - req->rq_flags |= rq_flags | RQF_QUIET; + req->rq_flags |= RQF_QUIET; /* * head injection *required* here otherwise quiesce won't work @@ -249,20 +246,21 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, if (unlikely(scmd->resid_len > 0 && scmd->resid_len <= bufflen)) memset(buffer + bufflen - scmd->resid_len, 0, scmd->resid_len); - if (resid) - *resid = scmd->resid_len; - if (sense && scmd->sense_len) - memcpy(sense, scmd->sense_buffer, SCSI_SENSE_BUFFERSIZE); - if (sshdr) + if (args->resid) + *args->resid = scmd->resid_len; + if (args->sense) + memcpy(args->sense, scmd->sense_buffer, SCSI_SENSE_BUFFERSIZE); + if (args->sshdr) scsi_normalize_sense(scmd->sense_buffer, scmd->sense_len, - sshdr); + args->sshdr); + ret = scmd->result; out: blk_mq_free_request(req); return ret; } -EXPORT_SYMBOL(__scsi_execute); +EXPORT_SYMBOL(scsi_execute_cmd); /* * Wake up the error handler if necessary. Avoid as follows that the error diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index d2751ed536df..b407807cc669 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -479,28 +479,51 @@ extern const char *scsi_device_state_name(enum scsi_device_state); extern int scsi_is_sdev_device(const struct device *); extern int scsi_is_target_device(const struct device *); extern void scsi_sanitize_inquiry_string(unsigned char *s, int len); -extern int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, - int data_direction, void *buffer, unsigned bufflen, - unsigned char *sense, struct scsi_sense_hdr *sshdr, - int timeout, int retries, blk_opf_t flags, - req_flags_t rq_flags, int *resid); + +/* Optional arguments to scsi_execute_cmd */ +struct scsi_exec_args { + unsigned char *sense; /* sense buffer */ + unsigned int sense_len; /* sense buffer len */ + struct scsi_sense_hdr *sshdr; /* decoded sense header */ + blk_mq_req_flags_t req_flags; /* BLK_MQ_REQ flags */ + int *resid; /* residual length */ +}; + +int scsi_execute_cmd(struct scsi_device *sdev, const unsigned char *cmd, + blk_opf_t opf, void *buffer, unsigned int bufflen, + int timeout, int retries, + const struct scsi_exec_args *args); + /* Make sure any sense buffer is the correct size. */ -#define scsi_execute(sdev, cmd, data_direction, buffer, bufflen, sense, \ - sshdr, timeout, retries, flags, rq_flags, resid) \ +#define scsi_execute(_sdev, _cmd, _data_dir, _buffer, _bufflen, _sense, \ + _sshdr, _timeout, _retries, _flags, _rq_flags, \ + _resid) \ ({ \ - BUILD_BUG_ON((sense) != NULL && \ - sizeof(sense) != SCSI_SENSE_BUFFERSIZE); \ - __scsi_execute(sdev, cmd, data_direction, buffer, bufflen, \ - sense, sshdr, timeout, retries, flags, rq_flags, \ - resid); \ + scsi_execute_cmd(_sdev, _cmd, (_data_dir == DMA_TO_DEVICE ? \ + REQ_OP_DRV_OUT : REQ_OP_DRV_IN) | _flags, \ + _buffer, _bufflen, _timeout, _retries, \ + &(struct scsi_exec_args) { \ + .sense = _sense, \ + .sshdr = _sshdr, \ + .req_flags = _rq_flags & RQF_PM ? \ + BLK_MQ_REQ_PM : 0, \ + .resid = _resid, \ + }); \ }) + static inline int scsi_execute_req(struct scsi_device *sdev, const unsigned char *cmd, int data_direction, void *buffer, unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout, int retries, int *resid) { - return scsi_execute(sdev, cmd, data_direction, buffer, - bufflen, NULL, sshdr, timeout, retries, 0, 0, resid); + return scsi_execute_cmd(sdev, cmd, + data_direction == DMA_TO_DEVICE ? + REQ_OP_DRV_OUT : REQ_OP_DRV_IN, buffer, + bufflen, timeout, retries, + &(struct scsi_exec_args) { + .sshdr = sshdr, + .resid = resid, + }); } extern void sdev_disable_disk_events(struct scsi_device *sdev); extern void sdev_enable_disk_events(struct scsi_device *sdev); From b73dd5f9997279715cd450ee8ca599aaff2eabb9 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 13 Feb 2024 09:33:06 -0500 Subject: [PATCH 005/216] scsi: sd: usb_storage: uas: Access media prior to querying device properties [ Upstream commit 321da3dc1f3c92a12e3c5da934090d2992a8814c ] It has been observed that some USB/UAS devices return generic properties hardcoded in firmware for mode pages for a period of time after a device has been discovered. The reported properties are either garbage or they do not accurately reflect the characteristics of the physical storage device attached in the case of a bridge. Prior to commit 1e029397d12f ("scsi: sd: Reorganize DIF/DIX code to avoid calling revalidate twice") we would call revalidate several times during device discovery. As a result, incorrect values would eventually get replaced with ones accurately describing the attached storage. When we did away with the redundant revalidate pass, several cases were reported where devices reported nonsensical values or would end up in write-protected state. An initial attempt at addressing this issue involved introducing a delayed second revalidate invocation. However, this approach still left some devices reporting incorrect characteristics. Tasos Sahanidis debugged the problem further and identified that introducing a READ operation prior to MODE SENSE fixed the problem and that it wasn't a timing issue. Issuing a READ appears to cause the devices to update their state to reflect the actual properties of the storage media. Device properties like vendor, model, and storage capacity appear to be correctly reported from the get-go. It is unclear why these devices defer populating the remaining characteristics. Match the behavior of a well known commercial operating system and trigger a READ operation prior to querying device characteristics to force the device to populate the mode pages. The additional READ is triggered by a flag set in the USB storage and UAS drivers. We avoid issuing the READ for other transport classes since some storage devices identify Linux through our particular discovery command sequence. Link: https://lore.kernel.org/r/20240213143306.2194237-1-martin.petersen@oracle.com Fixes: 1e029397d12f ("scsi: sd: Reorganize DIF/DIX code to avoid calling revalidate twice") Cc: stable@vger.kernel.org Reported-by: Tasos Sahanidis Reviewed-by: Ewan D. Milne Reviewed-by: Bart Van Assche Tested-by: Tasos Sahanidis Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/sd.c | 26 +++++++++++++++++++++++++- drivers/usb/storage/scsiglue.c | 7 +++++++ drivers/usb/storage/uas.c | 7 +++++++ include/scsi/scsi_device.h | 1 + 4 files changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 31b5273f43a7..4433b02c8935 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3284,6 +3284,24 @@ static bool sd_validate_opt_xfer_size(struct scsi_disk *sdkp, return true; } +static void sd_read_block_zero(struct scsi_disk *sdkp) +{ + unsigned int buf_len = sdkp->device->sector_size; + char *buffer, cmd[10] = { }; + + buffer = kmalloc(buf_len, GFP_KERNEL); + if (!buffer) + return; + + cmd[0] = READ_10; + put_unaligned_be32(0, &cmd[2]); /* Logical block address 0 */ + put_unaligned_be16(1, &cmd[7]); /* Transfer 1 logical block */ + + scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, buffer, buf_len, + SD_TIMEOUT, sdkp->max_retries, NULL); + kfree(buffer); +} + /** * sd_revalidate_disk - called the first time a new disk is seen, * performs disk spin up, read_capacity, etc. @@ -3323,7 +3341,13 @@ static int sd_revalidate_disk(struct gendisk *disk) */ if (sdkp->media_present) { sd_read_capacity(sdkp, buffer); - + /* + * Some USB/UAS devices return generic values for mode pages + * until the media has been accessed. Trigger a READ operation + * to force the device to populate mode pages. + */ + if (sdp->read_before_ms) + sd_read_block_zero(sdkp); /* * set the default to rotational. All non-rotational devices * support the block characteristics VPD page, which will diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c index c54e9805da53..12cf9940e5b6 100644 --- a/drivers/usb/storage/scsiglue.c +++ b/drivers/usb/storage/scsiglue.c @@ -179,6 +179,13 @@ static int slave_configure(struct scsi_device *sdev) */ sdev->use_192_bytes_for_3f = 1; + /* + * Some devices report generic values until the media has been + * accessed. Force a READ(10) prior to querying device + * characteristics. + */ + sdev->read_before_ms = 1; + /* * Some devices don't like MODE SENSE with page=0x3f, * which is the command used for checking if a device diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index de3836412bf3..ed22053b3252 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -878,6 +878,13 @@ static int uas_slave_configure(struct scsi_device *sdev) if (devinfo->flags & US_FL_CAPACITY_HEURISTICS) sdev->guess_capacity = 1; + /* + * Some devices report generic values until the media has been + * accessed. Force a READ(10) prior to querying device + * characteristics. + */ + sdev->read_before_ms = 1; + /* * Some devices don't like MODE SENSE with page=0x3f, * which is the command used for checking if a device diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index b407807cc669..a64713fe5264 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -204,6 +204,7 @@ struct scsi_device { unsigned use_10_for_rw:1; /* first try 10-byte read / write */ unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */ unsigned set_dbd_for_ms:1; /* Set "DBD" field in mode sense */ + unsigned read_before_ms:1; /* perform a READ before MODE SENSE */ unsigned no_report_opcodes:1; /* no REPORT SUPPORTED OPERATION CODES */ unsigned no_write_same:1; /* no WRITE SAME command */ unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */ From 2a3d40b4025fcfe51b04924979f1653993b17669 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 9 Feb 2024 14:04:53 -0800 Subject: [PATCH 006/216] af_unix: Fix task hung while purging oob_skb in GC. [ Upstream commit 25236c91b5ab4a26a56ba2e79b8060cf4e047839 ] syzbot reported a task hung; at the same time, GC was looping infinitely in list_for_each_entry_safe() for OOB skb. [0] syzbot demonstrated that the list_for_each_entry_safe() was not actually safe in this case. A single skb could have references for multiple sockets. If we free such a skb in the list_for_each_entry_safe(), the current and next sockets could be unlinked in a single iteration. unix_notinflight() uses list_del_init() to unlink the socket, so the prefetched next socket forms a loop itself and list_for_each_entry_safe() never stops. Here, we must use while() and make sure we always fetch the first socket. [0]: Sending NMI from CPU 0 to CPUs 1: NMI backtrace for cpu 1 CPU: 1 PID: 5065 Comm: syz-executor236 Not tainted 6.8.0-rc3-syzkaller-00136-g1f719a2f3fa6 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024 RIP: 0010:preempt_count arch/x86/include/asm/preempt.h:26 [inline] RIP: 0010:check_kcov_mode kernel/kcov.c:173 [inline] RIP: 0010:__sanitizer_cov_trace_pc+0xd/0x60 kernel/kcov.c:207 Code: cc cc cc cc 66 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 65 48 8b 14 25 40 c2 03 00 <65> 8b 05 b4 7c 78 7e a9 00 01 ff 00 48 8b 34 24 74 0f f6 c4 01 74 RSP: 0018:ffffc900033efa58 EFLAGS: 00000283 RAX: ffff88807b077800 RBX: ffff88807b077800 RCX: 1ffffffff27b1189 RDX: ffff88802a5a3b80 RSI: ffffffff8968488d RDI: ffff88807b077f70 RBP: ffffc900033efbb0 R08: 0000000000000001 R09: fffffbfff27a900c R10: ffffffff93d48067 R11: ffffffff8ae000eb R12: ffff88807b077800 R13: dffffc0000000000 R14: ffff88807b077e40 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8880b9500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000564f4fc1e3a8 CR3: 000000000d57a000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: unix_gc+0x563/0x13b0 net/unix/garbage.c:319 unix_release_sock+0xa93/0xf80 net/unix/af_unix.c:683 unix_release+0x91/0xf0 net/unix/af_unix.c:1064 __sock_release+0xb0/0x270 net/socket.c:659 sock_close+0x1c/0x30 net/socket.c:1421 __fput+0x270/0xb80 fs/file_table.c:376 task_work_run+0x14f/0x250 kernel/task_work.c:180 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0xa8a/0x2ad0 kernel/exit.c:871 do_group_exit+0xd4/0x2a0 kernel/exit.c:1020 __do_sys_exit_group kernel/exit.c:1031 [inline] __se_sys_exit_group kernel/exit.c:1029 [inline] __x64_sys_exit_group+0x3e/0x50 kernel/exit.c:1029 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xd5/0x270 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x6f/0x77 RIP: 0033:0x7f9d6cbdac09 Code: Unable to access opcode bytes at 0x7f9d6cbdabdf. RSP: 002b:00007fff5952feb8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9d6cbdac09 RDX: 000000000000003c RSI: 00000000000000e7 RDI: 0000000000000000 RBP: 00007f9d6cc552b0 R08: ffffffffffffffb8 R09: 0000000000000006 R10: 0000000000000006 R11: 0000000000000246 R12: 00007f9d6cc552b0 R13: 0000000000000000 R14: 00007f9d6cc55d00 R15: 00007f9d6cbabe70 Reported-by: syzbot+4fa4a2d1f5a5ee06f006@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=4fa4a2d1f5a5ee06f006 Fixes: 1279f9d9dec2 ("af_unix: Call kfree_skb() for dead unix_(sk)->oob_skb in GC.") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240209220453.96053-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/unix/garbage.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 767b338a7a2d..9e1bab97c05b 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -315,10 +315,11 @@ void unix_gc(void) __skb_queue_purge(&hitlist); #if IS_ENABLED(CONFIG_AF_UNIX_OOB) - list_for_each_entry_safe(u, next, &gc_candidates, link) { - struct sk_buff *skb = u->oob_skb; + while (!list_empty(&gc_candidates)) { + u = list_entry(gc_candidates.next, struct unix_sock, link); + if (u->oob_skb) { + struct sk_buff *skb = u->oob_skb; - if (skb) { u->oob_skb = NULL; kfree_skb(skb); } From a76072bc73c77cbdc6c77e5893376939894e6f73 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 28 Sep 2023 11:35:39 +0200 Subject: [PATCH 007/216] of: overlay: Reorder struct fragment fields kerneldoc [ Upstream commit 5d007ffdf6025fe83e497c44ed7c8aa8f150c4d1 ] The fields of the fragment structure were reordered, but the kerneldoc was not updated. Fixes: 81225ea682f45629 ("of: overlay: reorder fields in struct fragment") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/cfa36d2bb95e3c399c415dbf58057302c70ef375.1695893695.git.geert+renesas@glider.be Signed-off-by: Rob Herring Signed-off-by: Sasha Levin --- drivers/of/overlay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c index 4402871b5c0c..e663d5585a05 100644 --- a/drivers/of/overlay.c +++ b/drivers/of/overlay.c @@ -45,8 +45,8 @@ struct target { /** * struct fragment - info about fragment nodes in overlay expanded device tree - * @target: target of the overlay operation * @overlay: pointer to the __overlay__ node + * @target: target of the overlay operation */ struct fragment { struct device_node *overlay; From 00459ae532d6f1e7c720b5a331f40f72cf158dca Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 7 Mar 2023 16:10:51 -0600 Subject: [PATCH 008/216] net: restore alpha order to Ethernet devices in config [ Upstream commit a1331535aeb41b08fe0c2c78af51885edc93615b ] The filename "wangxun" sorts between "intel" and "xscale", but xscale/Kconfig contains "Intel XScale" prompts, so Wangxun ends up in the wrong place in the config front-ends. Move wangxun/Kconfig so the Wangxun devices appear in order in the user interface. Fixes: 3ce7547e5b71 ("net: txgbe: Add build support for txgbe") Signed-off-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20230307221051.890135-1-helgaas@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig index 1917da784191..5a274b99f299 100644 --- a/drivers/net/ethernet/Kconfig +++ b/drivers/net/ethernet/Kconfig @@ -84,7 +84,6 @@ source "drivers/net/ethernet/huawei/Kconfig" source "drivers/net/ethernet/i825xx/Kconfig" source "drivers/net/ethernet/ibm/Kconfig" source "drivers/net/ethernet/intel/Kconfig" -source "drivers/net/ethernet/wangxun/Kconfig" source "drivers/net/ethernet/xscale/Kconfig" config JME @@ -189,6 +188,7 @@ source "drivers/net/ethernet/toshiba/Kconfig" source "drivers/net/ethernet/tundra/Kconfig" source "drivers/net/ethernet/vertexcom/Kconfig" source "drivers/net/ethernet/via/Kconfig" +source "drivers/net/ethernet/wangxun/Kconfig" source "drivers/net/ethernet/wiznet/Kconfig" source "drivers/net/ethernet/xilinx/Kconfig" source "drivers/net/ethernet/xircom/Kconfig" From 174ac6b53a20cc7f466eead68ccee55ab633e5a1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 6 Feb 2023 16:39:20 +0100 Subject: [PATCH 009/216] mlxsw: spectrum_acl_tcam: Make fini symmetric to init [ Upstream commit 61fe3b9102ac84ba479ab84d8f5454af2e21e468 ] Move mutex_destroy() to the end to make the function symmetric with mlxsw_sp_acl_tcam_init(). No functional changes. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: Petr Machata Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c index dc2e204bcd72..2107de4e9d99 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c @@ -86,10 +86,10 @@ void mlxsw_sp_acl_tcam_fini(struct mlxsw_sp *mlxsw_sp, { const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops; - mutex_destroy(&tcam->lock); ops->fini(mlxsw_sp, tcam->priv); bitmap_free(tcam->used_groups); bitmap_free(tcam->used_regions); + mutex_destroy(&tcam->lock); } int mlxsw_sp_acl_tcam_priority_get(struct mlxsw_sp *mlxsw_sp, From 5dbedec7e5cf668caa0d76e02915eef16d22e97f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 6 Feb 2023 16:39:19 +0100 Subject: [PATCH 010/216] mlxsw: spectrum_acl_tcam: Add missing mutex_destroy() [ Upstream commit 65823e07b1e4055b6278725fd92f4d7e6f8d53fd ] Pair mutex_init() with a mutex_destroy() in the error path. Found during code review. No functional changes. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: Petr Machata Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c index 2107de4e9d99..41eac7dfb67e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c @@ -52,8 +52,10 @@ int mlxsw_sp_acl_tcam_init(struct mlxsw_sp *mlxsw_sp, max_regions = max_tcam_regions; tcam->used_regions = bitmap_zalloc(max_regions, GFP_KERNEL); - if (!tcam->used_regions) - return -ENOMEM; + if (!tcam->used_regions) { + err = -ENOMEM; + goto err_alloc_used_regions; + } tcam->max_regions = max_regions; max_groups = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_GROUPS); @@ -78,6 +80,8 @@ err_tcam_init: bitmap_free(tcam->used_groups); err_alloc_used_groups: bitmap_free(tcam->used_regions); +err_alloc_used_regions: + mutex_destroy(&tcam->lock); return err; } From e30f82597bf64ad32f3b9718bb12791bf3926f3d Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 15 May 2023 11:10:49 -0400 Subject: [PATCH 011/216] PCI: layerscape: Add the endpoint linkup notifier support [ Upstream commit 061cbfab09fb35898f2907d42f936cf9ae271d93 ] Layerscape has PME interrupt, which can be used as linkup notifier. Set CFG_READY bit of PEX_PF0_CONFIG to enable accesses from root complex when linkup detected. Link: https://lore.kernel.org/r/20230515151049.2797105-1-Frank.Li@nxp.com Signed-off-by: Xiaowei Bao Signed-off-by: Frank Li Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Manivannan Sadhasivam Signed-off-by: Sasha Levin --- .../pci/controller/dwc/pci-layerscape-ep.c | 100 +++++++++++++++++- 1 file changed, 99 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c index ad99707b3b99..5b27554e071a 100644 --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c @@ -18,6 +18,20 @@ #include "pcie-designware.h" +#define PEX_PF0_CONFIG 0xC0014 +#define PEX_PF0_CFG_READY BIT(0) + +/* PEX PFa PCIE PME and message interrupt registers*/ +#define PEX_PF0_PME_MES_DR 0xC0020 +#define PEX_PF0_PME_MES_DR_LUD BIT(7) +#define PEX_PF0_PME_MES_DR_LDD BIT(9) +#define PEX_PF0_PME_MES_DR_HRD BIT(10) + +#define PEX_PF0_PME_MES_IER 0xC0028 +#define PEX_PF0_PME_MES_IER_LUDIE BIT(7) +#define PEX_PF0_PME_MES_IER_LDDIE BIT(9) +#define PEX_PF0_PME_MES_IER_HRDIE BIT(10) + #define to_ls_pcie_ep(x) dev_get_drvdata((x)->dev) struct ls_pcie_ep_drvdata { @@ -30,8 +44,84 @@ struct ls_pcie_ep { struct dw_pcie *pci; struct pci_epc_features *ls_epc; const struct ls_pcie_ep_drvdata *drvdata; + int irq; + bool big_endian; }; +static u32 ls_lut_readl(struct ls_pcie_ep *pcie, u32 offset) +{ + struct dw_pcie *pci = pcie->pci; + + if (pcie->big_endian) + return ioread32be(pci->dbi_base + offset); + else + return ioread32(pci->dbi_base + offset); +} + +static void ls_lut_writel(struct ls_pcie_ep *pcie, u32 offset, u32 value) +{ + struct dw_pcie *pci = pcie->pci; + + if (pcie->big_endian) + iowrite32be(value, pci->dbi_base + offset); + else + iowrite32(value, pci->dbi_base + offset); +} + +static irqreturn_t ls_pcie_ep_event_handler(int irq, void *dev_id) +{ + struct ls_pcie_ep *pcie = dev_id; + struct dw_pcie *pci = pcie->pci; + u32 val, cfg; + + val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR); + ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val); + + if (!val) + return IRQ_NONE; + + if (val & PEX_PF0_PME_MES_DR_LUD) { + cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG); + cfg |= PEX_PF0_CFG_READY; + ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg); + dw_pcie_ep_linkup(&pci->ep); + + dev_dbg(pci->dev, "Link up\n"); + } else if (val & PEX_PF0_PME_MES_DR_LDD) { + dev_dbg(pci->dev, "Link down\n"); + } else if (val & PEX_PF0_PME_MES_DR_HRD) { + dev_dbg(pci->dev, "Hot reset\n"); + } + + return IRQ_HANDLED; +} + +static int ls_pcie_ep_interrupt_init(struct ls_pcie_ep *pcie, + struct platform_device *pdev) +{ + u32 val; + int ret; + + pcie->irq = platform_get_irq_byname(pdev, "pme"); + if (pcie->irq < 0) + return pcie->irq; + + ret = devm_request_irq(&pdev->dev, pcie->irq, ls_pcie_ep_event_handler, + IRQF_SHARED, pdev->name, pcie); + if (ret) { + dev_err(&pdev->dev, "Can't register PCIe IRQ\n"); + return ret; + } + + /* Enable interrupts */ + val = ls_lut_readl(pcie, PEX_PF0_PME_MES_IER); + val |= PEX_PF0_PME_MES_IER_LDDIE | PEX_PF0_PME_MES_IER_HRDIE | + PEX_PF0_PME_MES_IER_LUDIE; + ls_lut_writel(pcie, PEX_PF0_PME_MES_IER, val); + + return 0; +} + static const struct pci_epc_features* ls_pcie_ep_get_features(struct dw_pcie_ep *ep) { @@ -124,6 +214,7 @@ static int __init ls_pcie_ep_probe(struct platform_device *pdev) struct ls_pcie_ep *pcie; struct pci_epc_features *ls_epc; struct resource *dbi_base; + int ret; pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); if (!pcie) @@ -143,6 +234,7 @@ static int __init ls_pcie_ep_probe(struct platform_device *pdev) pci->ops = pcie->drvdata->dw_pcie_ops; ls_epc->bar_fixed_64bit = (1 << BAR_2) | (1 << BAR_4); + ls_epc->linkup_notifier = true; pcie->pci = pci; pcie->ls_epc = ls_epc; @@ -154,9 +246,15 @@ static int __init ls_pcie_ep_probe(struct platform_device *pdev) pci->ep.ops = &ls_pcie_ep_ops; + pcie->big_endian = of_property_read_bool(dev->of_node, "big-endian"); + platform_set_drvdata(pdev, pcie); - return dw_pcie_ep_init(&pci->ep); + ret = dw_pcie_ep_init(&pci->ep); + if (ret) + return ret; + + return ls_pcie_ep_interrupt_init(pcie, pdev); } static struct platform_driver ls_pcie_ep_driver = { From 507eeaad4d32174640440f225a30112d8cccd374 Mon Sep 17 00:00:00 2001 From: Xiaowei Bao Date: Thu, 20 Jul 2023 09:58:34 -0400 Subject: [PATCH 012/216] PCI: layerscape: Add workaround for lost link capabilities during reset [ Upstream commit 17cf8661ee0f065c08152e611a568dd1fb0285f1 ] The endpoint controller loses the Maximum Link Width and Supported Link Speed value from the Link Capabilities Register - initially configured by the Reset Configuration Word (RCW) - during a link-down or hot reset event. Address this issue in the endpoint event handler. Link: https://lore.kernel.org/r/20230720135834.1977616-2-Frank.Li@nxp.com Fixes: a805770d8a22 ("PCI: layerscape: Add EP mode support") Signed-off-by: Xiaowei Bao Signed-off-by: Hou Zhiqiang Signed-off-by: Frank Li Signed-off-by: Lorenzo Pieralisi Acked-by: Manivannan Sadhasivam Signed-off-by: Sasha Levin --- .../pci/controller/dwc/pci-layerscape-ep.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c index 5b27554e071a..dd7d74fecc48 100644 --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c @@ -45,6 +45,7 @@ struct ls_pcie_ep { struct pci_epc_features *ls_epc; const struct ls_pcie_ep_drvdata *drvdata; int irq; + u32 lnkcap; bool big_endian; }; @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void *dev_id) struct ls_pcie_ep *pcie = dev_id; struct dw_pcie *pci = pcie->pci; u32 val, cfg; + u8 offset; val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR); ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val); @@ -81,6 +83,19 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void *dev_id) return IRQ_NONE; if (val & PEX_PF0_PME_MES_DR_LUD) { + + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); + + /* + * The values of the Maximum Link Width and Supported Link + * Speed from the Link Capabilities Register will be lost + * during link down or hot reset. Restore initial value + * that configured by the Reset Configuration Word (RCW). + */ + dw_pcie_dbi_ro_wr_en(pci); + dw_pcie_writel_dbi(pci, offset + PCI_EXP_LNKCAP, pcie->lnkcap); + dw_pcie_dbi_ro_wr_dis(pci); + cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG); cfg |= PEX_PF0_CFG_READY; ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg); @@ -214,6 +229,7 @@ static int __init ls_pcie_ep_probe(struct platform_device *pdev) struct ls_pcie_ep *pcie; struct pci_epc_features *ls_epc; struct resource *dbi_base; + u8 offset; int ret; pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); @@ -250,6 +266,9 @@ static int __init ls_pcie_ep_probe(struct platform_device *pdev) platform_set_drvdata(pdev, pcie); + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); + pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); + ret = dw_pcie_ep_init(&pci->ep); if (ret) return ret; From 0cea0c330a11461d0fbad5347a5d68d499db56fd Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Fri, 14 Apr 2023 11:19:46 +0200 Subject: [PATCH 013/216] ARM: dts: imx: Adjust dma-apbh node name [ Upstream commit e9f5cd85f1f931bb7b64031492f7051187ccaac7 ] Currently the dtbs_check generates warnings like this: $nodename:0: 'dma-apbh@110000' does not match '^dma-controller(@.*)?$' So fix all affected dma-apbh node names. Signed-off-by: Stefan Wahren Signed-off-by: Shawn Guo Signed-off-by: Sasha Levin --- arch/arm/boot/dts/imx23.dtsi | 2 +- arch/arm/boot/dts/imx28.dtsi | 2 +- arch/arm/boot/dts/imx6qdl.dtsi | 2 +- arch/arm/boot/dts/imx6sx.dtsi | 2 +- arch/arm/boot/dts/imx6ul.dtsi | 2 +- arch/arm/boot/dts/imx7s.dtsi | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm/boot/dts/imx23.dtsi b/arch/arm/boot/dts/imx23.dtsi index ec476b159649..b236d23f8071 100644 --- a/arch/arm/boot/dts/imx23.dtsi +++ b/arch/arm/boot/dts/imx23.dtsi @@ -59,7 +59,7 @@ reg = <0x80000000 0x2000>; }; - dma_apbh: dma-apbh@80004000 { + dma_apbh: dma-controller@80004000 { compatible = "fsl,imx23-dma-apbh"; reg = <0x80004000 0x2000>; interrupts = <0 14 20 0 diff --git a/arch/arm/boot/dts/imx28.dtsi b/arch/arm/boot/dts/imx28.dtsi index b15df16ecb01..b81592a61311 100644 --- a/arch/arm/boot/dts/imx28.dtsi +++ b/arch/arm/boot/dts/imx28.dtsi @@ -78,7 +78,7 @@ status = "disabled"; }; - dma_apbh: dma-apbh@80004000 { + dma_apbh: dma-controller@80004000 { compatible = "fsl,imx28-dma-apbh"; reg = <0x80004000 0x2000>; interrupts = <82 83 84 85 diff --git a/arch/arm/boot/dts/imx6qdl.dtsi b/arch/arm/boot/dts/imx6qdl.dtsi index ff1e0173b39b..2c6eada01d79 100644 --- a/arch/arm/boot/dts/imx6qdl.dtsi +++ b/arch/arm/boot/dts/imx6qdl.dtsi @@ -150,7 +150,7 @@ interrupt-parent = <&gpc>; ranges; - dma_apbh: dma-apbh@110000 { + dma_apbh: dma-controller@110000 { compatible = "fsl,imx6q-dma-apbh", "fsl,imx28-dma-apbh"; reg = <0x00110000 0x2000>; interrupts = <0 13 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm/boot/dts/imx6sx.dtsi b/arch/arm/boot/dts/imx6sx.dtsi index 1f1053a898fb..67d344ae76b5 100644 --- a/arch/arm/boot/dts/imx6sx.dtsi +++ b/arch/arm/boot/dts/imx6sx.dtsi @@ -209,7 +209,7 @@ power-domains = <&pd_pu>; }; - dma_apbh: dma-apbh@1804000 { + dma_apbh: dma-controller@1804000 { compatible = "fsl,imx6sx-dma-apbh", "fsl,imx28-dma-apbh"; reg = <0x01804000 0x2000>; interrupts = , diff --git a/arch/arm/boot/dts/imx6ul.dtsi b/arch/arm/boot/dts/imx6ul.dtsi index 2b5996395701..aac081b6daaa 100644 --- a/arch/arm/boot/dts/imx6ul.dtsi +++ b/arch/arm/boot/dts/imx6ul.dtsi @@ -164,7 +164,7 @@ <0x00a06000 0x2000>; }; - dma_apbh: dma-apbh@1804000 { + dma_apbh: dma-controller@1804000 { compatible = "fsl,imx6q-dma-apbh", "fsl,imx28-dma-apbh"; reg = <0x01804000 0x2000>; interrupts = <0 13 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm/boot/dts/imx7s.dtsi b/arch/arm/boot/dts/imx7s.dtsi index 4b23630fc738..2940dacaa56f 100644 --- a/arch/arm/boot/dts/imx7s.dtsi +++ b/arch/arm/boot/dts/imx7s.dtsi @@ -1267,7 +1267,7 @@ }; }; - dma_apbh: dma-apbh@33000000 { + dma_apbh: dma-controller@33000000 { compatible = "fsl,imx7d-dma-apbh", "fsl,imx28-dma-apbh"; reg = <0x33000000 0x2000>; interrupts = , From 49e734926a4b07308d98dc9d3c8f05eb77f1da00 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sat, 17 Dec 2022 02:08:53 +0100 Subject: [PATCH 014/216] ARM: dts: imx7s: Drop dma-apb interrupt-names [ Upstream commit 9928f0a9e7c0cee3360ca1442b4001d34ad67556 ] Drop "interrupt-names" property, since it is broken. The drivers/dma/mxs-dma.c in Linux kernel does not use it, the property contains duplicate array entries in existing DTs, and even malformed entries (gmpi, should have been gpmi). Get rid of that optional property altogether. Signed-off-by: Marek Vasut Signed-off-by: Shawn Guo Signed-off-by: Sasha Levin --- arch/arm/boot/dts/imx7s.dtsi | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/boot/dts/imx7s.dtsi b/arch/arm/boot/dts/imx7s.dtsi index 2940dacaa56f..69aebc691526 100644 --- a/arch/arm/boot/dts/imx7s.dtsi +++ b/arch/arm/boot/dts/imx7s.dtsi @@ -1274,7 +1274,6 @@ , , ; - interrupt-names = "gpmi0", "gpmi1", "gpmi2", "gpmi3"; #dma-cells = <1>; dma-channels = <4>; clocks = <&clks IMX7D_NAND_USDHC_BUS_RAWNAND_CLK>; From ed9fdc82cafbcf8a46b55d315219bf9464621bca Mon Sep 17 00:00:00 2001 From: Elson Roy Serrao Date: Fri, 24 Mar 2023 14:47:57 -0700 Subject: [PATCH 015/216] usb: gadget: Properly configure the device for remote wakeup [ Upstream commit b93c2a68f3d9dc98ec30dcb342ae47c1c8d09d18 ] The wakeup bit in the bmAttributes field indicates whether the device is configured for remote wakeup. But this field should be allowed to set only if the UDC supports such wakeup mechanism. So configure this field based on UDC capability. Also inform the UDC whether the device is configured for remote wakeup by implementing a gadget op. Reviewed-by: Thinh Nguyen Signed-off-by: Elson Roy Serrao Link: https://lore.kernel.org/r/1679694482-16430-2-git-send-email-quic_eserrao@quicinc.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/gadget/composite.c | 18 ++++++++++++++++++ drivers/usb/gadget/configfs.c | 3 +++ drivers/usb/gadget/udc/core.c | 27 +++++++++++++++++++++++++++ drivers/usb/gadget/udc/trace.h | 5 +++++ include/linux/usb/composite.h | 2 ++ include/linux/usb/gadget.h | 8 ++++++++ 6 files changed, 63 insertions(+) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index cb0a4e2cdbb7..247cca46cdfa 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -511,6 +511,19 @@ static u8 encode_bMaxPower(enum usb_device_speed speed, return min(val, 900U) / 8; } +void check_remote_wakeup_config(struct usb_gadget *g, + struct usb_configuration *c) +{ + if (USB_CONFIG_ATT_WAKEUP & c->bmAttributes) { + /* Reset the rw bit if gadget is not capable of it */ + if (!g->wakeup_capable && g->ops->set_remote_wakeup) { + WARN(c->cdev, "Clearing wakeup bit for config c.%d\n", + c->bConfigurationValue); + c->bmAttributes &= ~USB_CONFIG_ATT_WAKEUP; + } + } +} + static int config_buf(struct usb_configuration *config, enum usb_device_speed speed, void *buf, u8 type) { @@ -959,6 +972,11 @@ static int set_config(struct usb_composite_dev *cdev, power = min(power, 500U); else power = min(power, 900U); + + if (USB_CONFIG_ATT_WAKEUP & c->bmAttributes) + usb_gadget_set_remote_wakeup(gadget, 1); + else + usb_gadget_set_remote_wakeup(gadget, 0); done: if (power <= USB_SELF_POWER_VBUS_MAX_DRAW) usb_gadget_set_selfpowered(gadget); diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 4dcf29577f8f..b94aec6227c5 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -1376,6 +1376,9 @@ static int configfs_composite_bind(struct usb_gadget *gadget, if (gadget_is_otg(gadget)) c->descriptors = otg_desc; + /* Properly configure the bmAttributes wakeup bit */ + check_remote_wakeup_config(gadget, c); + cfg = container_of(c, struct config_usb_cfg, c); if (!list_empty(&cfg->string_list)) { i = 0; diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index c40f2ecbe1b8..0edd9e53fc5a 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -525,6 +525,33 @@ out: } EXPORT_SYMBOL_GPL(usb_gadget_wakeup); +/** + * usb_gadget_set_remote_wakeup - configures the device remote wakeup feature. + * @gadget:the device being configured for remote wakeup + * @set:value to be configured. + * + * set to one to enable remote wakeup feature and zero to disable it. + * + * returns zero on success, else negative errno. + */ +int usb_gadget_set_remote_wakeup(struct usb_gadget *gadget, int set) +{ + int ret = 0; + + if (!gadget->ops->set_remote_wakeup) { + ret = -EOPNOTSUPP; + goto out; + } + + ret = gadget->ops->set_remote_wakeup(gadget, set); + +out: + trace_usb_gadget_set_remote_wakeup(gadget, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(usb_gadget_set_remote_wakeup); + /** * usb_gadget_set_selfpowered - sets the device selfpowered feature. * @gadget:the device being declared as self-powered diff --git a/drivers/usb/gadget/udc/trace.h b/drivers/usb/gadget/udc/trace.h index abdbcb1bacb0..a5ed26fbc2da 100644 --- a/drivers/usb/gadget/udc/trace.h +++ b/drivers/usb/gadget/udc/trace.h @@ -91,6 +91,11 @@ DEFINE_EVENT(udc_log_gadget, usb_gadget_wakeup, TP_ARGS(g, ret) ); +DEFINE_EVENT(udc_log_gadget, usb_gadget_set_remote_wakeup, + TP_PROTO(struct usb_gadget *g, int ret), + TP_ARGS(g, ret) +); + DEFINE_EVENT(udc_log_gadget, usb_gadget_set_selfpowered, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret) diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index 43ac3fa760db..9783b9107d76 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -412,6 +412,8 @@ extern int composite_dev_prepare(struct usb_composite_driver *composite, extern int composite_os_desc_req_prepare(struct usb_composite_dev *cdev, struct usb_ep *ep0); void composite_dev_cleanup(struct usb_composite_dev *cdev); +void check_remote_wakeup_config(struct usb_gadget *g, + struct usb_configuration *c); static inline struct usb_composite_driver *to_cdriver( struct usb_gadget_driver *gdrv) diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index dc3092cea99e..5bec668b41dc 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -309,6 +309,7 @@ struct usb_udc; struct usb_gadget_ops { int (*get_frame)(struct usb_gadget *); int (*wakeup)(struct usb_gadget *); + int (*set_remote_wakeup)(struct usb_gadget *, int set); int (*set_selfpowered) (struct usb_gadget *, int is_selfpowered); int (*vbus_session) (struct usb_gadget *, int is_active); int (*vbus_draw) (struct usb_gadget *, unsigned mA); @@ -383,6 +384,8 @@ struct usb_gadget_ops { * @connected: True if gadget is connected. * @lpm_capable: If the gadget max_speed is FULL or HIGH, this flag * indicates that it supports LPM as per the LPM ECN & errata. + * @wakeup_capable: True if gadget is capable of sending remote wakeup. + * @wakeup_armed: True if gadget is armed by the host for remote wakeup. * @irq: the interrupt number for device controller. * @id_number: a unique ID number for ensuring that gadget names are distinct * @@ -444,6 +447,8 @@ struct usb_gadget { unsigned deactivated:1; unsigned connected:1; unsigned lpm_capable:1; + unsigned wakeup_capable:1; + unsigned wakeup_armed:1; int irq; int id_number; }; @@ -600,6 +605,7 @@ static inline int gadget_is_otg(struct usb_gadget *g) #if IS_ENABLED(CONFIG_USB_GADGET) int usb_gadget_frame_number(struct usb_gadget *gadget); int usb_gadget_wakeup(struct usb_gadget *gadget); +int usb_gadget_set_remote_wakeup(struct usb_gadget *gadget, int set); int usb_gadget_set_selfpowered(struct usb_gadget *gadget); int usb_gadget_clear_selfpowered(struct usb_gadget *gadget); int usb_gadget_vbus_connect(struct usb_gadget *gadget); @@ -615,6 +621,8 @@ static inline int usb_gadget_frame_number(struct usb_gadget *gadget) { return 0; } static inline int usb_gadget_wakeup(struct usb_gadget *gadget) { return 0; } +static inline int usb_gadget_set_remote_wakeup(struct usb_gadget *gadget, int set) +{ return 0; } static inline int usb_gadget_set_selfpowered(struct usb_gadget *gadget) { return 0; } static inline int usb_gadget_clear_selfpowered(struct usb_gadget *gadget) From f8faa536370ec9db460bac96460e16801f62325e Mon Sep 17 00:00:00 2001 From: Vicki Pfau Date: Thu, 13 Apr 2023 23:57:42 -0700 Subject: [PATCH 016/216] Input: xpad - add constants for GIP interface numbers [ Upstream commit f9b2e603c6216824e34dc9a67205d98ccc9a41ca ] Wired GIP devices present multiple interfaces with the same USB identification other than the interface number. This adds constants for differentiating two of them and uses them where appropriate Signed-off-by: Vicki Pfau Link: https://lore.kernel.org/r/20230411031650.960322-2-vi@endrift.com Signed-off-by: Dmitry Torokhov Signed-off-by: Sasha Levin --- drivers/input/joystick/xpad.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 02f3bc4e4895..13c36f51b935 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -564,6 +564,9 @@ struct xboxone_init_packet { #define GIP_MOTOR_LT BIT(3) #define GIP_MOTOR_ALL (GIP_MOTOR_R | GIP_MOTOR_L | GIP_MOTOR_RT | GIP_MOTOR_LT) +#define GIP_WIRED_INTF_DATA 0 +#define GIP_WIRED_INTF_AUDIO 1 + /* * This packet is required for all Xbox One pads with 2015 * or later firmware installed (or present from the factory). @@ -2008,7 +2011,7 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id } if (xpad->xtype == XTYPE_XBOXONE && - intf->cur_altsetting->desc.bInterfaceNumber != 0) { + intf->cur_altsetting->desc.bInterfaceNumber != GIP_WIRED_INTF_DATA) { /* * The Xbox One controller lists three interfaces all with the * same interface class, subclass and protocol. Differentiate by From 8745f3592ee4a7b49ede16ddd3f12a41ecaa23c9 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Fri, 31 Mar 2023 11:31:23 +0800 Subject: [PATCH 017/216] iommu/sprd: Release dma buffer to avoid memory leak [ Upstream commit 9afea57384d4ae7b2034593eac7fa76c7122762a ] When attaching to a domain, the driver would alloc a DMA buffer which is used to store address mapping table, and it need to be released when the IOMMU domain is freed. Signed-off-by: Chunyan Zhang Link: https://lore.kernel.org/r/20230331033124.864691-2-zhang.lyra@gmail.com Signed-off-by: Joerg Roedel Signed-off-by: Sasha Levin --- drivers/iommu/sprd-iommu.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index 8261066de07d..e4358393fe37 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -152,13 +152,6 @@ static struct iommu_domain *sprd_iommu_domain_alloc(unsigned int domain_type) return &dom->domain; } -static void sprd_iommu_domain_free(struct iommu_domain *domain) -{ - struct sprd_iommu_domain *dom = to_sprd_domain(domain); - - kfree(dom); -} - static void sprd_iommu_first_vpn(struct sprd_iommu_domain *dom) { struct sprd_iommu_device *sdev = dom->sdev; @@ -231,6 +224,28 @@ static void sprd_iommu_hw_en(struct sprd_iommu_device *sdev, bool en) sprd_iommu_update_bits(sdev, reg_cfg, mask, 0, val); } +static void sprd_iommu_cleanup(struct sprd_iommu_domain *dom) +{ + size_t pgt_size; + + /* Nothing need to do if the domain hasn't been attached */ + if (!dom->sdev) + return; + + pgt_size = sprd_iommu_pgt_size(&dom->domain); + dma_free_coherent(dom->sdev->dev, pgt_size, dom->pgt_va, dom->pgt_pa); + dom->sdev = NULL; + sprd_iommu_hw_en(dom->sdev, false); +} + +static void sprd_iommu_domain_free(struct iommu_domain *domain) +{ + struct sprd_iommu_domain *dom = to_sprd_domain(domain); + + sprd_iommu_cleanup(dom); + kfree(dom); +} + static int sprd_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { From e89c84422f35ce9fcb0fe9e3f3f60506586a7bae Mon Sep 17 00:00:00 2001 From: Tomas Krcka Date: Wed, 29 Mar 2023 12:34:19 +0000 Subject: [PATCH 018/216] iommu/arm-smmu-v3: Acknowledge pri/event queue overflow if any [ Upstream commit 67ea0b7ce41844eae7c10bb04dfe66a23318c224 ] When an overflow occurs in the PRI queue, the SMMU toggles the overflow flag in the PROD register. To exit the overflow condition, the PRI thread is supposed to acknowledge it by toggling this flag in the CONS register. Unacknowledged overflow causes the queue to stop adding anything new. Currently, the priq thread always writes the CONS register back to the SMMU after clearing the queue. The writeback is not necessary if the OVFLG in the PROD register has not been changed, no overflow has occured. This commit checks the difference of the overflow flag between CONS and PROD register. If it's different, toggles the OVACKFLG flag in the CONS register and write it to the SMMU. The situation is similar for the event queue. The acknowledge register is also toggled after clearing the event queue but never propagated to the hardware. This would only be done the next time when executing evtq thread. Unacknowledged event queue overflow doesn't affect the event queue, because the SMMU still adds elements to that queue when the overflow condition is active. But it feel nicer to keep SMMU in sync when possible, so use the same way here as well. Signed-off-by: Tomas Krcka Link: https://lore.kernel.org/r/20230329123420.34641-1-tomas.krcka@gmail.com Signed-off-by: Will Deacon Signed-off-by: Sasha Levin --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 8966f7d5aab6..82f100e591b5 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -152,6 +152,18 @@ static void queue_inc_cons(struct arm_smmu_ll_queue *q) q->cons = Q_OVF(q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons); } +static void queue_sync_cons_ovf(struct arm_smmu_queue *q) +{ + struct arm_smmu_ll_queue *llq = &q->llq; + + if (likely(Q_OVF(llq->prod) == Q_OVF(llq->cons))) + return; + + llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) | + Q_IDX(llq, llq->cons); + queue_sync_cons_out(q); +} + static int queue_sync_prod_in(struct arm_smmu_queue *q) { u32 prod; @@ -1583,8 +1595,7 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) } while (!queue_empty(llq)); /* Sync our overflow flag, as we believe we're up to speed */ - llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) | - Q_IDX(llq, llq->cons); + queue_sync_cons_ovf(q); return IRQ_HANDLED; } @@ -1642,9 +1653,7 @@ static irqreturn_t arm_smmu_priq_thread(int irq, void *dev) } while (!queue_empty(llq)); /* Sync our overflow flag, as we believe we're up to speed */ - llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) | - Q_IDX(llq, llq->cons); - queue_sync_cons_out(q); + queue_sync_cons_ovf(q); return IRQ_HANDLED; } From 39c6312009574ca73865354133ca222e7753a71b Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Wed, 11 Jan 2023 16:59:43 +0800 Subject: [PATCH 019/216] fs/ntfs3: Fix a possible null-pointer dereference in ni_clear() [ Upstream commit ec275bf9693d19cc0fdce8436f4c425ced86f6e7 ] In a previous commit c1006bd13146, ni->mi.mrec in ni_write_inode() could be NULL, and thus a NULL check is added for this variable. However, in the same call stack, ni->mi.mrec can be also dereferenced in ni_clear(): ntfs_evict_inode(inode) ni_write_inode(inode, ...) ni = ntfs_i(inode); is_rec_inuse(ni->mi.mrec) -> Add a NULL check by previous commit ni_clear(ntfs_i(inode)) is_rec_inuse(ni->mi.mrec) -> No check Thus, a possible null-pointer dereference may exist in ni_clear(). To fix it, a NULL check is added in this function. Signed-off-by: Jia-Ju Bai Reported-by: TOTE Robot Signed-off-by: Konstantin Komarov Signed-off-by: Sasha Levin --- fs/ntfs3/frecord.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index bb7e33c24073..1f0e230ec9e2 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -102,7 +102,7 @@ void ni_clear(struct ntfs_inode *ni) { struct rb_node *node; - if (!ni->vfs_inode.i_nlink && is_rec_inuse(ni->mi.mrec)) + if (!ni->vfs_inode.i_nlink && ni->mi.mrec && is_rec_inuse(ni->mi.mrec)) ni_delete_all(ni); al_destroy(ni); From 976126f2def45f4075f18372bc4e97bb5da3757a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Feb 2023 09:59:10 +0100 Subject: [PATCH 020/216] clk: tegra20: fix gcc-7 constant overflow warning [ Upstream commit b4a2adbf3586efa12fe78b9dec047423e01f3010 ] Older gcc versions get confused by comparing a u32 value to a negative constant in a switch()/case block: drivers/clk/tegra/clk-tegra20.c: In function 'tegra20_clk_measure_input_freq': drivers/clk/tegra/clk-tegra20.c:581:2: error: case label does not reduce to an integer constant case OSC_CTRL_OSC_FREQ_12MHZ: ^~~~ drivers/clk/tegra/clk-tegra20.c:593:2: error: case label does not reduce to an integer constant case OSC_CTRL_OSC_FREQ_26MHZ: Make the constants unsigned instead. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20230227085914.2560984-1-arnd@kernel.org Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin --- drivers/clk/tegra/clk-tegra20.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/clk/tegra/clk-tegra20.c b/drivers/clk/tegra/clk-tegra20.c index 422d78247553..dcacc5064d33 100644 --- a/drivers/clk/tegra/clk-tegra20.c +++ b/drivers/clk/tegra/clk-tegra20.c @@ -21,24 +21,24 @@ #define MISC_CLK_ENB 0x48 #define OSC_CTRL 0x50 -#define OSC_CTRL_OSC_FREQ_MASK (3<<30) -#define OSC_CTRL_OSC_FREQ_13MHZ (0<<30) -#define OSC_CTRL_OSC_FREQ_19_2MHZ (1<<30) -#define OSC_CTRL_OSC_FREQ_12MHZ (2<<30) -#define OSC_CTRL_OSC_FREQ_26MHZ (3<<30) -#define OSC_CTRL_MASK (0x3f2 | OSC_CTRL_OSC_FREQ_MASK) +#define OSC_CTRL_OSC_FREQ_MASK (3u<<30) +#define OSC_CTRL_OSC_FREQ_13MHZ (0u<<30) +#define OSC_CTRL_OSC_FREQ_19_2MHZ (1u<<30) +#define OSC_CTRL_OSC_FREQ_12MHZ (2u<<30) +#define OSC_CTRL_OSC_FREQ_26MHZ (3u<<30) +#define OSC_CTRL_MASK (0x3f2u | OSC_CTRL_OSC_FREQ_MASK) -#define OSC_CTRL_PLL_REF_DIV_MASK (3<<28) -#define OSC_CTRL_PLL_REF_DIV_1 (0<<28) -#define OSC_CTRL_PLL_REF_DIV_2 (1<<28) -#define OSC_CTRL_PLL_REF_DIV_4 (2<<28) +#define OSC_CTRL_PLL_REF_DIV_MASK (3u<<28) +#define OSC_CTRL_PLL_REF_DIV_1 (0u<<28) +#define OSC_CTRL_PLL_REF_DIV_2 (1u<<28) +#define OSC_CTRL_PLL_REF_DIV_4 (2u<<28) #define OSC_FREQ_DET 0x58 -#define OSC_FREQ_DET_TRIG (1<<31) +#define OSC_FREQ_DET_TRIG (1u<<31) #define OSC_FREQ_DET_STATUS 0x5c -#define OSC_FREQ_DET_BUSY (1<<31) -#define OSC_FREQ_DET_CNT_MASK 0xFFFF +#define OSC_FREQ_DET_BUSYu (1<<31) +#define OSC_FREQ_DET_CNT_MASK 0xFFFFu #define TEGRA20_CLK_PERIPH_BANKS 3 From 0d04e45c65f0785e558b93d2631d58680f263e10 Mon Sep 17 00:00:00 2001 From: Edward Lo Date: Tue, 4 Oct 2022 23:15:06 +0800 Subject: [PATCH 021/216] fs/ntfs3: Add length check in indx_get_root [ Upstream commit 08e8cf5f2d9ec383a2e339a2711b62a54ff3fba0 ] This adds a length check to guarantee the retrieved index root is legit. [ 162.459513] BUG: KASAN: use-after-free in hdr_find_e.isra.0+0x10c/0x320 [ 162.460176] Read of size 2 at addr ffff8880037bca99 by task mount/243 [ 162.460851] [ 162.461252] CPU: 0 PID: 243 Comm: mount Not tainted 6.0.0-rc7 #42 [ 162.461744] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 162.462609] Call Trace: [ 162.462954] [ 162.463276] dump_stack_lvl+0x49/0x63 [ 162.463822] print_report.cold+0xf5/0x689 [ 162.464608] ? unwind_get_return_address+0x3a/0x60 [ 162.465766] ? hdr_find_e.isra.0+0x10c/0x320 [ 162.466975] kasan_report+0xa7/0x130 [ 162.467506] ? _raw_spin_lock_irq+0xc0/0xf0 [ 162.467998] ? hdr_find_e.isra.0+0x10c/0x320 [ 162.468536] __asan_load2+0x68/0x90 [ 162.468923] hdr_find_e.isra.0+0x10c/0x320 [ 162.469282] ? cmp_uints+0xe0/0xe0 [ 162.469557] ? cmp_sdh+0x90/0x90 [ 162.469864] ? ni_find_attr+0x214/0x300 [ 162.470217] ? ni_load_mi+0x80/0x80 [ 162.470479] ? entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 162.470931] ? ntfs_bread_run+0x190/0x190 [ 162.471307] ? indx_get_root+0xe4/0x190 [ 162.471556] ? indx_get_root+0x140/0x190 [ 162.471833] ? indx_init+0x1e0/0x1e0 [ 162.472069] ? fnd_clear+0x115/0x140 [ 162.472363] ? _raw_spin_lock_irqsave+0x100/0x100 [ 162.472731] indx_find+0x184/0x470 [ 162.473461] ? sysvec_apic_timer_interrupt+0x57/0xc0 [ 162.474429] ? indx_find_buffer+0x2d0/0x2d0 [ 162.474704] ? do_syscall_64+0x3b/0x90 [ 162.474962] dir_search_u+0x196/0x2f0 [ 162.475381] ? ntfs_nls_to_utf16+0x450/0x450 [ 162.475661] ? ntfs_security_init+0x3d6/0x440 [ 162.475906] ? is_sd_valid+0x180/0x180 [ 162.476191] ntfs_extend_init+0x13f/0x2c0 [ 162.476496] ? ntfs_fix_post_read+0x130/0x130 [ 162.476861] ? iput.part.0+0x286/0x320 [ 162.477325] ntfs_fill_super+0x11e0/0x1b50 [ 162.477709] ? put_ntfs+0x1d0/0x1d0 [ 162.477970] ? vsprintf+0x20/0x20 [ 162.478258] ? set_blocksize+0x95/0x150 [ 162.478538] get_tree_bdev+0x232/0x370 [ 162.478789] ? put_ntfs+0x1d0/0x1d0 [ 162.479038] ntfs_fs_get_tree+0x15/0x20 [ 162.479374] vfs_get_tree+0x4c/0x130 [ 162.479729] path_mount+0x654/0xfe0 [ 162.480124] ? putname+0x80/0xa0 [ 162.480484] ? finish_automount+0x2e0/0x2e0 [ 162.480894] ? putname+0x80/0xa0 [ 162.481467] ? kmem_cache_free+0x1c4/0x440 [ 162.482280] ? putname+0x80/0xa0 [ 162.482714] do_mount+0xd6/0xf0 [ 162.483264] ? path_mount+0xfe0/0xfe0 [ 162.484782] ? __kasan_check_write+0x14/0x20 [ 162.485593] __x64_sys_mount+0xca/0x110 [ 162.486024] do_syscall_64+0x3b/0x90 [ 162.486543] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 162.487141] RIP: 0033:0x7f9d374e948a [ 162.488324] Code: 48 8b 0d 11 fa 2a 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 008 [ 162.489728] RSP: 002b:00007ffe30e73d18 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 162.490971] RAX: ffffffffffffffda RBX: 0000561cdb43a060 RCX: 00007f9d374e948a [ 162.491669] RDX: 0000561cdb43a260 RSI: 0000561cdb43a2e0 RDI: 0000561cdb442af0 [ 162.492050] RBP: 0000000000000000 R08: 0000561cdb43a280 R09: 0000000000000020 [ 162.492459] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 0000561cdb442af0 [ 162.493183] R13: 0000561cdb43a260 R14: 0000000000000000 R15: 00000000ffffffff [ 162.493644] [ 162.493908] [ 162.494214] The buggy address belongs to the physical page: [ 162.494761] page:000000003e38a3d5 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x37bc [ 162.496064] flags: 0xfffffc0000000(node=0|zone=1|lastcpupid=0x1fffff) [ 162.497278] raw: 000fffffc0000000 ffffea00000df1c8 ffffea00000df008 0000000000000000 [ 162.498928] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000 [ 162.500542] page dumped because: kasan: bad access detected [ 162.501057] [ 162.501242] Memory state around the buggy address: [ 162.502230] ffff8880037bc980: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 162.502977] ffff8880037bca00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 162.503522] >ffff8880037bca80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 162.503963] ^ [ 162.504370] ffff8880037bcb00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 162.504766] ffff8880037bcb80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff Signed-off-by: Edward Lo Signed-off-by: Konstantin Komarov Signed-off-by: Sasha Levin --- fs/ntfs3/index.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 7371f7855e4c..eee01db6e0cc 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -998,6 +998,7 @@ struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le = NULL; struct ATTRIB *a; const struct INDEX_NAMES *in = &s_index_names[indx->type]; + struct INDEX_ROOT *root = NULL; a = ni_find_attr(ni, NULL, &le, ATTR_ROOT, in->name, in->name_len, NULL, mi); @@ -1007,7 +1008,15 @@ struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, if (attr) *attr = a; - return resident_data_ex(a, sizeof(struct INDEX_ROOT)); + root = resident_data_ex(a, sizeof(struct INDEX_ROOT)); + + /* length check */ + if (root && offsetof(struct INDEX_ROOT, ihdr) + le32_to_cpu(root->ihdr.used) > + le32_to_cpu(a->res.data_size)) { + return NULL; + } + + return root; } static int indx_write(struct ntfs_index *indx, struct ntfs_inode *ni, From b3152afc0eb864f7c6ecad134a15b577ef7aec77 Mon Sep 17 00:00:00 2001 From: Abdun Nihaal Date: Sun, 30 Oct 2022 12:32:51 +0530 Subject: [PATCH 022/216] fs/ntfs3: Fix NULL dereference in ni_write_inode [ Upstream commit 8dae4f6341e335a09575be60b4fdf697c732a470 ] Syzbot reports a NULL dereference in ni_write_inode. When creating a new inode, if allocation fails in mi_init function (called in mi_format_new function), mi->mrec is set to NULL. In the error path of this inode creation, mi->mrec is later dereferenced in ni_write_inode. Add a NULL check to prevent NULL dereference. Link: https://syzkaller.appspot.com/bug?extid=f45957555ed4a808cc7a Reported-and-tested-by: syzbot+f45957555ed4a808cc7a@syzkaller.appspotmail.com Signed-off-by: Abdun Nihaal Signed-off-by: Konstantin Komarov Signed-off-by: Sasha Levin --- fs/ntfs3/frecord.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 1f0e230ec9e2..d26026090024 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -3255,6 +3255,9 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) return 0; } + if (!ni->mi.mrec) + goto out; + if (is_rec_inuse(ni->mi.mrec) && !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) { bool modified = false; From 329fc4d3f73d865b25f2ee4eafafb040ace37ad5 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 17 Nov 2022 17:19:12 +0800 Subject: [PATCH 023/216] fs/ntfs3: Fix NULL pointer dereference in 'ni_write_inode' [ Upstream commit db2a3cc6a3481076da6344cc62a80a4e2525f36f ] Syzbot found the following issue: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000016 Mem abort info: ESR = 0x0000000096000006 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x06: level 2 translation fault Data abort info: ISV = 0, ISS = 0x00000006 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=000000010af56000 [0000000000000016] pgd=08000001090da003, p4d=08000001090da003, pud=08000001090ce003, pmd=0000000000000000 Internal error: Oops: 0000000096000006 [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 3036 Comm: syz-executor206 Not tainted 6.0.0-rc6-syzkaller-17739-g16c9f284e746 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/26/2022 pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : is_rec_inuse fs/ntfs3/ntfs.h:313 [inline] pc : ni_write_inode+0xac/0x798 fs/ntfs3/frecord.c:3232 lr : ni_write_inode+0xa0/0x798 fs/ntfs3/frecord.c:3226 sp : ffff8000126c3800 x29: ffff8000126c3860 x28: 0000000000000000 x27: ffff0000c8b02000 x26: ffff0000c7502320 x25: ffff0000c7502288 x24: 0000000000000000 x23: ffff80000cbec91c x22: ffff0000c8b03000 x21: ffff0000c8b02000 x20: 0000000000000001 x19: ffff0000c75024d8 x18: 00000000000000c0 x17: ffff80000dd1b198 x16: ffff80000db59158 x15: ffff0000c4b6b500 x14: 00000000000000b8 x13: 0000000000000000 x12: ffff0000c4b6b500 x11: ff80800008be1b60 x10: 0000000000000000 x9 : ffff0000c4b6b500 x8 : 0000000000000000 x7 : ffff800008be1b50 x6 : 0000000000000000 x5 : 0000000000000000 x4 : 0000000000000001 x3 : 0000000000000000 x2 : 0000000000000008 x1 : 0000000000000001 x0 : 0000000000000000 Call trace: is_rec_inuse fs/ntfs3/ntfs.h:313 [inline] ni_write_inode+0xac/0x798 fs/ntfs3/frecord.c:3232 ntfs_evict_inode+0x54/0x84 fs/ntfs3/inode.c:1744 evict+0xec/0x334 fs/inode.c:665 iput_final fs/inode.c:1748 [inline] iput+0x2c4/0x324 fs/inode.c:1774 ntfs_new_inode+0x7c/0xe0 fs/ntfs3/fsntfs.c:1660 ntfs_create_inode+0x20c/0xe78 fs/ntfs3/inode.c:1278 ntfs_create+0x54/0x74 fs/ntfs3/namei.c:100 lookup_open fs/namei.c:3413 [inline] open_last_lookups fs/namei.c:3481 [inline] path_openat+0x804/0x11c4 fs/namei.c:3688 do_filp_open+0xdc/0x1b8 fs/namei.c:3718 do_sys_openat2+0xb8/0x22c fs/open.c:1311 do_sys_open fs/open.c:1327 [inline] __do_sys_openat fs/open.c:1343 [inline] __se_sys_openat fs/open.c:1338 [inline] __arm64_sys_openat+0xb0/0xe0 fs/open.c:1338 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall arch/arm64/kernel/syscall.c:52 [inline] el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654 el0t_64_sync+0x18c/0x190 Code: 97dafee4 340001b4 f9401328 2a1f03e0 (79402d14) ---[ end trace 0000000000000000 ]--- Above issue may happens as follows: ntfs_new_inode mi_init mi->mrec = kmalloc(sbi->record_size, GFP_NOFS); -->failed to allocate memory if (!mi->mrec) return -ENOMEM; iput iput_final evict ntfs_evict_inode ni_write_inode is_rec_inuse(ni->mi.mrec)-> As 'ni->mi.mrec' is NULL trigger NULL-ptr-deref To solve above issue if new inode failed make inode bad before call 'iput()' in 'ntfs_new_inode()'. Reported-by: syzbot+f45957555ed4a808cc7a@syzkaller.appspotmail.com Signed-off-by: Ye Bin Signed-off-by: Konstantin Komarov Signed-off-by: Sasha Levin --- fs/ntfs3/fsntfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 1eac80d55b55..4c2d079b3d49 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -1674,6 +1674,7 @@ struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST rno, bool dir) out: if (err) { + make_bad_inode(inode); iput(inode); ni = ERR_PTR(err); } From e5f488993bc1893b84d93e9915155fab66a070d2 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 27 Mar 2023 13:30:29 +0530 Subject: [PATCH 024/216] iommu/arm-smmu-qcom: Limit the SMR groups to 128 [ Upstream commit 12261134732689b7e30c59db9978f81230965181 ] Some platforms support more than 128 stream matching groups than what is defined by the ARM SMMU architecture specification. But due to some unknown reasons, those additional groups don't exhibit the same behavior as the architecture supported ones. For instance, the additional groups will not detect the quirky behavior of some firmware versions intercepting writes to S2CR register, thus skipping the quirk implemented in the driver and causing boot crash. So let's limit the groups to 128 for now until the issue with those groups are fixed and issue a notice to users in that case. Reviewed-by: Johan Hovold Tested-by: Johan Hovold Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20230327080029.11584-1-manivannan.sadhasivam@linaro.org [will: Reworded the comment slightly] Signed-off-by: Will Deacon Signed-off-by: Sasha Levin --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index d80065c8105a..f15dcb9e4175 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -267,12 +267,26 @@ static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain, static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) { - unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); struct qcom_smmu *qsmmu = to_qcom_smmu(smmu); + unsigned int last_s2cr; u32 reg; u32 smr; int i; + /* + * Some platforms support more than the Arm SMMU architected maximum of + * 128 stream matching groups. For unknown reasons, the additional + * groups don't exhibit the same behavior as the architected registers, + * so limit the groups to 128 until the behavior is fixed for the other + * groups. + */ + if (smmu->num_mapping_groups > 128) { + dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n"); + smmu->num_mapping_groups = 128; + } + + last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); + /* * With some firmware versions writes to S2CR of type FAULT are * ignored, and writing BYPASS will end up written as FAULT in the From afbf1a5cef46427241e76704991cc83c9b1a463b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 21 Mar 2023 17:47:03 -0600 Subject: [PATCH 025/216] RDMA/core: Fix multiple -Warray-bounds warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit aa4d540b4150052ae3b36d286b9c833a961ce291 ] GCC-13 (and Clang)[1] does not like to access a partially allocated object, since it cannot reason about it for bounds checking. In this case 140 bytes are allocated for an object of type struct ib_umad_packet: packet = kzalloc(sizeof(*packet) + IB_MGMT_RMPP_HDR, GFP_KERNEL); However, notice that sizeof(*packet) is only 104 bytes: struct ib_umad_packet { struct ib_mad_send_buf * msg; /* 0 8 */ struct ib_mad_recv_wc * recv_wc; /* 8 8 */ struct list_head list; /* 16 16 */ int length; /* 32 4 */ /* XXX 4 bytes hole, try to pack */ struct ib_user_mad mad __attribute__((__aligned__(8))); /* 40 64 */ /* size: 104, cachelines: 2, members: 5 */ /* sum members: 100, holes: 1, sum holes: 4 */ /* forced alignments: 1, forced holes: 1, sum forced holes: 4 */ /* last cacheline: 40 bytes */ } __attribute__((__aligned__(8))); and 36 bytes extra bytes are allocated for a flexible-array member in struct ib_user_mad: include/rdma/ib_mad.h: 120 enum { ... 123 IB_MGMT_RMPP_HDR = 36, ... } struct ib_user_mad { struct ib_user_mad_hdr hdr; /* 0 64 */ /* --- cacheline 1 boundary (64 bytes) --- */ __u64 data[] __attribute__((__aligned__(8))); /* 64 0 */ /* size: 64, cachelines: 1, members: 2 */ /* forced alignments: 1 */ } __attribute__((__aligned__(8))); So we have sizeof(*packet) + IB_MGMT_RMPP_HDR == 140 bytes Then the address of the flex-array member (for which only 36 bytes were allocated) is casted and copied into a pointer to struct ib_rmpp_mad, which, in turn, is of size 256 bytes: rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data; struct ib_rmpp_mad { struct ib_mad_hdr mad_hdr; /* 0 24 */ struct ib_rmpp_hdr rmpp_hdr; /* 24 12 */ u8 data[220]; /* 36 220 */ /* size: 256, cachelines: 4, members: 3 */ }; The thing is that those 36 bytes allocated for flex-array member data in struct ib_user_mad onlly account for the size of both struct ib_mad_hdr and struct ib_rmpp_hdr, but nothing is left for array u8 data[220]. So, the compiler is legitimately complaining about accessing an object for which not enough memory was allocated. Apparently, the only members of struct ib_rmpp_mad that are relevant (that are actually being used) in function ib_umad_write() are mad_hdr and rmpp_hdr. So, instead of casting packet->mad.data to (struct ib_rmpp_mad *) create a new structure struct ib_rmpp_mad_hdr { struct ib_mad_hdr mad_hdr; struct ib_rmpp_hdr rmpp_hdr; } __packed; and cast packet->mad.data to (struct ib_rmpp_mad_hdr *). Notice that IB_MGMT_RMPP_HDR == sizeof(struct ib_rmpp_mad_hdr) == 36 bytes Refactor the rest of the code, accordingly. Fix the following warnings seen under GCC-13 and -Warray-bounds: drivers/infiniband/core/user_mad.c:564:50: warning: array subscript ‘struct ib_rmpp_mad[0]’ is partly outside array bounds of ‘unsigned char[140]’ [-Warray-bounds=] drivers/infiniband/core/user_mad.c:566:42: warning: array subscript ‘struct ib_rmpp_mad[0]’ is partly outside array bounds of ‘unsigned char[140]’ [-Warray-bounds=] drivers/infiniband/core/user_mad.c:618:25: warning: array subscript ‘struct ib_rmpp_mad[0]’ is partly outside array bounds of ‘unsigned char[140]’ [-Warray-bounds=] drivers/infiniband/core/user_mad.c:622:44: warning: array subscript ‘struct ib_rmpp_mad[0]’ is partly outside array bounds of ‘unsigned char[140]’ [-Warray-bounds=] Link: https://github.com/KSPP/linux/issues/273 Link: https://godbolt.org/z/oYWaGM4Yb [1] Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/ZBpB91qQcB10m3Fw@work Signed-off-by: Leon Romanovsky Signed-off-by: Sasha Levin --- drivers/infiniband/core/user_mad.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index d96c78e436f9..5c284dfbe692 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -131,6 +131,11 @@ struct ib_umad_packet { struct ib_user_mad mad; }; +struct ib_rmpp_mad_hdr { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; +} __packed; + #define CREATE_TRACE_POINTS #include @@ -494,11 +499,11 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct ib_umad_file *file = filp->private_data; + struct ib_rmpp_mad_hdr *rmpp_mad_hdr; struct ib_umad_packet *packet; struct ib_mad_agent *agent; struct rdma_ah_attr ah_attr; struct ib_ah *ah; - struct ib_rmpp_mad *rmpp_mad; __be64 *tid; int ret, data_len, hdr_len, copy_offset, rmpp_active; u8 base_version; @@ -506,7 +511,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, if (count < hdr_size(file) + IB_MGMT_RMPP_HDR) return -EINVAL; - packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL); + packet = kzalloc(sizeof(*packet) + IB_MGMT_RMPP_HDR, GFP_KERNEL); if (!packet) return -ENOMEM; @@ -560,13 +565,13 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, goto err_up; } - rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data; - hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); + rmpp_mad_hdr = (struct ib_rmpp_mad_hdr *)packet->mad.data; + hdr_len = ib_get_mad_data_offset(rmpp_mad_hdr->mad_hdr.mgmt_class); - if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + if (ib_is_mad_class_rmpp(rmpp_mad_hdr->mad_hdr.mgmt_class) && ib_mad_kernel_rmpp_agent(agent)) { copy_offset = IB_MGMT_RMPP_HDR; - rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + rmpp_active = ib_get_rmpp_flags(&rmpp_mad_hdr->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE; } else { copy_offset = IB_MGMT_MAD_HDR; @@ -615,12 +620,12 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid; *tid = cpu_to_be64(((u64) agent->hi_tid) << 32 | (be64_to_cpup(tid) & 0xffffffff)); - rmpp_mad->mad_hdr.tid = *tid; + rmpp_mad_hdr->mad_hdr.tid = *tid; } if (!ib_mad_kernel_rmpp_agent(agent) - && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) - && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) { + && ib_is_mad_class_rmpp(rmpp_mad_hdr->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&rmpp_mad_hdr->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) { spin_lock_irq(&file->send_lock); list_add_tail(&packet->list, &file->send_list); spin_unlock_irq(&file->send_lock); From 87632bc9ecff5ded93433bc0fca428019bdd1cfe Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 18 Jan 2024 10:05:05 -0800 Subject: [PATCH 026/216] mm: huge_memory: don't force huge page alignment on 32 bit commit 4ef9ad19e17676b9ef071309bc62020e2373705d upstream. commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") caused two issues [1] [2] reported on 32 bit system or compat userspace. It doesn't make too much sense to force huge page alignment on 32 bit system due to the constrained virtual address space. [1] https://lore.kernel.org/linux-mm/d0a136a0-4a31-46bc-adf4-2db109a61672@kernel.org/ [2] https://lore.kernel.org/linux-mm/CAJuCfpHXLdQy1a2B6xN2d7quTYwg2OoZseYPZTRpU0eHHKD-sQ@mail.gmail.com/ Link: https://lkml.kernel.org/r/20240118180505.2914778-1-shy828301@gmail.com Fixes: efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") Signed-off-by: Yang Shi Reported-by: Jiri Slaby Reported-by: Suren Baghdasaryan Tested-by: Jiri Slaby Tested-by: Suren Baghdasaryan Reviewed-by: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: Christopher Lameter Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/huge_memory.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 59577946735b..9736e762184b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -607,6 +608,9 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, loff_t off_align = round_up(off, size); unsigned long len_pad, ret; + if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall()) + return 0; + if (off_end <= off_align || (off_end - off_align) < size) return 0; From 65a389ef979b5ca96bc08aa165d6710fe8f1e890 Mon Sep 17 00:00:00 2001 From: Han Xu Date: Wed, 8 Nov 2023 09:07:01 -0600 Subject: [PATCH 027/216] mtd: spinand: gigadevice: Fix the get ecc status issue [ Upstream commit 59950610c0c00c7a06d8a75d2ee5d73dba4274cf ] Some GigaDevice ecc_get_status functions use on-stack buffer for spi_mem_op causes spi_mem_check_op failing, fix the issue by using spinand scratchbuf. Fixes: c40c7a990a46 ("mtd: spinand: Add support for GigaDevice GD5F1GQ4UExxG") Signed-off-by: Han Xu Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231108150701.593912-1-han.xu@nxp.com Signed-off-by: Sasha Levin --- drivers/mtd/nand/spi/gigadevice.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/spi/gigadevice.c b/drivers/mtd/nand/spi/gigadevice.c index 6b043e24855f..9116ee7f023e 100644 --- a/drivers/mtd/nand/spi/gigadevice.c +++ b/drivers/mtd/nand/spi/gigadevice.c @@ -186,7 +186,7 @@ static int gd5fxgq4uexxg_ecc_get_status(struct spinand_device *spinand, { u8 status2; struct spi_mem_op op = SPINAND_GET_FEATURE_OP(GD5FXGQXXEXXG_REG_STATUS2, - &status2); + spinand->scratchbuf); int ret; switch (status & STATUS_ECC_MASK) { @@ -207,6 +207,7 @@ static int gd5fxgq4uexxg_ecc_get_status(struct spinand_device *spinand, * report the maximum of 4 in this case */ /* bits sorted this way (3...0): ECCS1,ECCS0,ECCSE1,ECCSE0 */ + status2 = *(spinand->scratchbuf); return ((status & STATUS_ECC_MASK) >> 2) | ((status2 & STATUS_ECC_MASK) >> 4); @@ -228,7 +229,7 @@ static int gd5fxgq5xexxg_ecc_get_status(struct spinand_device *spinand, { u8 status2; struct spi_mem_op op = SPINAND_GET_FEATURE_OP(GD5FXGQXXEXXG_REG_STATUS2, - &status2); + spinand->scratchbuf); int ret; switch (status & STATUS_ECC_MASK) { @@ -248,6 +249,7 @@ static int gd5fxgq5xexxg_ecc_get_status(struct spinand_device *spinand, * 1 ... 4 bits are flipped (and corrected) */ /* bits sorted this way (1...0): ECCSE1, ECCSE0 */ + status2 = *(spinand->scratchbuf); return ((status2 & STATUS_ECC_MASK) >> 4) + 1; case STATUS_ECC_UNCOR_ERROR: From 0b27bf4c494d61e5663baa34c3edd7ccebf0ea44 Mon Sep 17 00:00:00 2001 From: Ryosuke Yasuoka Date: Wed, 21 Feb 2024 16:40:48 +0900 Subject: [PATCH 028/216] netlink: Fix kernel-infoleak-after-free in __skb_datagram_iter [ Upstream commit 661779e1fcafe1b74b3f3fe8e980c1e207fea1fd ] syzbot reported the following uninit-value access issue [1]: netlink_to_full_skb() creates a new `skb` and puts the `skb->data` passed as a 1st arg of netlink_to_full_skb() onto new `skb`. The data size is specified as `len` and passed to skb_put_data(). This `len` is based on `skb->end` that is not data offset but buffer offset. The `skb->end` contains data and tailroom. Since the tailroom is not initialized when the new `skb` created, KMSAN detects uninitialized memory area when copying the data. This patch resolved this issue by correct the len from `skb->end` to `skb->len`, which is the actual data offset. BUG: KMSAN: kernel-infoleak-after-free in instrument_copy_to_user include/linux/instrumented.h:114 [inline] BUG: KMSAN: kernel-infoleak-after-free in copy_to_user_iter lib/iov_iter.c:24 [inline] BUG: KMSAN: kernel-infoleak-after-free in iterate_ubuf include/linux/iov_iter.h:29 [inline] BUG: KMSAN: kernel-infoleak-after-free in iterate_and_advance2 include/linux/iov_iter.h:245 [inline] BUG: KMSAN: kernel-infoleak-after-free in iterate_and_advance include/linux/iov_iter.h:271 [inline] BUG: KMSAN: kernel-infoleak-after-free in _copy_to_iter+0x364/0x2520 lib/iov_iter.c:186 instrument_copy_to_user include/linux/instrumented.h:114 [inline] copy_to_user_iter lib/iov_iter.c:24 [inline] iterate_ubuf include/linux/iov_iter.h:29 [inline] iterate_and_advance2 include/linux/iov_iter.h:245 [inline] iterate_and_advance include/linux/iov_iter.h:271 [inline] _copy_to_iter+0x364/0x2520 lib/iov_iter.c:186 copy_to_iter include/linux/uio.h:197 [inline] simple_copy_to_iter+0x68/0xa0 net/core/datagram.c:532 __skb_datagram_iter+0x123/0xdc0 net/core/datagram.c:420 skb_copy_datagram_iter+0x5c/0x200 net/core/datagram.c:546 skb_copy_datagram_msg include/linux/skbuff.h:3960 [inline] packet_recvmsg+0xd9c/0x2000 net/packet/af_packet.c:3482 sock_recvmsg_nosec net/socket.c:1044 [inline] sock_recvmsg net/socket.c:1066 [inline] sock_read_iter+0x467/0x580 net/socket.c:1136 call_read_iter include/linux/fs.h:2014 [inline] new_sync_read fs/read_write.c:389 [inline] vfs_read+0x8f6/0xe00 fs/read_write.c:470 ksys_read+0x20f/0x4c0 fs/read_write.c:613 __do_sys_read fs/read_write.c:623 [inline] __se_sys_read fs/read_write.c:621 [inline] __x64_sys_read+0x93/0xd0 fs/read_write.c:621 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was stored to memory at: skb_put_data include/linux/skbuff.h:2622 [inline] netlink_to_full_skb net/netlink/af_netlink.c:181 [inline] __netlink_deliver_tap_skb net/netlink/af_netlink.c:298 [inline] __netlink_deliver_tap+0x5be/0xc90 net/netlink/af_netlink.c:325 netlink_deliver_tap net/netlink/af_netlink.c:338 [inline] netlink_deliver_tap_kernel net/netlink/af_netlink.c:347 [inline] netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline] netlink_unicast+0x10f1/0x1250 net/netlink/af_netlink.c:1368 netlink_sendmsg+0x1238/0x13d0 net/netlink/af_netlink.c:1910 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: free_pages_prepare mm/page_alloc.c:1087 [inline] free_unref_page_prepare+0xb0/0xa40 mm/page_alloc.c:2347 free_unref_page_list+0xeb/0x1100 mm/page_alloc.c:2533 release_pages+0x23d3/0x2410 mm/swap.c:1042 free_pages_and_swap_cache+0xd9/0xf0 mm/swap_state.c:316 tlb_batch_pages_flush mm/mmu_gather.c:98 [inline] tlb_flush_mmu_free mm/mmu_gather.c:293 [inline] tlb_flush_mmu+0x6f5/0x980 mm/mmu_gather.c:300 tlb_finish_mmu+0x101/0x260 mm/mmu_gather.c:392 exit_mmap+0x49e/0xd30 mm/mmap.c:3321 __mmput+0x13f/0x530 kernel/fork.c:1349 mmput+0x8a/0xa0 kernel/fork.c:1371 exit_mm+0x1b8/0x360 kernel/exit.c:567 do_exit+0xd57/0x4080 kernel/exit.c:858 do_group_exit+0x2fd/0x390 kernel/exit.c:1021 __do_sys_exit_group kernel/exit.c:1032 [inline] __se_sys_exit_group kernel/exit.c:1030 [inline] __x64_sys_exit_group+0x3c/0x50 kernel/exit.c:1030 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Bytes 3852-3903 of 3904 are uninitialized Memory access of size 3904 starts at ffff88812ea1e000 Data copied to user address 0000000020003280 CPU: 1 PID: 5043 Comm: syz-executor297 Not tainted 6.7.0-rc5-syzkaller-00047-g5bd7ef53ffe5 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/10/2023 Fixes: 1853c9496460 ("netlink, mmap: transform mmap skb into full skb on taps") Reported-and-tested-by: syzbot+34ad5fab48f7bf510349@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=34ad5fab48f7bf510349 [1] Signed-off-by: Ryosuke Yasuoka Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240221074053.1794118-1-ryasuoka@redhat.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 6857a4965fe8..e9b81cba1e2b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -167,7 +167,7 @@ static inline u32 netlink_group_mask(u32 group) static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, gfp_t gfp_mask) { - unsigned int len = skb_end_offset(skb); + unsigned int len = skb->len; struct sk_buff *new; new = alloc_skb(len, gfp_mask); From 0ac219c4c3ab253f3981f346903458d20bacab32 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Feb 2024 18:27:33 +0100 Subject: [PATCH 029/216] netlink: add nla be16/32 types to minlen array [ Upstream commit 9a0d18853c280f6a0ee99f91619f2442a17a323a ] BUG: KMSAN: uninit-value in nla_validate_range_unsigned lib/nlattr.c:222 [inline] BUG: KMSAN: uninit-value in nla_validate_int_range lib/nlattr.c:336 [inline] BUG: KMSAN: uninit-value in validate_nla lib/nlattr.c:575 [inline] BUG: KMSAN: uninit-value in __nla_validate_parse+0x2e20/0x45c0 lib/nlattr.c:631 nla_validate_range_unsigned lib/nlattr.c:222 [inline] nla_validate_int_range lib/nlattr.c:336 [inline] validate_nla lib/nlattr.c:575 [inline] ... The message in question matches this policy: [NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255), but because NLA_BE32 size in minlen array is 0, the validation code will read past the malformed (too small) attribute. Note: Other attributes, e.g. BITFIELD32, SINT, UINT.. are also missing: those likely should be added too. Reported-by: syzbot+3f497b07aa3baf2fb4d0@syzkaller.appspotmail.com Reported-by: xingwei lee Closes: https://lore.kernel.org/all/CABOYnLzFYHSnvTyS6zGa-udNX55+izqkOt2sB9WDqUcEGW6n8w@mail.gmail.com/raw Fixes: ecaf75ffd5f5 ("netlink: introduce bigendian integer types") Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240221172740.5092-1-fw@strlen.de Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- lib/nlattr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/nlattr.c b/lib/nlattr.c index dffd60e4065f..86344df0ccf7 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -30,6 +30,8 @@ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = { [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), + [NLA_BE16] = sizeof(__be16), + [NLA_BE32] = sizeof(__be32), }; static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { @@ -43,6 +45,8 @@ static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), + [NLA_BE16] = sizeof(__be16), + [NLA_BE32] = sizeof(__be32), }; /* From ab63de24ebea36fe73ac7121738595d704b66d96 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 20 Feb 2024 14:56:02 +0100 Subject: [PATCH 030/216] net: ip_tunnel: prevent perpetual headroom growth [ Upstream commit 5ae1e9922bbdbaeb9cfbe91085ab75927488ac0f ] syzkaller triggered following kasan splat: BUG: KASAN: use-after-free in __skb_flow_dissect+0x19d1/0x7a50 net/core/flow_dissector.c:1170 Read of size 1 at addr ffff88812fb4000e by task syz-executor183/5191 [..] kasan_report+0xda/0x110 mm/kasan/report.c:588 __skb_flow_dissect+0x19d1/0x7a50 net/core/flow_dissector.c:1170 skb_flow_dissect_flow_keys include/linux/skbuff.h:1514 [inline] ___skb_get_hash net/core/flow_dissector.c:1791 [inline] __skb_get_hash+0xc7/0x540 net/core/flow_dissector.c:1856 skb_get_hash include/linux/skbuff.h:1556 [inline] ip_tunnel_xmit+0x1855/0x33c0 net/ipv4/ip_tunnel.c:748 ipip_tunnel_xmit+0x3cc/0x4e0 net/ipv4/ipip.c:308 __netdev_start_xmit include/linux/netdevice.h:4940 [inline] netdev_start_xmit include/linux/netdevice.h:4954 [inline] xmit_one net/core/dev.c:3548 [inline] dev_hard_start_xmit+0x13d/0x6d0 net/core/dev.c:3564 __dev_queue_xmit+0x7c1/0x3d60 net/core/dev.c:4349 dev_queue_xmit include/linux/netdevice.h:3134 [inline] neigh_connected_output+0x42c/0x5d0 net/core/neighbour.c:1592 ... ip_finish_output2+0x833/0x2550 net/ipv4/ip_output.c:235 ip_finish_output+0x31/0x310 net/ipv4/ip_output.c:323 .. iptunnel_xmit+0x5b4/0x9b0 net/ipv4/ip_tunnel_core.c:82 ip_tunnel_xmit+0x1dbc/0x33c0 net/ipv4/ip_tunnel.c:831 ipgre_xmit+0x4a1/0x980 net/ipv4/ip_gre.c:665 __netdev_start_xmit include/linux/netdevice.h:4940 [inline] netdev_start_xmit include/linux/netdevice.h:4954 [inline] xmit_one net/core/dev.c:3548 [inline] dev_hard_start_xmit+0x13d/0x6d0 net/core/dev.c:3564 ... The splat occurs because skb->data points past skb->head allocated area. This is because neigh layer does: __skb_pull(skb, skb_network_offset(skb)); ... but skb_network_offset() returns a negative offset and __skb_pull() arg is unsigned. IOW, we skb->data gets "adjusted" by a huge value. The negative value is returned because skb->head and skb->data distance is more than 64k and skb->network_header (u16) has wrapped around. The bug is in the ip_tunnel infrastructure, which can cause dev->needed_headroom to increment ad infinitum. The syzkaller reproducer consists of packets getting routed via a gre tunnel, and route of gre encapsulated packets pointing at another (ipip) tunnel. The ipip encapsulation finds gre0 as next output device. This results in the following pattern: 1). First packet is to be sent out via gre0. Route lookup found an output device, ipip0. 2). ip_tunnel_xmit for gre0 bumps gre0->needed_headroom based on the future output device, rt.dev->needed_headroom (ipip0). 3). ip output / start_xmit moves skb on to ipip0. which runs the same code path again (xmit recursion). 4). Routing step for the post-gre0-encap packet finds gre0 as output device to use for ipip0 encapsulated packet. tunl0->needed_headroom is then incremented based on the (already bumped) gre0 device headroom. This repeats for every future packet: gre0->needed_headroom gets inflated because previous packets' ipip0 step incremented rt->dev (gre0) headroom, and ipip0 incremented because gre0 needed_headroom was increased. For each subsequent packet, gre/ipip0->needed_headroom grows until post-expand-head reallocations result in a skb->head/data distance of more than 64k. Once that happens, skb->network_header (u16) wraps around when pskb_expand_head tries to make sure that skb_network_offset() is unchanged after the headroom expansion/reallocation. After this skb_network_offset(skb) returns a different (and negative) result post headroom expansion. The next trip to neigh layer (or anything else that would __skb_pull the network header) makes skb->data point to a memory location outside skb->head area. v2: Cap the needed_headroom update to an arbitarily chosen upperlimit to prevent perpetual increase instead of dropping the headroom increment completely. Reported-and-tested-by: syzbot+bfde3bef047a81b8fde6@syzkaller.appspotmail.com Closes: https://groups.google.com/g/syzkaller-bugs/c/fL9G6GtWskY/m/VKk_PR5FBAAJ Fixes: 243aad830e8a ("ip_gre: include route header_len in max_headroom calculation") Signed-off-by: Florian Westphal Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240220135606.4939-1-fw@strlen.de Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv4/ip_tunnel.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 24961b304dad..328f9068c6a4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -540,6 +540,20 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, return 0; } +static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom) +{ + /* we must cap headroom to some upperlimit, else pskb_expand_head + * will overflow header offsets in skb_headers_offset_update(). + */ + static const unsigned int max_allowed = 512; + + if (headroom > max_allowed) + headroom = max_allowed; + + if (headroom > READ_ONCE(dev->needed_headroom)) + WRITE_ONCE(dev->needed_headroom, headroom); +} + void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto, int tunnel_hlen) { @@ -614,13 +628,13 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; - if (headroom > READ_ONCE(dev->needed_headroom)) - WRITE_ONCE(dev->needed_headroom, headroom); - - if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { + if (skb_cow_head(skb, headroom)) { ip_rt_put(rt); goto tx_dropped; } + + ip_tunnel_adj_headroom(dev, headroom); + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); return; @@ -800,16 +814,16 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); - if (max_headroom > READ_ONCE(dev->needed_headroom)) - WRITE_ONCE(dev->needed_headroom, max_headroom); - if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { + if (skb_cow_head(skb, max_headroom)) { ip_rt_put(rt); dev->stats.tx_dropped++; kfree_skb(skb); return; } + ip_tunnel_adj_headroom(dev, max_headroom); + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); return; From a3c8fa54e904b0ddb52a08cc2d8ac239054f61fd Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Tue, 20 Feb 2024 16:10:53 +0800 Subject: [PATCH 031/216] net: mctp: take ownership of skb in mctp_local_output [ Upstream commit 3773d65ae5154ed7df404b050fd7387a36ab5ef3 ] Currently, mctp_local_output only takes ownership of skb on success, and we may leak an skb if mctp_local_output fails in specific states; the skb ownership isn't transferred until the actual output routing occurs. Instead, make mctp_local_output free the skb on all error paths up to the route action, so it always consumes the passed skb. Fixes: 833ef3b91de6 ("mctp: Populate socket implementation") Signed-off-by: Jeremy Kerr Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240220081053.1439104-1-jk@codeconstruct.com.au Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- include/net/mctp.h | 1 + net/mctp/route.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/net/mctp.h b/include/net/mctp.h index 82800d521c3d..7ed84054f462 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -249,6 +249,7 @@ struct mctp_route { struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, mctp_eid_t daddr); +/* always takes ownership of skb */ int mctp_local_output(struct sock *sk, struct mctp_route *rt, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); diff --git a/net/mctp/route.c b/net/mctp/route.c index 256bf0b89e6c..0144d8ebdaef 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -888,7 +888,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, dev = dev_get_by_index_rcu(sock_net(sk), cb->ifindex); if (!dev) { rcu_read_unlock(); - return rc; + goto out_free; } rt->dev = __mctp_dev_get(dev); rcu_read_unlock(); @@ -903,7 +903,8 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, rt->mtu = 0; } else { - return -EINVAL; + rc = -EINVAL; + goto out_free; } spin_lock_irqsave(&rt->dev->addrs_lock, flags); @@ -966,12 +967,17 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, rc = mctp_do_fragment_route(rt, skb, mtu, tag); } + /* route output functions consume the skb, even on error */ + skb = NULL; + out_release: if (!ext_rt) mctp_route_release(rt); mctp_dev_put(tmp_rt.dev); +out_free: + kfree_skb(skb); return rc; } From 29360fd3288f3978ccde2f8f7eba22282c4a08a3 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Tue, 20 Feb 2024 11:12:07 +0800 Subject: [PATCH 032/216] tun: Fix xdp_rxq_info's queue_index when detaching [ Upstream commit 2a770cdc4382b457ca3d43d03f0f0064f905a0d0 ] When a queue(tfile) is detached, we only update tfile's queue_index, but do not update xdp_rxq_info's queue_index. This patch fixes it. Fixes: 8bf5c4ee1889 ("tun: setup xdp_rxq_info") Signed-off-by: Yunjian Wang Link: https://lore.kernel.org/r/1708398727-46308-1-git-send-email-wangyunjian@huawei.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/tun.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 367255bb44cd..922d6f16d99d 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -653,6 +653,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean) tun->tfiles[tun->numqueues - 1]); ntfile = rtnl_dereference(tun->tfiles[index]); ntfile->queue_index = index; + ntfile->xdp_rxq.queue_index = index; rcu_assign_pointer(tun->tfiles[tun->numqueues - 1], NULL); From e85b3c15398f6fa1f3941be8acbef79ae114744d Mon Sep 17 00:00:00 2001 From: Doug Smythies Date: Sat, 17 Feb 2024 13:30:10 -0800 Subject: [PATCH 033/216] cpufreq: intel_pstate: fix pstate limits enforcement for adjust_perf call back [ Upstream commit f0a0fc10abb062d122db5ac4ed42f6d1ca342649 ] There is a loophole in pstate limit clamping for the intel_cpufreq CPU frequency scaling driver (intel_pstate in passive mode), schedutil CPU frequency scaling governor, HWP (HardWare Pstate) control enabled, when the adjust_perf call back path is used. Fix it. Fixes: a365ab6b9dfb cpufreq: intel_pstate: Implement the ->adjust_perf() callback Signed-off-by: Doug Smythies Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- drivers/cpufreq/intel_pstate.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index abdd26f7d04c..5771f3fc6115 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2952,6 +2952,9 @@ static void intel_cpufreq_adjust_perf(unsigned int cpunum, if (min_pstate < cpu->min_perf_ratio) min_pstate = cpu->min_perf_ratio; + if (min_pstate > cpu->max_perf_ratio) + min_pstate = cpu->max_perf_ratio; + max_pstate = min(cap_pstate, cpu->max_perf_ratio); if (max_pstate < min_pstate) max_pstate = min_pstate; From 7985d73961bbb4e726c1be7b9cd26becc7be8325 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Feb 2024 15:12:10 -0800 Subject: [PATCH 034/216] net: veth: clear GRO when clearing XDP even when down MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit fe9f801355f0b47668419f30f1fac1cf4539e736 ] veth sets NETIF_F_GRO automatically when XDP is enabled, because both features use the same NAPI machinery. The logic to clear NETIF_F_GRO sits in veth_disable_xdp() which is called both on ndo_stop and when XDP is turned off. To avoid the flag from being cleared when the device is brought down, the clearing is skipped when IFF_UP is not set. Bringing the device down should indeed not modify its features. Unfortunately, this means that clearing is also skipped when XDP is disabled _while_ the device is down. And there's nothing on the open path to bring the device features back into sync. IOW if user enables XDP, disables it and then brings the device up we'll end up with a stray GRO flag set but no NAPI instances. We don't depend on the GRO flag on the datapath, so the datapath won't crash. We will crash (or hang), however, next time features are sync'ed (either by user via ethtool or peer changing its config). The GRO flag will go away, and veth will try to disable the NAPIs. But the open path never created them since XDP was off, the GRO flag was a stray. If NAPI was initialized before we'll hang in napi_disable(). If it never was we'll crash trying to stop uninitialized hrtimer. Move the GRO flag updates to the XDP enable / disable paths, instead of mixing them with the ndo_open / ndo_close paths. Fixes: d3256efd8e8b ("veth: allow enabling NAPI even without XDP") Reported-by: Thomas Gleixner Reported-by: syzbot+039399a9b96297ddedca@syzkaller.appspotmail.com Signed-off-by: Jakub Kicinski Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/veth.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 36c5a41f84e4..dea9cc8c39f7 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1135,14 +1135,6 @@ static int veth_enable_xdp(struct net_device *dev) veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); return err; } - - if (!veth_gro_requested(dev)) { - /* user-space did not require GRO, but adding XDP - * is supposed to get GRO working - */ - dev->features |= NETIF_F_GRO; - netdev_features_change(dev); - } } } @@ -1162,18 +1154,9 @@ static void veth_disable_xdp(struct net_device *dev) for (i = 0; i < dev->real_num_rx_queues; i++) rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); - if (!netif_running(dev) || !veth_gro_requested(dev)) { + if (!netif_running(dev) || !veth_gro_requested(dev)) veth_napi_del(dev); - /* if user-space did not require GRO, since adding XDP - * enabled it, clear it now - */ - if (!veth_gro_requested(dev) && netif_running(dev)) { - dev->features &= ~NETIF_F_GRO; - netdev_features_change(dev); - } - } - veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); } @@ -1558,6 +1541,14 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, } if (!old_prog) { + if (!veth_gro_requested(dev)) { + /* user-space did not require GRO, but adding + * XDP is supposed to get GRO working + */ + dev->features |= NETIF_F_GRO; + netdev_features_change(dev); + } + peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; peer->max_mtu = max_mtu; } @@ -1568,6 +1559,14 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, if (dev->flags & IFF_UP) veth_disable_xdp(dev); + /* if user-space did not require GRO, since adding XDP + * enabled it, clear it now + */ + if (!veth_gro_requested(dev)) { + dev->features &= ~NETIF_F_GRO; + netdev_features_change(dev); + } + if (peer) { peer->hw_features |= NETIF_F_GSO_SOFTWARE; peer->max_mtu = ETH_MAX_MTU; From 1b0998fdd85776775d975d0024bca227597e836a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Feb 2024 12:17:47 +0000 Subject: [PATCH 035/216] ipv6: fix potential "struct net" leak in inet6_rtm_getaddr() [ Upstream commit 10bfd453da64a057bcfd1a49fb6b271c48653cdb ] It seems that if userspace provides a correct IFA_TARGET_NETNSID value but no IFA_ADDRESS and IFA_LOCAL attributes, inet6_rtm_getaddr() returns -EINVAL with an elevated "struct net" refcount. Fixes: 6ecf4c37eb3e ("ipv6: enable IFA_TARGET_NETNSID for RTM_GETADDR") Signed-off-by: Eric Dumazet Cc: Christian Brauner Cc: David Ahern Reviewed-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv6/addrconf.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 46527b5cc8f0..1648373692a9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5473,9 +5473,10 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, } addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); - if (!addr) - return -EINVAL; - + if (!addr) { + err = -EINVAL; + goto errout; + } ifm = nlmsg_data(nlh); if (ifm->ifa_index) dev = dev_get_by_index(tgt_net, ifm->ifa_index); From c41548fede3d4b0305be2237ba7dbf657e9ff30b Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 22 Feb 2024 13:38:38 +0100 Subject: [PATCH 036/216] lan78xx: enable auto speed configuration for LAN7850 if no EEPROM is detected [ Upstream commit 0e67899abfbfdea0c3c0ed3fd263ffc601c5c157 ] Same as LAN7800, LAN7850 can be used without EEPROM. If EEPROM is not present or not flashed, LAN7850 will fail to sync the speed detected by the PHY with the MAC. In case link speed is 100Mbit, it will accidentally work, otherwise no data can be transferred. Better way would be to implement link_up callback, or set auto speed configuration unconditionally. But this changes would be more intrusive. So, for now, set it only if no EEPROM is found. Fixes: e69647a19c87 ("lan78xx: Set ASD in MAC_CR when EEE is enabled.") Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20240222123839.2816561-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/usb/lan78xx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index c458c030fadf..7b9d480e44fe 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -3035,7 +3035,8 @@ static int lan78xx_reset(struct lan78xx_net *dev) if (dev->chipid == ID_REV_CHIP_ID_7801_) buf &= ~MAC_CR_GMII_EN_; - if (dev->chipid == ID_REV_CHIP_ID_7800_) { + if (dev->chipid == ID_REV_CHIP_ID_7800_ || + dev->chipid == ID_REV_CHIP_ID_7850_) { ret = lan78xx_read_raw_eeprom(dev, 0, 1, &sig); if (!ret && sig != EEPROM_INDICATOR) { /* Implies there is no external eeprom. Set mac speed */ From 548ab66730848c8ed105e1d7caf9f4e3f68cdc94 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 23 Feb 2024 15:59:08 -0800 Subject: [PATCH 037/216] veth: try harder when allocating queue memory [ Upstream commit 1ce7d306ea63f3e379557c79abd88052e0483813 ] struct veth_rq is pretty large, 832B total without debug options enabled. Since commit under Fixes we try to pre-allocate enough queues for every possible CPU. Miao Wang reports that this may lead to order-5 allocations which will fail in production. Let the allocation fallback to vmalloc() and try harder. These are the same flags we pass to netdev queue allocation. Reported-and-tested-by: Miao Wang Fixes: 9d3684c24a52 ("veth: create by default nr_possible_cpus queues") Link: https://lore.kernel.org/all/5F52CAE2-2FB7-4712-95F1-3312FBBFA8DD@gmail.com/ Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240223235908.693010-1-kuba@kernel.org Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/veth.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index dea9cc8c39f7..dd9f5f146192 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1359,7 +1359,8 @@ static int veth_alloc_queues(struct net_device *dev) struct veth_priv *priv = netdev_priv(dev); int i; - priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT); + priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq), + GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!priv->rq) return -ENOMEM; @@ -1375,7 +1376,7 @@ static void veth_free_queues(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); - kfree(priv->rq); + kvfree(priv->rq); } static int veth_dev_init(struct net_device *dev) From d77ab053fb2f97ff366118d4dbffd8fe48168541 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Sun, 25 Feb 2024 00:20:06 +0100 Subject: [PATCH 038/216] net: usb: dm9601: fix wrong return value in dm9601_mdio_read [ Upstream commit c68b2c9eba38ec3f60f4894b189090febf4d8d22 ] The MII code does not check the return value of mdio_read (among others), and therefore no error code should be sent. A previous fix to the use of an uninitialized variable propagates negative error codes, that might lead to wrong operations by the MII library. An example of such issues is the use of mii_nway_restart by the dm9601 driver. The mii_nway_restart function does not check the value returned by mdio_read, which in this case might be a negative number which could contain the exact bit the function checks (BMCR_ANENABLE = 0x1000). Return zero in case of error, as it is common practice in users of mdio_read to avoid wrong uses of the return value. Fixes: 8f8abb863fa5 ("net: usb: dm9601: fix uninitialized variable use in dm9601_mdio_read") Signed-off-by: Javier Carrasco Reviewed-by: Simon Horman Reviewed-by: Peter Korsgaard Link: https://lore.kernel.org/r/20240225-dm9601_ret_err-v1-1-02c1d959ea59@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/usb/dm9601.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/dm9601.c b/drivers/net/usb/dm9601.c index 99ec1d4a972d..8b6d6a1b3c2e 100644 --- a/drivers/net/usb/dm9601.c +++ b/drivers/net/usb/dm9601.c @@ -232,7 +232,7 @@ static int dm9601_mdio_read(struct net_device *netdev, int phy_id, int loc) err = dm_read_shared_word(dev, 1, loc, &res); if (err < 0) { netdev_err(dev->net, "MDIO read error: %d\n", err); - return err; + return 0; } netdev_dbg(dev->net, From 1b4223e807fa17bc53062e922e4e7266450e304f Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 26 Feb 2024 12:08:20 +0100 Subject: [PATCH 039/216] net: lan78xx: fix "softirq work is pending" error [ Upstream commit e3d5d70cb483df8296dd44e9ae3b6355ef86494c ] Disable BH around the call to napi_schedule() to avoid following error: NOHZ tick-stop error: local softirq work is pending, handler #08!!! Fixes: ec4c7e12396b ("lan78xx: Introduce NAPI polling support") Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20240226110820.2113584-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/usb/lan78xx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 7b9d480e44fe..4fd456381129 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1501,7 +1501,9 @@ static int lan78xx_link_reset(struct lan78xx_net *dev) lan78xx_rx_urb_submit_all(dev); + local_bh_disable(); napi_schedule(&dev->napi); + local_bh_enable(); } return 0; From aa5897232682c27ff731b083b40c879b0eb2c994 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Mon, 26 Feb 2024 13:49:21 +0100 Subject: [PATCH 040/216] uapi: in6: replace temporary label with rfc9486 [ Upstream commit 6a2008641920a9c6fe1abbeb9acbec463215d505 ] Not really a fix per se, but IPV6_TLV_IOAM is still tagged as "TEMPORARY IANA allocation for IOAM", while RFC 9486 is available for some time now. Just update the reference. Fixes: 9ee11f0fff20 ("ipv6: ioam: Data plane support for Pre-allocated Trace") Signed-off-by: Justin Iurman Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240226124921.9097-1-justin.iurman@uliege.be Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- include/uapi/linux/in6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index c4c53a9ab959..ff8d21f9e95b 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -145,7 +145,7 @@ struct in6_flowlabel_req { #define IPV6_TLV_PADN 1 #define IPV6_TLV_ROUTERALERT 5 #define IPV6_TLV_CALIPSO 7 /* RFC 5570 */ -#define IPV6_TLV_IOAM 49 /* TEMPORARY IANA allocation for IOAM */ +#define IPV6_TLV_IOAM 49 /* RFC 9486 */ #define IPV6_TLV_JUMBO 194 #define IPV6_TLV_HAO 201 /* home address option */ From 17ccd9798fe0beda3db212cfa3ebe373f605cbd6 Mon Sep 17 00:00:00 2001 From: Jakub Raczynski Date: Mon, 26 Feb 2024 17:42:32 +0100 Subject: [PATCH 041/216] stmmac: Clear variable when destroying workqueue [ Upstream commit 8af411bbba1f457c33734795f024d0ef26d0963f ] Currently when suspending driver and stopping workqueue it is checked whether workqueue is not NULL and if so, it is destroyed. Function destroy_workqueue() does drain queue and does clear variable, but it does not set workqueue variable to NULL. This can cause kernel/module panic if code attempts to clear workqueue that was not initialized. This scenario is possible when resuming suspended driver in stmmac_resume(), because there is no handling for failed stmmac_hw_setup(), which can fail and return if DMA engine has failed to initialize, and workqueue is initialized after DMA engine. Should DMA engine fail to initialize, resume will proceed normally, but interface won't work and TX queue will eventually timeout, causing 'Reset adapter' error. This then does destroy workqueue during reset process. And since workqueue is initialized after DMA engine and can be skipped, it will cause kernel/module panic. To secure against this possible crash, set workqueue variable to NULL when destroying workqueue. Log/backtrace from crash goes as follows: [88.031977]------------[ cut here ]------------ [88.031985]NETDEV WATCHDOG: eth0 (sxgmac): transmit queue 1 timed out [88.032017]WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:477 dev_watchdog+0x390/0x398 [88.032251]---[ end trace e70de432e4d5c2c0 ]--- [88.032282]sxgmac 16d88000.ethernet eth0: Reset adapter. [88.036359]------------[ cut here ]------------ [88.036519]Call trace: [88.036523] flush_workqueue+0x3e4/0x430 [88.036528] drain_workqueue+0xc4/0x160 [88.036533] destroy_workqueue+0x40/0x270 [88.036537] stmmac_fpe_stop_wq+0x4c/0x70 [88.036541] stmmac_release+0x278/0x280 [88.036546] __dev_close_many+0xcc/0x158 [88.036551] dev_close_many+0xbc/0x190 [88.036555] dev_close.part.0+0x70/0xc0 [88.036560] dev_close+0x24/0x30 [88.036564] stmmac_service_task+0x110/0x140 [88.036569] process_one_work+0x1d8/0x4a0 [88.036573] worker_thread+0x54/0x408 [88.036578] kthread+0x164/0x170 [88.036583] ret_from_fork+0x10/0x20 [88.036588]---[ end trace e70de432e4d5c2c1 ]--- [88.036597]Unable to handle kernel NULL pointer dereference at virtual address 0000000000000004 Fixes: 5a5586112b929 ("net: stmmac: support FPE link partner hand-shaking procedure") Signed-off-by: Jakub Raczynski Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 91b2aa81914b..e2d51014ab4b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3900,8 +3900,10 @@ static void stmmac_fpe_stop_wq(struct stmmac_priv *priv) { set_bit(__FPE_REMOVING, &priv->fpe_task_state); - if (priv->fpe_wq) + if (priv->fpe_wq) { destroy_workqueue(priv->fpe_wq); + priv->fpe_wq = NULL; + } netdev_info(priv->dev, "FPE workqueue stop"); } From cad078914b628737fa0946de02169e80fba721cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= Date: Tue, 2 Jan 2024 19:08:08 +0100 Subject: [PATCH 042/216] Bluetooth: hci_sync: Check the correct flag before starting a scan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 6b3899be24b16ff8ee0cb25f0bd59b01b15ba1d1 ] There's a very confusing mistake in the code starting a HCI inquiry: We're calling hci_dev_test_flag() to test for HCI_INQUIRY, but hci_dev_test_flag() checks hdev->dev_flags instead of hdev->flags. HCI_INQUIRY is a bit that's set on hdev->flags, not on hdev->dev_flags though. HCI_INQUIRY equals the integer 7, and in hdev->dev_flags, 7 means HCI_BONDABLE, so we were actually checking for HCI_BONDABLE here. The mistake is only present in the synchronous code for starting an inquiry, not in the async one. Also devices are typically bondable while doing an inquiry, so that might be the reason why nobody noticed it so far. Fixes: abfeea476c68 ("Bluetooth: hci_sync: Convert MGMT_OP_START_DISCOVERY") Signed-off-by: Jonas Dreßler Reviewed-by: Simon Horman Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- net/bluetooth/hci_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 45d19294aa77..13ed6cbfade3 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5482,7 +5482,7 @@ static int hci_inquiry_sync(struct hci_dev *hdev, u8 length) bt_dev_dbg(hdev, ""); - if (hci_dev_test_flag(hdev, HCI_INQUIRY)) + if (test_bit(HCI_INQUIRY, &hdev->flags)) return 0; hci_dev_lock(hdev); From 45085686b9559bfbe3a4f41d3d695a520668f5e1 Mon Sep 17 00:00:00 2001 From: Ying Hsu Date: Thu, 4 Jan 2024 11:56:32 +0000 Subject: [PATCH 043/216] Bluetooth: Avoid potential use-after-free in hci_error_reset [ Upstream commit 2449007d3f73b2842c9734f45f0aadb522daf592 ] While handling the HCI_EV_HARDWARE_ERROR event, if the underlying BT controller is not responding, the GPIO reset mechanism would free the hci_dev and lead to a use-after-free in hci_error_reset. Here's the call trace observed on a ChromeOS device with Intel AX201: queue_work_on+0x3e/0x6c __hci_cmd_sync_sk+0x2ee/0x4c0 [bluetooth ] ? init_wait_entry+0x31/0x31 __hci_cmd_sync+0x16/0x20 [bluetooth ] hci_error_reset+0x4f/0xa4 [bluetooth ] process_one_work+0x1d8/0x33f worker_thread+0x21b/0x373 kthread+0x13a/0x152 ? pr_cont_work+0x54/0x54 ? kthread_blkcg+0x31/0x31 ret_from_fork+0x1f/0x30 This patch holds the reference count on the hci_dev while processing a HCI_EV_HARDWARE_ERROR event to avoid potential crash. Fixes: c7741d16a57c ("Bluetooth: Perform a power cycle when receiving hardware error event") Signed-off-by: Ying Hsu Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- net/bluetooth/hci_core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 6a1db678d032..a8932d449eb6 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1049,6 +1049,7 @@ static void hci_error_reset(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset); + hci_dev_hold(hdev); BT_DBG("%s", hdev->name); if (hdev->hw_error) @@ -1056,10 +1057,10 @@ static void hci_error_reset(struct work_struct *work) else bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code); - if (hci_dev_do_close(hdev)) - return; + if (!hci_dev_do_close(hdev)) + hci_dev_do_open(hdev); - hci_dev_do_open(hdev); + hci_dev_put(hdev); } void hci_uuids_clear(struct hci_dev *hdev) From 926405765f25809602c52e037827d0d5a9f62692 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Jan 2024 10:43:26 -0500 Subject: [PATCH 044/216] Bluetooth: hci_sync: Fix accept_list when attempting to suspend [ Upstream commit e5469adb2a7e930d96813316592302d9f8f1df4e ] During suspend, only wakeable devices can be in acceptlist, so if the device was previously added it needs to be removed otherwise the device can end up waking up the system prematurely. Fixes: 3b42055388c3 ("Bluetooth: hci_sync: Fix attempting to suspend with unfiltered passive scan") Signed-off-by: Clancy Shang Signed-off-by: Luiz Augusto von Dentz Reviewed-by: Paul Menzel Signed-off-by: Sasha Levin --- net/bluetooth/hci_sync.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 13ed6cbfade3..a33734046456 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2251,8 +2251,11 @@ static int hci_le_add_accept_list_sync(struct hci_dev *hdev, /* During suspend, only wakeable devices can be in acceptlist */ if (hdev->suspended && - !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) + !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) { + hci_le_del_accept_list_sync(hdev, ¶ms->addr, + params->addr_type); return 0; + } /* Select filter policy to accept all advertising */ if (*num_entries >= hdev->le_accept_list_size) From 0b056a52b3adfe5fedf20cd64addbe4e1d226c95 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 9 Jan 2024 19:03:23 +0800 Subject: [PATCH 045/216] Bluetooth: hci_event: Fix wrongly recorded wakeup BD_ADDR [ Upstream commit 61a5ab72edea7ebc3ad2c6beea29d966f528ebfb ] hci_store_wake_reason() wrongly parses event HCI_Connection_Request as HCI_Connection_Complete and HCI_Connection_Complete as HCI_Connection_Request, so causes recording wakeup BD_ADDR error and potential stability issue, fix it by using the correct field. Fixes: 2f20216c1d6f ("Bluetooth: Emit controller suspend and resume events") Signed-off-by: Zijun Hu Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- net/bluetooth/hci_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 56ecc5f97b91..b18f5e5df8ad 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -7245,10 +7245,10 @@ static void hci_store_wake_reason(struct hci_dev *hdev, u8 event, * keep track of the bdaddr of the connection event that woke us up. */ if (event == HCI_EV_CONN_REQUEST) { - bacpy(&hdev->wake_addr, &conn_complete->bdaddr); + bacpy(&hdev->wake_addr, &conn_request->bdaddr); hdev->wake_addr_type = BDADDR_BREDR; } else if (event == HCI_EV_CONN_COMPLETE) { - bacpy(&hdev->wake_addr, &conn_request->bdaddr); + bacpy(&hdev->wake_addr, &conn_complete->bdaddr); hdev->wake_addr_type = BDADDR_BREDR; } else if (event == HCI_EV_LE_META) { struct hci_ev_le_meta *le_ev = (void *)skb->data; From 30a5e812f78e3d1cced90e1ed750bf027599205f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 22 Jan 2024 09:02:47 -0500 Subject: [PATCH 046/216] Bluetooth: hci_event: Fix handling of HCI_EV_IO_CAPA_REQUEST [ Upstream commit 7e74aa53a68bf60f6019bd5d9a9a1406ec4d4865 ] If we received HCI_EV_IO_CAPA_REQUEST while HCI_OP_READ_REMOTE_EXT_FEATURES is yet to be responded assume the remote does support SSP since otherwise this event shouldn't be generated. Link: https://lore.kernel.org/linux-bluetooth/CABBYNZ+9UdG1cMZVmdtN3U2aS16AKMCyTARZZyFX7xTEDWcMOw@mail.gmail.com/T/#t Fixes: c7f59461f5a7 ("Bluetooth: Fix a refcnt underflow problem for hci_conn") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- net/bluetooth/hci_event.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index b18f5e5df8ad..f79aaef5a276 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5282,9 +5282,12 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); - if (!conn || !hci_conn_ssp_enabled(conn)) + if (!conn || !hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) goto unlock; + /* Assume remote supports SSP since it has triggered this event */ + set_bit(HCI_CONN_SSP_ENABLED, &conn->flags); + hci_conn_hold(conn); if (!hci_dev_test_flag(hdev, HCI_MGMT)) From 2dc94c160ef0292d7da7ea2d4c3087c852c97fcc Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Thu, 25 Jan 2024 14:50:28 +0800 Subject: [PATCH 047/216] Bluetooth: Enforce validation on max value of connection interval [ Upstream commit e4b019515f950b4e6e5b74b2e1bb03a90cb33039 ] Right now Linux BT stack cannot pass test case "GAP/CONN/CPUP/BV-05-C 'Connection Parameter Update Procedure Invalid Parameters Central Responder'" in Bluetooth Test Suite revision GAP.TS.p44. [0] That was revoled by commit c49a8682fc5d ("Bluetooth: validate BLE connection interval updates"), but later got reverted due to devices like keyboards and mice may require low connection interval. So only validate the max value connection interval to pass the Test Suite, and let devices to request low connection interval if needed. [0] https://www.bluetooth.org/docman/handlers/DownloadDoc.ashx?doc_id=229869 Fixes: 68d19d7d9957 ("Revert "Bluetooth: validate BLE connection interval updates"") Signed-off-by: Kai-Heng Feng Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- net/bluetooth/hci_event.c | 4 ++++ net/bluetooth/l2cap_core.c | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index f79aaef5a276..452d839c152f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6719,6 +6719,10 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, void *data, return send_conn_param_neg_reply(hdev, handle, HCI_ERROR_UNKNOWN_CONN_ID); + if (max > hcon->le_conn_max_interval) + return send_conn_param_neg_reply(hdev, handle, + HCI_ERROR_INVALID_LL_PARAMS); + if (hci_check_conn_params(min, max, latency, timeout)) return send_conn_param_neg_reply(hdev, handle, HCI_ERROR_INVALID_LL_PARAMS); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 81f5974e5eb5..b4cba55be5ad 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -5614,7 +5614,13 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, memset(&rsp, 0, sizeof(rsp)); - err = hci_check_conn_params(min, max, latency, to_multiplier); + if (max > hcon->le_conn_max_interval) { + BT_DBG("requested connection interval exceeds current bounds."); + err = -EINVAL; + } else { + err = hci_check_conn_params(min, max, latency, to_multiplier); + } + if (err) rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED); else From 7b410226d9eff7f64857a75d65e149440bff2b2f Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 19 Jan 2024 17:45:30 +0800 Subject: [PATCH 048/216] Bluetooth: qca: Fix wrong event type for patch config command [ Upstream commit c0dbc56077ae759f2dd602c7561480bc2b1b712c ] Vendor-specific command patch config has HCI_Command_Complete event as response, but qca_send_patch_config_cmd() wrongly expects vendor-specific event for the command, fixed by using right event type. Btmon log for the vendor-specific command are shown below: < HCI Command: Vendor (0x3f|0x0000) plen 5 28 01 00 00 00 > HCI Event: Command Complete (0x0e) plen 5 Vendor (0x3f|0x0000) ncmd 1 Status: Success (0x00) 28 Fixes: 4fac8a7ac80b ("Bluetooth: btqca: sequential validation") Signed-off-by: Zijun Hu Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- drivers/bluetooth/btqca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index c9064d34d830..d7d0c9de3dc3 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -152,7 +152,7 @@ static int qca_send_patch_config_cmd(struct hci_dev *hdev) bt_dev_dbg(hdev, "QCA Patch config"); skb = __hci_cmd_sync_ev(hdev, EDL_PATCH_CMD_OPCODE, sizeof(cmd), - cmd, HCI_EV_VENDOR, HCI_INIT_TIMEOUT); + cmd, 0, HCI_INIT_TIMEOUT); if (IS_ERR(skb)) { err = PTR_ERR(skb); bt_dev_err(hdev, "Sending QCA Patch config failed (%d)", err); From eb7b5777d3c7f5dbbb0736f638068f50006d81b0 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 11 Mar 2023 12:13:53 +0100 Subject: [PATCH 049/216] Bluetooth: hci_qca: mark OF related data as maybe unused MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 44fac8a2fd2f72ee98ee41e6bc9ecc7765b5d3cc ] The driver can be compile tested with !CONFIG_OF making certain data unused: drivers/bluetooth/hci_qca.c:1869:37: error: ‘qca_soc_data_wcn6750’ defined but not used [-Werror=unused-const-variable=] drivers/bluetooth/hci_qca.c:1853:37: error: ‘qca_soc_data_wcn3998’ defined but not used [-Werror=unused-const-variable=] drivers/bluetooth/hci_qca.c:1841:37: error: ‘qca_soc_data_wcn3991’ defined but not used [-Werror=unused-const-variable=] drivers/bluetooth/hci_qca.c:1830:37: error: ‘qca_soc_data_wcn3990’ defined but not used [-Werror=unused-const-variable=] Signed-off-by: Krzysztof Kozlowski Signed-off-by: Luiz Augusto von Dentz Stable-dep-of: 7dcd3e014aa7 ("Bluetooth: hci_qca: Set BDA quirk bit if fwnode exists in DT") Signed-off-by: Sasha Levin --- drivers/bluetooth/hci_qca.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 76ceb8a0183d..0e908a337e53 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -1824,7 +1824,7 @@ static const struct hci_uart_proto qca_proto = { .dequeue = qca_dequeue, }; -static const struct qca_device_data qca_soc_data_wcn3990 = { +static const struct qca_device_data qca_soc_data_wcn3990 __maybe_unused = { .soc_type = QCA_WCN3990, .vregs = (struct qca_vreg []) { { "vddio", 15000 }, @@ -1835,7 +1835,7 @@ static const struct qca_device_data qca_soc_data_wcn3990 = { .num_vregs = 4, }; -static const struct qca_device_data qca_soc_data_wcn3991 = { +static const struct qca_device_data qca_soc_data_wcn3991 __maybe_unused = { .soc_type = QCA_WCN3991, .vregs = (struct qca_vreg []) { { "vddio", 15000 }, @@ -1847,7 +1847,7 @@ static const struct qca_device_data qca_soc_data_wcn3991 = { .capabilities = QCA_CAP_WIDEBAND_SPEECH | QCA_CAP_VALID_LE_STATES, }; -static const struct qca_device_data qca_soc_data_wcn3998 = { +static const struct qca_device_data qca_soc_data_wcn3998 __maybe_unused = { .soc_type = QCA_WCN3998, .vregs = (struct qca_vreg []) { { "vddio", 10000 }, @@ -1858,13 +1858,13 @@ static const struct qca_device_data qca_soc_data_wcn3998 = { .num_vregs = 4, }; -static const struct qca_device_data qca_soc_data_qca6390 = { +static const struct qca_device_data qca_soc_data_qca6390 __maybe_unused = { .soc_type = QCA_QCA6390, .num_vregs = 0, .capabilities = QCA_CAP_WIDEBAND_SPEECH | QCA_CAP_VALID_LE_STATES, }; -static const struct qca_device_data qca_soc_data_wcn6750 = { +static const struct qca_device_data qca_soc_data_wcn6750 __maybe_unused = { .soc_type = QCA_WCN6750, .vregs = (struct qca_vreg []) { { "vddio", 5000 }, From e5383662fd02ad9516ae2c27f85cd56296372ba9 Mon Sep 17 00:00:00 2001 From: Steev Klimaszewski Date: Sun, 26 Mar 2023 18:38:10 -0500 Subject: [PATCH 050/216] Bluetooth: hci_qca: Add support for QTI Bluetooth chip wcn6855 [ Upstream commit 095327fede005f4b14d40b2183b2f7965c739dbd ] Add regulators, GPIOs and changes required to power on/off wcn6855. Add support for firmware download for wcn6855 which is in the linux-firmware repository as hpbtfw21.tlv and hpnv21.bin. Based on the assumption that this is similar to the wcn6750 Tested-on: BTFW.HSP.2.1.0-00538-VER_PATCHZ-1 Signed-off-by: Steev Klimaszewski Reviewed-by: Bjorn Andersson Tested-by: Bjorn Andersson Signed-off-by: Luiz Augusto von Dentz Stable-dep-of: 7dcd3e014aa7 ("Bluetooth: hci_qca: Set BDA quirk bit if fwnode exists in DT") Signed-off-by: Sasha Levin --- drivers/bluetooth/btqca.c | 14 ++++++++- drivers/bluetooth/btqca.h | 10 +++++++ drivers/bluetooth/hci_qca.c | 57 ++++++++++++++++++++++++++++--------- 3 files changed, 66 insertions(+), 15 deletions(-) diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index d7d0c9de3dc3..4cb541096b93 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -614,6 +614,9 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, config.type = ELF_TYPE_PATCH; snprintf(config.fwname, sizeof(config.fwname), "qca/msbtfw%02x.mbn", rom_ver); + } else if (soc_type == QCA_WCN6855) { + snprintf(config.fwname, sizeof(config.fwname), + "qca/hpbtfw%02x.tlv", rom_ver); } else { snprintf(config.fwname, sizeof(config.fwname), "qca/rampatch_%08x.bin", soc_ver); @@ -648,6 +651,9 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, else if (soc_type == QCA_WCN6750) snprintf(config.fwname, sizeof(config.fwname), "qca/msnv%02x.bin", rom_ver); + else if (soc_type == QCA_WCN6855) + snprintf(config.fwname, sizeof(config.fwname), + "qca/hpnv%02x.bin", rom_ver); else snprintf(config.fwname, sizeof(config.fwname), "qca/nvm_%08x.bin", soc_ver); @@ -685,11 +691,17 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, return err; } - if (soc_type == QCA_WCN3991 || soc_type == QCA_WCN6750) { + switch (soc_type) { + case QCA_WCN3991: + case QCA_WCN6750: + case QCA_WCN6855: /* get fw build info */ err = qca_read_fw_build_info(hdev); if (err < 0) return err; + break; + default: + break; } bt_dev_info(hdev, "QCA setup on UART is completed"); diff --git a/drivers/bluetooth/btqca.h b/drivers/bluetooth/btqca.h index 61e9a50e66ae..b884095bcd9d 100644 --- a/drivers/bluetooth/btqca.h +++ b/drivers/bluetooth/btqca.h @@ -147,6 +147,7 @@ enum qca_btsoc_type { QCA_WCN3991, QCA_QCA6390, QCA_WCN6750, + QCA_WCN6855, }; #if IS_ENABLED(CONFIG_BT_QCA) @@ -168,6 +169,10 @@ static inline bool qca_is_wcn6750(enum qca_btsoc_type soc_type) { return soc_type == QCA_WCN6750; } +static inline bool qca_is_wcn6855(enum qca_btsoc_type soc_type) +{ + return soc_type == QCA_WCN6855; +} #else @@ -206,6 +211,11 @@ static inline bool qca_is_wcn6750(enum qca_btsoc_type soc_type) return false; } +static inline bool qca_is_wcn6855(enum qca_btsoc_type soc_type) +{ + return false; +} + static inline int qca_send_pre_shutdown_cmd(struct hci_dev *hdev) { return -EOPNOTSUPP; diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 0e908a337e53..f217c2821b9f 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -1315,7 +1315,8 @@ static int qca_set_baudrate(struct hci_dev *hdev, uint8_t baudrate) /* Give the controller time to process the request */ if (qca_is_wcn399x(qca_soc_type(hu)) || - qca_is_wcn6750(qca_soc_type(hu))) + qca_is_wcn6750(qca_soc_type(hu)) || + qca_is_wcn6855(qca_soc_type(hu))) usleep_range(1000, 10000); else msleep(300); @@ -1392,7 +1393,8 @@ static unsigned int qca_get_speed(struct hci_uart *hu, static int qca_check_speeds(struct hci_uart *hu) { if (qca_is_wcn399x(qca_soc_type(hu)) || - qca_is_wcn6750(qca_soc_type(hu))) { + qca_is_wcn6750(qca_soc_type(hu)) || + qca_is_wcn6855(qca_soc_type(hu))) { if (!qca_get_speed(hu, QCA_INIT_SPEED) && !qca_get_speed(hu, QCA_OPER_SPEED)) return -EINVAL; @@ -1426,7 +1428,8 @@ static int qca_set_speed(struct hci_uart *hu, enum qca_speed_type speed_type) * changing the baudrate of chip and host. */ if (qca_is_wcn399x(soc_type) || - qca_is_wcn6750(soc_type)) + qca_is_wcn6750(soc_type) || + qca_is_wcn6855(soc_type)) hci_uart_set_flow_control(hu, true); if (soc_type == QCA_WCN3990) { @@ -1444,7 +1447,8 @@ static int qca_set_speed(struct hci_uart *hu, enum qca_speed_type speed_type) error: if (qca_is_wcn399x(soc_type) || - qca_is_wcn6750(soc_type)) + qca_is_wcn6750(soc_type) || + qca_is_wcn6855(soc_type)) hci_uart_set_flow_control(hu, false); if (soc_type == QCA_WCN3990) { @@ -1680,7 +1684,8 @@ static int qca_power_on(struct hci_dev *hdev) return 0; if (qca_is_wcn399x(soc_type) || - qca_is_wcn6750(soc_type)) { + qca_is_wcn6750(soc_type) || + qca_is_wcn6855(soc_type)) { ret = qca_regulator_init(hu); } else { qcadev = serdev_device_get_drvdata(hu->serdev); @@ -1721,7 +1726,8 @@ static int qca_setup(struct hci_uart *hu) bt_dev_info(hdev, "setting up %s", qca_is_wcn399x(soc_type) ? "wcn399x" : - (soc_type == QCA_WCN6750) ? "wcn6750" : "ROME/QCA6390"); + (soc_type == QCA_WCN6750) ? "wcn6750" : + (soc_type == QCA_WCN6855) ? "wcn6855" : "ROME/QCA6390"); qca->memdump_state = QCA_MEMDUMP_IDLE; @@ -1733,7 +1739,8 @@ retry: clear_bit(QCA_SSR_TRIGGERED, &qca->flags); if (qca_is_wcn399x(soc_type) || - qca_is_wcn6750(soc_type)) { + qca_is_wcn6750(soc_type) || + qca_is_wcn6855(soc_type)) { set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); hci_set_aosp_capable(hdev); @@ -1755,7 +1762,8 @@ retry: } if (!(qca_is_wcn399x(soc_type) || - qca_is_wcn6750(soc_type))) { + qca_is_wcn6750(soc_type) || + qca_is_wcn6855(soc_type))) { /* Get QCA version information */ ret = qca_read_soc_version(hdev, &ver, soc_type); if (ret) @@ -1881,6 +1889,20 @@ static const struct qca_device_data qca_soc_data_wcn6750 __maybe_unused = { .capabilities = QCA_CAP_WIDEBAND_SPEECH | QCA_CAP_VALID_LE_STATES, }; +static const struct qca_device_data qca_soc_data_wcn6855 = { + .soc_type = QCA_WCN6855, + .vregs = (struct qca_vreg []) { + { "vddio", 5000 }, + { "vddbtcxmx", 126000 }, + { "vddrfacmn", 12500 }, + { "vddrfa0p8", 102000 }, + { "vddrfa1p7", 302000 }, + { "vddrfa1p2", 257000 }, + }, + .num_vregs = 6, + .capabilities = QCA_CAP_WIDEBAND_SPEECH | QCA_CAP_VALID_LE_STATES, +}; + static void qca_power_shutdown(struct hci_uart *hu) { struct qca_serdev *qcadev; @@ -1910,7 +1932,7 @@ static void qca_power_shutdown(struct hci_uart *hu) host_set_baudrate(hu, 2400); qca_send_power_pulse(hu, false); qca_regulator_disable(qcadev); - } else if (soc_type == QCA_WCN6750) { + } else if (soc_type == QCA_WCN6750 || soc_type == QCA_WCN6855) { gpiod_set_value_cansleep(qcadev->bt_en, 0); msleep(100); qca_regulator_disable(qcadev); @@ -2045,7 +2067,8 @@ static int qca_serdev_probe(struct serdev_device *serdev) if (data && (qca_is_wcn399x(data->soc_type) || - qca_is_wcn6750(data->soc_type))) { + qca_is_wcn6750(data->soc_type) || + qca_is_wcn6855(data->soc_type))) { qcadev->btsoc_type = data->soc_type; qcadev->bt_power = devm_kzalloc(&serdev->dev, sizeof(struct qca_power), @@ -2065,14 +2088,18 @@ static int qca_serdev_probe(struct serdev_device *serdev) qcadev->bt_en = devm_gpiod_get_optional(&serdev->dev, "enable", GPIOD_OUT_LOW); - if (IS_ERR_OR_NULL(qcadev->bt_en) && data->soc_type == QCA_WCN6750) { + if (IS_ERR_OR_NULL(qcadev->bt_en) && + (data->soc_type == QCA_WCN6750 || + data->soc_type == QCA_WCN6855)) { dev_err(&serdev->dev, "failed to acquire BT_EN gpio\n"); power_ctrl_enabled = false; } qcadev->sw_ctrl = devm_gpiod_get_optional(&serdev->dev, "swctrl", GPIOD_IN); - if (IS_ERR_OR_NULL(qcadev->sw_ctrl) && data->soc_type == QCA_WCN6750) + if (IS_ERR_OR_NULL(qcadev->sw_ctrl) && + (data->soc_type == QCA_WCN6750 || + data->soc_type == QCA_WCN6855)) dev_warn(&serdev->dev, "failed to acquire SW_CTRL gpio\n"); qcadev->susclk = devm_clk_get_optional(&serdev->dev, NULL); @@ -2148,8 +2175,9 @@ static void qca_serdev_remove(struct serdev_device *serdev) struct qca_power *power = qcadev->bt_power; if ((qca_is_wcn399x(qcadev->btsoc_type) || - qca_is_wcn6750(qcadev->btsoc_type)) && - power->vregs_on) + qca_is_wcn6750(qcadev->btsoc_type) || + qca_is_wcn6855(qcadev->btsoc_type)) && + power->vregs_on) qca_power_shutdown(&qcadev->serdev_hu); else if (qcadev->susclk) clk_disable_unprepare(qcadev->susclk); @@ -2333,6 +2361,7 @@ static const struct of_device_id qca_bluetooth_of_match[] = { { .compatible = "qcom,wcn3991-bt", .data = &qca_soc_data_wcn3991}, { .compatible = "qcom,wcn3998-bt", .data = &qca_soc_data_wcn3998}, { .compatible = "qcom,wcn6750-bt", .data = &qca_soc_data_wcn6750}, + { .compatible = "qcom,wcn6855-bt", .data = &qca_soc_data_wcn6855}, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, qca_bluetooth_of_match); From 29059d0f3bc21f76db5a375a70a449ba86f3d6cc Mon Sep 17 00:00:00 2001 From: Min-Hua Chen Date: Fri, 19 May 2023 18:43:23 +0800 Subject: [PATCH 051/216] Bluetooth: btqca: use le32_to_cpu for ver.soc_id [ Upstream commit 8153b738bc547878a017889d2b1cf8dd2de0e0c6 ] Use le32_to_cpu for ver.soc_id to fix the following sparse warning. drivers/bluetooth/btqca.c:640:24: sparse: warning: restricted __le32 degrades to integer Signed-off-by: Min-Hua Chen Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Jakub Kicinski Stable-dep-of: 7dcd3e014aa7 ("Bluetooth: hci_qca: Set BDA quirk bit if fwnode exists in DT") Signed-off-by: Sasha Levin --- drivers/bluetooth/btqca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index 4cb541096b93..d40a6041c48c 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -637,7 +637,7 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, snprintf(config.fwname, sizeof(config.fwname), "qca/%s", firmware_name); else if (qca_is_wcn399x(soc_type)) { - if (ver.soc_id == QCA_WCN3991_SOC_ID) { + if (le32_to_cpu(ver.soc_id) == QCA_WCN3991_SOC_ID) { snprintf(config.fwname, sizeof(config.fwname), "qca/crnv%02xu.bin", rom_ver); } else { From 940963613275a39fd693fb1969c1c6fbc0798a21 Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Wed, 2 Aug 2023 08:56:29 +0200 Subject: [PATCH 052/216] Bluetooth: btqca: Add WCN3988 support [ Upstream commit f904feefe60c28b6852d5625adc4a2c39426a2d9 ] Add support for the Bluetooth chip codenamed APACHE which is part of WCN3988. The firmware for this chip has a slightly different naming scheme compared to most others. For ROM Version 0x0200 we need to use apbtfw10.tlv + apnv10.bin and for ROM version 0x201 apbtfw11.tlv + apnv11.bin Signed-off-by: Luca Weiss Signed-off-by: Luiz Augusto von Dentz Stable-dep-of: 7dcd3e014aa7 ("Bluetooth: hci_qca: Set BDA quirk bit if fwnode exists in DT") Signed-off-by: Sasha Levin --- drivers/bluetooth/btqca.c | 13 +++++++++++-- drivers/bluetooth/btqca.h | 12 ++++++++++-- drivers/bluetooth/hci_qca.c | 12 ++++++++++++ 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index d40a6041c48c..d775402b33df 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -594,14 +594,20 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, /* Firmware files to download are based on ROM version. * ROM version is derived from last two bytes of soc_ver. */ - rom_ver = ((soc_ver & 0x00000f00) >> 0x04) | (soc_ver & 0x0000000f); + if (soc_type == QCA_WCN3988) + rom_ver = ((soc_ver & 0x00000f00) >> 0x05) | (soc_ver & 0x0000000f); + else + rom_ver = ((soc_ver & 0x00000f00) >> 0x04) | (soc_ver & 0x0000000f); if (soc_type == QCA_WCN6750) qca_send_patch_config_cmd(hdev); /* Download rampatch file */ config.type = TLV_TYPE_PATCH; - if (qca_is_wcn399x(soc_type)) { + if (soc_type == QCA_WCN3988) { + snprintf(config.fwname, sizeof(config.fwname), + "qca/apbtfw%02x.tlv", rom_ver); + } else if (qca_is_wcn399x(soc_type)) { snprintf(config.fwname, sizeof(config.fwname), "qca/crbtfw%02x.tlv", rom_ver); } else if (soc_type == QCA_QCA6390) { @@ -636,6 +642,9 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, if (firmware_name) snprintf(config.fwname, sizeof(config.fwname), "qca/%s", firmware_name); + else if (soc_type == QCA_WCN3988) + snprintf(config.fwname, sizeof(config.fwname), + "qca/apnv%02x.bin", rom_ver); else if (qca_is_wcn399x(soc_type)) { if (le32_to_cpu(ver.soc_id) == QCA_WCN3991_SOC_ID) { snprintf(config.fwname, sizeof(config.fwname), diff --git a/drivers/bluetooth/btqca.h b/drivers/bluetooth/btqca.h index b884095bcd9d..fc6cf314eb0e 100644 --- a/drivers/bluetooth/btqca.h +++ b/drivers/bluetooth/btqca.h @@ -142,6 +142,7 @@ enum qca_btsoc_type { QCA_INVALID = -1, QCA_AR3002, QCA_ROME, + QCA_WCN3988, QCA_WCN3990, QCA_WCN3998, QCA_WCN3991, @@ -162,8 +163,15 @@ int qca_set_bdaddr(struct hci_dev *hdev, const bdaddr_t *bdaddr); int qca_send_pre_shutdown_cmd(struct hci_dev *hdev); static inline bool qca_is_wcn399x(enum qca_btsoc_type soc_type) { - return soc_type == QCA_WCN3990 || soc_type == QCA_WCN3991 || - soc_type == QCA_WCN3998; + switch (soc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + return true; + default: + return false; + } } static inline bool qca_is_wcn6750(enum qca_btsoc_type soc_type) { diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index f217c2821b9f..746eb096c037 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -1832,6 +1832,17 @@ static const struct hci_uart_proto qca_proto = { .dequeue = qca_dequeue, }; +static const struct qca_device_data qca_soc_data_wcn3988 __maybe_unused = { + .soc_type = QCA_WCN3988, + .vregs = (struct qca_vreg []) { + { "vddio", 15000 }, + { "vddxo", 80000 }, + { "vddrf", 300000 }, + { "vddch0", 450000 }, + }, + .num_vregs = 4, +}; + static const struct qca_device_data qca_soc_data_wcn3990 __maybe_unused = { .soc_type = QCA_WCN3990, .vregs = (struct qca_vreg []) { @@ -2357,6 +2368,7 @@ static const struct of_device_id qca_bluetooth_of_match[] = { { .compatible = "qcom,qca6174-bt" }, { .compatible = "qcom,qca6390-bt", .data = &qca_soc_data_qca6390}, { .compatible = "qcom,qca9377-bt" }, + { .compatible = "qcom,wcn3988-bt", .data = &qca_soc_data_wcn3988}, { .compatible = "qcom,wcn3990-bt", .data = &qca_soc_data_wcn3990}, { .compatible = "qcom,wcn3991-bt", .data = &qca_soc_data_wcn3991}, { .compatible = "qcom,wcn3998-bt", .data = &qca_soc_data_wcn3998}, From fc47ed389a884ee4a5b01f62ecae8137b41f63d7 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Wed, 16 Aug 2023 10:06:47 +0200 Subject: [PATCH 053/216] Bluetooth: qca: use switch case for soc type behavior [ Upstream commit 691d54d0f7cb14baac1ff4af210d13c0e4897e27 ] Use switch/case to handle soc type specific behaviour, the permit dropping the qca_is_xxx() inline functions and make the code clearer and easier to update for new SoCs. Suggested-by: Konrad Dybcio Suggested-by: Luiz Augusto von Dentz Signed-off-by: Neil Armstrong Signed-off-by: Luiz Augusto von Dentz Stable-dep-of: 7dcd3e014aa7 ("Bluetooth: hci_qca: Set BDA quirk bit if fwnode exists in DT") Signed-off-by: Sasha Levin --- drivers/bluetooth/btqca.c | 87 +++++++++----- drivers/bluetooth/btqca.h | 36 ------ drivers/bluetooth/hci_qca.c | 233 +++++++++++++++++++++++++++--------- 3 files changed, 236 insertions(+), 120 deletions(-) diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index d775402b33df..8331090af86e 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -604,26 +604,34 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, /* Download rampatch file */ config.type = TLV_TYPE_PATCH; - if (soc_type == QCA_WCN3988) { - snprintf(config.fwname, sizeof(config.fwname), - "qca/apbtfw%02x.tlv", rom_ver); - } else if (qca_is_wcn399x(soc_type)) { + switch (soc_type) { + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: snprintf(config.fwname, sizeof(config.fwname), "qca/crbtfw%02x.tlv", rom_ver); - } else if (soc_type == QCA_QCA6390) { + break; + case QCA_WCN3988: + snprintf(config.fwname, sizeof(config.fwname), + "qca/apbtfw%02x.tlv", rom_ver); + break; + case QCA_QCA6390: snprintf(config.fwname, sizeof(config.fwname), "qca/htbtfw%02x.tlv", rom_ver); - } else if (soc_type == QCA_WCN6750) { + break; + case QCA_WCN6750: /* Choose mbn file by default.If mbn file is not found * then choose tlv file */ config.type = ELF_TYPE_PATCH; snprintf(config.fwname, sizeof(config.fwname), "qca/msbtfw%02x.mbn", rom_ver); - } else if (soc_type == QCA_WCN6855) { + break; + case QCA_WCN6855: snprintf(config.fwname, sizeof(config.fwname), "qca/hpbtfw%02x.tlv", rom_ver); - } else { + break; + default: snprintf(config.fwname, sizeof(config.fwname), "qca/rampatch_%08x.bin", soc_ver); } @@ -639,33 +647,44 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, /* Download NVM configuration */ config.type = TLV_TYPE_NVM; - if (firmware_name) + if (firmware_name) { snprintf(config.fwname, sizeof(config.fwname), "qca/%s", firmware_name); - else if (soc_type == QCA_WCN3988) - snprintf(config.fwname, sizeof(config.fwname), - "qca/apnv%02x.bin", rom_ver); - else if (qca_is_wcn399x(soc_type)) { - if (le32_to_cpu(ver.soc_id) == QCA_WCN3991_SOC_ID) { + } else { + switch (soc_type) { + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + if (le32_to_cpu(ver.soc_id) == QCA_WCN3991_SOC_ID) { + snprintf(config.fwname, sizeof(config.fwname), + "qca/crnv%02xu.bin", rom_ver); + } else { + snprintf(config.fwname, sizeof(config.fwname), + "qca/crnv%02x.bin", rom_ver); + } + break; + case QCA_WCN3988: snprintf(config.fwname, sizeof(config.fwname), - "qca/crnv%02xu.bin", rom_ver); - } else { + "qca/apnv%02x.bin", rom_ver); + break; + case QCA_QCA6390: snprintf(config.fwname, sizeof(config.fwname), - "qca/crnv%02x.bin", rom_ver); + "qca/htnv%02x.bin", rom_ver); + break; + case QCA_WCN6750: + snprintf(config.fwname, sizeof(config.fwname), + "qca/msnv%02x.bin", rom_ver); + break; + case QCA_WCN6855: + snprintf(config.fwname, sizeof(config.fwname), + "qca/hpnv%02x.bin", rom_ver); + break; + + default: + snprintf(config.fwname, sizeof(config.fwname), + "qca/nvm_%08x.bin", soc_ver); } } - else if (soc_type == QCA_QCA6390) - snprintf(config.fwname, sizeof(config.fwname), - "qca/htnv%02x.bin", rom_ver); - else if (soc_type == QCA_WCN6750) - snprintf(config.fwname, sizeof(config.fwname), - "qca/msnv%02x.bin", rom_ver); - else if (soc_type == QCA_WCN6855) - snprintf(config.fwname, sizeof(config.fwname), - "qca/hpnv%02x.bin", rom_ver); - else - snprintf(config.fwname, sizeof(config.fwname), - "qca/nvm_%08x.bin", soc_ver); err = qca_download_firmware(hdev, &config, soc_type, rom_ver); if (err < 0) { @@ -673,16 +692,24 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, return err; } - if (soc_type >= QCA_WCN3991) { + switch (soc_type) { + case QCA_WCN3991: + case QCA_QCA6390: + case QCA_WCN6750: + case QCA_WCN6855: err = qca_disable_soc_logging(hdev); if (err < 0) return err; + break; + default: + break; } /* WCN399x and WCN6750 supports the Microsoft vendor extension with 0xFD70 as the * VsMsftOpCode. */ switch (soc_type) { + case QCA_WCN3988: case QCA_WCN3990: case QCA_WCN3991: case QCA_WCN3998: diff --git a/drivers/bluetooth/btqca.h b/drivers/bluetooth/btqca.h index fc6cf314eb0e..fe51c632d772 100644 --- a/drivers/bluetooth/btqca.h +++ b/drivers/bluetooth/btqca.h @@ -161,27 +161,6 @@ int qca_read_soc_version(struct hci_dev *hdev, struct qca_btsoc_version *ver, enum qca_btsoc_type); int qca_set_bdaddr(struct hci_dev *hdev, const bdaddr_t *bdaddr); int qca_send_pre_shutdown_cmd(struct hci_dev *hdev); -static inline bool qca_is_wcn399x(enum qca_btsoc_type soc_type) -{ - switch (soc_type) { - case QCA_WCN3988: - case QCA_WCN3990: - case QCA_WCN3991: - case QCA_WCN3998: - return true; - default: - return false; - } -} -static inline bool qca_is_wcn6750(enum qca_btsoc_type soc_type) -{ - return soc_type == QCA_WCN6750; -} -static inline bool qca_is_wcn6855(enum qca_btsoc_type soc_type) -{ - return soc_type == QCA_WCN6855; -} - #else static inline int qca_set_bdaddr_rome(struct hci_dev *hdev, const bdaddr_t *bdaddr) @@ -209,21 +188,6 @@ static inline int qca_set_bdaddr(struct hci_dev *hdev, const bdaddr_t *bdaddr) return -EOPNOTSUPP; } -static inline bool qca_is_wcn399x(enum qca_btsoc_type soc_type) -{ - return false; -} - -static inline bool qca_is_wcn6750(enum qca_btsoc_type soc_type) -{ - return false; -} - -static inline bool qca_is_wcn6855(enum qca_btsoc_type soc_type) -{ - return false; -} - static inline int qca_send_pre_shutdown_cmd(struct hci_dev *hdev) { return -EOPNOTSUPP; diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 746eb096c037..e6ead996948a 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -606,9 +606,18 @@ static int qca_open(struct hci_uart *hu) if (hu->serdev) { qcadev = serdev_device_get_drvdata(hu->serdev); - if (qca_is_wcn399x(qcadev->btsoc_type) || - qca_is_wcn6750(qcadev->btsoc_type)) + switch (qcadev->btsoc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + case QCA_WCN6750: hu->init_speed = qcadev->init_speed; + break; + + default: + break; + } if (qcadev->oper_speed) hu->oper_speed = qcadev->oper_speed; @@ -1314,12 +1323,19 @@ static int qca_set_baudrate(struct hci_dev *hdev, uint8_t baudrate) msecs_to_jiffies(CMD_TRANS_TIMEOUT_MS)); /* Give the controller time to process the request */ - if (qca_is_wcn399x(qca_soc_type(hu)) || - qca_is_wcn6750(qca_soc_type(hu)) || - qca_is_wcn6855(qca_soc_type(hu))) + switch (qca_soc_type(hu)) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + case QCA_WCN6750: + case QCA_WCN6855: usleep_range(1000, 10000); - else + break; + + default: msleep(300); + } return 0; } @@ -1392,13 +1408,19 @@ static unsigned int qca_get_speed(struct hci_uart *hu, static int qca_check_speeds(struct hci_uart *hu) { - if (qca_is_wcn399x(qca_soc_type(hu)) || - qca_is_wcn6750(qca_soc_type(hu)) || - qca_is_wcn6855(qca_soc_type(hu))) { + switch (qca_soc_type(hu)) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + case QCA_WCN6750: + case QCA_WCN6855: if (!qca_get_speed(hu, QCA_INIT_SPEED) && !qca_get_speed(hu, QCA_OPER_SPEED)) return -EINVAL; - } else { + break; + + default: if (!qca_get_speed(hu, QCA_INIT_SPEED) || !qca_get_speed(hu, QCA_OPER_SPEED)) return -EINVAL; @@ -1427,14 +1449,28 @@ static int qca_set_speed(struct hci_uart *hu, enum qca_speed_type speed_type) /* Disable flow control for wcn3990 to deassert RTS while * changing the baudrate of chip and host. */ - if (qca_is_wcn399x(soc_type) || - qca_is_wcn6750(soc_type) || - qca_is_wcn6855(soc_type)) + switch (soc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + case QCA_WCN6750: + case QCA_WCN6855: hci_uart_set_flow_control(hu, true); + break; - if (soc_type == QCA_WCN3990) { + default: + break; + } + + switch (soc_type) { + case QCA_WCN3990: reinit_completion(&qca->drop_ev_comp); set_bit(QCA_DROP_VENDOR_EVENT, &qca->flags); + break; + + default: + break; } qca_baudrate = qca_get_baudrate_value(speed); @@ -1446,12 +1482,22 @@ static int qca_set_speed(struct hci_uart *hu, enum qca_speed_type speed_type) host_set_baudrate(hu, speed); error: - if (qca_is_wcn399x(soc_type) || - qca_is_wcn6750(soc_type) || - qca_is_wcn6855(soc_type)) + switch (soc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + case QCA_WCN6750: + case QCA_WCN6855: hci_uart_set_flow_control(hu, false); + break; - if (soc_type == QCA_WCN3990) { + default: + break; + } + + switch (soc_type) { + case QCA_WCN3990: /* Wait for the controller to send the vendor event * for the baudrate change command. */ @@ -1463,6 +1509,10 @@ error: } clear_bit(QCA_DROP_VENDOR_EVENT, &qca->flags); + break; + + default: + break; } } @@ -1624,12 +1674,20 @@ static int qca_regulator_init(struct hci_uart *hu) } } - if (qca_is_wcn399x(soc_type)) { + switch (soc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: /* Forcefully enable wcn399x to enter in to boot mode. */ host_set_baudrate(hu, 2400); ret = qca_send_power_pulse(hu, false); if (ret) return ret; + break; + + default: + break; } /* For wcn6750 need to enable gpio bt_en */ @@ -1646,10 +1704,18 @@ static int qca_regulator_init(struct hci_uart *hu) qca_set_speed(hu, QCA_INIT_SPEED); - if (qca_is_wcn399x(soc_type)) { + switch (soc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: ret = qca_send_power_pulse(hu, true); if (ret) return ret; + break; + + default: + break; } /* Now the device is in ready state to communicate with host. @@ -1683,11 +1749,17 @@ static int qca_power_on(struct hci_dev *hdev) if (!hu->serdev) return 0; - if (qca_is_wcn399x(soc_type) || - qca_is_wcn6750(soc_type) || - qca_is_wcn6855(soc_type)) { + switch (soc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + case QCA_WCN6750: + case QCA_WCN6855: ret = qca_regulator_init(hu); - } else { + break; + + default: qcadev = serdev_device_get_drvdata(hu->serdev); if (qcadev->bt_en) { gpiod_set_value_cansleep(qcadev->bt_en, 1); @@ -1710,6 +1782,7 @@ static int qca_setup(struct hci_uart *hu) const char *firmware_name = qca_get_firmware_name(hu); int ret; struct qca_btsoc_version ver; + const char *soc_name; ret = qca_check_speeds(hu); if (ret) @@ -1724,10 +1797,26 @@ static int qca_setup(struct hci_uart *hu) */ set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); - bt_dev_info(hdev, "setting up %s", - qca_is_wcn399x(soc_type) ? "wcn399x" : - (soc_type == QCA_WCN6750) ? "wcn6750" : - (soc_type == QCA_WCN6855) ? "wcn6855" : "ROME/QCA6390"); + switch (soc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + soc_name = "wcn399x"; + break; + + case QCA_WCN6750: + soc_name = "wcn6750"; + break; + + case QCA_WCN6855: + soc_name = "wcn6855"; + break; + + default: + soc_name = "ROME/QCA6390"; + } + bt_dev_info(hdev, "setting up %s", soc_name); qca->memdump_state = QCA_MEMDUMP_IDLE; @@ -1738,16 +1827,22 @@ retry: clear_bit(QCA_SSR_TRIGGERED, &qca->flags); - if (qca_is_wcn399x(soc_type) || - qca_is_wcn6750(soc_type) || - qca_is_wcn6855(soc_type)) { + switch (soc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + case QCA_WCN6750: + case QCA_WCN6855: set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); hci_set_aosp_capable(hdev); ret = qca_read_soc_version(hdev, &ver, soc_type); if (ret) goto out; - } else { + break; + + default: qca_set_speed(hu, QCA_INIT_SPEED); } @@ -1761,9 +1856,16 @@ retry: qca_baudrate = qca_get_baudrate_value(speed); } - if (!(qca_is_wcn399x(soc_type) || - qca_is_wcn6750(soc_type) || - qca_is_wcn6855(soc_type))) { + switch (soc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + case QCA_WCN6750: + case QCA_WCN6855: + break; + + default: /* Get QCA version information */ ret = qca_read_soc_version(hdev, &ver, soc_type); if (ret) @@ -1939,11 +2041,18 @@ static void qca_power_shutdown(struct hci_uart *hu) qcadev = serdev_device_get_drvdata(hu->serdev); - if (qca_is_wcn399x(soc_type)) { + switch (soc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: host_set_baudrate(hu, 2400); qca_send_power_pulse(hu, false); qca_regulator_disable(qcadev); - } else if (soc_type == QCA_WCN6750 || soc_type == QCA_WCN6855) { + break; + + case QCA_WCN6750: + case QCA_WCN6855: gpiod_set_value_cansleep(qcadev->bt_en, 0); msleep(100); qca_regulator_disable(qcadev); @@ -1951,7 +2060,9 @@ static void qca_power_shutdown(struct hci_uart *hu) sw_ctrl_state = gpiod_get_value_cansleep(qcadev->sw_ctrl); bt_dev_dbg(hu->hdev, "SW_CTRL is %d", sw_ctrl_state); } - } else if (qcadev->bt_en) { + break; + + default: gpiod_set_value_cansleep(qcadev->bt_en, 0); } @@ -2076,11 +2187,18 @@ static int qca_serdev_probe(struct serdev_device *serdev) if (!qcadev->oper_speed) BT_DBG("UART will pick default operating speed"); - if (data && - (qca_is_wcn399x(data->soc_type) || - qca_is_wcn6750(data->soc_type) || - qca_is_wcn6855(data->soc_type))) { + if (data) qcadev->btsoc_type = data->soc_type; + else + qcadev->btsoc_type = QCA_ROME; + + switch (qcadev->btsoc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + case QCA_WCN6750: + case QCA_WCN6855: qcadev->bt_power = devm_kzalloc(&serdev->dev, sizeof(struct qca_power), GFP_KERNEL); @@ -2124,12 +2242,9 @@ static int qca_serdev_probe(struct serdev_device *serdev) BT_ERR("wcn3990 serdev registration failed"); return err; } - } else { - if (data) - qcadev->btsoc_type = data->soc_type; - else - qcadev->btsoc_type = QCA_ROME; + break; + default: qcadev->bt_en = devm_gpiod_get_optional(&serdev->dev, "enable", GPIOD_OUT_LOW); if (IS_ERR_OR_NULL(qcadev->bt_en)) { @@ -2185,13 +2300,23 @@ static void qca_serdev_remove(struct serdev_device *serdev) struct qca_serdev *qcadev = serdev_device_get_drvdata(serdev); struct qca_power *power = qcadev->bt_power; - if ((qca_is_wcn399x(qcadev->btsoc_type) || - qca_is_wcn6750(qcadev->btsoc_type) || - qca_is_wcn6855(qcadev->btsoc_type)) && - power->vregs_on) - qca_power_shutdown(&qcadev->serdev_hu); - else if (qcadev->susclk) - clk_disable_unprepare(qcadev->susclk); + switch (qcadev->btsoc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + case QCA_WCN6750: + case QCA_WCN6855: + if (power->vregs_on) { + qca_power_shutdown(&qcadev->serdev_hu); + break; + } + fallthrough; + + default: + if (qcadev->susclk) + clk_disable_unprepare(qcadev->susclk); + } hci_uart_unregister_device(&qcadev->serdev_hu); } From 67ffc334b92a96e65b627b6b3349c25946ff69f6 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Wed, 16 Aug 2023 10:06:48 +0200 Subject: [PATCH 054/216] Bluetooth: qca: add support for WCN7850 [ Upstream commit e0c1278ac89b0390fe9a74f673b6f25172292db2 ] Add support for the WCN7850 Bluetooth chipset. Tested on the SM8550 QRD platform. Signed-off-by: Neil Armstrong Signed-off-by: Luiz Augusto von Dentz Stable-dep-of: 7dcd3e014aa7 ("Bluetooth: hci_qca: Set BDA quirk bit if fwnode exists in DT") Signed-off-by: Sasha Levin --- drivers/bluetooth/btqca.c | 10 ++++++++++ drivers/bluetooth/btqca.h | 1 + drivers/bluetooth/hci_qca.c | 31 ++++++++++++++++++++++++++++++- 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index 8331090af86e..0211f704a358 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -631,6 +631,10 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, snprintf(config.fwname, sizeof(config.fwname), "qca/hpbtfw%02x.tlv", rom_ver); break; + case QCA_WCN7850: + snprintf(config.fwname, sizeof(config.fwname), + "qca/hmtbtfw%02x.tlv", rom_ver); + break; default: snprintf(config.fwname, sizeof(config.fwname), "qca/rampatch_%08x.bin", soc_ver); @@ -679,6 +683,10 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, snprintf(config.fwname, sizeof(config.fwname), "qca/hpnv%02x.bin", rom_ver); break; + case QCA_WCN7850: + snprintf(config.fwname, sizeof(config.fwname), + "qca/hmtnv%02x.bin", rom_ver); + break; default: snprintf(config.fwname, sizeof(config.fwname), @@ -697,6 +705,7 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, case QCA_QCA6390: case QCA_WCN6750: case QCA_WCN6855: + case QCA_WCN7850: err = qca_disable_soc_logging(hdev); if (err < 0) return err; @@ -731,6 +740,7 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, case QCA_WCN3991: case QCA_WCN6750: case QCA_WCN6855: + case QCA_WCN7850: /* get fw build info */ err = qca_read_fw_build_info(hdev); if (err < 0) diff --git a/drivers/bluetooth/btqca.h b/drivers/bluetooth/btqca.h index fe51c632d772..03bff5c0059d 100644 --- a/drivers/bluetooth/btqca.h +++ b/drivers/bluetooth/btqca.h @@ -149,6 +149,7 @@ enum qca_btsoc_type { QCA_QCA6390, QCA_WCN6750, QCA_WCN6855, + QCA_WCN7850, }; #if IS_ENABLED(CONFIG_BT_QCA) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index e6ead996948a..43abdaf92a0e 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -1330,6 +1330,7 @@ static int qca_set_baudrate(struct hci_dev *hdev, uint8_t baudrate) case QCA_WCN3998: case QCA_WCN6750: case QCA_WCN6855: + case QCA_WCN7850: usleep_range(1000, 10000); break; @@ -1415,6 +1416,7 @@ static int qca_check_speeds(struct hci_uart *hu) case QCA_WCN3998: case QCA_WCN6750: case QCA_WCN6855: + case QCA_WCN7850: if (!qca_get_speed(hu, QCA_INIT_SPEED) && !qca_get_speed(hu, QCA_OPER_SPEED)) return -EINVAL; @@ -1456,6 +1458,7 @@ static int qca_set_speed(struct hci_uart *hu, enum qca_speed_type speed_type) case QCA_WCN3998: case QCA_WCN6750: case QCA_WCN6855: + case QCA_WCN7850: hci_uart_set_flow_control(hu, true); break; @@ -1489,6 +1492,7 @@ error: case QCA_WCN3998: case QCA_WCN6750: case QCA_WCN6855: + case QCA_WCN7850: hci_uart_set_flow_control(hu, false); break; @@ -1756,6 +1760,7 @@ static int qca_power_on(struct hci_dev *hdev) case QCA_WCN3998: case QCA_WCN6750: case QCA_WCN6855: + case QCA_WCN7850: ret = qca_regulator_init(hu); break; @@ -1813,6 +1818,10 @@ static int qca_setup(struct hci_uart *hu) soc_name = "wcn6855"; break; + case QCA_WCN7850: + soc_name = "wcn7850"; + break; + default: soc_name = "ROME/QCA6390"; } @@ -1834,6 +1843,7 @@ retry: case QCA_WCN3998: case QCA_WCN6750: case QCA_WCN6855: + case QCA_WCN7850: set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); hci_set_aosp_capable(hdev); @@ -1863,6 +1873,7 @@ retry: case QCA_WCN3998: case QCA_WCN6750: case QCA_WCN6855: + case QCA_WCN7850: break; default: @@ -2016,6 +2027,20 @@ static const struct qca_device_data qca_soc_data_wcn6855 = { .capabilities = QCA_CAP_WIDEBAND_SPEECH | QCA_CAP_VALID_LE_STATES, }; +static const struct qca_device_data qca_soc_data_wcn7850 __maybe_unused = { + .soc_type = QCA_WCN7850, + .vregs = (struct qca_vreg []) { + { "vddio", 5000 }, + { "vddaon", 26000 }, + { "vdddig", 126000 }, + { "vddrfa0p8", 102000 }, + { "vddrfa1p2", 257000 }, + { "vddrfa1p9", 302000 }, + }, + .num_vregs = 6, + .capabilities = QCA_CAP_WIDEBAND_SPEECH | QCA_CAP_VALID_LE_STATES, +}; + static void qca_power_shutdown(struct hci_uart *hu) { struct qca_serdev *qcadev; @@ -2199,6 +2224,7 @@ static int qca_serdev_probe(struct serdev_device *serdev) case QCA_WCN3998: case QCA_WCN6750: case QCA_WCN6855: + case QCA_WCN7850: qcadev->bt_power = devm_kzalloc(&serdev->dev, sizeof(struct qca_power), GFP_KERNEL); @@ -2228,7 +2254,8 @@ static int qca_serdev_probe(struct serdev_device *serdev) GPIOD_IN); if (IS_ERR_OR_NULL(qcadev->sw_ctrl) && (data->soc_type == QCA_WCN6750 || - data->soc_type == QCA_WCN6855)) + data->soc_type == QCA_WCN6855 || + data->soc_type == QCA_WCN7850)) dev_warn(&serdev->dev, "failed to acquire SW_CTRL gpio\n"); qcadev->susclk = devm_clk_get_optional(&serdev->dev, NULL); @@ -2307,6 +2334,7 @@ static void qca_serdev_remove(struct serdev_device *serdev) case QCA_WCN3998: case QCA_WCN6750: case QCA_WCN6855: + case QCA_WCN7850: if (power->vregs_on) { qca_power_shutdown(&qcadev->serdev_hu); break; @@ -2499,6 +2527,7 @@ static const struct of_device_id qca_bluetooth_of_match[] = { { .compatible = "qcom,wcn3998-bt", .data = &qca_soc_data_wcn3998}, { .compatible = "qcom,wcn6750-bt", .data = &qca_soc_data_wcn6750}, { .compatible = "qcom,wcn6855-bt", .data = &qca_soc_data_wcn6855}, + { .compatible = "qcom,wcn7850-bt", .data = &qca_soc_data_wcn7850}, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, qca_bluetooth_of_match); From 92b8a3273f3812ba441d2a842d602ff7d33362b3 Mon Sep 17 00:00:00 2001 From: Janaki Ramaiah Thota Date: Wed, 24 Jan 2024 20:00:42 +0530 Subject: [PATCH 055/216] Bluetooth: hci_qca: Set BDA quirk bit if fwnode exists in DT [ Upstream commit 7dcd3e014aa7faeeaf4047190b22d8a19a0db696 ] BT adapter going into UNCONFIGURED state during BT turn ON when devicetree has no local-bd-address node. Bluetooth will not work out of the box on such devices, to avoid this problem, added check to set HCI_QUIRK_USE_BDADDR_PROPERTY based on local-bd-address node entry. When this quirk is not set, the public Bluetooth address read by host from controller though HCI Read BD Address command is considered as valid. Fixes: e668eb1e1578 ("Bluetooth: hci_core: Don't stop BT if the BD address missing in dts") Signed-off-by: Janaki Ramaiah Thota Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- drivers/bluetooth/hci_qca.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 43abdaf92a0e..8bfef7f81b41 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -7,6 +7,7 @@ * * Copyright (C) 2007 Texas Instruments, Inc. * Copyright (c) 2010, 2012, 2018 The Linux Foundation. All rights reserved. + * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved. * * Acknowledgements: * This file is based on hci_ll.c, which was... @@ -1844,7 +1845,17 @@ retry: case QCA_WCN6750: case QCA_WCN6855: case QCA_WCN7850: - set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); + + /* Set BDA quirk bit for reading BDA value from fwnode property + * only if that property exist in DT. + */ + if (fwnode_property_present(dev_fwnode(hdev->dev.parent), "local-bd-address")) { + set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); + bt_dev_info(hdev, "setting quirk bit to read BDA from fwnode later"); + } else { + bt_dev_dbg(hdev, "local-bd-address` is not present in the devicetree so not setting quirk bit for BDA"); + } + hci_set_aosp_capable(hdev); ret = qca_read_soc_version(hdev, &ver, soc_type); From ddf6ee3df30b694ac0a66b243245e5b89b6162e2 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Thu, 22 Feb 2024 10:33:08 +0000 Subject: [PATCH 056/216] netfilter: nf_tables: allow NFPROTO_INET in nft_(match/target)_validate() [ Upstream commit 7e0f122c65912740327e4c54472acaa5f85868cb ] Commit d0009effa886 ("netfilter: nf_tables: validate NFPROTO_* family") added some validation of NFPROTO_* families in the nft_compat module, but it broke the ability to use legacy iptables modules in dual-stack nftables. While with legacy iptables one had to independently manage IPv4 and IPv6 tables, with nftables it is possible to have dual-stack tables sharing the rules. Moreover, it was possible to use rules based on legacy iptables match/target modules in dual-stack nftables. As an example, the program from [2] creates an INET dual-stack family table using an xt_bpf based rule, which looks like the following (the actual output was generated with a patched nft tool as the current nft tool does not parse dual stack tables with legacy match rules, so consider it for illustrative purposes only): table inet testfw { chain input { type filter hook prerouting priority filter; policy accept; bytecode counter packets 0 bytes 0 accept } } After d0009effa886 ("netfilter: nf_tables: validate NFPROTO_* family") we get EOPNOTSUPP for the above program. Fix this by allowing NFPROTO_INET for nft_(match/target)_validate(), but also restrict the functions to classic iptables hooks. Changes in v3: * clarify that upstream nft will not display such configuration properly and that the output was generated with a patched nft tool * remove example program from commit description and link to it instead * no code changes otherwise Changes in v2: * restrict nft_(match/target)_validate() to classic iptables hooks * rewrite example program to use unmodified libnftnl Fixes: d0009effa886 ("netfilter: nf_tables: validate NFPROTO_* family") Link: https://lore.kernel.org/all/Zc1PfoWN38UuFJRI@calendula/T/#mc947262582c90fec044c7a3398cc92fac7afea72 [1] Link: https://lore.kernel.org/all/20240220145509.53357-1-ignat@cloudflare.com/ [2] Reported-by: Jordan Griege Signed-off-by: Ignat Korchagin Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_compat.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index e1623fbf3654..e4b8c02c5e6a 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -358,10 +358,20 @@ static int nft_target_validate(const struct nft_ctx *ctx, if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET && ctx->family != NFPROTO_BRIDGE && ctx->family != NFPROTO_ARP) return -EOPNOTSUPP; + ret = nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING)); + if (ret) + return ret; + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); @@ -607,10 +617,20 @@ static int nft_match_validate(const struct nft_ctx *ctx, if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET && ctx->family != NFPROTO_BRIDGE && ctx->family != NFPROTO_ARP) return -EOPNOTSUPP; + ret = nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING)); + if (ret) + return ret; + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); From b8afc22a1160121d108d7ea8496f133804d69b93 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 1 Feb 2023 14:45:22 +0100 Subject: [PATCH 057/216] netfilter: let reset rules clean out conntrack entries [ Upstream commit 2954fe60e33da0f4de4d81a4c95c7dddb517d00c ] iptables/nftables support responding to tcp packets with tcp resets. The generated tcp reset packet passes through both output and postrouting netfilter hooks, but conntrack will never see them because the generated skb has its ->nfct pointer copied over from the packet that triggered the reset rule. If the reset rule is used for established connections, this may result in the conntrack entry to be around for a very long time (default timeout is 5 days). One way to avoid this would be to not copy the nf_conn pointer so that the rest packet passes through conntrack too. Problem is that output rules might not have the same conntrack zone setup as the prerouting ones, so its possible that the reset skb won't find the correct entry. Generating a template entry for the skb seems error prone as well. Add an explicit "closing" function that switches a confirmed conntrack entry to closed state and wire this up for tcp. If the entry isn't confirmed, no action is needed because the conntrack entry will never be committed to the table. Reported-by: Russel King Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Stable-dep-of: 62e7151ae3eb ("netfilter: bridge: confirm multicast packets before passing them up the stack") Signed-off-by: Sasha Levin --- include/linux/netfilter.h | 3 +++ include/net/netfilter/nf_conntrack.h | 8 ++++++ net/ipv4/netfilter/nf_reject_ipv4.c | 1 + net/ipv6/netfilter/nf_reject_ipv6.c | 1 + net/netfilter/core.c | 16 ++++++++++++ net/netfilter/nf_conntrack_core.c | 12 +++++++++ net/netfilter/nf_conntrack_proto_tcp.c | 35 ++++++++++++++++++++++++++ 7 files changed, 76 insertions(+) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index bef8db9d6c08..c8e03bcaecaa 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -437,11 +437,13 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) #include void nf_ct_attach(struct sk_buff *, const struct sk_buff *); +void nf_ct_set_closing(struct nf_conntrack *nfct); struct nf_conntrack_tuple; bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb); #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} +static inline void nf_ct_set_closing(struct nf_conntrack *nfct) {} struct nf_conntrack_tuple; static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) @@ -459,6 +461,7 @@ struct nf_ct_hook { bool (*get_tuple_skb)(struct nf_conntrack_tuple *, const struct sk_buff *); void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); + void (*set_closing)(struct nf_conntrack *nfct); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 6a2019aaa464..3dbf947285be 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -125,6 +125,12 @@ struct nf_conn { union nf_conntrack_proto proto; }; +static inline struct nf_conn * +nf_ct_to_nf_conn(const struct nf_conntrack *nfct) +{ + return container_of(nfct, struct nf_conn, ct_general); +} + static inline struct nf_conn * nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash) { @@ -175,6 +181,8 @@ nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) void nf_ct_destroy(struct nf_conntrack *nfct); +void nf_conntrack_tcp_set_closing(struct nf_conn *ct); + /* decrement reference count on a conntrack */ static inline void nf_ct_put(struct nf_conn *ct) { diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 4073762996e2..fc761915c5f6 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -279,6 +279,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, goto free_nskb; nf_ct_attach(nskb, oldskb); + nf_ct_set_closing(skb_nfct(oldskb)); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip_local_out for bridged traffic, the MAC source on diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 433d98bbe33f..71d692728230 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -344,6 +344,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen); nf_ct_attach(nskb, oldskb); + nf_ct_set_closing(skb_nfct(oldskb)); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip6_local_out for bridged traffic, the MAC source on diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 55a7f72d547c..edf92074221e 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -707,6 +707,22 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) } EXPORT_SYMBOL(nf_conntrack_destroy); +void nf_ct_set_closing(struct nf_conntrack *nfct) +{ + const struct nf_ct_hook *ct_hook; + + if (!nfct) + return; + + rcu_read_lock(); + ct_hook = rcu_dereference(nf_ct_hook); + if (ct_hook) + ct_hook->set_closing(nfct); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_ct_set_closing); + bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 796026296609..6d30c64a5fe8 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2772,11 +2772,23 @@ err_cachep: return ret; } +static void nf_conntrack_set_closing(struct nf_conntrack *nfct) +{ + struct nf_conn *ct = nf_ct_to_nf_conn(nfct); + + switch (nf_ct_protonum(ct)) { + case IPPROTO_TCP: + nf_conntrack_tcp_set_closing(ct); + break; + } +} + static const struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, .destroy = nf_ct_destroy, .get_tuple_skb = nf_conntrack_get_tuple_skb, .attach = nf_conntrack_attach, + .set_closing = nf_conntrack_set_closing, }; void nf_conntrack_init_end(void) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index e0092bf273fd..9480e638e5d1 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -913,6 +913,41 @@ static bool tcp_can_early_drop(const struct nf_conn *ct) return false; } +void nf_conntrack_tcp_set_closing(struct nf_conn *ct) +{ + enum tcp_conntrack old_state; + const unsigned int *timeouts; + u32 timeout; + + if (!nf_ct_is_confirmed(ct)) + return; + + spin_lock_bh(&ct->lock); + old_state = ct->proto.tcp.state; + ct->proto.tcp.state = TCP_CONNTRACK_CLOSE; + + if (old_state == TCP_CONNTRACK_CLOSE || + test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + spin_unlock_bh(&ct->lock); + return; + } + + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) { + const struct nf_tcp_net *tn; + + tn = nf_tcp_pernet(nf_ct_net(ct)); + timeouts = tn->timeouts; + } + + timeout = timeouts[TCP_CONNTRACK_CLOSE]; + WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp); + + spin_unlock_bh(&ct->lock); + + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); +} + static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state) { state->td_end = 0; From 2b1414d5e94e477edff1d2c79030f1d742625ea0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2024 16:17:51 +0100 Subject: [PATCH 058/216] netfilter: bridge: confirm multicast packets before passing them up the stack [ Upstream commit 62e7151ae3eb465e0ab52a20c941ff33bb6332e9 ] conntrack nf_confirm logic cannot handle cloned skbs referencing the same nf_conn entry, which will happen for multicast (broadcast) frames on bridges. Example: macvlan0 | br0 / \ ethX ethY ethX (or Y) receives a L2 multicast or broadcast packet containing an IP packet, flow is not yet in conntrack table. 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting. -> skb->_nfct now references a unconfirmed entry 2. skb is broad/mcast packet. bridge now passes clones out on each bridge interface. 3. skb gets passed up the stack. 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb and schedules a work queue to send them out on the lower devices. The clone skb->_nfct is not a copy, it is the same entry as the original skb. The macvlan rx handler then returns RX_HANDLER_PASS. 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb. The Macvlan broadcast worker and normal confirm path will race. This race will not happen if step 2 already confirmed a clone. In that case later steps perform skb_clone() with skb->_nfct already confirmed (in hash table). This works fine. But such confirmation won't happen when eb/ip/nftables rules dropped the packets before they reached the nf_confirm step in postrouting. Pablo points out that nf_conntrack_bridge doesn't allow use of stateful nat, so we can safely discard the nf_conn entry and let inet call conntrack again. This doesn't work for bridge netfilter: skb could have a nat transformation. Also bridge nf prevents re-invocation of inet prerouting via 'sabotage_in' hook. Work around this problem by explicit confirmation of the entry at LOCAL_IN time, before upper layer has a chance to clone the unconfirmed entry. The downside is that this disables NAT and conntrack helpers. Alternative fix would be to add locking to all code parts that deal with unconfirmed packets, but even if that could be done in a sane way this opens up other problems, for example: -m physdev --physdev-out eth0 -j SNAT --snat-to 1.2.3.4 -m physdev --physdev-out eth1 -j SNAT --snat-to 1.2.3.5 For multicast case, only one of such conflicting mappings will be created, conntrack only handles 1:1 NAT mappings. Users should set create a setup that explicitly marks such traffic NOTRACK (conntrack bypass) to avoid this, but we cannot auto-bypass them, ruleset might have accept rules for untracked traffic already, so user-visible behaviour would change. Suggested-by: Pablo Neira Ayuso Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217777 Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/linux/netfilter.h | 1 + net/bridge/br_netfilter_hooks.c | 96 ++++++++++++++++++++++ net/bridge/netfilter/nf_conntrack_bridge.c | 30 +++++++ net/netfilter/nf_conntrack_core.c | 1 + 4 files changed, 128 insertions(+) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index c8e03bcaecaa..e5f4b6f8d1c0 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -462,6 +462,7 @@ struct nf_ct_hook { const struct sk_buff *); void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); void (*set_closing)(struct nf_conntrack *nfct); + int (*confirm)(struct sk_buff *skb); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 202ad43e35d6..bff48d576363 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -43,6 +43,10 @@ #include #endif +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include +#endif + static unsigned int brnf_net_id __read_mostly; struct brnf_net { @@ -553,6 +557,90 @@ static unsigned int br_nf_pre_routing(void *priv, return NF_STOLEN; } +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +/* conntracks' nf_confirm logic cannot handle cloned skbs referencing + * the same nf_conn entry, which will happen for multicast (broadcast) + * Frames on bridges. + * + * Example: + * macvlan0 + * br0 + * ethX ethY + * + * ethX (or Y) receives multicast or broadcast packet containing + * an IP packet, not yet in conntrack table. + * + * 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting. + * -> skb->_nfct now references a unconfirmed entry + * 2. skb is broad/mcast packet. bridge now passes clones out on each bridge + * interface. + * 3. skb gets passed up the stack. + * 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb + * and schedules a work queue to send them out on the lower devices. + * + * The clone skb->_nfct is not a copy, it is the same entry as the + * original skb. The macvlan rx handler then returns RX_HANDLER_PASS. + * 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb. + * + * The Macvlan broadcast worker and normal confirm path will race. + * + * This race will not happen if step 2 already confirmed a clone. In that + * case later steps perform skb_clone() with skb->_nfct already confirmed (in + * hash table). This works fine. + * + * But such confirmation won't happen when eb/ip/nftables rules dropped the + * packets before they reached the nf_confirm step in postrouting. + * + * Work around this problem by explicit confirmation of the entry at + * LOCAL_IN time, before upper layer has a chance to clone the unconfirmed + * entry. + * + */ +static unsigned int br_nf_local_in(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conntrack *nfct = skb_nfct(skb); + const struct nf_ct_hook *ct_hook; + struct nf_conn *ct; + int ret; + + if (!nfct || skb->pkt_type == PACKET_HOST) + return NF_ACCEPT; + + ct = container_of(nfct, struct nf_conn, ct_general); + if (likely(nf_ct_is_confirmed(ct))) + return NF_ACCEPT; + + WARN_ON_ONCE(skb_shared(skb)); + WARN_ON_ONCE(refcount_read(&nfct->use) != 1); + + /* We can't call nf_confirm here, it would create a dependency + * on nf_conntrack module. + */ + ct_hook = rcu_dereference(nf_ct_hook); + if (!ct_hook) { + skb->_nfct = 0ul; + nf_conntrack_put(nfct); + return NF_ACCEPT; + } + + nf_bridge_pull_encap_header(skb); + ret = ct_hook->confirm(skb); + switch (ret & NF_VERDICT_MASK) { + case NF_STOLEN: + return NF_STOLEN; + default: + nf_bridge_push_encap_header(skb); + break; + } + + ct = container_of(nfct, struct nf_conn, ct_general); + WARN_ON_ONCE(!nf_ct_is_confirmed(ct)); + + return ret; +} +#endif /* PF_BRIDGE/FORWARD *************************************************/ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -962,6 +1050,14 @@ static const struct nf_hook_ops br_nf_ops[] = { .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_BRNF, }, +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + { + .hook = br_nf_local_in, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_LOCAL_IN, + .priority = NF_BR_PRI_LAST, + }, +#endif { .hook = br_nf_forward_ip, .pf = NFPROTO_BRIDGE, diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 06d94b2c6b5d..c7c27ada6704 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -291,6 +291,30 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, return nf_conntrack_in(skb, &bridge_state); } +static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + if (skb->pkt_type == PACKET_HOST) + return NF_ACCEPT; + + /* nf_conntrack_confirm() cannot handle concurrent clones, + * this happens for broad/multicast frames with e.g. macvlan on top + * of the bridge device. + */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) + return NF_ACCEPT; + + /* let inet prerouting call conntrack again */ + skb->_nfct = 0; + nf_ct_put(ct); + + return NF_ACCEPT; +} + static void nf_ct_bridge_frag_save(struct sk_buff *skb, struct nf_bridge_frag_data *data) { @@ -415,6 +439,12 @@ static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = { .hooknum = NF_BR_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, }, + { + .hook = nf_ct_bridge_in, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, { .hook = nf_ct_bridge_post, .pf = NFPROTO_BRIDGE, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 6d30c64a5fe8..024f93fc8c0b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2789,6 +2789,7 @@ static const struct nf_ct_hook nf_conntrack_hook = { .get_tuple_skb = nf_conntrack_get_tuple_skb, .attach = nf_conntrack_attach, .set_closing = nf_conntrack_set_closing, + .confirm = __nf_conntrack_confirm, }; void nf_conntrack_init_end(void) From f2261eb994aa5757c1da046b78e3229a3ece0ad9 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Tue, 27 Feb 2024 20:11:28 +0800 Subject: [PATCH 059/216] rtnetlink: fix error logic of IFLA_BRIDGE_FLAGS writing back [ Upstream commit 743ad091fb46e622f1b690385bb15e3cd3daf874 ] In the commit d73ef2d69c0d ("rtnetlink: let rtnl_bridge_setlink checks IFLA_BRIDGE_MODE length"), an adjustment was made to the old loop logic in the function `rtnl_bridge_setlink` to enable the loop to also check the length of the IFLA_BRIDGE_MODE attribute. However, this adjustment removed the `break` statement and led to an error logic of the flags writing back at the end of this function. if (have_flags) memcpy(nla_data(attr), &flags, sizeof(flags)); // attr should point to IFLA_BRIDGE_FLAGS NLA !!! Before the mentioned commit, the `attr` is granted to be IFLA_BRIDGE_FLAGS. However, this is not necessarily true fow now as the updated loop will let the attr point to the last NLA, even an invalid NLA which could cause overflow writes. This patch introduces a new variable `br_flag` to save the NLA pointer that points to IFLA_BRIDGE_FLAGS and uses it to resolve the mentioned error logic. Fixes: d73ef2d69c0d ("rtnetlink: let rtnl_bridge_setlink checks IFLA_BRIDGE_MODE length") Signed-off-by: Lin Ma Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20240227121128.608110-1-linma@zju.edu.cn Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/core/rtnetlink.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7cf1e42d7f93..ac379e4590f8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -5026,10 +5026,9 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; - struct nlattr *br_spec, *attr = NULL; + struct nlattr *br_spec, *attr, *br_flags_attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; - bool have_flags = false; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; @@ -5047,11 +5046,11 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !have_flags) { + if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; - have_flags = true; + br_flags_attr = attr; flags = nla_get_u16(attr); } @@ -5095,8 +5094,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, } } - if (have_flags) - memcpy(nla_data(attr), &flags, sizeof(flags)); + if (br_flags_attr) + memcpy(nla_data(br_flags_attr), &flags, sizeof(flags)); out: return err; } From a0222b48175709b8a66e5f373d17a10ca5659cc8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 27 Feb 2024 10:49:41 -0800 Subject: [PATCH 060/216] igb: extend PTP timestamp adjustments to i211 [ Upstream commit 0bb7b09392eb74b152719ae87b1ba5e4bf910ef0 ] The i211 requires the same PTP timestamp adjustments as the i210, according to its datasheet. To ensure consistent timestamping across different platforms, this change extends the existing adjustments to include the i211. The adjustment result are tested and comparable for i210 and i211 based systems. Fixes: 3f544d2a4d5c ("igb: adjust PTP timestamps for Tx/Rx latency") Signed-off-by: Oleksij Rempel Reviewed-by: Jacob Keller Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20240227184942.362710-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/igb/igb_ptp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c index 07171e574e7d..36e62197fba0 100644 --- a/drivers/net/ethernet/intel/igb/igb_ptp.c +++ b/drivers/net/ethernet/intel/igb/igb_ptp.c @@ -976,7 +976,7 @@ static void igb_ptp_tx_hwtstamp(struct igb_adapter *adapter) igb_ptp_systim_to_hwtstamp(adapter, &shhwtstamps, regval); /* adjust timestamp for the TX latency based on link speed */ - if (adapter->hw.mac.type == e1000_i210) { + if (hw->mac.type == e1000_i210 || hw->mac.type == e1000_i211) { switch (adapter->link_speed) { case SPEED_10: adjust = IGB_I210_TX_LATENCY_10; @@ -1022,6 +1022,7 @@ int igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va, ktime_t *timestamp) { struct igb_adapter *adapter = q_vector->adapter; + struct e1000_hw *hw = &adapter->hw; struct skb_shared_hwtstamps ts; __le64 *regval = (__le64 *)va; int adjust = 0; @@ -1041,7 +1042,7 @@ int igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va, igb_ptp_systim_to_hwtstamp(adapter, &ts, le64_to_cpu(regval[1])); /* adjust timestamp for the RX latency based on link speed */ - if (adapter->hw.mac.type == e1000_i210) { + if (hw->mac.type == e1000_i210 || hw->mac.type == e1000_i211) { switch (adapter->link_speed) { case SPEED_10: adjust = IGB_I210_RX_LATENCY_10; From 7d4121b40149aed0698c7b82384c5c069da91836 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Wed, 28 Feb 2024 09:56:44 +0100 Subject: [PATCH 061/216] net: hsr: Use correct offset for HSR TLV values in supervisory HSR frames [ Upstream commit 51dd4ee0372228ffb0f7709fa7aa0678d4199d06 ] Current HSR implementation uses following supervisory frame (even for HSRv1 the HSR tag is not is not present): 00000000: 01 15 4e 00 01 2d XX YY ZZ 94 77 10 88 fb 00 01 00000010: 7e 1c 17 06 XX YY ZZ 94 77 10 1e 06 XX YY ZZ 94 00000020: 77 10 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 The current code adds extra two bytes (i.e. sizeof(struct hsr_sup_tlv)) when offset for skb_pull() is calculated. This is wrong, as both 'struct hsrv1_ethhdr_sp' and 'hsrv0_ethhdr_sp' already have 'struct hsr_sup_tag' defined in them, so there is no need for adding extra two bytes. This code was working correctly as with no RedBox support, the check for HSR_TLV_EOT (0x00) was off by two bytes, which were corresponding to zeroed padded bytes for minimal packet size. Fixes: eafaa88b3eb7 ("net: hsr: Add support for redbox supervision frames") Signed-off-by: Lukasz Majewski Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240228085644.3618044-1-lukma@denx.de Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/hsr/hsr_forward.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 80cdc6f6b34c..0323ab5023c6 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -83,7 +83,7 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) return false; /* Get next tlv */ - total_length += sizeof(struct hsr_sup_tlv) + hsr_sup_tag->tlv.HSR_TLV_length; + total_length += hsr_sup_tag->tlv.HSR_TLV_length; if (!pskb_may_pull(skb, total_length)) return false; skb_pull(skb, total_length); From 40f0f326cfe6847faaa409f4883b94fcdda468ab Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 28 Feb 2024 23:43:57 +0100 Subject: [PATCH 062/216] tls: decrement decrypt_pending if no async completion will be called [ Upstream commit f7fa16d49837f947ee59492958f9e6f0e51d9a78 ] With mixed sync/async decryption, or failures of crypto_aead_decrypt, we increment decrypt_pending but we never do the corresponding decrement since tls_decrypt_done will not be called. In this case, we should decrement decrypt_pending immediately to avoid getting stuck. For example, the prequeue prequeue test gets stuck with mixed modes (one async decrypt + one sync decrypt). Fixes: 94524d8fc965 ("net/tls: Add support for async decryption of tls records") Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/c56d5fc35543891d5319f834f25622360e1bfbec.1709132643.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/tls/tls_sw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 93e1bfa72d79..c6ad435a4421 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -273,6 +273,8 @@ static int tls_do_decryption(struct sock *sk, return 0; ret = crypto_wait_req(ret, &ctx->async_wait); + } else if (darg->async) { + atomic_dec(&ctx->decrypt_pending); } darg->async = false; From 08562ca971ff6d4d30ef7eb3fe932f8bf9dcd841 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 28 Feb 2024 23:43:58 +0100 Subject: [PATCH 063/216] tls: fix peeking with sync+async decryption [ Upstream commit 6caaf104423d809b49a67ee6500191d063b40dc6 ] If we peek from 2 records with a currently empty rx_list, and the first record is decrypted synchronously but the second record is decrypted async, the following happens: 1. decrypt record 1 (sync) 2. copy from record 1 to the userspace's msg 3. queue the decrypted record to rx_list for future read(!PEEK) 4. decrypt record 2 (async) 5. queue record 2 to rx_list 6. call process_rx_list to copy data from the 2nd record We currently pass copied=0 as skip offset to process_rx_list, so we end up copying once again from the first record. We should skip over the data we've already copied. Seen with selftest tls.12_aes_gcm.recv_peek_large_buf_mult_recs Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records") Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/1b132d2b2b99296bfde54e8a67672d90d6d16e71.1709132643.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/tls/tls_sw.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index c6ad435a4421..2bd27b77769c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2023,6 +2023,7 @@ int tls_sw_recvmsg(struct sock *sk, struct strp_msg *rxm; struct tls_msg *tlm; ssize_t copied = 0; + ssize_t peeked = 0; bool async = false; int target, err; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); @@ -2170,8 +2171,10 @@ put_on_rx_list: if (err < 0) goto put_on_rx_list_err; - if (is_peek) + if (is_peek) { + peeked += chunk; goto put_on_rx_list; + } if (partially_consumed) { rxm->offset += chunk; @@ -2210,8 +2213,8 @@ recv_end: /* Drain records from the rx_list & copy if required */ if (is_peek || is_kvec) - err = process_rx_list(ctx, msg, &control, copied, - decrypted, is_peek, NULL); + err = process_rx_list(ctx, msg, &control, copied + peeked, + decrypted - peeked, is_peek, NULL); else err = process_rx_list(ctx, msg, &control, 0, async_copy_bytes, is_peek, NULL); From ddc547dd05a46720866c32022300f7376c40119f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 12 Feb 2024 12:24:40 +0100 Subject: [PATCH 064/216] efi/capsule-loader: fix incorrect allocation size [ Upstream commit fccfa646ef3628097d59f7d9c1a3e84d4b6bb45e ] gcc-14 notices that the allocation with sizeof(void) on 32-bit architectures is not enough for a 64-bit phys_addr_t: drivers/firmware/efi/capsule-loader.c: In function 'efi_capsule_open': drivers/firmware/efi/capsule-loader.c:295:24: error: allocation of insufficient size '4' for type 'phys_addr_t' {aka 'long long unsigned int'} with size '8' [-Werror=alloc-size] 295 | cap_info->phys = kzalloc(sizeof(void *), GFP_KERNEL); | ^ Use the correct type instead here. Fixes: f24c4d478013 ("efi/capsule-loader: Reinstate virtual capsule mapping") Signed-off-by: Arnd Bergmann Signed-off-by: Ard Biesheuvel Signed-off-by: Sasha Levin --- drivers/firmware/efi/capsule-loader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c index 3e8d4b51a814..97bafb5f7038 100644 --- a/drivers/firmware/efi/capsule-loader.c +++ b/drivers/firmware/efi/capsule-loader.c @@ -292,7 +292,7 @@ static int efi_capsule_open(struct inode *inode, struct file *file) return -ENOMEM; } - cap_info->phys = kzalloc(sizeof(void *), GFP_KERNEL); + cap_info->phys = kzalloc(sizeof(phys_addr_t), GFP_KERNEL); if (!cap_info->phys) { kfree(cap_info->pages); kfree(cap_info); From cefe18e9ec84f8fe3e198ccebb815cc996eb9797 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 15 Feb 2024 16:51:33 +0100 Subject: [PATCH 065/216] power: supply: bq27xxx-i2c: Do not free non existing IRQ [ Upstream commit 2df70149e73e79783bcbc7db4fa51ecef0e2022c ] The bq27xxx i2c-client may not have an IRQ, in which case client->irq will be 0. bq27xxx_battery_i2c_probe() already has an if (client->irq) check wrapping the request_threaded_irq(). But bq27xxx_battery_i2c_remove() unconditionally calls free_irq(client->irq) leading to: [ 190.310742] ------------[ cut here ]------------ [ 190.310843] Trying to free already-free IRQ 0 [ 190.310861] WARNING: CPU: 2 PID: 1304 at kernel/irq/manage.c:1893 free_irq+0x1b8/0x310 Followed by a backtrace when unbinding the driver. Add an if (client->irq) to bq27xxx_battery_i2c_remove() mirroring probe() to fix this. Fixes: 444ff00734f3 ("power: supply: bq27xxx: Fix I2C IRQ race on remove") Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240215155133.70537-1-hdegoede@redhat.com Signed-off-by: Sebastian Reichel Signed-off-by: Sasha Levin --- drivers/power/supply/bq27xxx_battery_i2c.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c index 0713a52a2510..17b37354e32c 100644 --- a/drivers/power/supply/bq27xxx_battery_i2c.c +++ b/drivers/power/supply/bq27xxx_battery_i2c.c @@ -209,7 +209,9 @@ static void bq27xxx_battery_i2c_remove(struct i2c_client *client) { struct bq27xxx_device_info *di = i2c_get_clientdata(client); - free_irq(client->irq, di); + if (client->irq) + free_irq(client->irq, di); + bq27xxx_battery_teardown(di); mutex_lock(&battery_mutex); From 7f8644b6a86d45c9f8240734b161896a09069fe5 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 21 Feb 2024 10:21:56 +0100 Subject: [PATCH 066/216] ALSA: Drop leftover snd-rtctimer stuff from Makefile [ Upstream commit 4df49712eb54141be00a9312547436d55677f092 ] We forgot to remove the line for snd-rtctimer from Makefile while dropping the functionality. Get rid of the stale line. Fixes: 34ce71a96dcb ("ALSA: timer: remove legacy rtctimer") Link: https://lore.kernel.org/r/20240221092156.28695-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/core/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/core/Makefile b/sound/core/Makefile index 2762f03d9b7b..a7a1590b2952 100644 --- a/sound/core/Makefile +++ b/sound/core/Makefile @@ -30,7 +30,6 @@ snd-ctl-led-objs := control_led.o snd-rawmidi-objs := rawmidi.o snd-timer-objs := timer.o snd-hrtimer-objs := hrtimer.o -snd-rtctimer-objs := rtctimer.o snd-hwdep-objs := hwdep.o snd-seq-device-objs := seq_device.o From d36b9a1b4e5214abaf864afde5617b021b5cb588 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 23 Feb 2024 16:03:33 +0100 Subject: [PATCH 067/216] drm/tegra: Remove existing framebuffer only if we support display [ Upstream commit 86bf8cfda6d2a6720fa2e6e676c98f0882c9d3d7 ] Tegra DRM doesn't support display on Tegra234 and later, so make sure not to remove any existing framebuffers in that case. v2: - add comments explaining how this situation can come about - clear DRIVER_MODESET and DRIVER_ATOMIC feature bits Fixes: 6848c291a54f ("drm/aperture: Convert drivers to aperture interfaces") Signed-off-by: Thierry Reding Reviewed-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Signed-off-by: Robert Foss Link: https://patchwork.freedesktop.org/patch/msgid/20240223150333.1401582-1-thierry.reding@gmail.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/tegra/drm.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c index 5fc55b9777cb..6806779f8ecc 100644 --- a/drivers/gpu/drm/tegra/drm.c +++ b/drivers/gpu/drm/tegra/drm.c @@ -1252,9 +1252,26 @@ static int host1x_drm_probe(struct host1x_device *dev) drm_mode_config_reset(drm); - err = drm_aperture_remove_framebuffers(&tegra_drm_driver); - if (err < 0) - goto hub; + /* + * Only take over from a potential firmware framebuffer if any CRTCs + * have been registered. This must not be a fatal error because there + * are other accelerators that are exposed via this driver. + * + * Another case where this happens is on Tegra234 where the display + * hardware is no longer part of the host1x complex, so this driver + * will not expose any modesetting features. + */ + if (drm->mode_config.num_crtc > 0) { + err = drm_aperture_remove_framebuffers(&tegra_drm_driver); + if (err < 0) + goto hub; + } else { + /* + * Indicate to userspace that this doesn't expose any display + * capabilities. + */ + drm->driver_features &= ~(DRIVER_MODESET | DRIVER_ATOMIC); + } err = tegra_drm_fb_init(drm); if (err < 0) From 2f91a96b892fab2f2543b4a55740c5bee36b1a6b Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 8 Feb 2024 12:44:11 +0100 Subject: [PATCH 068/216] fbcon: always restore the old font data in fbcon_do_set_font() [ Upstream commit 00d6a284fcf3fad1b7e1b5bc3cd87cbfb60ce03f ] Commit a5a923038d70 (fbdev: fbcon: Properly revert changes when vc_resize() failed) started restoring old font data upon failure (of vc_resize()). But it performs so only for user fonts. It means that the "system"/internal fonts are not restored at all. So in result, the very first call to fbcon_do_set_font() performs no restore at all upon failing vc_resize(). This can be reproduced by Syzkaller to crash the system on the next invocation of font_get(). It's rather hard to hit the allocation failure in vc_resize() on the first font_set(), but not impossible. Esp. if fault injection is used to aid the execution/failure. It was demonstrated by Sirius: BUG: unable to handle page fault for address: fffffffffffffff8 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD cb7b067 P4D cb7b067 PUD cb7d067 PMD 0 Oops: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 8007 Comm: poc Not tainted 6.7.0-g9d1694dc91ce #20 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:fbcon_get_font+0x229/0x800 drivers/video/fbdev/core/fbcon.c:2286 Call Trace: con_font_get drivers/tty/vt/vt.c:4558 [inline] con_font_op+0x1fc/0xf20 drivers/tty/vt/vt.c:4673 vt_k_ioctl drivers/tty/vt/vt_ioctl.c:474 [inline] vt_ioctl+0x632/0x2ec0 drivers/tty/vt/vt_ioctl.c:752 tty_ioctl+0x6f8/0x1570 drivers/tty/tty_io.c:2803 vfs_ioctl fs/ioctl.c:51 [inline] ... So restore the font data in any case, not only for user fonts. Note the later 'if' is now protected by 'old_userfont' and not 'old_data' as the latter is always set now. (And it is supposed to be non-NULL. Otherwise we would see the bug above again.) Signed-off-by: Jiri Slaby (SUSE) Fixes: a5a923038d70 ("fbdev: fbcon: Properly revert changes when vc_resize() failed") Reported-and-tested-by: Ubisectech Sirius Cc: Ubisectech Sirius Cc: Daniel Vetter Cc: Helge Deller Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20240208114411.14604-1-jirislaby@kernel.org Signed-off-by: Sasha Levin --- drivers/video/fbdev/core/fbcon.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index fa205be94a4b..14498a0d13e0 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2397,11 +2397,9 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, struct fbcon_ops *ops = info->fbcon_par; struct fbcon_display *p = &fb_display[vc->vc_num]; int resize, ret, old_userfont, old_width, old_height, old_charcount; - char *old_data = NULL; + u8 *old_data = vc->vc_font.data; resize = (w != vc->vc_font.width) || (h != vc->vc_font.height); - if (p->userfont) - old_data = vc->vc_font.data; vc->vc_font.data = (void *)(p->fontdata = data); old_userfont = p->userfont; if ((p->userfont = userfont)) @@ -2435,13 +2433,13 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, update_screen(vc); } - if (old_data && (--REFCOUNT(old_data) == 0)) + if (old_userfont && (--REFCOUNT(old_data) == 0)) kfree(old_data - FONT_EXTRA_WORDS * sizeof(int)); return 0; err_out: p->fontdata = old_data; - vc->vc_font.data = (void *)old_data; + vc->vc_font.data = old_data; if (userfont) { p->userfont = old_userfont; From 058ed71e0f7aa3b6694ca357e23d084e5d3f2470 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Feb 2024 13:15:02 +0000 Subject: [PATCH 069/216] afs: Fix endless loop in directory parsing [ Upstream commit 5f7a07646655fb4108da527565dcdc80124b14c4 ] If a directory has a block with only ".__afsXXXX" files in it (from uncompleted silly-rename), these .__afsXXXX files are skipped but without advancing the file position in the dir_context. This leads to afs_dir_iterate() repeating the block again and again. Fix this by making the code that skips the .__afsXXXX file also manually advance the file position. The symptoms are a soft lookup: watchdog: BUG: soft lockup - CPU#3 stuck for 52s! [check:5737] ... RIP: 0010:afs_dir_iterate_block+0x39/0x1fd ... ? watchdog_timer_fn+0x1a6/0x213 ... ? asm_sysvec_apic_timer_interrupt+0x16/0x20 ? afs_dir_iterate_block+0x39/0x1fd afs_dir_iterate+0x10a/0x148 afs_readdir+0x30/0x4a iterate_dir+0x93/0xd3 __do_sys_getdents64+0x6b/0xd4 This is almost certainly the actual fix for: https://bugzilla.kernel.org/show_bug.cgi?id=218496 Fixes: 57e9d49c5452 ("afs: Hide silly-rename files from userspace") Signed-off-by: David Howells Link: https://lore.kernel.org/r/786185.1708694102@warthog.procyon.org.uk Reviewed-by: Marc Dionne cc: Marc Dionne cc: Markus Suvanto cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner Signed-off-by: Sasha Levin --- fs/afs/dir.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index cf811b77ee67..6e2c967fae6f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -478,8 +478,10 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, dire->u.name[0] == '.' && ctx->actor != afs_lookup_filldir && ctx->actor != afs_lookup_one_filldir && - memcmp(dire->u.name, ".__afs", 6) == 0) + memcmp(dire->u.name, ".__afs", 6) == 0) { + ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); continue; + } /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, From 8310080799b40fd9f2a8b808c657269678c149af Mon Sep 17 00:00:00 2001 From: Dimitris Vlachos Date: Thu, 29 Feb 2024 21:17:23 +0200 Subject: [PATCH 070/216] riscv: Sparse-Memory/vmemmap out-of-bounds fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit a11dd49dcb9376776193e15641f84fcc1e5980c9 ] Offset vmemmap so that the first page of vmemmap will be mapped to the first page of physical memory in order to ensure that vmemmap’s bounds will be respected during pfn_to_page()/page_to_pfn() operations. The conversion macros will produce correct SV39/48/57 addresses for every possible/valid DRAM_BASE inside the physical memory limits. v2:Address Alex's comments Suggested-by: Alexandre Ghiti Signed-off-by: Dimitris Vlachos Reported-by: Dimitris Vlachos Closes: https://lore.kernel.org/linux-riscv/20240202135030.42265-1-csd4492@csd.uoc.gr Fixes: d95f1a542c3d ("RISC-V: Implement sparsemem") Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240229191723.32779-1-dvlachos@ics.forth.gr Signed-off-by: Palmer Dabbelt Signed-off-by: Sasha Levin --- arch/riscv/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 59bb53da473d..63055c6ad2c2 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -79,7 +79,7 @@ * Define vmemmap for pfn_to_page & page_to_pfn calls. Needed if kernel * is configured with CONFIG_SPARSEMEM_VMEMMAP enabled. */ -#define vmemmap ((struct page *)VMEMMAP_START) +#define vmemmap ((struct page *)VMEMMAP_START - (phys_ram_base >> PAGE_SHIFT)) #define PCI_IO_SIZE SZ_16M #define PCI_IO_END VMEMMAP_START From 8f626221e5fa89134515d358e7d614609b612a5c Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Fri, 23 Feb 2024 21:24:35 -0800 Subject: [PATCH 071/216] of: property: fw_devlink: Fix stupid bug in remote-endpoint parsing [ Upstream commit 7cb50f6c9fbaa1c0b80100b8971bf13db5d75d06 ] Introduced a stupid bug in commit 782bfd03c3ae ("of: property: Improve finding the supplier of a remote-endpoint property") due to a last minute incorrect edit of "index !=0" into "!index". This patch fixes it to be "index > 0" to match the comment right next to it. Reported-by: Luca Ceresoli Link: https://lore.kernel.org/lkml/20240223171849.10f9901d@booty/ Fixes: 782bfd03c3ae ("of: property: Improve finding the supplier of a remote-endpoint property") Signed-off-by: Saravana Kannan Reviewed-by: Herve Codina Reviewed-by: Luca Ceresoli Tested-by: Luca Ceresoli Link: https://lore.kernel.org/r/20240224052436.3552333-1-saravanak@google.com Signed-off-by: Rob Herring Signed-off-by: Sasha Levin --- drivers/of/property.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/of/property.c b/drivers/of/property.c index 33d5f16c8120..da5d71219770 100644 --- a/drivers/of/property.c +++ b/drivers/of/property.c @@ -1332,7 +1332,7 @@ static struct device_node *parse_remote_endpoint(struct device_node *np, int index) { /* Return NULL for index > 0 to signify end of remote-endpoints. */ - if (!index || strcmp(prop_name, "remote-endpoint")) + if (index > 0 || strcmp(prop_name, "remote-endpoint")) return NULL; return of_graph_get_remote_port_parent(np); From 3bfe04c1273d30b866f4c7c238331ed3b08e5824 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 1 Mar 2024 22:04:06 +0900 Subject: [PATCH 072/216] tomoyo: fix UAF write bug in tomoyo_write_control() commit 2f03fc340cac9ea1dc63cbf8c93dd2eb0f227815 upstream. Since tomoyo_write_control() updates head->write_buf when write() of long lines is requested, we need to fetch head->write_buf after head->io_sem is held. Otherwise, concurrent write() requests can cause use-after-free-write and double-free problems. Reported-by: Sam Sun Closes: https://lkml.kernel.org/r/CAEkJfYNDspuGxYx5kym8Lvp--D36CMDUErg4rxfWFJuPbbji8g@mail.gmail.com Fixes: bd03a3e4c9a9 ("TOMOYO: Add policy namespace support.") Cc: # Linux 3.1+ Signed-off-by: Tetsuo Handa Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- security/tomoyo/common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index f4cd9b58b205..a7af085550b2 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -2648,13 +2648,14 @@ ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head, { int error = buffer_len; size_t avail_len = buffer_len; - char *cp0 = head->write_buf; + char *cp0; int idx; if (!head->write) return -EINVAL; if (mutex_lock_interruptible(&head->io_sem)) return -EINTR; + cp0 = head->write_buf; head->read_user_buf_avail = 0; idx = tomoyo_read_lock(); /* Read a line and dispatch it to the policy handler. */ From 8cec41a35065dcfcca5a2337f4edd56dadd1425c Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sun, 18 Feb 2024 12:30:26 +0900 Subject: [PATCH 073/216] ALSA: firewire-lib: fix to check cycle continuity commit 77ce96543b03f437c6b45f286d8110db2b6622a3 upstream. The local helper function to compare the given pair of cycle count evaluates them. If the left value is less than the right value, the function returns negative value. If the safe cycle is less than the current cycle, it is the case of cycle lost. However, it is not currently handled properly. This commit fixes the bug. Cc: Fixes: 705794c53b00 ("ALSA: firewire-lib: check cycle continuity") Signed-off-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20240218033026.72577-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/firewire/amdtp-stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/firewire/amdtp-stream.c b/sound/firewire/amdtp-stream.c index 9be2260e4ca2..f8b644cb9157 100644 --- a/sound/firewire/amdtp-stream.c +++ b/sound/firewire/amdtp-stream.c @@ -934,7 +934,7 @@ static int generate_device_pkt_descs(struct amdtp_stream *s, // to the reason. unsigned int safe_cycle = increment_ohci_cycle_count(next_cycle, IR_JUMBO_PAYLOAD_MAX_SKIP_CYCLES); - lost = (compare_ohci_cycle_count(safe_cycle, cycle) > 0); + lost = (compare_ohci_cycle_count(safe_cycle, cycle) < 0); } if (lost) { dev_err(&s->unit->device, "Detect discontinuity of cycle: %d %d\n", From 4cbbc2f0dbe22498e290997c52f088413d6b9ad5 Mon Sep 17 00:00:00 2001 From: Hans Peter Date: Mon, 19 Feb 2024 17:38:49 +0100 Subject: [PATCH 074/216] ALSA: hda/realtek: Enable Mute LED on HP 840 G8 (MB 8AB8) commit 1fdf4e8be7059e7784fec11d30cd32784f0bdc83 upstream. On my EliteBook 840 G8 Notebook PC (ProdId 5S7R6EC#ABD; built 2022 for german market) the Mute LED is always on. The mute button itself works as expected. alsa-info.sh shows a different subsystem-id 0x8ab9 for Realtek ALC285 Codec, thus the existing quirks for HP 840 G8 don't work. Therefore, add a new quirk for this type of EliteBook. Signed-off-by: Hans Peter Cc: Link: https://lore.kernel.org/r/20240219164518.4099-1-flurry123@gmx.ch Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 92a656fb5321..53fe38a1b5da 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9687,6 +9687,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8aa3, "HP ProBook 450 G9 (MB 8AA1)", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8aa8, "HP EliteBook 640 G9 (MB 8AA6)", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8aab, "HP EliteBook 650 G9 (MB 8AA9)", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8ab9, "HP EliteBook 840 G8 (MB 8AB8)", ALC285_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8abb, "HP ZBook Firefly 14 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ad1, "HP EliteBook 840 14 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ad2, "HP EliteBook 860 16 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), From fd3289ab8ed1f8a2f6e3593adf39bb610fbc17a5 Mon Sep 17 00:00:00 2001 From: Eniac Zhang Date: Tue, 20 Feb 2024 17:58:12 +0000 Subject: [PATCH 075/216] ALSA: hda/realtek: fix mute/micmute LED For HP mt440 commit 67c3d7717efbd46092f217b1f811df1b205cce06 upstream. The HP mt440 Thin Client uses an ALC236 codec and needs the ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF quirk to make the mute and micmute LEDs work. There are two variants of the USB-C PD chip on this device. Each uses a different BIOS and board ID, hence the two entries. Signed-off-by: Eniac Zhang Signed-off-by: Alexandru Gagniuc Cc: Link: https://lore.kernel.org/r/20240220175812.782687-1-alexandru.gagniuc@hp.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 53fe38a1b5da..75bd7b2fa4ee 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9662,6 +9662,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8973, "HP EliteBook 860 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8974, "HP EliteBook 840 Aero G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8975, "HP EliteBook x360 840 Aero G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x897d, "HP mt440 Mobile Thin Client U74", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8981, "HP Elite Dragonfly G3", ALC245_FIXUP_CS35L41_SPI_4), SND_PCI_QUIRK(0x103c, 0x898e, "HP EliteBook 835 G9", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x898f, "HP EliteBook 835 G9", ALC287_FIXUP_CS35L41_I2C_2), @@ -9693,6 +9694,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8ad2, "HP EliteBook 860 16 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8b0f, "HP Elite mt645 G7 Mobile Thin Client U81", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8b2f, "HP 255 15.6 inch G10 Notebook PC", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), + SND_PCI_QUIRK(0x103c, 0x8b3f, "HP mt440 Mobile Thin Client U91", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8b42, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8b43, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8b44, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), From 59ed284c7bff4da0f6cafd05ca15de1c0ae1d087 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Mon, 19 Feb 2024 20:03:45 +0100 Subject: [PATCH 076/216] landlock: Fix asymmetric private inodes referring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d9818b3e906a0ee1ab02ea79e74a2f755fc5461a upstream. When linking or renaming a file, if only one of the source or destination directory is backed by an S_PRIVATE inode, then the related set of layer masks would be used as uninitialized by is_access_to_paths_allowed(). This would result to indeterministic access for one side instead of always being allowed. This bug could only be triggered with a mounted filesystem containing both S_PRIVATE and !S_PRIVATE inodes, which doesn't seem possible. The collect_domain_accesses() calls return early if is_nouser_or_private() returns false, which means that the directory's superblock has SB_NOUSER or its inode has S_PRIVATE. Because rename or link actions are only allowed on the same mounted filesystem, the superblock is always the same for both source and destination directories. However, it might be possible in theory to have an S_PRIVATE parent source inode with an !S_PRIVATE parent destination inode, or vice versa. To make sure this case is not an issue, explicitly initialized both set of layer masks to 0, which means to allow all actions on the related side. If at least on side has !S_PRIVATE, then collect_domain_accesses() and is_access_to_paths_allowed() check for the required access rights. Cc: Arnd Bergmann Cc: Christian Brauner Cc: Günther Noack Cc: Jann Horn Cc: Shervin Oloumi Cc: stable@vger.kernel.org Fixes: b91c3e4ea756 ("landlock: Add support for file reparenting with LANDLOCK_ACCESS_FS_REFER") Link: https://lore.kernel.org/r/20240219190345.2928627-1-mic@digikod.net Signed-off-by: Mickaël Salaün Signed-off-by: Greg Kroah-Hartman --- security/landlock/fs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/landlock/fs.c b/security/landlock/fs.c index 64ed7665455f..d328965f32f7 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -824,8 +824,8 @@ static int current_check_refer_path(struct dentry *const old_dentry, bool allow_parent1, allow_parent2; access_mask_t access_request_parent1, access_request_parent2; struct path mnt_dir; - layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS], - layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS]; + layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS] = {}, + layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS] = {}; if (!dom) return 0; From abd32d7f5c0294c1b2454c5a3b13b18446bac627 Mon Sep 17 00:00:00 2001 From: Alexander Ofitserov Date: Wed, 28 Feb 2024 14:47:03 +0300 Subject: [PATCH 077/216] gtp: fix use-after-free and null-ptr-deref in gtp_newlink() commit 616d82c3cfa2a2146dd7e3ae47bda7e877ee549e upstream. The gtp_link_ops operations structure for the subsystem must be registered after registering the gtp_net_ops pernet operations structure. Syzkaller hit 'general protection fault in gtp_genl_dump_pdp' bug: [ 1010.702740] gtp: GTP module unloaded [ 1010.715877] general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] SMP KASAN NOPTI [ 1010.715888] KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] [ 1010.715895] CPU: 1 PID: 128616 Comm: a.out Not tainted 6.8.0-rc6-std-def-alt1 #1 [ 1010.715899] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-alt1 04/01/2014 [ 1010.715908] RIP: 0010:gtp_newlink+0x4d7/0x9c0 [gtp] [ 1010.715915] Code: 80 3c 02 00 0f 85 41 04 00 00 48 8b bb d8 05 00 00 e8 ed f6 ff ff 48 89 c2 48 89 c5 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 4f 04 00 00 4c 89 e2 4c 8b 6d 00 48 b8 00 00 00 [ 1010.715920] RSP: 0018:ffff888020fbf180 EFLAGS: 00010203 [ 1010.715929] RAX: dffffc0000000000 RBX: ffff88800399c000 RCX: 0000000000000000 [ 1010.715933] RDX: 0000000000000001 RSI: ffffffff84805280 RDI: 0000000000000282 [ 1010.715938] RBP: 000000000000000d R08: 0000000000000001 R09: 0000000000000000 [ 1010.715942] R10: 0000000000000001 R11: 0000000000000001 R12: ffff88800399cc80 [ 1010.715947] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000400 [ 1010.715953] FS: 00007fd1509ab5c0(0000) GS:ffff88805b300000(0000) knlGS:0000000000000000 [ 1010.715958] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1010.715962] CR2: 0000000000000000 CR3: 000000001c07a000 CR4: 0000000000750ee0 [ 1010.715968] PKRU: 55555554 [ 1010.715972] Call Trace: [ 1010.715985] ? __die_body.cold+0x1a/0x1f [ 1010.715995] ? die_addr+0x43/0x70 [ 1010.716002] ? exc_general_protection+0x199/0x2f0 [ 1010.716016] ? asm_exc_general_protection+0x1e/0x30 [ 1010.716026] ? gtp_newlink+0x4d7/0x9c0 [gtp] [ 1010.716034] ? gtp_net_exit+0x150/0x150 [gtp] [ 1010.716042] __rtnl_newlink+0x1063/0x1700 [ 1010.716051] ? rtnl_setlink+0x3c0/0x3c0 [ 1010.716063] ? is_bpf_text_address+0xc0/0x1f0 [ 1010.716070] ? kernel_text_address.part.0+0xbb/0xd0 [ 1010.716076] ? __kernel_text_address+0x56/0xa0 [ 1010.716084] ? unwind_get_return_address+0x5a/0xa0 [ 1010.716091] ? create_prof_cpu_mask+0x30/0x30 [ 1010.716098] ? arch_stack_walk+0x9e/0xf0 [ 1010.716106] ? stack_trace_save+0x91/0xd0 [ 1010.716113] ? stack_trace_consume_entry+0x170/0x170 [ 1010.716121] ? __lock_acquire+0x15c5/0x5380 [ 1010.716139] ? mark_held_locks+0x9e/0xe0 [ 1010.716148] ? kmem_cache_alloc_trace+0x35f/0x3c0 [ 1010.716155] ? __rtnl_newlink+0x1700/0x1700 [ 1010.716160] rtnl_newlink+0x69/0xa0 [ 1010.716166] rtnetlink_rcv_msg+0x43b/0xc50 [ 1010.716172] ? rtnl_fdb_dump+0x9f0/0x9f0 [ 1010.716179] ? lock_acquire+0x1fe/0x560 [ 1010.716188] ? netlink_deliver_tap+0x12f/0xd50 [ 1010.716196] netlink_rcv_skb+0x14d/0x440 [ 1010.716202] ? rtnl_fdb_dump+0x9f0/0x9f0 [ 1010.716208] ? netlink_ack+0xab0/0xab0 [ 1010.716213] ? netlink_deliver_tap+0x202/0xd50 [ 1010.716220] ? netlink_deliver_tap+0x218/0xd50 [ 1010.716226] ? __virt_addr_valid+0x30b/0x590 [ 1010.716233] netlink_unicast+0x54b/0x800 [ 1010.716240] ? netlink_attachskb+0x870/0x870 [ 1010.716248] ? __check_object_size+0x2de/0x3b0 [ 1010.716254] netlink_sendmsg+0x938/0xe40 [ 1010.716261] ? netlink_unicast+0x800/0x800 [ 1010.716269] ? __import_iovec+0x292/0x510 [ 1010.716276] ? netlink_unicast+0x800/0x800 [ 1010.716284] __sock_sendmsg+0x159/0x190 [ 1010.716290] ____sys_sendmsg+0x712/0x880 [ 1010.716297] ? sock_write_iter+0x3d0/0x3d0 [ 1010.716304] ? __ia32_sys_recvmmsg+0x270/0x270 [ 1010.716309] ? lock_acquire+0x1fe/0x560 [ 1010.716315] ? drain_array_locked+0x90/0x90 [ 1010.716324] ___sys_sendmsg+0xf8/0x170 [ 1010.716331] ? sendmsg_copy_msghdr+0x170/0x170 [ 1010.716337] ? lockdep_init_map_type+0x2c7/0x860 [ 1010.716343] ? lockdep_hardirqs_on_prepare+0x430/0x430 [ 1010.716350] ? debug_mutex_init+0x33/0x70 [ 1010.716360] ? percpu_counter_add_batch+0x8b/0x140 [ 1010.716367] ? lock_acquire+0x1fe/0x560 [ 1010.716373] ? find_held_lock+0x2c/0x110 [ 1010.716384] ? __fd_install+0x1b6/0x6f0 [ 1010.716389] ? lock_downgrade+0x810/0x810 [ 1010.716396] ? __fget_light+0x222/0x290 [ 1010.716403] __sys_sendmsg+0xea/0x1b0 [ 1010.716409] ? __sys_sendmsg_sock+0x40/0x40 [ 1010.716419] ? lockdep_hardirqs_on_prepare+0x2b3/0x430 [ 1010.716425] ? syscall_enter_from_user_mode+0x1d/0x60 [ 1010.716432] do_syscall_64+0x30/0x40 [ 1010.716438] entry_SYSCALL_64_after_hwframe+0x62/0xc7 [ 1010.716444] RIP: 0033:0x7fd1508cbd49 [ 1010.716452] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ef 70 0d 00 f7 d8 64 89 01 48 [ 1010.716456] RSP: 002b:00007fff18872348 EFLAGS: 00000202 ORIG_RAX: 000000000000002e [ 1010.716463] RAX: ffffffffffffffda RBX: 000055f72bf0eac0 RCX: 00007fd1508cbd49 [ 1010.716468] RDX: 0000000000000000 RSI: 0000000020000280 RDI: 0000000000000006 [ 1010.716473] RBP: 00007fff18872360 R08: 00007fff18872360 R09: 00007fff18872360 [ 1010.716478] R10: 00007fff18872360 R11: 0000000000000202 R12: 000055f72bf0e1b0 [ 1010.716482] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 1010.716491] Modules linked in: gtp(+) udp_tunnel ib_core uinput af_packet rfkill qrtr joydev hid_generic usbhid hid kvm_intel iTCO_wdt intel_pmc_bxt iTCO_vendor_support kvm snd_hda_codec_generic ledtrig_audio irqbypass crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel snd_hda_intel nls_utf8 snd_intel_dspcfg nls_cp866 psmouse aesni_intel vfat crypto_simd fat cryptd glue_helper snd_hda_codec pcspkr snd_hda_core i2c_i801 snd_hwdep i2c_smbus xhci_pci snd_pcm lpc_ich xhci_pci_renesas xhci_hcd qemu_fw_cfg tiny_power_button button sch_fq_codel vboxvideo drm_vram_helper drm_ttm_helper ttm vboxsf vboxguest snd_seq_midi snd_seq_midi_event snd_seq snd_rawmidi snd_seq_device snd_timer snd soundcore msr fuse efi_pstore dm_mod ip_tables x_tables autofs4 virtio_gpu virtio_dma_buf drm_kms_helper cec rc_core drm virtio_rng virtio_scsi rng_core virtio_balloon virtio_blk virtio_net virtio_console net_failover failover ahci libahci libata evdev scsi_mod input_leds serio_raw virtio_pci intel_agp [ 1010.716674] virtio_ring intel_gtt virtio [last unloaded: gtp] [ 1010.716693] ---[ end trace 04990a4ce61e174b ]--- Cc: stable@vger.kernel.org Signed-off-by: Alexander Ofitserov Fixes: 459aa660eb1d ("gtp: add initial driver for datapath of GPRS Tunneling Protocol (GTP-U)") Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240228114703.465107-1-oficerovas@altlinux.org Signed-off-by: Paolo Abeni Signed-off-by: Greg Kroah-Hartman --- drivers/net/gtp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 937dd9cf4fba..7086acfed5b9 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1902,26 +1902,26 @@ static int __init gtp_init(void) get_random_bytes(>p_h_initval, sizeof(gtp_h_initval)); - err = rtnl_link_register(>p_link_ops); + err = register_pernet_subsys(>p_net_ops); if (err < 0) goto error_out; - err = register_pernet_subsys(>p_net_ops); + err = rtnl_link_register(>p_link_ops); if (err < 0) - goto unreg_rtnl_link; + goto unreg_pernet_subsys; err = genl_register_family(>p_genl_family); if (err < 0) - goto unreg_pernet_subsys; + goto unreg_rtnl_link; pr_info("GTP module loaded (pdp ctx size %zd bytes)\n", sizeof(struct pdp_ctx)); return 0; -unreg_pernet_subsys: - unregister_pernet_subsys(>p_net_ops); unreg_rtnl_link: rtnl_link_unregister(>p_link_ops); +unreg_pernet_subsys: + unregister_pernet_subsys(>p_net_ops); error_out: pr_err("error loading GTP module loaded\n"); return err; From 930e826962d9f01dcd2220176134427358d112f2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 14 Feb 2024 20:08:35 +0100 Subject: [PATCH 078/216] wifi: nl80211: reject iftype change with mesh ID change commit f78c1375339a291cba492a70eaf12ec501d28a8e upstream. It's currently possible to change the mesh ID when the interface isn't yet in mesh mode, at the same time as changing it into mesh mode. This leads to an overwrite of data in the wdev->u union for the interface type it currently has, causing cfg80211_change_iface() to do wrong things when switching. We could probably allow setting an interface to mesh while setting the mesh ID at the same time by doing a different order of operations here, but realistically there's no userspace that's going to do this, so just disallow changes in iftype when setting mesh ID. Cc: stable@vger.kernel.org Fixes: 29cbe68c516a ("cfg80211/mac80211: add mesh join/leave commands") Reported-by: syzbot+dd4779978217b1973180@syzkaller.appspotmail.com Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/wireless/nl80211.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c259d3227a9e..1a3bd554e258 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4137,6 +4137,8 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) if (ntype != NL80211_IFTYPE_MESH_POINT) return -EINVAL; + if (otype != NL80211_IFTYPE_MESH_POINT) + return -EINVAL; if (netif_running(dev)) return -EBUSY; From c34adc20b91a8e55e048b18d63f4f4ae003ecf8f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 23 Feb 2024 16:38:43 +0000 Subject: [PATCH 079/216] btrfs: fix double free of anonymous device after snapshot creation failure commit e2b54eaf28df0c978626c9736b94f003b523b451 upstream. When creating a snapshot we may do a double free of an anonymous device in case there's an error committing the transaction. The second free may result in freeing an anonymous device number that was allocated by some other subsystem in the kernel or another btrfs filesystem. The steps that lead to this: 1) At ioctl.c:create_snapshot() we allocate an anonymous device number and assign it to pending_snapshot->anon_dev; 2) Then we call btrfs_commit_transaction() and end up at transaction.c:create_pending_snapshot(); 3) There we call btrfs_get_new_fs_root() and pass it the anonymous device number stored in pending_snapshot->anon_dev; 4) btrfs_get_new_fs_root() frees that anonymous device number because btrfs_lookup_fs_root() returned a root - someone else did a lookup of the new root already, which could some task doing backref walking; 5) After that some error happens in the transaction commit path, and at ioctl.c:create_snapshot() we jump to the 'fail' label, and after that we free again the same anonymous device number, which in the meanwhile may have been reallocated somewhere else, because pending_snapshot->anon_dev still has the same value as in step 1. Recently syzbot ran into this and reported the following trace: ------------[ cut here ]------------ ida_free called for id=51 which is not allocated. WARNING: CPU: 1 PID: 31038 at lib/idr.c:525 ida_free+0x370/0x420 lib/idr.c:525 Modules linked in: CPU: 1 PID: 31038 Comm: syz-executor.2 Not tainted 6.8.0-rc4-syzkaller-00410-gc02197fc9076 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024 RIP: 0010:ida_free+0x370/0x420 lib/idr.c:525 Code: 10 42 80 3c 28 (...) RSP: 0018:ffffc90015a67300 EFLAGS: 00010246 RAX: be5130472f5dd000 RBX: 0000000000000033 RCX: 0000000000040000 RDX: ffffc90009a7a000 RSI: 000000000003ffff RDI: 0000000000040000 RBP: ffffc90015a673f0 R08: ffffffff81577992 R09: 1ffff92002b4cdb4 R10: dffffc0000000000 R11: fffff52002b4cdb5 R12: 0000000000000246 R13: dffffc0000000000 R14: ffffffff8e256b80 R15: 0000000000000246 FS: 00007fca3f4b46c0(0000) GS:ffff8880b9500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f167a17b978 CR3: 000000001ed26000 CR4: 0000000000350ef0 Call Trace: btrfs_get_root_ref+0xa48/0xaf0 fs/btrfs/disk-io.c:1346 create_pending_snapshot+0xff2/0x2bc0 fs/btrfs/transaction.c:1837 create_pending_snapshots+0x195/0x1d0 fs/btrfs/transaction.c:1931 btrfs_commit_transaction+0xf1c/0x3740 fs/btrfs/transaction.c:2404 create_snapshot+0x507/0x880 fs/btrfs/ioctl.c:848 btrfs_mksubvol+0x5d0/0x750 fs/btrfs/ioctl.c:998 btrfs_mksnapshot+0xb5/0xf0 fs/btrfs/ioctl.c:1044 __btrfs_ioctl_snap_create+0x387/0x4b0 fs/btrfs/ioctl.c:1306 btrfs_ioctl_snap_create_v2+0x1ca/0x400 fs/btrfs/ioctl.c:1393 btrfs_ioctl+0xa74/0xd40 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl+0xfe/0x170 fs/ioctl.c:857 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x6f/0x77 RIP: 0033:0x7fca3e67dda9 Code: 28 00 00 00 (...) RSP: 002b:00007fca3f4b40c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fca3e7abf80 RCX: 00007fca3e67dda9 RDX: 00000000200005c0 RSI: 0000000050009417 RDI: 0000000000000003 RBP: 00007fca3e6ca47a R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007fca3e7abf80 R15: 00007fff6bf95658 Where we get an explicit message where we attempt to free an anonymous device number that is not currently allocated. It happens in a different code path from the example below, at btrfs_get_root_ref(), so this change may not fix the case triggered by syzbot. To fix at least the code path from the example above, change btrfs_get_root_ref() and its callers to receive a dev_t pointer argument for the anonymous device number, so that in case it frees the number, it also resets it to 0, so that up in the call chain we don't attempt to do the double free. CC: stable@vger.kernel.org # 5.10+ Link: https://lore.kernel.org/linux-btrfs/000000000000f673a1061202f630@google.com/ Fixes: e03ee2fe873e ("btrfs: do not ASSERT() if the newly created subvolume already got read") Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/disk-io.c | 22 +++++++++++----------- fs/btrfs/disk-io.h | 2 +- fs/btrfs/ioctl.c | 2 +- fs/btrfs/transaction.c | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0d1b05ded1e3..5756edb37c61 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1643,12 +1643,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) * * @objectid: root id * @anon_dev: preallocated anonymous block device number for new roots, - * pass 0 for new allocation. + * pass NULL for a new allocation. * @check_ref: whether to check root item references, If true, return -ENOENT * for orphan roots */ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, - u64 objectid, dev_t anon_dev, + u64 objectid, dev_t *anon_dev, bool check_ref) { struct btrfs_root *root; @@ -1668,9 +1668,9 @@ again: * that common but still possible. In that case, we just need * to free the anon_dev. */ - if (unlikely(anon_dev)) { - free_anon_bdev(anon_dev); - anon_dev = 0; + if (unlikely(anon_dev && *anon_dev)) { + free_anon_bdev(*anon_dev); + *anon_dev = 0; } if (check_ref && btrfs_root_refs(&root->root_item) == 0) { @@ -1692,7 +1692,7 @@ again: goto fail; } - ret = btrfs_init_fs_root(root, anon_dev); + ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0); if (ret) goto fail; @@ -1728,7 +1728,7 @@ fail: * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() * and once again by our caller. */ - if (anon_dev) + if (anon_dev && *anon_dev) root->anon_dev = 0; btrfs_put_root(root); return ERR_PTR(ret); @@ -1744,7 +1744,7 @@ fail: struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref) { - return btrfs_get_root_ref(fs_info, objectid, 0, check_ref); + return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref); } /* @@ -1752,11 +1752,11 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, * the anonymous block device id * * @objectid: tree objectid - * @anon_dev: if zero, allocate a new anonymous block device or use the - * parameter value + * @anon_dev: if NULL, allocate a new anonymous block device or use the + * parameter value if not NULL */ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, - u64 objectid, dev_t anon_dev) + u64 objectid, dev_t *anon_dev) { return btrfs_get_root_ref(fs_info, objectid, anon_dev, true); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 7322af63c0cc..24bddca86e9c 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -65,7 +65,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref); struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, - u64 objectid, dev_t anon_dev); + u64 objectid, dev_t *anon_dev); struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 objectid); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 196e222749cc..64b37afb7c87 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -708,7 +708,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, free_extent_buffer(leaf); leaf = NULL; - new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); + new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 60db4c3b82fa..b172091f4261 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1809,7 +1809,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.offset = (u64)-1; - pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); + pending->snap = btrfs_get_new_fs_root(fs_info, objectid, &pending->anon_dev); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); pending->snap = NULL; From f590040ce2b712177306b03c2a63b16f7d48d3c8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 14 Feb 2024 16:19:24 +0100 Subject: [PATCH 080/216] btrfs: dev-replace: properly validate device names commit 9845664b9ee47ce7ee7ea93caf47d39a9d4552c4 upstream. There's a syzbot report that device name buffers passed to device replace are not properly checked for string termination which could lead to a read out of bounds in getname_kernel(). Add a helper that validates both source and target device name buffers. For devid as the source initialize the buffer to empty string in case something tries to read it later. This was originally analyzed and fixed in a different way by Edward Adam Davis (see links). Link: https://lore.kernel.org/linux-btrfs/000000000000d1a1d1060cc9c5e7@google.com/ Link: https://lore.kernel.org/linux-btrfs/tencent_44CA0665C9836EF9EEC80CB9E7E206DF5206@qq.com/ CC: stable@vger.kernel.org # 4.19+ CC: Edward Adam Davis Reported-and-tested-by: syzbot+33f23b49ac24f986c9e8@syzkaller.appspotmail.com Reviewed-by: Boris Burkov Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/dev-replace.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 61e58066b5fd..9c856a73d533 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -740,6 +740,23 @@ leave: return ret; } +static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args) +{ + if (args->start.srcdevid == 0) { + if (memchr(args->start.srcdev_name, 0, + sizeof(args->start.srcdev_name)) == NULL) + return -ENAMETOOLONG; + } else { + args->start.srcdev_name[0] = 0; + } + + if (memchr(args->start.tgtdev_name, 0, + sizeof(args->start.tgtdev_name)) == NULL) + return -ENAMETOOLONG; + + return 0; +} + int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { @@ -752,10 +769,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, default: return -EINVAL; } - - if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || - args->start.tgtdev_name[0] == '\0') - return -EINVAL; + ret = btrfs_check_replace_dev_names(args); + if (ret < 0) + return ret; ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name, args->start.srcdevid, From 444d70889d199b7f74eec45f14768a83c0b04d73 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 16 Feb 2024 22:17:10 +0000 Subject: [PATCH 081/216] btrfs: send: don't issue unnecessary zero writes for trailing hole commit 5897710b28cabab04ea6c7547f27b7989de646ae upstream. If we have a sparse file with a trailing hole (from the last extent's end to i_size) and then create an extent in the file that ends before the file's i_size, then when doing an incremental send we will issue a write full of zeroes for the range that starts immediately after the new extent ends up to i_size. While this isn't incorrect because the file ends up with exactly the same data, it unnecessarily results in using extra space at the destination with one or more extents full of zeroes instead of having a hole. In same cases this results in using megabytes or even gigabytes of unnecessary space. Example, reproducer: $ cat test.sh #!/bin/bash DEV=/dev/sdh MNT=/mnt/sdh mkfs.btrfs -f $DEV mount $DEV $MNT # Create 1G sparse file. xfs_io -f -c "truncate 1G" $MNT/foobar # Create base snapshot. btrfs subvolume snapshot -r $MNT $MNT/mysnap1 # Create send stream (full send) for the base snapshot. btrfs send -f /tmp/1.snap $MNT/mysnap1 # Now write one extent at the beginning of the file and one somewhere # in the middle, leaving a gap between the end of this second extent # and the file's size. xfs_io -c "pwrite -S 0xab 0 128K" \ -c "pwrite -S 0xcd 512M 128K" \ $MNT/foobar # Now create a second snapshot which is going to be used for an # incremental send operation. btrfs subvolume snapshot -r $MNT $MNT/mysnap2 # Create send stream (incremental send) for the second snapshot. btrfs send -p $MNT/mysnap1 -f /tmp/2.snap $MNT/mysnap2 # Now recreate the filesystem by receiving both send streams and # verify we get the same content that the original filesystem had # and file foobar has only two extents with a size of 128K each. umount $MNT mkfs.btrfs -f $DEV mount $DEV $MNT btrfs receive -f /tmp/1.snap $MNT btrfs receive -f /tmp/2.snap $MNT echo -e "\nFile fiemap in the second snapshot:" # Should have: # # 128K extent at file range [0, 128K[ # hole at file range [128K, 512M[ # 128K extent file range [512M, 512M + 128K[ # hole at file range [512M + 128K, 1G[ xfs_io -r -c "fiemap -v" $MNT/mysnap2/foobar # File should be using 256K of data (two 128K extents). echo -e "\nSpace used by the file: $(du -h $MNT/mysnap2/foobar | cut -f 1)" umount $MNT Running the test, we can see with fiemap that we get an extent for the range [512M, 1G[, while in the source filesystem we have an extent for the range [512M, 512M + 128K[ and a hole for the rest of the file (the range [512M + 128K, 1G[): $ ./test.sh (...) File fiemap in the second snapshot: /mnt/sdh/mysnap2/foobar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..255]: 26624..26879 256 0x0 1: [256..1048575]: hole 1048320 2: [1048576..2097151]: 2156544..3205119 1048576 0x1 Space used by the file: 513M This happens because once we finish processing an inode, at finish_inode_if_needed(), we always issue a hole (write operations full of zeros) if there's a gap between the end of the last processed extent and the file's size, even if that range is already a hole in the parent snapshot. Fix this by issuing the hole only if the range is not already a hole. After this change, running the test above, we get the expected layout: $ ./test.sh (...) File fiemap in the second snapshot: /mnt/sdh/mysnap2/foobar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..255]: 26624..26879 256 0x0 1: [256..1048575]: hole 1048320 2: [1048576..1048831]: 26880..27135 256 0x1 3: [1048832..2097151]: hole 1048320 Space used by the file: 256K A test case for fstests will follow soon. CC: stable@vger.kernel.org # 6.1+ Reported-by: Dorai Ashok S A Link: https://lore.kernel.org/linux-btrfs/c0bf7818-9c45-46a8-b3d3-513230d0c86e@inix.me/ Reviewed-by: Sweet Tea Dorminy Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/send.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a75669972dc7..9f7ffd9ef6fd 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6462,11 +6462,20 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret) goto out; } - if (sctx->cur_inode_last_extent < - sctx->cur_inode_size) { - ret = send_hole(sctx, sctx->cur_inode_size); - if (ret) + if (sctx->cur_inode_last_extent < sctx->cur_inode_size) { + ret = range_is_hole_in_parent(sctx, + sctx->cur_inode_last_extent, + sctx->cur_inode_size); + if (ret < 0) { goto out; + } else if (ret == 0) { + ret = send_hole(sctx, sctx->cur_inode_size); + if (ret < 0) + goto out; + } else { + /* Range is already a hole, skip. */ + ret = 0; + } } } if (need_truncate) { From 2e443ed55fe3ffb08327b331a9f45e9382413c94 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 9 Aug 2023 15:06:00 -0400 Subject: [PATCH 082/216] Revert "drm/amd/pm: resolve reboot exception for si oland" commit 955558030954b9637b41c97b730f9b38c92ac488 upstream. This reverts commit e490d60a2f76bff636c68ce4fe34c1b6c34bbd86. This causes hangs on SI when DC is enabled and errors on driver reboot and power off cycles. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3216 Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2755 Reviewed-by: Yang Wang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c | 29 ++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c index dc0a6fba7050..ff1032de4f76 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c @@ -6925,6 +6925,23 @@ static int si_dpm_enable(struct amdgpu_device *adev) return 0; } +static int si_set_temperature_range(struct amdgpu_device *adev) +{ + int ret; + + ret = si_thermal_enable_alert(adev, false); + if (ret) + return ret; + ret = si_thermal_set_temperature_range(adev, R600_TEMP_RANGE_MIN, R600_TEMP_RANGE_MAX); + if (ret) + return ret; + ret = si_thermal_enable_alert(adev, true); + if (ret) + return ret; + + return ret; +} + static void si_dpm_disable(struct amdgpu_device *adev) { struct rv7xx_power_info *pi = rv770_get_pi(adev); @@ -7608,6 +7625,18 @@ static int si_dpm_process_interrupt(struct amdgpu_device *adev, static int si_dpm_late_init(void *handle) { + int ret; + struct amdgpu_device *adev = (struct amdgpu_device *)handle; + + if (!adev->pm.dpm_enabled) + return 0; + + ret = si_set_temperature_range(adev); + if (ret) + return ret; +#if 0 //TODO ? + si_dpm_powergate_uvd(adev, true); +#endif return 0; } From 8dafc066c54669384ce01b4bbdfe9708a085afb9 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Mon, 19 Feb 2024 12:18:52 +0000 Subject: [PATCH 083/216] drm/buddy: fix range bias MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f41900e4a6ef019d64a70394b0e0c3bd048d4ec8 upstream. There is a corner case here where start/end is after/before the block range we are currently checking. If so we need to be sure that splitting the block will eventually give use the block size we need. To do that we should adjust the block range to account for the start/end, and only continue with the split if the size/alignment will fit the requested size. Not doing so can result in leaving split blocks unmerged when it eventually fails. Fixes: afea229fe102 ("drm: improve drm_buddy_alloc function") Signed-off-by: Matthew Auld Cc: Arunpravin Paneer Selvam Cc: Christian König Cc: # v5.18+ Reviewed-by: Arunpravin Paneer Selvam Link: https://patchwork.freedesktop.org/patch/msgid/20240219121851.25774-4-matthew.auld@intel.com Signed-off-by: Christian König Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/drm_buddy.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c index 7098f125b54a..fd32041f8226 100644 --- a/drivers/gpu/drm/drm_buddy.c +++ b/drivers/gpu/drm/drm_buddy.c @@ -332,6 +332,7 @@ alloc_range_bias(struct drm_buddy *mm, u64 start, u64 end, unsigned int order) { + u64 req_size = mm->chunk_size << order; struct drm_buddy_block *block; struct drm_buddy_block *buddy; LIST_HEAD(dfs); @@ -367,6 +368,15 @@ alloc_range_bias(struct drm_buddy *mm, if (drm_buddy_block_is_allocated(block)) continue; + if (block_start < start || block_end > end) { + u64 adjusted_start = max(block_start, start); + u64 adjusted_end = min(block_end, end); + + if (round_down(adjusted_end + 1, req_size) <= + round_up(adjusted_start, req_size)) + continue; + } + if (contains(start, end, block_start, block_end) && order == drm_buddy_block_order(block)) { /* From 237ecf1afe6c22534fa43abdf2bf0b0f52de0aaa Mon Sep 17 00:00:00 2001 From: Peng Ma Date: Thu, 1 Feb 2024 16:50:07 -0500 Subject: [PATCH 084/216] dmaengine: fsl-qdma: fix SoC may hang on 16 byte unaligned read commit 9d739bccf261dd93ec1babf82f5c5d71dd4caa3e upstream. There is chip (ls1028a) errata: The SoC may hang on 16 byte unaligned read transactions by QDMA. Unaligned read transactions initiated by QDMA may stall in the NOC (Network On-Chip), causing a deadlock condition. Stalled transactions will trigger completion timeouts in PCIe controller. Workaround: Enable prefetch by setting the source descriptor prefetchable bit ( SD[PF] = 1 ). Implement this workaround. Cc: stable@vger.kernel.org Fixes: b092529e0aa0 ("dmaengine: fsl-qdma: Add qDMA controller driver for Layerscape SoCs") Signed-off-by: Peng Ma Signed-off-by: Frank Li Link: https://lore.kernel.org/r/20240201215007.439503-1-Frank.Li@nxp.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/fsl-qdma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/fsl-qdma.c b/drivers/dma/fsl-qdma.c index f383f219ed00..915fa0cbb553 100644 --- a/drivers/dma/fsl-qdma.c +++ b/drivers/dma/fsl-qdma.c @@ -109,6 +109,7 @@ #define FSL_QDMA_CMD_WTHROTL_OFFSET 20 #define FSL_QDMA_CMD_DSEN_OFFSET 19 #define FSL_QDMA_CMD_LWC_OFFSET 16 +#define FSL_QDMA_CMD_PF BIT(17) /* Field definition for Descriptor status */ #define QDMA_CCDF_STATUS_RTE BIT(5) @@ -384,7 +385,8 @@ static void fsl_qdma_comp_fill_memcpy(struct fsl_qdma_comp *fsl_comp, qdma_csgf_set_f(csgf_dest, len); /* Descriptor Buffer */ cmd = cpu_to_le32(FSL_QDMA_CMD_RWTTYPE << - FSL_QDMA_CMD_RWTTYPE_OFFSET); + FSL_QDMA_CMD_RWTTYPE_OFFSET) | + FSL_QDMA_CMD_PF; sdf->data = QDMA_SDDF_CMD(cmd); cmd = cpu_to_le32(FSL_QDMA_CMD_RWTTYPE << From 034e2d70b5c7f578200ad09955aeb2aa65d1164a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 23 Feb 2024 14:20:35 +0100 Subject: [PATCH 085/216] crypto: arm64/neonbs - fix out-of-bounds access on short input commit 1c0cf6d19690141002889d72622b90fc01562ce4 upstream. The bit-sliced implementation of AES-CTR operates on blocks of 128 bytes, and will fall back to the plain NEON version for tail blocks or inputs that are shorter than 128 bytes to begin with. It will call straight into the plain NEON asm helper, which performs all memory accesses in granules of 16 bytes (the size of a NEON register). For this reason, the associated plain NEON glue code will copy inputs shorter than 16 bytes into a temporary buffer, given that this is a rare occurrence and it is not worth the effort to work around this in the asm code. The fallback from the bit-sliced NEON version fails to take this into account, potentially resulting in out-of-bounds accesses. So clone the same workaround, and use a temp buffer for short in/outputs. Fixes: fc074e130051 ("crypto: arm64/aes-neonbs-ctr - fallback to plain NEON for final chunk") Cc: Reported-by: syzbot+f1ceaa1a09ab891e1934@syzkaller.appspotmail.com Reviewed-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- arch/arm64/crypto/aes-neonbs-glue.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index bac4cabef607..467ac2f768ac 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -227,8 +227,19 @@ static int ctr_encrypt(struct skcipher_request *req) src += blocks * AES_BLOCK_SIZE; } if (nbytes && walk.nbytes == walk.total) { + u8 buf[AES_BLOCK_SIZE]; + u8 *d = dst; + + if (unlikely(nbytes < AES_BLOCK_SIZE)) + src = dst = memcpy(buf + sizeof(buf) - nbytes, + src, nbytes); + neon_aes_ctr_encrypt(dst, src, ctx->enc, ctx->key.rounds, nbytes, walk.iv); + + if (unlikely(nbytes < AES_BLOCK_SIZE)) + memcpy(d, dst, nbytes); + nbytes = 0; } kernel_neon_end(); From 300111cd9042d133d1edd0255f50556211125ce9 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Thu, 22 Feb 2024 17:30:53 +0100 Subject: [PATCH 086/216] dmaengine: ptdma: use consistent DMA masks commit df2515a17914ecfc2a0594509deaf7fcb8d191ac upstream. The PTDMA driver sets DMA masks in two different places for the same device inconsistently. First call is in pt_pci_probe(), where it uses 48bit mask. The second call is in pt_dmaengine_register(), where it uses a 64bit mask. Using 64bit dma mask causes IO_PAGE_FAULT errors on DMA transfers between main memory and other devices. Without the extra call it works fine. Additionally the second call doesn't check the return value so it can silently fail. Remove the superfluous dma_set_mask() call and only use 48bit mask. Cc: stable@vger.kernel.org Fixes: b0b4a6b10577 ("dmaengine: ptdma: register PTDMA controller as a DMA resource") Reviewed-by: Basavaraj Natikar Signed-off-by: Tadeusz Struk Link: https://lore.kernel.org/r/20240222163053.13842-1-tstruk@gigaio.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/ptdma/ptdma-dmaengine.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/dma/ptdma/ptdma-dmaengine.c b/drivers/dma/ptdma/ptdma-dmaengine.c index 1aa65e5de0f3..f79240734807 100644 --- a/drivers/dma/ptdma/ptdma-dmaengine.c +++ b/drivers/dma/ptdma/ptdma-dmaengine.c @@ -385,8 +385,6 @@ int pt_dmaengine_register(struct pt_device *pt) chan->vc.desc_free = pt_do_cleanup; vchan_init(&chan->vc, dma_dev); - dma_set_mask_and_coherent(pt->dev, DMA_BIT_MASK(64)); - ret = dma_async_device_register(dma_dev); if (ret) goto err_reg; From 474d521da890b3e3585335fb80a6044cb2553d99 Mon Sep 17 00:00:00 2001 From: Curtis Klein Date: Thu, 1 Feb 2024 17:04:06 -0500 Subject: [PATCH 087/216] dmaengine: fsl-qdma: init irq after reg initialization commit 87a39071e0b639f45e05d296cc0538eef44ec0bd upstream. Initialize the qDMA irqs after the registers are configured so that interrupts that may have been pending from a primary kernel don't get processed by the irq handler before it is ready to and cause panic with the following trace: Call trace: fsl_qdma_queue_handler+0xf8/0x3e8 __handle_irq_event_percpu+0x78/0x2b0 handle_irq_event_percpu+0x1c/0x68 handle_irq_event+0x44/0x78 handle_fasteoi_irq+0xc8/0x178 generic_handle_irq+0x24/0x38 __handle_domain_irq+0x90/0x100 gic_handle_irq+0x5c/0xb8 el1_irq+0xb8/0x180 _raw_spin_unlock_irqrestore+0x14/0x40 __setup_irq+0x4bc/0x798 request_threaded_irq+0xd8/0x190 devm_request_threaded_irq+0x74/0xe8 fsl_qdma_probe+0x4d4/0xca8 platform_drv_probe+0x50/0xa0 really_probe+0xe0/0x3f8 driver_probe_device+0x64/0x130 device_driver_attach+0x6c/0x78 __driver_attach+0xbc/0x158 bus_for_each_dev+0x5c/0x98 driver_attach+0x20/0x28 bus_add_driver+0x158/0x220 driver_register+0x60/0x110 __platform_driver_register+0x44/0x50 fsl_qdma_driver_init+0x18/0x20 do_one_initcall+0x48/0x258 kernel_init_freeable+0x1a4/0x23c kernel_init+0x10/0xf8 ret_from_fork+0x10/0x18 Cc: stable@vger.kernel.org Fixes: b092529e0aa0 ("dmaengine: fsl-qdma: Add qDMA controller driver for Layerscape SoCs") Signed-off-by: Curtis Klein Signed-off-by: Yi Zhao Signed-off-by: Frank Li Link: https://lore.kernel.org/r/20240201220406.440145-1-Frank.Li@nxp.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/fsl-qdma.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/dma/fsl-qdma.c b/drivers/dma/fsl-qdma.c index 915fa0cbb553..7082a5a6814a 100644 --- a/drivers/dma/fsl-qdma.c +++ b/drivers/dma/fsl-qdma.c @@ -1203,10 +1203,6 @@ static int fsl_qdma_probe(struct platform_device *pdev) if (!fsl_qdma->queue) return -ENOMEM; - ret = fsl_qdma_irq_init(pdev, fsl_qdma); - if (ret) - return ret; - fsl_qdma->irq_base = platform_get_irq_byname(pdev, "qdma-queue0"); if (fsl_qdma->irq_base < 0) return fsl_qdma->irq_base; @@ -1245,19 +1241,22 @@ static int fsl_qdma_probe(struct platform_device *pdev) platform_set_drvdata(pdev, fsl_qdma); - ret = dma_async_device_register(&fsl_qdma->dma_dev); - if (ret) { - dev_err(&pdev->dev, - "Can't register NXP Layerscape qDMA engine.\n"); - return ret; - } - ret = fsl_qdma_reg_init(fsl_qdma); if (ret) { dev_err(&pdev->dev, "Can't Initialize the qDMA engine.\n"); return ret; } + ret = fsl_qdma_irq_init(pdev, fsl_qdma); + if (ret) + return ret; + + ret = dma_async_device_register(&fsl_qdma->dma_dev); + if (ret) { + dev_err(&pdev->dev, "Can't register NXP Layerscape qDMA engine.\n"); + return ret; + } + return 0; } From 70af82bb9c897faa25a44e4181f36c60312b71ef Mon Sep 17 00:00:00 2001 From: Christophe Kerello Date: Wed, 7 Feb 2024 15:39:51 +0100 Subject: [PATCH 088/216] mmc: mmci: stm32: fix DMA API overlapping mappings warning commit 6b1ba3f9040be5efc4396d86c9752cdc564730be upstream. Turning on CONFIG_DMA_API_DEBUG_SG results in the following warning: DMA-API: mmci-pl18x 48220000.mmc: cacheline tracking EEXIST, overlapping mappings aren't supported WARNING: CPU: 1 PID: 51 at kernel/dma/debug.c:568 add_dma_entry+0x234/0x2f4 Modules linked in: CPU: 1 PID: 51 Comm: kworker/1:2 Not tainted 6.1.28 #1 Hardware name: STMicroelectronics STM32MP257F-EV1 Evaluation Board (DT) Workqueue: events_freezable mmc_rescan Call trace: add_dma_entry+0x234/0x2f4 debug_dma_map_sg+0x198/0x350 __dma_map_sg_attrs+0xa0/0x110 dma_map_sg_attrs+0x10/0x2c sdmmc_idma_prep_data+0x80/0xc0 mmci_prep_data+0x38/0x84 mmci_start_data+0x108/0x2dc mmci_request+0xe4/0x190 __mmc_start_request+0x68/0x140 mmc_start_request+0x94/0xc0 mmc_wait_for_req+0x70/0x100 mmc_send_tuning+0x108/0x1ac sdmmc_execute_tuning+0x14c/0x210 mmc_execute_tuning+0x48/0xec mmc_sd_init_uhs_card.part.0+0x208/0x464 mmc_sd_init_card+0x318/0x89c mmc_attach_sd+0xe4/0x180 mmc_rescan+0x244/0x320 DMA API debug brings to light leaking dma-mappings as dma_map_sg and dma_unmap_sg are not correctly balanced. If an error occurs in mmci_cmd_irq function, only mmci_dma_error function is called and as this API is not managed on stm32 variant, dma_unmap_sg is never called in this error path. Signed-off-by: Christophe Kerello Fixes: 46b723dd867d ("mmc: mmci: add stm32 sdmmc variant") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240207143951.938144-1-christophe.kerello@foss.st.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/mmci_stm32_sdmmc.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/mmc/host/mmci_stm32_sdmmc.c b/drivers/mmc/host/mmci_stm32_sdmmc.c index 60bca78a72b1..0511583ffa76 100644 --- a/drivers/mmc/host/mmci_stm32_sdmmc.c +++ b/drivers/mmc/host/mmci_stm32_sdmmc.c @@ -200,6 +200,8 @@ static int sdmmc_idma_start(struct mmci_host *host, unsigned int *datactrl) struct scatterlist *sg; int i; + host->dma_in_progress = true; + if (!host->variant->dma_lli || data->sg_len == 1 || idma->use_bounce_buffer) { u32 dma_addr; @@ -238,9 +240,30 @@ static int sdmmc_idma_start(struct mmci_host *host, unsigned int *datactrl) return 0; } +static void sdmmc_idma_error(struct mmci_host *host) +{ + struct mmc_data *data = host->data; + struct sdmmc_idma *idma = host->dma_priv; + + if (!dma_inprogress(host)) + return; + + writel_relaxed(0, host->base + MMCI_STM32_IDMACTRLR); + host->dma_in_progress = false; + data->host_cookie = 0; + + if (!idma->use_bounce_buffer) + dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len, + mmc_get_dma_dir(data)); +} + static void sdmmc_idma_finalize(struct mmci_host *host, struct mmc_data *data) { + if (!dma_inprogress(host)) + return; + writel_relaxed(0, host->base + MMCI_STM32_IDMACTRLR); + host->dma_in_progress = false; if (!data->host_cookie) sdmmc_idma_unprep_data(host, data, 0); @@ -567,6 +590,7 @@ static struct mmci_host_ops sdmmc_variant_ops = { .dma_setup = sdmmc_idma_setup, .dma_start = sdmmc_idma_start, .dma_finalize = sdmmc_idma_finalize, + .dma_error = sdmmc_idma_error, .set_clkreg = mmci_sdmmc_set_clkreg, .set_pwrreg = mmci_sdmmc_set_pwrreg, .busy_complete = sdmmc_busy_complete, From bc9f87a41d185d7678c5742a4f5952df04bf2375 Mon Sep 17 00:00:00 2001 From: Ivan Semenov Date: Tue, 6 Feb 2024 19:28:45 +0200 Subject: [PATCH 089/216] mmc: core: Fix eMMC initialization with 1-bit bus connection commit ff3206d2186d84e4f77e1378ba1d225633f17b9b upstream. Initializing an eMMC that's connected via a 1-bit bus is current failing, if the HW (DT) informs that 4-bit bus is supported. In fact this is a regression, as we were earlier capable of falling back to 1-bit mode, when switching to 4/8-bit bus failed. Therefore, let's restore the behaviour. Log for Samsung eMMC 5.1 chip connected via 1bit bus (only D0 pin) Before patch: [134509.044225] mmc0: switch to bus width 4 failed [134509.044509] mmc0: new high speed MMC card at address 0001 [134509.054594] mmcblk0: mmc0:0001 BGUF4R 29.1 GiB [134509.281602] mmc0: switch to bus width 4 failed [134509.282638] I/O error, dev mmcblk0, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2 [134509.282657] Buffer I/O error on dev mmcblk0, logical block 0, async page read [134509.284598] I/O error, dev mmcblk0, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2 [134509.284602] Buffer I/O error on dev mmcblk0, logical block 0, async page read [134509.284609] ldm_validate_partition_table(): Disk read failed. [134509.286495] I/O error, dev mmcblk0, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2 [134509.286500] Buffer I/O error on dev mmcblk0, logical block 0, async page read [134509.288303] I/O error, dev mmcblk0, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2 [134509.288308] Buffer I/O error on dev mmcblk0, logical block 0, async page read [134509.289540] I/O error, dev mmcblk0, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2 [134509.289544] Buffer I/O error on dev mmcblk0, logical block 0, async page read [134509.289553] mmcblk0: unable to read partition table [134509.289728] mmcblk0boot0: mmc0:0001 BGUF4R 31.9 MiB [134509.290283] mmcblk0boot1: mmc0:0001 BGUF4R 31.9 MiB [134509.294577] I/O error, dev mmcblk0, sector 0 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 2 [134509.295835] I/O error, dev mmcblk0, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2 [134509.295841] Buffer I/O error on dev mmcblk0, logical block 0, async page read After patch: [134551.089613] mmc0: switch to bus width 4 failed [134551.090377] mmc0: new high speed MMC card at address 0001 [134551.102271] mmcblk0: mmc0:0001 BGUF4R 29.1 GiB [134551.113365] mmcblk0: p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15 p16 p17 p18 p19 p20 p21 [134551.114262] mmcblk0boot0: mmc0:0001 BGUF4R 31.9 MiB [134551.114925] mmcblk0boot1: mmc0:0001 BGUF4R 31.9 MiB Fixes: 577fb13199b1 ("mmc: rework selection of bus speed mode") Cc: stable@vger.kernel.org Signed-off-by: Ivan Semenov Link: https://lore.kernel.org/r/20240206172845.34316-1-ivan@semenov.dev Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/core/mmc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index a46ce0868fe1..3a927452a650 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1007,10 +1007,12 @@ static int mmc_select_bus_width(struct mmc_card *card) static unsigned ext_csd_bits[] = { EXT_CSD_BUS_WIDTH_8, EXT_CSD_BUS_WIDTH_4, + EXT_CSD_BUS_WIDTH_1, }; static unsigned bus_widths[] = { MMC_BUS_WIDTH_8, MMC_BUS_WIDTH_4, + MMC_BUS_WIDTH_1, }; struct mmc_host *host = card->host; unsigned idx, bus_width = 0; From c65c475560851291ad64272d4b85b55e70c4adbb Mon Sep 17 00:00:00 2001 From: Elad Nachman Date: Thu, 22 Feb 2024 21:17:14 +0200 Subject: [PATCH 090/216] mmc: sdhci-xenon: add timeout for PHY init complete commit 09e23823ae9a3e2d5d20f2e1efe0d6e48cef9129 upstream. AC5X spec says PHY init complete bit must be polled until zero. We see cases in which timeout can take longer than the standard calculation on AC5X, which is expected following the spec comment above. According to the spec, we must wait as long as it takes for that bit to toggle on AC5X. Cap that with 100 delay loops so we won't get stuck forever. Fixes: 06c8b667ff5b ("mmc: sdhci-xenon: Add support to PHYs of Marvell Xenon SDHC") Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Signed-off-by: Elad Nachman Link: https://lore.kernel.org/r/20240222191714.1216470-3-enachman@marvell.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci-xenon-phy.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/mmc/host/sdhci-xenon-phy.c b/drivers/mmc/host/sdhci-xenon-phy.c index 8cf3a375de65..52a8e1217124 100644 --- a/drivers/mmc/host/sdhci-xenon-phy.c +++ b/drivers/mmc/host/sdhci-xenon-phy.c @@ -109,6 +109,8 @@ #define XENON_EMMC_PHY_LOGIC_TIMING_ADJUST (XENON_EMMC_PHY_REG_BASE + 0x18) #define XENON_LOGIC_TIMING_VALUE 0x00AA8977 +#define XENON_MAX_PHY_TIMEOUT_LOOPS 100 + /* * List offset of PHY registers and some special register values * in eMMC PHY 5.0 or eMMC PHY 5.1 @@ -259,18 +261,27 @@ static int xenon_emmc_phy_init(struct sdhci_host *host) /* get the wait time */ wait /= clock; wait++; - /* wait for host eMMC PHY init completes */ - udelay(wait); - reg = sdhci_readl(host, phy_regs->timing_adj); - reg &= XENON_PHY_INITIALIZAION; - if (reg) { + /* + * AC5X spec says bit must be polled until zero. + * We see cases in which timeout can take longer + * than the standard calculation on AC5X, which is + * expected following the spec comment above. + * According to the spec, we must wait as long as + * it takes for that bit to toggle on AC5X. + * Cap that with 100 delay loops so we won't get + * stuck here forever: + */ + + ret = read_poll_timeout(sdhci_readl, reg, + !(reg & XENON_PHY_INITIALIZAION), + wait, XENON_MAX_PHY_TIMEOUT_LOOPS * wait, + false, host, phy_regs->timing_adj); + if (ret) dev_err(mmc_dev(host->mmc), "eMMC PHY init cannot complete after %d us\n", - wait); - return -ETIMEDOUT; - } + wait * XENON_MAX_PHY_TIMEOUT_LOOPS); - return 0; + return ret; } #define ARMADA_3700_SOC_PAD_1_8V 0x1 From 4974d928d5e3909bd8cfe4b0bca2509636a8ebf2 Mon Sep 17 00:00:00 2001 From: Elad Nachman Date: Thu, 22 Feb 2024 22:09:30 +0200 Subject: [PATCH 091/216] mmc: sdhci-xenon: fix PHY init clock stability commit 8e9f25a290ae0016353c9ea13314c95fb3207812 upstream. Each time SD/mmc phy is initialized, at times, in some of the attempts, phy fails to completes its initialization which results into timeout error. Per the HW spec, it is a pre-requisite to ensure a stable SD clock before a phy initialization is attempted. Fixes: 06c8b667ff5b ("mmc: sdhci-xenon: Add support to PHYs of Marvell Xenon SDHC") Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Signed-off-by: Elad Nachman Link: https://lore.kernel.org/r/20240222200930.1277665-1-enachman@marvell.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci-xenon-phy.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/mmc/host/sdhci-xenon-phy.c b/drivers/mmc/host/sdhci-xenon-phy.c index 52a8e1217124..cc9d28b75eb9 100644 --- a/drivers/mmc/host/sdhci-xenon-phy.c +++ b/drivers/mmc/host/sdhci-xenon-phy.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "sdhci-pltfm.h" @@ -218,6 +219,19 @@ static int xenon_alloc_emmc_phy(struct sdhci_host *host) return 0; } +static int xenon_check_stability_internal_clk(struct sdhci_host *host) +{ + u32 reg; + int err; + + err = read_poll_timeout(sdhci_readw, reg, reg & SDHCI_CLOCK_INT_STABLE, + 1100, 20000, false, host, SDHCI_CLOCK_CONTROL); + if (err) + dev_err(mmc_dev(host->mmc), "phy_init: Internal clock never stabilized.\n"); + + return err; +} + /* * eMMC 5.0/5.1 PHY init/re-init. * eMMC PHY init should be executed after: @@ -234,6 +248,11 @@ static int xenon_emmc_phy_init(struct sdhci_host *host) struct xenon_priv *priv = sdhci_pltfm_priv(pltfm_host); struct xenon_emmc_phy_regs *phy_regs = priv->emmc_phy_regs; + int ret = xenon_check_stability_internal_clk(host); + + if (ret) + return ret; + reg = sdhci_readl(host, phy_regs->timing_adj); reg |= XENON_PHY_INITIALIZAION; sdhci_writel(host, reg, phy_regs->timing_adj); From 76109a226a39aea5d621b9b0af04ba23fc9cf7de Mon Sep 17 00:00:00 2001 From: Zong Li Date: Fri, 2 Feb 2024 01:51:02 +0000 Subject: [PATCH 092/216] riscv: add CALLER_ADDRx support commit 680341382da56bd192ebfa4e58eaf4fec2e5bca7 upstream. CALLER_ADDRx returns caller's address at specified level, they are used for several tracers. These macros eventually use __builtin_return_address(n) to get the caller's address if arch doesn't define their own implementation. In RISC-V, __builtin_return_address(n) only works when n == 0, we need to walk the stack frame to get the caller's address at specified level. data.level started from 'level + 3' due to the call flow of getting caller's address in RISC-V implementation. If we don't have additional three iteration, the level is corresponding to follows: callsite -> return_address -> arch_stack_walk -> walk_stackframe | | | | level 3 level 2 level 1 level 0 Fixes: 10626c32e382 ("riscv/ftrace: Add basic support") Cc: stable@vger.kernel.org Reviewed-by: Alexandre Ghiti Signed-off-by: Zong Li Link: https://lore.kernel.org/r/20240202015102.26251-1-zong.li@sifive.com Signed-off-by: Palmer Dabbelt Signed-off-by: Greg Kroah-Hartman --- arch/riscv/include/asm/ftrace.h | 5 ++++ arch/riscv/kernel/Makefile | 2 ++ arch/riscv/kernel/return_address.c | 48 ++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) create mode 100644 arch/riscv/kernel/return_address.c diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index d47d87c2d7e3..dcf1bc9de584 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -25,6 +25,11 @@ #define ARCH_SUPPORTS_FTRACE_OPS 1 #ifndef __ASSEMBLY__ + +extern void *return_address(unsigned int level); + +#define ftrace_return_address(n) return_address(n) + void MCOUNT_NAME(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index ab333cb792fd..4c0805d264ca 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -7,6 +7,7 @@ ifdef CONFIG_FTRACE CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_patch.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_sbi.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) endif CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,) CFLAGS_compat_syscall_table.o += $(call cc-option,-Wno-override-init,) @@ -41,6 +42,7 @@ obj-y += irq.o obj-y += process.o obj-y += ptrace.o obj-y += reset.o +obj-y += return_address.o obj-y += setup.o obj-y += signal.o obj-y += syscall_table.o diff --git a/arch/riscv/kernel/return_address.c b/arch/riscv/kernel/return_address.c new file mode 100644 index 000000000000..c8115ec8fb30 --- /dev/null +++ b/arch/riscv/kernel/return_address.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This code come from arch/arm64/kernel/return_address.c + * + * Copyright (C) 2023 SiFive. + */ + +#include +#include +#include + +struct return_address_data { + unsigned int level; + void *addr; +}; + +static bool save_return_addr(void *d, unsigned long pc) +{ + struct return_address_data *data = d; + + if (!data->level) { + data->addr = (void *)pc; + return false; + } + + --data->level; + + return true; +} +NOKPROBE_SYMBOL(save_return_addr); + +noinline void *return_address(unsigned int level) +{ + struct return_address_data data; + + data.level = level + 3; + data.addr = NULL; + + arch_stack_walk(save_return_addr, &data, current, NULL); + + if (!data.level) + return data.addr; + else + return NULL; + +} +EXPORT_SYMBOL_GPL(return_address); +NOKPROBE_SYMBOL(return_address); From 249d6ca4ff0022a4b51a8eb9fac6d7bff2c94d1b Mon Sep 17 00:00:00 2001 From: Tim Schumacher Date: Fri, 26 Jan 2024 17:25:23 +0100 Subject: [PATCH 093/216] efivarfs: Request at most 512 bytes for variable names commit f45812cc23fb74bef62d4eb8a69fe7218f4b9f2a upstream. Work around a quirk in a few old (2011-ish) UEFI implementations, where a call to `GetNextVariableName` with a buffer size larger than 512 bytes will always return EFI_INVALID_PARAMETER. There is some lore around EFI variable names being up to 1024 bytes in size, but this has no basis in the UEFI specification, and the upper bounds are typically platform specific, and apply to the entire variable (name plus payload). Given that Linux does not permit creating files with names longer than NAME_MAX (255) bytes, 512 bytes (== 256 UTF-16 characters) is a reasonable limit. Cc: # 6.1+ Signed-off-by: Tim Schumacher Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- fs/efivarfs/vars.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c index 9e4f47808bd5..13bc60698955 100644 --- a/fs/efivarfs/vars.c +++ b/fs/efivarfs/vars.c @@ -372,7 +372,7 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), void *data, bool duplicates, struct list_head *head) { - unsigned long variable_name_size = 1024; + unsigned long variable_name_size = 512; efi_char16_t *variable_name; efi_status_t status; efi_guid_t vendor_guid; @@ -389,12 +389,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), goto free; /* - * Per EFI spec, the maximum storage allocated for both - * the variable name and variable data is 1024 bytes. + * A small set of old UEFI implementations reject sizes + * above a certain threshold, the lowest seen in the wild + * is 512. */ do { - variable_name_size = 1024; + variable_name_size = 512; status = efivar_get_next_variable(&variable_name_size, variable_name, @@ -431,9 +432,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), break; case EFI_NOT_FOUND: break; + case EFI_BUFFER_TOO_SMALL: + pr_warn("efivars: Variable name size exceeds maximum (%lu > 512)\n", + variable_name_size); + status = EFI_NOT_FOUND; + break; default: - printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n", - status); + pr_warn("efivars: get_next_variable: status=%lx\n", status); status = EFI_NOT_FOUND; break; } From 396a4120011d8d574eb57793efb0eec5f271a2c8 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 26 Feb 2024 17:49:57 -0800 Subject: [PATCH 094/216] pmdomain: qcom: rpmhpd: Fix enabled_corner aggregation commit 2a93c6cbd5a703d44c414a3c3945a87ce11430ba upstream. Commit 'e3e56c050ab6 ("soc: qcom: rpmhpd: Make power_on actually enable the domain")' aimed to make sure that a power-domain that is being enabled without any particular performance-state requested will at least turn the rail on, to avoid filling DeviceTree with otherwise unnecessary required-opps properties. But in the event that aggregation happens on a disabled power-domain, with an enabled peer without performance-state, both the local and peer corner are 0. The peer's enabled_corner is not considered, with the result that the underlying (shared) resource is disabled. One case where this can be observed is when the display stack keeps mmcx enabled (but without a particular performance-state vote) in order to access registers and sync_state happens in the rpmhpd driver. As mmcx_ao is flushed the state of the peer (mmcx) is not considered and mmcx_ao ends up turning off "mmcx.lvl" underneath mmcx. This has been observed several times, but has been painted over in DeviceTree by adding an explicit vote for the lowest non-disabled performance-state. Fixes: e3e56c050ab6 ("soc: qcom: rpmhpd: Make power_on actually enable the domain") Reported-by: Johan Hovold Closes: https://lore.kernel.org/linux-arm-msm/ZdMwZa98L23mu3u6@hovoldconsulting.com/ Cc: Signed-off-by: Bjorn Andersson Reviewed-by: Konrad Dybcio Reviewed-by: Dmitry Baryshkov Tested-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Reviewed-by: Stephen Boyd Tested-by: Johan Hovold Link: https://lore.kernel.org/r/20240226-rpmhpd-enable-corner-fix-v1-1-68c004cec48c@quicinc.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/soc/qcom/rpmhpd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/soc/qcom/rpmhpd.c b/drivers/soc/qcom/rpmhpd.c index 092f6ab09acf..9a90f241bb97 100644 --- a/drivers/soc/qcom/rpmhpd.c +++ b/drivers/soc/qcom/rpmhpd.c @@ -492,12 +492,15 @@ static int rpmhpd_aggregate_corner(struct rpmhpd *pd, unsigned int corner) unsigned int active_corner, sleep_corner; unsigned int this_active_corner = 0, this_sleep_corner = 0; unsigned int peer_active_corner = 0, peer_sleep_corner = 0; + unsigned int peer_enabled_corner; to_active_sleep(pd, corner, &this_active_corner, &this_sleep_corner); - if (peer && peer->enabled) - to_active_sleep(peer, peer->corner, &peer_active_corner, + if (peer && peer->enabled) { + peer_enabled_corner = max(peer->corner, peer->enable_corner); + to_active_sleep(peer, peer_enabled_corner, &peer_active_corner, &peer_sleep_corner); + } active_corner = max(this_active_corner, peer_active_corner); From c9fa51d4c434fa7bdafd0c7a9e19cf9023787fd4 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 31 Jan 2024 01:04:28 +0100 Subject: [PATCH 095/216] x86/e820: Don't reserve SETUP_RNG_SEED in e820 commit 7fd817c906503b6813ea3b41f5fdf4192449a707 upstream. SETUP_RNG_SEED in setup_data is supplied by kexec and should not be reserved in the e820 map. Doing so reserves 16 bytes of RAM when booting with kexec. (16 bytes because data->len is zeroed by parse_setup_data so only sizeof(setup_data) is reserved.) When kexec is used repeatedly, each boot adds two entries in the kexec-provided e820 map as the 16-byte range splits a larger range of usable memory. Eventually all of the 128 available entries get used up. The next split will result in losing usable memory as the new entries cannot be added to the e820 map. Fixes: 68b8e9713c8e ("x86/setup: Use rng seeds from setup_data") Signed-off-by: Jiri Bohac Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Dave Hansen Cc: Link: https://lore.kernel.org/r/ZbmOjKnARGiaYBd5@dwarf.suse.cz Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/e820.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9dac24680ff8..993734e96615 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1017,10 +1017,12 @@ void __init e820__reserve_setup_data(void) e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); /* - * SETUP_EFI and SETUP_IMA are supplied by kexec and do not need - * to be reserved. + * SETUP_EFI, SETUP_IMA and SETUP_RNG_SEED are supplied by + * kexec and do not need to be reserved. */ - if (data->type != SETUP_EFI && data->type != SETUP_IMA) + if (data->type != SETUP_EFI && + data->type != SETUP_IMA && + data->type != SETUP_RNG_SEED) e820__range_update_kexec(pa_data, sizeof(*data) + data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); From 65742f4bb1f919caa564b8a20b15b8cdd6eca2ef Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 1 Feb 2024 00:09:02 +0100 Subject: [PATCH 096/216] x86/cpu/intel: Detect TME keyid bits before setting MTRR mask registers commit 6890cb1ace350b4386c8aee1343dc3b3ddd214da upstream. MKTME repurposes the high bit of physical address to key id for encryption key and, even though MAXPHYADDR in CPUID[0x80000008] remains the same, the valid bits in the MTRR mask register are based on the reduced number of physical address bits. detect_tme() in arch/x86/kernel/cpu/intel.c detects TME and subtracts it from the total usable physical bits, but it is called too late. Move the call to early_init_intel() so that it is called in setup_arch(), before MTRRs are setup. This fixes boot on TDX-enabled systems, which until now only worked with "disable_mtrr_cleanup". Without the patch, the values written to the MTRRs mask registers were 52-bit wide (e.g. 0x000fffff_80000800) and the writes failed; with the patch, the values are 46-bit wide, which matches the reduced MAXPHYADDR that is shown in /proc/cpuinfo. Reported-by: Zixi Chen Signed-off-by: Paolo Bonzini Signed-off-by: Dave Hansen Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20240131230902.1867092-3-pbonzini%40redhat.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/intel.c | 178 ++++++++++++++++++------------------ 1 file changed, 91 insertions(+), 87 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 427899650483..32bd64017047 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -216,6 +216,90 @@ int intel_cpu_collect_info(struct ucode_cpu_info *uci) } EXPORT_SYMBOL_GPL(intel_cpu_collect_info); +#define MSR_IA32_TME_ACTIVATE 0x982 + +/* Helpers to access TME_ACTIVATE MSR */ +#define TME_ACTIVATE_LOCKED(x) (x & 0x1) +#define TME_ACTIVATE_ENABLED(x) (x & 0x2) + +#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ +#define TME_ACTIVATE_POLICY_AES_XTS_128 0 + +#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ + +#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ +#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 + +/* Values for mktme_status (SW only construct) */ +#define MKTME_ENABLED 0 +#define MKTME_DISABLED 1 +#define MKTME_UNINITIALIZED 2 +static int mktme_status = MKTME_UNINITIALIZED; + +static void detect_tme_early(struct cpuinfo_x86 *c) +{ + u64 tme_activate, tme_policy, tme_crypto_algs; + int keyid_bits = 0, nr_keyids = 0; + static u64 tme_activate_cpu0 = 0; + + rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + + if (mktme_status != MKTME_UNINITIALIZED) { + if (tme_activate != tme_activate_cpu0) { + /* Broken BIOS? */ + pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); + pr_err_once("x86/tme: MKTME is not usable\n"); + mktme_status = MKTME_DISABLED; + + /* Proceed. We may need to exclude bits from x86_phys_bits. */ + } + } else { + tme_activate_cpu0 = tme_activate; + } + + if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { + pr_info_once("x86/tme: not enabled by BIOS\n"); + mktme_status = MKTME_DISABLED; + return; + } + + if (mktme_status != MKTME_UNINITIALIZED) + goto detect_keyid_bits; + + pr_info("x86/tme: enabled by BIOS\n"); + + tme_policy = TME_ACTIVATE_POLICY(tme_activate); + if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) + pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); + + tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); + if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { + pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", + tme_crypto_algs); + mktme_status = MKTME_DISABLED; + } +detect_keyid_bits: + keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); + nr_keyids = (1UL << keyid_bits) - 1; + if (nr_keyids) { + pr_info_once("x86/mktme: enabled by BIOS\n"); + pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); + } else { + pr_info_once("x86/mktme: disabled by BIOS\n"); + } + + if (mktme_status == MKTME_UNINITIALIZED) { + /* MKTME is usable */ + mktme_status = MKTME_ENABLED; + } + + /* + * KeyID bits effectively lower the number of physical address + * bits. Update cpuinfo_x86::x86_phys_bits accordingly. + */ + c->x86_phys_bits -= keyid_bits; +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -367,6 +451,13 @@ static void early_init_intel(struct cpuinfo_x86 *c) */ if (detect_extended_topology_early(c) < 0) detect_ht_early(c); + + /* + * Adjust the number of physical bits early because it affects the + * valid bits of the MTRR mask registers. + */ + if (cpu_has(c, X86_FEATURE_TME)) + detect_tme_early(c); } static void bsp_init_intel(struct cpuinfo_x86 *c) @@ -527,90 +618,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -#define MSR_IA32_TME_ACTIVATE 0x982 - -/* Helpers to access TME_ACTIVATE MSR */ -#define TME_ACTIVATE_LOCKED(x) (x & 0x1) -#define TME_ACTIVATE_ENABLED(x) (x & 0x2) - -#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ -#define TME_ACTIVATE_POLICY_AES_XTS_128 0 - -#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ - -#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ -#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 - -/* Values for mktme_status (SW only construct) */ -#define MKTME_ENABLED 0 -#define MKTME_DISABLED 1 -#define MKTME_UNINITIALIZED 2 -static int mktme_status = MKTME_UNINITIALIZED; - -static void detect_tme(struct cpuinfo_x86 *c) -{ - u64 tme_activate, tme_policy, tme_crypto_algs; - int keyid_bits = 0, nr_keyids = 0; - static u64 tme_activate_cpu0 = 0; - - rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); - - if (mktme_status != MKTME_UNINITIALIZED) { - if (tme_activate != tme_activate_cpu0) { - /* Broken BIOS? */ - pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); - pr_err_once("x86/tme: MKTME is not usable\n"); - mktme_status = MKTME_DISABLED; - - /* Proceed. We may need to exclude bits from x86_phys_bits. */ - } - } else { - tme_activate_cpu0 = tme_activate; - } - - if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { - pr_info_once("x86/tme: not enabled by BIOS\n"); - mktme_status = MKTME_DISABLED; - return; - } - - if (mktme_status != MKTME_UNINITIALIZED) - goto detect_keyid_bits; - - pr_info("x86/tme: enabled by BIOS\n"); - - tme_policy = TME_ACTIVATE_POLICY(tme_activate); - if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) - pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); - - tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); - if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { - pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", - tme_crypto_algs); - mktme_status = MKTME_DISABLED; - } -detect_keyid_bits: - keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); - nr_keyids = (1UL << keyid_bits) - 1; - if (nr_keyids) { - pr_info_once("x86/mktme: enabled by BIOS\n"); - pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); - } else { - pr_info_once("x86/mktme: disabled by BIOS\n"); - } - - if (mktme_status == MKTME_UNINITIALIZED) { - /* MKTME is usable */ - mktme_status = MKTME_ENABLED; - } - - /* - * KeyID bits effectively lower the number of physical address - * bits. Update cpuinfo_x86::x86_phys_bits accordingly. - */ - c->x86_phys_bits -= keyid_bits; -} - static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; @@ -747,9 +754,6 @@ static void init_intel(struct cpuinfo_x86 *c) init_ia32_feat_ctl(c); - if (cpu_has(c, X86_FEATURE_TME)) - detect_tme(c); - init_intel_misc_features(c); split_lock_init(); From e6e04845c2e8af9fef7d58439e9f62a3ed93f33b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 15 Feb 2024 19:25:31 +0100 Subject: [PATCH 097/216] mptcp: fix data races on local_id commit a7cfe776637004a4c938fde78be4bd608c32c3ef upstream. The local address id is accessed lockless by the NL PM, add all the required ONCE annotation. There is a caveat: the local id can be initialized late in the subflow life-cycle, and its validity is controlled by the local_id_valid flag. Remove such flag and encode the validity in the local_id field itself with negative value before initialization. That allows accessing the field consistently with a single read operation. Fixes: 0ee4261a3681 ("mptcp: implement mptcp_pm_remove_subflow") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/mptcp/diag.c | 2 +- net/mptcp/pm_netlink.c | 6 +++--- net/mptcp/pm_userspace.c | 2 +- net/mptcp/protocol.c | 2 +- net/mptcp/protocol.h | 13 +++++++++++-- net/mptcp/subflow.c | 9 +++++---- 6 files changed, 22 insertions(+), 12 deletions(-) diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c index e57c5f47f035..6ff6f14674aa 100644 --- a/net/mptcp/diag.c +++ b/net/mptcp/diag.c @@ -65,7 +65,7 @@ static int subflow_get_info(struct sock *sk, struct sk_buff *skb) sf->map_data_len) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) || nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) || - nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, sf->local_id)) { + nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, subflow_get_local_id(sf))) { err = -EMSGSIZE; goto nla_failure; } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 70a1025f093c..3632f4830420 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -799,7 +799,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - u8 id = subflow->local_id; + u8 id = subflow_get_local_id(subflow); if (rm_type == MPTCP_MIB_RMADDR && subflow->remote_id != rm_id) continue; @@ -808,7 +808,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", - i, rm_id, subflow->local_id, subflow->remote_id, + i, rm_id, id, subflow->remote_id, msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); @@ -2028,7 +2028,7 @@ static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) if (WARN_ON_ONCE(!sf)) return -EINVAL; - if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, sf->local_id)) + if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, subflow_get_local_id(sf))) return -EMSGSIZE; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, sf->remote_id)) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 631fa104617c..67eccc141a6c 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -233,7 +233,7 @@ static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { - if (subflow->local_id == 0) { + if (READ_ONCE(subflow->local_id) == 0) { has_id_0 = true; break; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 859b18cb8e4f..cdabb00648bd 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -119,7 +119,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) subflow->request_mptcp = 1; /* This is the first subflow, always with id 0 */ - subflow->local_id_valid = 1; + WRITE_ONCE(subflow->local_id, 0); mptcp_sock_graft(msk->first, sk->sk_socket); return 0; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index b09220521323..2bc37773e780 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -475,7 +475,6 @@ struct mptcp_subflow_context { can_ack : 1, /* only after processing the remote a key */ disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ - local_id_valid : 1, /* local_id is correctly initialized */ valid_csum_seen : 1; /* at least one csum validated */ enum mptcp_data_avail data_avail; u32 remote_nonce; @@ -483,7 +482,7 @@ struct mptcp_subflow_context { u32 local_nonce; u32 remote_token; u8 hmac[MPTCPOPT_HMAC_LEN]; - u8 local_id; + s16 local_id; /* if negative not initialized yet */ u8 remote_id; u8 reset_seen:1; u8 reset_transient:1; @@ -529,6 +528,7 @@ mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow) { memset(&subflow->reset, 0, sizeof(subflow->reset)); subflow->request_mptcp = 1; + WRITE_ONCE(subflow->local_id, -1); } static inline u64 @@ -909,6 +909,15 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); +static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflow) +{ + int local_id = READ_ONCE(subflow->local_id); + + if (local_id < 0) + return 0; + return local_id; +} + void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_work(struct mptcp_sock *msk); void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 45d20e20cfc0..83bc438b9825 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -489,8 +489,8 @@ do_reset: static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id) { - subflow->local_id = local_id; - subflow->local_id_valid = 1; + WARN_ON_ONCE(local_id < 0 || local_id > 255); + WRITE_ONCE(subflow->local_id, local_id); } static int subflow_chk_local_id(struct sock *sk) @@ -499,7 +499,7 @@ static int subflow_chk_local_id(struct sock *sk) struct mptcp_sock *msk = mptcp_sk(subflow->conn); int err; - if (likely(subflow->local_id_valid)) + if (likely(subflow->local_id >= 0)) return 0; err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); @@ -1630,6 +1630,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, pr_debug("subflow=%p", ctx); ctx->tcp_sock = sk; + WRITE_ONCE(ctx->local_id, -1); return ctx; } @@ -1867,7 +1868,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->idsn = subflow_req->idsn; /* this is the first subflow, id is always 0 */ - new_ctx->local_id_valid = 1; + subflow_set_local_id(new_ctx, 0); } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; From e64148635509bf13eea851986f5a0b150e5bd066 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 15 Feb 2024 19:25:32 +0100 Subject: [PATCH 098/216] mptcp: fix data races on remote_id commit 967d3c27127e71a10ff5c083583a038606431b61 upstream. Similar to the previous patch, address the data race on remote_id, adding the suitable ONCE annotations. Fixes: bedee0b56113 ("mptcp: address lookup improvements") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/mptcp/pm_netlink.c | 8 ++++---- net/mptcp/subflow.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 3632f4830420..582d0c641ed1 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -449,7 +449,7 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm mptcp_for_each_subflow(msk, subflow) { ssk = mptcp_subflow_tcp_sock(subflow); remote_address((struct sock_common *)ssk, &addrs[i]); - addrs[i].id = subflow->remote_id; + addrs[i].id = READ_ONCE(subflow->remote_id); if (deny_id0 && !addrs[i].id) continue; @@ -798,18 +798,18 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + u8 remote_id = READ_ONCE(subflow->remote_id); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; u8 id = subflow_get_local_id(subflow); - if (rm_type == MPTCP_MIB_RMADDR && subflow->remote_id != rm_id) + if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) continue; if (rm_type == MPTCP_MIB_RMSUBFLOW && !mptcp_local_id_match(msk, id, rm_id)) continue; pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", - i, rm_id, id, subflow->remote_id, - msk->mpc_endpoint_id); + i, rm_id, id, remote_id, msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 83bc438b9825..891c2f4fed08 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -446,7 +446,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->backup = mp_opt.backup; subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; - subflow->remote_id = mp_opt.join_id; + WRITE_ONCE(subflow->remote_id, mp_opt.join_id); pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); @@ -1477,7 +1477,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; - subflow->remote_id = remote_id; + WRITE_ONCE(subflow->remote_id, remote_id); subflow->request_join = 1; subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); mptcp_info2sockaddr(remote, &addr, ssk->sk_family); @@ -1874,7 +1874,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->mp_join = 1; new_ctx->fully_established = 1; new_ctx->backup = subflow_req->backup; - new_ctx->remote_id = subflow_req->remote_id; + WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id); new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; From fbccc5eb1652b6c4ff446f34eb5a18869b7f4f3b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 15 Feb 2024 19:25:33 +0100 Subject: [PATCH 099/216] mptcp: fix duplicate subflow creation commit 045e9d812868a2d80b7a57b224ce8009444b7bbc upstream. Fullmesh endpoints could end-up unexpectedly generating duplicate subflows - same local and remote addresses - when multiple incoming ADD_ADDR are processed before the PM creates the subflow for the local endpoints. Address the issue explicitly checking for duplicates at subflow creation time. To avoid a quadratic computational complexity, track the unavailable remote address ids in a temporary bitmap and initialize such bitmap with the remote ids of all the existing subflows matching the local address currently processed. The above allows additionally replacing the existing code checking for duplicate entry in the current set with a simple bit test operation. Fixes: 2843ff6f36db ("mptcp: remote addresses fullmesh") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/435 Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/mptcp/pm_netlink.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 582d0c641ed1..3328870b0c1f 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -407,23 +407,12 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk) } } -static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned int nr, - const struct mptcp_addr_info *addr) -{ - int i; - - for (i = 0; i < nr; i++) { - if (addrs[i].id == addr->id) - return true; - } - - return false; -} - /* Fill all the remote addresses into the array addrs[], * and return the array size. */ -static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullmesh, +static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, + struct mptcp_addr_info *local, + bool fullmesh, struct mptcp_addr_info *addrs) { bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); @@ -446,6 +435,16 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm msk->pm.subflows++; addrs[i++] = remote; } else { + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + + /* Forbid creation of new subflows matching existing + * ones, possibly already created by incoming ADD_ADDR + */ + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) + if (READ_ONCE(subflow->local_id) == local->id) + __set_bit(subflow->remote_id, unavail_id); + mptcp_for_each_subflow(msk, subflow) { ssk = mptcp_subflow_tcp_sock(subflow); remote_address((struct sock_common *)ssk, &addrs[i]); @@ -453,8 +452,11 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm if (deny_id0 && !addrs[i].id) continue; - if (!lookup_address_in_vec(addrs, i, &addrs[i]) && - msk->pm.subflows < subflows_max) { + if (msk->pm.subflows < subflows_max) { + /* forbid creating multiple address towards + * this id + */ + __set_bit(addrs[i].id, unavail_id); msk->pm.subflows++; i++; } @@ -603,7 +605,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH); msk->pm.local_addr_used++; - nr = fill_remote_addresses_vec(msk, fullmesh, addrs); + nr = fill_remote_addresses_vec(msk, &local->addr, fullmesh, addrs); if (nr) __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); spin_unlock_bh(&msk->pm.lock); From 53e3f2ee8a0ce7fb33488325a74b678ddf74632a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Feb 2024 18:21:21 +0100 Subject: [PATCH 100/216] mptcp: continue marking the first subflow as UNCONNECTED After the 'Fixes' commit mentioned below, which is a partial backport, the MPTCP worker was no longer marking the first subflow as "UNCONNECTED" when the socket was transitioning to TCP_CLOSE state. As a result, in v6.1, it was no longer possible to reconnect to the just disconnected socket. Continue to do that like before, only for the first subflow. A few refactoring have been done around the 'msk->subflow' in later versions, and it looks like this is not needed to do that there, but still needed in v6.1. Without that, the 'disconnect' tests from the mptcp_connect.sh selftest fail: they repeat the transfer 3 times by reconnecting to the server each time. Fixes: 7857e35ef10e ("mptcp: get rid of msk->subflow") Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Greg Kroah-Hartman --- net/mptcp/protocol.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index cdabb00648bd..125825db642c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2440,6 +2440,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { __mptcp_subflow_disconnect(ssk, subflow, flags); + if (msk->subflow && ssk == msk->subflow->sk) + msk->subflow->state = SS_UNCONNECTED; release_sock(ssk); goto out; From fb7be5e5ec265a47f6763b6d772873da78bb09d0 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 23 Feb 2024 17:14:11 +0100 Subject: [PATCH 101/216] mptcp: map v4 address to v6 when destroying subflow commit 535d620ea5ff1a033dc64ee3d912acadc7470619 upstream. Address family of server side mismatches with that of client side, like in "userspace pm add & remove address" test: userspace_pm_add_addr $ns1 10.0.2.1 10 userspace_pm_rm_sf $ns1 "::ffff:10.0.2.1" $SUB_ESTABLISHED That's because on the server side, the family is set to AF_INET6 and the v4 address is mapped in a v6 one. This patch fixes this issue. In mptcp_pm_nl_subflow_destroy_doit(), before checking local address family with remote address family, map an IPv4 address to an IPv6 address if the pair is a v4-mapped address. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/387 Fixes: 702c2f646d42 ("mptcp: netlink: allow userspace-driven subflow establishment") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240223-upstream-net-20240223-misc-fixes-v1-1-162e87e48497@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/mptcp/pm_userspace.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 67eccc141a6c..414ed70e7ba2 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -489,6 +489,16 @@ int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info) goto destroy_err; } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (addr_l.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) { + ipv6_addr_set_v4mapped(addr_l.addr.s_addr, &addr_l.addr6); + addr_l.family = AF_INET6; + } + if (addr_r.family == AF_INET && ipv6_addr_v4mapped(&addr_l.addr6)) { + ipv6_addr_set_v4mapped(addr_r.addr.s_addr, &addr_r.addr6); + addr_r.family = AF_INET6; + } +#endif if (addr_l.family != addr_r.family) { GENL_SET_ERR_MSG(info, "address families do not match"); err = -EINVAL; From 84a3c10a0c79ede027b030f61a89b6ab7cf98226 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 23 Feb 2024 17:14:14 +0100 Subject: [PATCH 102/216] mptcp: push at DSS boundaries commit b9cd26f640a308ea314ad23532de9a8592cd09d2 upstream. when inserting not contiguous data in the subflow write queue, the protocol creates a new skb and prevent the TCP stack from merging it later with already queued skbs by setting the EOR marker. Still no push flag is explicitly set at the end of previous GSO packet, making the aggregation on the receiver side sub-optimal - and packetdrill self-tests less predictable. Explicitly mark the end of not contiguous DSS with the push flag. Fixes: 6d0060f600ad ("mptcp: Write MPTCP DSS headers to outgoing data packets") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240223-upstream-net-20240223-misc-fixes-v1-4-162e87e48497@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/mptcp/protocol.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 125825db642c..be53e5460962 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1319,6 +1319,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext = skb_ext_find(skb, SKB_EXT_MPTCP); if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { TCP_SKB_CB(skb)->eor = 1; + tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } From 03ad085eb14db2ddc4de5d9474426d258dc53954 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 23 Feb 2024 17:14:17 +0100 Subject: [PATCH 103/216] selftests: mptcp: join: add ss mptcp support check commit 9480f388a2ef54fba911d9325372abd69a328601 upstream. Commands 'ss -M' are used in script mptcp_join.sh to display only MPTCP sockets. So it must be checked if ss tool supports MPTCP in this script. Fixes: e274f7154008 ("selftests: mptcp: add subflow limits test-cases") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240223-upstream-net-20240223-misc-fixes-v1-7-162e87e48497@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 2107579e2939..a20dca9d26d6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -144,6 +144,11 @@ check_tools() exit $ksft_skip fi + if ! ss -h | grep -q MPTCP; then + echo "SKIP: ss tool does not support MPTCP" + exit $ksft_skip + fi + # Use the legacy version if available to support old kernel versions if iptables-legacy -V &> /dev/null; then iptables="iptables-legacy" From a8722cece375838f7067aa929d89a819f7d1ae96 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 23 Feb 2024 17:14:15 +0100 Subject: [PATCH 104/216] mptcp: fix snd_wnd initialization for passive socket commit adf1bb78dab55e36d4d557aa2fb446ebcfe9e5ce upstream. Such value should be inherited from the first subflow, but passive sockets always used 'rsk_rcv_wnd'. Fixes: 6f8a612a33e4 ("mptcp: keep track of advertised windows right edge") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240223-upstream-net-20240223-misc-fixes-v1-5-162e87e48497@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index be53e5460962..1fc5f6649e32 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3203,7 +3203,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, msk->write_seq = subflow_req->idsn + 1; msk->snd_nxt = msk->write_seq; msk->snd_una = msk->write_seq; - msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; + msk->wnd_end = msk->snd_nxt + tcp_sk(ssk)->snd_wnd; msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; if (mp_opt->suboptions & OPTIONS_MPTCP_MPC) { From d93fd40c62397326046902a2c5cb75af50882a85 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 23 Feb 2024 17:14:18 +0100 Subject: [PATCH 105/216] mptcp: fix double-free on socket dismantle commit 10048689def7e40a4405acda16fdc6477d4ecc5c upstream. when MPTCP server accepts an incoming connection, it clones its listener socket. However, the pointer to 'inet_opt' for the new socket has the same value as the original one: as a consequence, on program exit it's possible to observe the following splat: BUG: KASAN: double-free in inet_sock_destruct+0x54f/0x8b0 Free of addr ffff888485950880 by task swapper/25/0 CPU: 25 PID: 0 Comm: swapper/25 Kdump: loaded Not tainted 6.8.0-rc1+ #609 Hardware name: Supermicro SYS-6027R-72RF/X9DRH-7TF/7F/iTF/iF, BIOS 3.0 07/26/2013 Call Trace: dump_stack_lvl+0x32/0x50 print_report+0xca/0x620 kasan_report_invalid_free+0x64/0x90 __kasan_slab_free+0x1aa/0x1f0 kfree+0xed/0x2e0 inet_sock_destruct+0x54f/0x8b0 __sk_destruct+0x48/0x5b0 rcu_do_batch+0x34e/0xd90 rcu_core+0x559/0xac0 __do_softirq+0x183/0x5a4 irq_exit_rcu+0x12d/0x170 sysvec_apic_timer_interrupt+0x6b/0x80 asm_sysvec_apic_timer_interrupt+0x16/0x20 RIP: 0010:cpuidle_enter_state+0x175/0x300 Code: 30 00 0f 84 1f 01 00 00 83 e8 01 83 f8 ff 75 e5 48 83 c4 18 44 89 e8 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc fb 45 85 ed <0f> 89 60 ff ff ff 48 c1 e5 06 48 c7 43 18 00 00 00 00 48 83 44 2b RSP: 0018:ffff888481cf7d90 EFLAGS: 00000202 RAX: 0000000000000000 RBX: ffff88887facddc8 RCX: 0000000000000000 RDX: 1ffff1110ff588b1 RSI: 0000000000000019 RDI: ffff88887fac4588 RBP: 0000000000000004 R08: 0000000000000002 R09: 0000000000043080 R10: 0009b02ea273363f R11: ffff88887fabf42b R12: ffffffff932592e0 R13: 0000000000000004 R14: 0000000000000000 R15: 00000022c880ec80 cpuidle_enter+0x4a/0xa0 do_idle+0x310/0x410 cpu_startup_entry+0x51/0x60 start_secondary+0x211/0x270 secondary_startup_64_no_verify+0x184/0x18b Allocated by task 6853: kasan_save_stack+0x1c/0x40 kasan_save_track+0x10/0x30 __kasan_kmalloc+0xa6/0xb0 __kmalloc+0x1eb/0x450 cipso_v4_sock_setattr+0x96/0x360 netlbl_sock_setattr+0x132/0x1f0 selinux_netlbl_socket_post_create+0x6c/0x110 selinux_socket_post_create+0x37b/0x7f0 security_socket_post_create+0x63/0xb0 __sock_create+0x305/0x450 __sys_socket_create.part.23+0xbd/0x130 __sys_socket+0x37/0xb0 __x64_sys_socket+0x6f/0xb0 do_syscall_64+0x83/0x160 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Freed by task 6858: kasan_save_stack+0x1c/0x40 kasan_save_track+0x10/0x30 kasan_save_free_info+0x3b/0x60 __kasan_slab_free+0x12c/0x1f0 kfree+0xed/0x2e0 inet_sock_destruct+0x54f/0x8b0 __sk_destruct+0x48/0x5b0 subflow_ulp_release+0x1f0/0x250 tcp_cleanup_ulp+0x6e/0x110 tcp_v4_destroy_sock+0x5a/0x3a0 inet_csk_destroy_sock+0x135/0x390 tcp_fin+0x416/0x5c0 tcp_data_queue+0x1bc8/0x4310 tcp_rcv_state_process+0x15a3/0x47b0 tcp_v4_do_rcv+0x2c1/0x990 tcp_v4_rcv+0x41fb/0x5ed0 ip_protocol_deliver_rcu+0x6d/0x9f0 ip_local_deliver_finish+0x278/0x360 ip_local_deliver+0x182/0x2c0 ip_rcv+0xb5/0x1c0 __netif_receive_skb_one_core+0x16e/0x1b0 process_backlog+0x1e3/0x650 __napi_poll+0xa6/0x500 net_rx_action+0x740/0xbb0 __do_softirq+0x183/0x5a4 The buggy address belongs to the object at ffff888485950880 which belongs to the cache kmalloc-64 of size 64 The buggy address is located 0 bytes inside of 64-byte region [ffff888485950880, ffff8884859508c0) The buggy address belongs to the physical page: page:0000000056d1e95e refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888485950700 pfn:0x485950 flags: 0x57ffffc0000800(slab|node=1|zone=2|lastcpupid=0x1fffff) page_type: 0xffffffff() raw: 0057ffffc0000800 ffff88810004c640 ffffea00121b8ac0 dead000000000006 raw: ffff888485950700 0000000000200019 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888485950780: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff888485950800: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc >ffff888485950880: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ^ ffff888485950900: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff888485950980: 00 00 00 00 00 01 fc fc fc fc fc fc fc fc fc fc Something similar (a refcount underflow) happens with CALIPSO/IPv6. Fix this by duplicating IP / IPv6 options after clone, so that ip{,6}_sock_destruct() doesn't end up freeing the same memory area twice. Fixes: cf7da0d66cc1 ("mptcp: Create SUBFLOW socket for incoming connections") Cc: stable@vger.kernel.org Signed-off-by: Davide Caratti Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240223-upstream-net-20240223-misc-fixes-v1-8-162e87e48497@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/mptcp/protocol.c | 49 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1fc5f6649e32..3bc21581486a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3169,8 +3169,50 @@ static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } + +static void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk) +{ + const struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt; + struct ipv6_pinfo *newnp; + + newnp = inet6_sk(newsk); + + rcu_read_lock(); + opt = rcu_dereference(np->opt); + if (opt) { + opt = ipv6_dup_options(newsk, opt); + if (!opt) + net_warn_ratelimited("%s: Failed to copy ip6 options\n", __func__); + } + RCU_INIT_POINTER(newnp->opt, opt); + rcu_read_unlock(); +} #endif +static void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk) +{ + struct ip_options_rcu *inet_opt, *newopt = NULL; + const struct inet_sock *inet = inet_sk(sk); + struct inet_sock *newinet; + + newinet = inet_sk(newsk); + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt) { + newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + + inet_opt->opt.optlen, GFP_ATOMIC); + if (newopt) + memcpy(newopt, inet_opt, sizeof(*inet_opt) + + inet_opt->opt.optlen); + else + net_warn_ratelimited("%s: Failed to copy ip options\n", __func__); + } + RCU_INIT_POINTER(newinet->inet_opt, newopt); + rcu_read_unlock(); +} + struct sock *mptcp_sk_clone_init(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct sock *ssk, @@ -3191,6 +3233,13 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, __mptcp_init_sock(nsk); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (nsk->sk_family == AF_INET6) + mptcp_copy_ip6_options(nsk, sk); + else +#endif + mptcp_copy_ip_options(nsk, sk); + msk = mptcp_sk(nsk); msk->local_key = subflow_req->local_key; msk->token = subflow_req->token; From f27d319df055629480b84b9288a502337b6f2a2e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 23 Feb 2024 17:14:19 +0100 Subject: [PATCH 106/216] mptcp: fix possible deadlock in subflow diag commit d6a9608af9a75d13243d217f6ce1e30e57d56ffe upstream. Syzbot and Eric reported a lockdep splat in the subflow diag: WARNING: possible circular locking dependency detected 6.8.0-rc4-syzkaller-00212-g40b9385dd8e6 #0 Not tainted syz-executor.2/24141 is trying to acquire lock: ffff888045870130 (k-sk_lock-AF_INET6){+.+.}-{0:0}, at: tcp_diag_put_ulp net/ipv4/tcp_diag.c:100 [inline] ffff888045870130 (k-sk_lock-AF_INET6){+.+.}-{0:0}, at: tcp_diag_get_aux+0x738/0x830 net/ipv4/tcp_diag.c:137 but task is already holding lock: ffffc9000135e488 (&h->lhash2[i].lock){+.+.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] ffffc9000135e488 (&h->lhash2[i].lock){+.+.}-{2:2}, at: inet_diag_dump_icsk+0x39f/0x1f80 net/ipv4/inet_diag.c:1038 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&h->lhash2[i].lock){+.+.}-{2:2}: lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:351 [inline] __inet_hash+0x335/0xbe0 net/ipv4/inet_hashtables.c:743 inet_csk_listen_start+0x23a/0x320 net/ipv4/inet_connection_sock.c:1261 __inet_listen_sk+0x2a2/0x770 net/ipv4/af_inet.c:217 inet_listen+0xa3/0x110 net/ipv4/af_inet.c:239 rds_tcp_listen_init+0x3fd/0x5a0 net/rds/tcp_listen.c:316 rds_tcp_init_net+0x141/0x320 net/rds/tcp.c:577 ops_init+0x352/0x610 net/core/net_namespace.c:136 __register_pernet_operations net/core/net_namespace.c:1214 [inline] register_pernet_operations+0x2cb/0x660 net/core/net_namespace.c:1283 register_pernet_device+0x33/0x80 net/core/net_namespace.c:1370 rds_tcp_init+0x62/0xd0 net/rds/tcp.c:735 do_one_initcall+0x238/0x830 init/main.c:1236 do_initcall_level+0x157/0x210 init/main.c:1298 do_initcalls+0x3f/0x80 init/main.c:1314 kernel_init_freeable+0x42f/0x5d0 init/main.c:1551 kernel_init+0x1d/0x2a0 init/main.c:1441 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:242 -> #0 (k-sk_lock-AF_INET6){+.+.}-{0:0}: check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain+0x18ca/0x58e0 kernel/locking/lockdep.c:3869 __lock_acquire+0x1345/0x1fd0 kernel/locking/lockdep.c:5137 lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754 lock_sock_fast include/net/sock.h:1723 [inline] subflow_get_info+0x166/0xd20 net/mptcp/diag.c:28 tcp_diag_put_ulp net/ipv4/tcp_diag.c:100 [inline] tcp_diag_get_aux+0x738/0x830 net/ipv4/tcp_diag.c:137 inet_sk_diag_fill+0x10ed/0x1e00 net/ipv4/inet_diag.c:345 inet_diag_dump_icsk+0x55b/0x1f80 net/ipv4/inet_diag.c:1061 __inet_diag_dump+0x211/0x3a0 net/ipv4/inet_diag.c:1263 inet_diag_dump_compat+0x1c1/0x2d0 net/ipv4/inet_diag.c:1371 netlink_dump+0x59b/0xc80 net/netlink/af_netlink.c:2264 __netlink_dump_start+0x5df/0x790 net/netlink/af_netlink.c:2370 netlink_dump_start include/linux/netlink.h:338 [inline] inet_diag_rcv_msg_compat+0x209/0x4c0 net/ipv4/inet_diag.c:1405 sock_diag_rcv_msg+0xe7/0x410 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2543 sock_diag_rcv+0x2a/0x40 net/core/sock_diag.c:280 netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1367 netlink_sendmsg+0xa3b/0xd70 net/netlink/af_netlink.c:1908 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2584 ___sys_sendmsg net/socket.c:2638 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2667 do_syscall_64+0xf9/0x240 entry_SYSCALL_64_after_hwframe+0x6f/0x77 As noted by Eric we can break the lock dependency chain avoid dumping any extended info for the mptcp subflow listener: nothing actually useful is presented there. Fixes: b8adb69a7d29 ("mptcp: fix lockless access in subflow ULP diag") Cc: stable@vger.kernel.org Reported-by: Eric Dumazet Closes: https://lore.kernel.org/netdev/CANn89iJ=Oecw6OZDwmSYc9HJKQ_G32uN11L+oUcMu+TOD5Xiaw@mail.gmail.com/ Suggested-by: Eric Dumazet Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240223-upstream-net-20240223-misc-fixes-v1-9-162e87e48497@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/mptcp/diag.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c index 6ff6f14674aa..7017dd60659d 100644 --- a/net/mptcp/diag.c +++ b/net/mptcp/diag.c @@ -21,6 +21,9 @@ static int subflow_get_info(struct sock *sk, struct sk_buff *skb) bool slow; int err; + if (inet_sk_state_load(sk) == TCP_LISTEN) + return 0; + start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP); if (!start) return -EMSGSIZE; From 88067197e97af3fcb104dd86030f788ec1b32fdb Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 4 Jan 2023 10:01:38 +0200 Subject: [PATCH 107/216] RDMA/core: Refactor rdma_bind_addr commit 8d037973d48c026224ab285e6a06985ccac6f7bf upstream. Refactor rdma_bind_addr function so that it doesn't require that the cma destination address be changed before calling it. So now it will update the destination address internally only when it is really needed and after passing all the required checks. Which in turn results in a cleaner and more sensible call and error handling flows for the functions that call it directly or indirectly. Signed-off-by: Patrisious Haddad Reported-by: Wei Chen Reviewed-by: Mark Zhang Link: https://lore.kernel.org/r/3d0e9a2fd62bc10ba02fed1c7c48a48638952320.1672819273.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/cma.c | 253 +++++++++++++++++----------------- 1 file changed, 130 insertions(+), 123 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 0773ca7ace24..950c8422aec2 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3547,121 +3547,6 @@ err: return ret; } -static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr) -{ - struct sockaddr_storage zero_sock = {}; - - if (src_addr && src_addr->sa_family) - return rdma_bind_addr(id, src_addr); - - /* - * When the src_addr is not specified, automatically supply an any addr - */ - zero_sock.ss_family = dst_addr->sa_family; - if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { - struct sockaddr_in6 *src_addr6 = - (struct sockaddr_in6 *)&zero_sock; - struct sockaddr_in6 *dst_addr6 = - (struct sockaddr_in6 *)dst_addr; - - src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; - if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - id->route.addr.dev_addr.bound_dev_if = - dst_addr6->sin6_scope_id; - } else if (dst_addr->sa_family == AF_IB) { - ((struct sockaddr_ib *)&zero_sock)->sib_pkey = - ((struct sockaddr_ib *)dst_addr)->sib_pkey; - } - return rdma_bind_addr(id, (struct sockaddr *)&zero_sock); -} - -/* - * If required, resolve the source address for bind and leave the id_priv in - * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior - * calls made by ULP, a previously bound ID will not be re-bound and src_addr is - * ignored. - */ -static int resolve_prepare_src(struct rdma_id_private *id_priv, - struct sockaddr *src_addr, - const struct sockaddr *dst_addr) -{ - int ret; - - memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { - /* For a well behaved ULP state will be RDMA_CM_IDLE */ - ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); - if (ret) - goto err_dst; - if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, - RDMA_CM_ADDR_QUERY))) { - ret = -EINVAL; - goto err_dst; - } - } - - if (cma_family(id_priv) != dst_addr->sa_family) { - ret = -EINVAL; - goto err_state; - } - return 0; - -err_state: - cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); -err_dst: - memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); - return ret; -} - -int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, unsigned long timeout_ms) -{ - struct rdma_id_private *id_priv = - container_of(id, struct rdma_id_private, id); - int ret; - - ret = resolve_prepare_src(id_priv, src_addr, dst_addr); - if (ret) - return ret; - - if (cma_any_addr(dst_addr)) { - ret = cma_resolve_loopback(id_priv); - } else { - if (dst_addr->sa_family == AF_IB) { - ret = cma_resolve_ib_addr(id_priv); - } else { - /* - * The FSM can return back to RDMA_CM_ADDR_BOUND after - * rdma_resolve_ip() is called, eg through the error - * path in addr_handler(). If this happens the existing - * request must be canceled before issuing a new one. - * Since canceling a request is a bit slow and this - * oddball path is rare, keep track once a request has - * been issued. The track turns out to be a permanent - * state since this is the only cancel as it is - * immediately before rdma_resolve_ip(). - */ - if (id_priv->used_resolve_ip) - rdma_addr_cancel(&id->route.addr.dev_addr); - else - id_priv->used_resolve_ip = 1; - ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, - &id->route.addr.dev_addr, - timeout_ms, addr_handler, - false, id_priv); - } - } - if (ret) - goto err; - - return 0; -err: - cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); - return ret; -} -EXPORT_SYMBOL(rdma_resolve_addr); - int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; @@ -4064,27 +3949,26 @@ err: } EXPORT_SYMBOL(rdma_listen); -int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +static int rdma_bind_addr_dst(struct rdma_id_private *id_priv, + struct sockaddr *addr, const struct sockaddr *daddr) { - struct rdma_id_private *id_priv; + struct sockaddr *id_daddr; int ret; - struct sockaddr *daddr; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && addr->sa_family != AF_IB) return -EAFNOSUPPORT; - id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; - ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); + ret = cma_check_linklocal(&id_priv->id.route.addr.dev_addr, addr); if (ret) goto err1; memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { - ret = cma_translate_addr(addr, &id->route.addr.dev_addr); + ret = cma_translate_addr(addr, &id_priv->id.route.addr.dev_addr); if (ret) goto err1; @@ -4104,8 +3988,10 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) } #endif } - daddr = cma_dst_addr(id_priv); - daddr->sa_family = addr->sa_family; + id_daddr = cma_dst_addr(id_priv); + if (daddr != id_daddr) + memcpy(id_daddr, daddr, rdma_addr_size(addr)); + id_daddr->sa_family = addr->sa_family; ret = cma_get_port(id_priv); if (ret) @@ -4121,6 +4007,127 @@ err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } + +static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + const struct sockaddr *dst_addr) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + struct sockaddr_storage zero_sock = {}; + + if (src_addr && src_addr->sa_family) + return rdma_bind_addr_dst(id_priv, src_addr, dst_addr); + + /* + * When the src_addr is not specified, automatically supply an any addr + */ + zero_sock.ss_family = dst_addr->sa_family; + if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { + struct sockaddr_in6 *src_addr6 = + (struct sockaddr_in6 *)&zero_sock; + struct sockaddr_in6 *dst_addr6 = + (struct sockaddr_in6 *)dst_addr; + + src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; + if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + id->route.addr.dev_addr.bound_dev_if = + dst_addr6->sin6_scope_id; + } else if (dst_addr->sa_family == AF_IB) { + ((struct sockaddr_ib *)&zero_sock)->sib_pkey = + ((struct sockaddr_ib *)dst_addr)->sib_pkey; + } + return rdma_bind_addr_dst(id_priv, (struct sockaddr *)&zero_sock, dst_addr); +} + +/* + * If required, resolve the source address for bind and leave the id_priv in + * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior + * calls made by ULP, a previously bound ID will not be re-bound and src_addr is + * ignored. + */ +static int resolve_prepare_src(struct rdma_id_private *id_priv, + struct sockaddr *src_addr, + const struct sockaddr *dst_addr) +{ + int ret; + + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { + /* For a well behaved ULP state will be RDMA_CM_IDLE */ + ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); + if (ret) + return ret; + if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, + RDMA_CM_ADDR_QUERY))) + return -EINVAL; + + } + + if (cma_family(id_priv) != dst_addr->sa_family) { + ret = -EINVAL; + goto err_state; + } + return 0; + +err_state: + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); + return ret; +} + +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + const struct sockaddr *dst_addr, unsigned long timeout_ms) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + ret = resolve_prepare_src(id_priv, src_addr, dst_addr); + if (ret) + return ret; + + if (cma_any_addr(dst_addr)) { + ret = cma_resolve_loopback(id_priv); + } else { + if (dst_addr->sa_family == AF_IB) { + ret = cma_resolve_ib_addr(id_priv); + } else { + /* + * The FSM can return back to RDMA_CM_ADDR_BOUND after + * rdma_resolve_ip() is called, eg through the error + * path in addr_handler(). If this happens the existing + * request must be canceled before issuing a new one. + * Since canceling a request is a bit slow and this + * oddball path is rare, keep track once a request has + * been issued. The track turns out to be a permanent + * state since this is the only cancel as it is + * immediately before rdma_resolve_ip(). + */ + if (id_priv->used_resolve_ip) + rdma_addr_cancel(&id->route.addr.dev_addr); + else + id_priv->used_resolve_ip = 1; + ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, + &id->route.addr.dev_addr, + timeout_ms, addr_handler, + false, id_priv); + } + } + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_addr); + +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + return rdma_bind_addr_dst(id_priv, addr, cma_dst_addr(id_priv)); +} EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) From 2d9b3e1ae1bed1f20621d5cc95e74746a4afbe7d Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Wed, 12 Jul 2023 18:41:33 -0500 Subject: [PATCH 108/216] RDMA/core: Update CMA destination address on rdma_resolve_addr commit 0e15863015d97c1ee2cc29d599abcc7fa2dc3e95 upstream. 8d037973d48c ("RDMA/core: Refactor rdma_bind_addr") intoduces as regression on irdma devices on certain tests which uses rdma CM, such as cmtime. No connections can be established with the MAD QP experiences a fatal error on the active side. The cma destination address is not updated with the dst_addr when ULP on active side calls rdma_bind_addr followed by rdma_resolve_addr. The id_priv state is 'bound' in resolve_prepare_src and update is skipped. This leaves the dgid passed into irdma driver to create an Address Handle (AH) for the MAD QP at 0. The create AH descriptor as well as the ARP cache entry is invalid and HW throws an asynchronous events as result. [ 1207.656888] resolve_prepare_src caller: ucma_resolve_addr+0xff/0x170 [rdma_ucm] daddr=200.0.4.28 id_priv->state=7 [....] [ 1207.680362] ice 0000:07:00.1 rocep7s0f1: caller: irdma_create_ah+0x3e/0x70 [irdma] ah_id=0 arp_idx=0 dest_ip=0.0.0.0 destMAC=00:00:64:ca:b7:52 ipvalid=1 raw=0000:0000:0000:0000:0000:ffff:0000:0000 [ 1207.682077] ice 0000:07:00.1 rocep7s0f1: abnormal ae_id = 0x401 bool qp=1 qp_id = 1, ae_src=5 [ 1207.691657] infiniband rocep7s0f1: Fatal error (1) on MAD QP (1) Fix this by updating the CMA destination address when the ULP calls a resolve address with the CM state already bound. Fixes: 8d037973d48c ("RDMA/core: Refactor rdma_bind_addr") Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20230712234133.1343-1-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/cma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 950c8422aec2..067d7f42871f 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4060,6 +4060,8 @@ static int resolve_prepare_src(struct rdma_id_private *id_priv, RDMA_CM_ADDR_QUERY))) return -EINVAL; + } else { + memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); } if (cma_family(id_priv) != dst_addr->sa_family) { From e7945d93fece3ae43d2ed47d5ef4e254c8a3b712 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 2 Aug 2022 11:00:16 +0200 Subject: [PATCH 109/216] efi: libstub: use EFI_LOADER_CODE region when moving the kernel in memory commit 9cf42bca30e98a1c6c9e8abf876940a551eaa3d1 upstream. The EFI spec is not very clear about which permissions are being given when allocating pages of a certain type. However, it is quite obvious that EFI_LOADER_CODE is more likely to permit execution than EFI_LOADER_DATA, which becomes relevant once we permit booting the kernel proper with the firmware's 1:1 mapping still active. Ostensibly, recent systems such as the Surface Pro X grant executable permissions to EFI_LOADER_CODE regions but not EFI_LOADER_DATA regions. Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/libstub/alignedmem.c | 5 +++-- drivers/firmware/efi/libstub/arm64-stub.c | 6 ++++-- drivers/firmware/efi/libstub/efistub.h | 6 ++++-- drivers/firmware/efi/libstub/mem.c | 3 ++- drivers/firmware/efi/libstub/randomalloc.c | 5 +++-- 5 files changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/firmware/efi/libstub/alignedmem.c b/drivers/firmware/efi/libstub/alignedmem.c index 1de9878ddd3a..174832661251 100644 --- a/drivers/firmware/efi/libstub/alignedmem.c +++ b/drivers/firmware/efi/libstub/alignedmem.c @@ -22,7 +22,8 @@ * Return: status code */ efi_status_t efi_allocate_pages_aligned(unsigned long size, unsigned long *addr, - unsigned long max, unsigned long align) + unsigned long max, unsigned long align, + int memory_type) { efi_physical_addr_t alloc_addr; efi_status_t status; @@ -36,7 +37,7 @@ efi_status_t efi_allocate_pages_aligned(unsigned long size, unsigned long *addr, slack = align / EFI_PAGE_SIZE - 1; status = efi_bs_call(allocate_pages, EFI_ALLOCATE_MAX_ADDRESS, - EFI_LOADER_DATA, size / EFI_PAGE_SIZE + slack, + memory_type, size / EFI_PAGE_SIZE + slack, &alloc_addr); if (status != EFI_SUCCESS) return status; diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index e2f90566b291..08f46c072da5 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -180,7 +180,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, * locate the kernel at a randomized offset in physical memory. */ status = efi_random_alloc(*reserve_size, min_kimg_align, - reserve_addr, phys_seed); + reserve_addr, phys_seed, + EFI_LOADER_CODE); if (status != EFI_SUCCESS) efi_warn("efi_random_alloc() failed: 0x%lx\n", status); } else { @@ -201,7 +202,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, } status = efi_allocate_pages_aligned(*reserve_size, reserve_addr, - ULONG_MAX, min_kimg_align); + ULONG_MAX, min_kimg_align, + EFI_LOADER_CODE); if (status != EFI_SUCCESS) { efi_err("Failed to relocate kernel\n"); diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 970e86e3aab0..ab505b07e626 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -880,7 +880,8 @@ void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size, efi_status_t efi_get_random_bytes(unsigned long size, u8 *out); efi_status_t efi_random_alloc(unsigned long size, unsigned long align, - unsigned long *addr, unsigned long random_seed); + unsigned long *addr, unsigned long random_seed, + int memory_type); efi_status_t efi_random_get_seed(void); @@ -907,7 +908,8 @@ efi_status_t efi_allocate_pages(unsigned long size, unsigned long *addr, unsigned long max); efi_status_t efi_allocate_pages_aligned(unsigned long size, unsigned long *addr, - unsigned long max, unsigned long align); + unsigned long max, unsigned long align, + int memory_type); efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align, unsigned long *addr, unsigned long min); diff --git a/drivers/firmware/efi/libstub/mem.c b/drivers/firmware/efi/libstub/mem.c index 45841ef55a9f..03d147f17185 100644 --- a/drivers/firmware/efi/libstub/mem.c +++ b/drivers/firmware/efi/libstub/mem.c @@ -91,7 +91,8 @@ efi_status_t efi_allocate_pages(unsigned long size, unsigned long *addr, if (EFI_ALLOC_ALIGN > EFI_PAGE_SIZE) return efi_allocate_pages_aligned(size, addr, max, - EFI_ALLOC_ALIGN); + EFI_ALLOC_ALIGN, + EFI_LOADER_DATA); alloc_addr = ALIGN_DOWN(max + 1, EFI_ALLOC_ALIGN) - 1; status = efi_bs_call(allocate_pages, EFI_ALLOCATE_MAX_ADDRESS, diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index 9fb5869896be..ec44bb7e092f 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -53,7 +53,8 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md, efi_status_t efi_random_alloc(unsigned long size, unsigned long align, unsigned long *addr, - unsigned long random_seed) + unsigned long random_seed, + int memory_type) { unsigned long total_slots = 0, target_slot; unsigned long total_mirrored_slots = 0; @@ -118,7 +119,7 @@ efi_status_t efi_random_alloc(unsigned long size, pages = size / EFI_PAGE_SIZE; status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, - EFI_LOADER_DATA, pages, &target); + memory_type, pages, &target); if (status == EFI_SUCCESS) *addr = target; break; From bad6e66d0701d88a1b7018ca0334b551fb71d74a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:01 +0100 Subject: [PATCH 110/216] x86/boot/compressed: Rename efi_thunk_64.S to efi-mixed.S commit cb8bda8ad4438b4bcfcf89697fc84803fb210017 upstream. In preparation for moving the mixed mode specific code out of head_64.S, rename the existing file to clarify that it contains more than just the mixed mode thunk. While at it, clean up the Makefile rules that add it to the build. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-2-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/Makefile | 6 +++--- arch/x86/boot/compressed/{efi_thunk_64.S => efi_mixed.S} | 0 2 files changed, 3 insertions(+), 3 deletions(-) rename arch/x86/boot/compressed/{efi_thunk_64.S => efi_mixed.S} (100%) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 15b7b403a4bd..27c82c78ac26 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -108,11 +108,11 @@ endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o -vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o -efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a +vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_mixed.o +vmlinux-objs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a -$(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE +$(obj)/vmlinux: $(vmlinux-objs-y) FORCE $(call if_changed,ld) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_mixed.S similarity index 100% rename from arch/x86/boot/compressed/efi_thunk_64.S rename to arch/x86/boot/compressed/efi_mixed.S From 3bad8dc0ae8db10540290c69bbb3a3f8e6e5aff4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:02 +0100 Subject: [PATCH 111/216] x86/boot/compressed: Move 32-bit entrypoint code into .text section commit e2ab9eab324cdf240de89741e4a1aa79919f0196 upstream. Move the code that stores the arguments passed to the EFI entrypoint into the .text section, so that it can be moved into a separate compilation unit in a subsequent patch. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-3-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 48 +++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index b4bd6df29116..61b1be41867c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -303,24 +303,41 @@ SYM_FUNC_START(efi32_stub_entry) popl %ecx popl %edx popl %esi + jmp efi32_entry +SYM_FUNC_END(efi32_stub_entry) + .text +/* + * This is the common EFI stub entry point for mixed mode. + * + * Arguments: %ecx image handle + * %edx EFI system table pointer + * %esi struct bootparams pointer (or NULL when not using + * the EFI handover protocol) + * + * Since this is the point of no return for ordinary execution, no registers + * are considered live except for the function parameters. [Note that the EFI + * stub may still exit and return to the firmware using the Exit() EFI boot + * service.] + */ +SYM_FUNC_START_LOCAL(efi32_entry) call 1f -1: pop %ebp - subl $ rva(1b), %ebp - - movl %esi, rva(efi32_boot_args+8)(%ebp) -SYM_INNER_LABEL(efi32_pe_stub_entry, SYM_L_LOCAL) - movl %ecx, rva(efi32_boot_args)(%ebp) - movl %edx, rva(efi32_boot_args+4)(%ebp) - movb $0, rva(efi_is64)(%ebp) +1: pop %ebx /* Save firmware GDTR and code/data selectors */ - sgdtl rva(efi32_boot_gdt)(%ebp) - movw %cs, rva(efi32_boot_cs)(%ebp) - movw %ds, rva(efi32_boot_ds)(%ebp) + sgdtl (efi32_boot_gdt - 1b)(%ebx) + movw %cs, (efi32_boot_cs - 1b)(%ebx) + movw %ds, (efi32_boot_ds - 1b)(%ebx) /* Store firmware IDT descriptor */ - sidtl rva(efi32_boot_idt)(%ebp) + sidtl (efi32_boot_idt - 1b)(%ebx) + + /* Store boot arguments */ + leal (efi32_boot_args - 1b)(%ebx), %ebx + movl %ecx, 0(%ebx) + movl %edx, 4(%ebx) + movl %esi, 8(%ebx) + movb $0x0, 12(%ebx) // efi_is64 /* Disable paging */ movl %cr0, %eax @@ -328,7 +345,8 @@ SYM_INNER_LABEL(efi32_pe_stub_entry, SYM_L_LOCAL) movl %eax, %cr0 jmp startup_32 -SYM_FUNC_END(efi32_stub_entry) +SYM_FUNC_END(efi32_entry) + __HEAD #endif .code64 @@ -847,7 +865,9 @@ SYM_FUNC_START(efi32_pe_entry) */ subl %esi, %ebx movl %ebx, rva(image_offset)(%ebp) // save image_offset - jmp efi32_pe_stub_entry + xorl %esi, %esi + jmp efi32_entry // pass %ecx, %edx, %esi + // no other registers remain live 2: popl %edi // restore callee-save registers popl %ebx From d8950e8e20e006c8cbc4cc1ff81c35921053a8a2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:03 +0100 Subject: [PATCH 112/216] x86/boot/compressed: Move bootargs parsing out of 32-bit startup code commit 5c3a85f35b583259cf5ca0344cd79c8899ba1bb7 upstream. Move the logic that chooses between the different EFI entrypoints out of the 32-bit boot path, and into a 64-bit helper that can perform the same task much more cleanly. While at it, document the mixed mode boot flow in a code comment. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-4-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/efi_mixed.S | 43 ++++++++++++++++++++++++++++ arch/x86/boot/compressed/head_64.S | 24 +++------------- 2 files changed, 47 insertions(+), 20 deletions(-) diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S index 67e7edcdfea8..58ab2e1ffd92 100644 --- a/arch/x86/boot/compressed/efi_mixed.S +++ b/arch/x86/boot/compressed/efi_mixed.S @@ -22,6 +22,49 @@ .code64 .text +/* + * When booting in 64-bit mode on 32-bit EFI firmware, startup_64_mixed_mode() + * is the first thing that runs after switching to long mode. Depending on + * whether the EFI handover protocol or the compat entry point was used to + * enter the kernel, it will either branch to the 64-bit EFI handover + * entrypoint at offset 0x390 in the image, or to the 64-bit EFI PE/COFF + * entrypoint efi_pe_entry(). In the former case, the bootloader must provide a + * struct bootparams pointer as the third argument, so the presence of such a + * pointer is used to disambiguate. + * + * +--------------+ + * +------------------+ +------------+ +------>| efi_pe_entry | + * | efi32_pe_entry |---->| | | +-----------+--+ + * +------------------+ | | +------+----------------+ | + * | startup_32 |---->| startup_64_mixed_mode | | + * +------------------+ | | +------+----------------+ V + * | efi32_stub_entry |---->| | | +------------------+ + * +------------------+ +------------+ +---->| efi64_stub_entry | + * +-------------+----+ + * +------------+ +----------+ | + * | startup_64 |<----| efi_main |<--------------+ + * +------------+ +----------+ + */ +SYM_FUNC_START(startup_64_mixed_mode) + lea efi32_boot_args(%rip), %rdx + mov 0(%rdx), %edi + mov 4(%rdx), %esi + mov 8(%rdx), %edx // saved bootparams pointer + test %edx, %edx + jnz efi64_stub_entry + /* + * efi_pe_entry uses MS calling convention, which requires 32 bytes of + * shadow space on the stack even if all arguments are passed in + * registers. We also need an additional 8 bytes for the space that + * would be occupied by the return address, and this also results in + * the correct stack alignment for entry. + */ + sub $40, %rsp + mov %rdi, %rcx // MS calling convention + mov %rsi, %rdx + jmp efi_pe_entry +SYM_FUNC_END(startup_64_mixed_mode) + SYM_FUNC_START(__efi64_thunk) push %rbp push %rbx diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 61b1be41867c..71542630f00f 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -261,25 +261,9 @@ SYM_FUNC_START(startup_32) */ leal rva(startup_64)(%ebp), %eax #ifdef CONFIG_EFI_MIXED - movl rva(efi32_boot_args)(%ebp), %edi - testl %edi, %edi - jz 1f - leal rva(efi64_stub_entry)(%ebp), %eax - movl rva(efi32_boot_args+4)(%ebp), %esi - movl rva(efi32_boot_args+8)(%ebp), %edx // saved bootparams pointer - testl %edx, %edx - jnz 1f - /* - * efi_pe_entry uses MS calling convention, which requires 32 bytes of - * shadow space on the stack even if all arguments are passed in - * registers. We also need an additional 8 bytes for the space that - * would be occupied by the return address, and this also results in - * the correct stack alignment for entry. - */ - subl $40, %esp - leal rva(efi_pe_entry)(%ebp), %eax - movl %edi, %ecx // MS calling convention - movl %esi, %edx + cmpb $1, rva(efi_is64)(%ebp) + je 1f + leal rva(startup_64_mixed_mode)(%ebp), %eax 1: #endif /* Check if the C-bit position is correct when SEV is active */ @@ -795,7 +779,7 @@ SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) SYM_DATA(image_offset, .long 0) #endif #ifdef CONFIG_EFI_MIXED -SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0) +SYM_DATA(efi32_boot_args, .long 0, 0, 0) SYM_DATA(efi_is64, .byte 1) #define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime) From c577208f81c9ddbc5ab1418bfe810680a671fa84 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:04 +0100 Subject: [PATCH 113/216] x86/boot/compressed: Move efi32_pe_entry into .text section commit 91592b5c0c2f076ff9d8cc0c14aa563448ac9fc4 upstream. Move efi32_pe_entry() into the .text section, so that it can be moved out of head_64.S and into a separate compilation unit in a subsequent patch. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-5-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 71542630f00f..14cd51d39719 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -786,7 +786,7 @@ SYM_DATA(efi_is64, .byte 1) #define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol) #define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base) - __HEAD + .text .code32 SYM_FUNC_START(efi32_pe_entry) /* @@ -808,12 +808,11 @@ SYM_FUNC_START(efi32_pe_entry) call 1f 1: pop %ebx - subl $ rva(1b), %ebx /* Get the loaded image protocol pointer from the image handle */ leal -4(%ebp), %eax pushl %eax // &loaded_image - leal rva(loaded_image_proto)(%ebx), %eax + leal (loaded_image_proto - 1b)(%ebx), %eax pushl %eax // pass the GUID address pushl 8(%ebp) // pass the image handle @@ -842,13 +841,13 @@ SYM_FUNC_START(efi32_pe_entry) movl 12(%ebp), %edx // sys_table movl -4(%ebp), %esi // loaded_image movl LI32_image_base(%esi), %esi // loaded_image->image_base - movl %ebx, %ebp // startup_32 for efi32_pe_stub_entry + leal (startup_32 - 1b)(%ebx), %ebp // runtime address of startup_32 /* * We need to set the image_offset variable here since startup_32() will * use it before we get to the 64-bit efi_pe_entry() in C code. */ - subl %esi, %ebx - movl %ebx, rva(image_offset)(%ebp) // save image_offset + subl %esi, %ebp // calculate image_offset + movl %ebp, (image_offset - 1b)(%ebx) // save image_offset xorl %esi, %esi jmp efi32_entry // pass %ecx, %edx, %esi // no other registers remain live From 469b84516cc456fdf003dc99d9a9b0e7eab27c24 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:05 +0100 Subject: [PATCH 114/216] x86/boot/compressed: Move efi32_entry out of head_64.S commit 73a6dec80e2acedaef3ca603d4b5799049f6e9f8 upstream. Move the efi32_entry() routine out of head_64.S and into efi-mixed.S, which reduces clutter in the complicated startup routines. It also permits linkage of some symbols used by code to be made local. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-6-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/efi_mixed.S | 57 +++++++++++++++++++++++----- arch/x86/boot/compressed/head_64.S | 45 ---------------------- 2 files changed, 47 insertions(+), 55 deletions(-) diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S index 58ab2e1ffd92..3487484ac1fd 100644 --- a/arch/x86/boot/compressed/efi_mixed.S +++ b/arch/x86/boot/compressed/efi_mixed.S @@ -105,7 +105,7 @@ SYM_FUNC_START(__efi64_thunk) /* * Switch to IDT and GDT with 32-bit segments. This is the firmware GDT * and IDT that was installed when the kernel started executing. The - * pointers were saved at the EFI stub entry point in head_64.S. + * pointers were saved by the efi32_entry() routine below. * * Pass the saved DS selector to the 32-bit code, and use far return to * restore the saved CS selector. @@ -217,22 +217,59 @@ SYM_FUNC_START_LOCAL(efi_enter32) lret SYM_FUNC_END(efi_enter32) +/* + * This is the common EFI stub entry point for mixed mode. + * + * Arguments: %ecx image handle + * %edx EFI system table pointer + * %esi struct bootparams pointer (or NULL when not using + * the EFI handover protocol) + * + * Since this is the point of no return for ordinary execution, no registers + * are considered live except for the function parameters. [Note that the EFI + * stub may still exit and return to the firmware using the Exit() EFI boot + * service.] + */ +SYM_FUNC_START(efi32_entry) + call 1f +1: pop %ebx + + /* Save firmware GDTR and code/data selectors */ + sgdtl (efi32_boot_gdt - 1b)(%ebx) + movw %cs, (efi32_boot_cs - 1b)(%ebx) + movw %ds, (efi32_boot_ds - 1b)(%ebx) + + /* Store firmware IDT descriptor */ + sidtl (efi32_boot_idt - 1b)(%ebx) + + /* Store boot arguments */ + leal (efi32_boot_args - 1b)(%ebx), %ebx + movl %ecx, 0(%ebx) + movl %edx, 4(%ebx) + movl %esi, 8(%ebx) + movb $0x0, 12(%ebx) // efi_is64 + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + jmp startup_32 +SYM_FUNC_END(efi32_entry) + .data .balign 8 -SYM_DATA_START(efi32_boot_gdt) +SYM_DATA_START_LOCAL(efi32_boot_gdt) .word 0 .quad 0 SYM_DATA_END(efi32_boot_gdt) -SYM_DATA_START(efi32_boot_idt) +SYM_DATA_START_LOCAL(efi32_boot_idt) .word 0 .quad 0 SYM_DATA_END(efi32_boot_idt) -SYM_DATA_START(efi32_boot_cs) - .word 0 -SYM_DATA_END(efi32_boot_cs) - -SYM_DATA_START(efi32_boot_ds) - .word 0 -SYM_DATA_END(efi32_boot_ds) +SYM_DATA_LOCAL(efi32_boot_cs, .word 0) +SYM_DATA_LOCAL(efi32_boot_ds, .word 0) +SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0) +SYM_DATA(efi_is64, .byte 1) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 14cd51d39719..2b812028fb2f 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -289,48 +289,6 @@ SYM_FUNC_START(efi32_stub_entry) popl %esi jmp efi32_entry SYM_FUNC_END(efi32_stub_entry) - - .text -/* - * This is the common EFI stub entry point for mixed mode. - * - * Arguments: %ecx image handle - * %edx EFI system table pointer - * %esi struct bootparams pointer (or NULL when not using - * the EFI handover protocol) - * - * Since this is the point of no return for ordinary execution, no registers - * are considered live except for the function parameters. [Note that the EFI - * stub may still exit and return to the firmware using the Exit() EFI boot - * service.] - */ -SYM_FUNC_START_LOCAL(efi32_entry) - call 1f -1: pop %ebx - - /* Save firmware GDTR and code/data selectors */ - sgdtl (efi32_boot_gdt - 1b)(%ebx) - movw %cs, (efi32_boot_cs - 1b)(%ebx) - movw %ds, (efi32_boot_ds - 1b)(%ebx) - - /* Store firmware IDT descriptor */ - sidtl (efi32_boot_idt - 1b)(%ebx) - - /* Store boot arguments */ - leal (efi32_boot_args - 1b)(%ebx), %ebx - movl %ecx, 0(%ebx) - movl %edx, 4(%ebx) - movl %esi, 8(%ebx) - movb $0x0, 12(%ebx) // efi_is64 - - /* Disable paging */ - movl %cr0, %eax - btrl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - jmp startup_32 -SYM_FUNC_END(efi32_entry) - __HEAD #endif .code64 @@ -779,9 +737,6 @@ SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) SYM_DATA(image_offset, .long 0) #endif #ifdef CONFIG_EFI_MIXED -SYM_DATA(efi32_boot_args, .long 0, 0, 0) -SYM_DATA(efi_is64, .byte 1) - #define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime) #define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol) #define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base) From beeeb4655db99ed0eb70f6756518fd202fd12e1a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:06 +0100 Subject: [PATCH 115/216] x86/boot/compressed: Move efi32_pe_entry() out of head_64.S commit 7f22ca396778fea9332d83ec2359dbe8396e9a06 upstream. Move the implementation of efi32_pe_entry() into efi-mixed.S, which is a more suitable location that only gets built if EFI mixed mode is actually enabled. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-7-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/efi_mixed.S | 82 ++++++++++++++++++++++++++ arch/x86/boot/compressed/head_64.S | 87 +--------------------------- 2 files changed, 83 insertions(+), 86 deletions(-) diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S index 3487484ac1fd..8844d8ed4b1c 100644 --- a/arch/x86/boot/compressed/efi_mixed.S +++ b/arch/x86/boot/compressed/efi_mixed.S @@ -257,6 +257,88 @@ SYM_FUNC_START(efi32_entry) jmp startup_32 SYM_FUNC_END(efi32_entry) +#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime) +#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol) +#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base) + +/* + * efi_status_t efi32_pe_entry(efi_handle_t image_handle, + * efi_system_table_32_t *sys_table) + */ +SYM_FUNC_START(efi32_pe_entry) + pushl %ebp + movl %esp, %ebp + pushl %eax // dummy push to allocate loaded_image + + pushl %ebx // save callee-save registers + pushl %edi + + call verify_cpu // check for long mode support + testl %eax, %eax + movl $0x80000003, %eax // EFI_UNSUPPORTED + jnz 2f + + call 1f +1: pop %ebx + + /* Get the loaded image protocol pointer from the image handle */ + leal -4(%ebp), %eax + pushl %eax // &loaded_image + leal (loaded_image_proto - 1b)(%ebx), %eax + pushl %eax // pass the GUID address + pushl 8(%ebp) // pass the image handle + + /* + * Note the alignment of the stack frame. + * sys_table + * handle <-- 16-byte aligned on entry by ABI + * return address + * frame pointer + * loaded_image <-- local variable + * saved %ebx <-- 16-byte aligned here + * saved %edi + * &loaded_image + * &loaded_image_proto + * handle <-- 16-byte aligned for call to handle_protocol + */ + + movl 12(%ebp), %eax // sys_table + movl ST32_boottime(%eax), %eax // sys_table->boottime + call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol + addl $12, %esp // restore argument space + testl %eax, %eax + jnz 2f + + movl 8(%ebp), %ecx // image_handle + movl 12(%ebp), %edx // sys_table + movl -4(%ebp), %esi // loaded_image + movl LI32_image_base(%esi), %esi // loaded_image->image_base + leal (startup_32 - 1b)(%ebx), %ebp // runtime address of startup_32 + /* + * We need to set the image_offset variable here since startup_32() will + * use it before we get to the 64-bit efi_pe_entry() in C code. + */ + subl %esi, %ebp // calculate image_offset + movl %ebp, (image_offset - 1b)(%ebx) // save image_offset + xorl %esi, %esi + jmp efi32_entry // pass %ecx, %edx, %esi + // no other registers remain live + +2: popl %edi // restore callee-save registers + popl %ebx + leave + RET +SYM_FUNC_END(efi32_pe_entry) + + .section ".rodata" + /* EFI loaded image protocol GUID */ + .balign 4 +SYM_DATA_START_LOCAL(loaded_image_proto) + .long 0x5b1b31a1 + .word 0x9562, 0x11d2 + .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b +SYM_DATA_END(loaded_image_proto) + .data .balign 8 SYM_DATA_START_LOCAL(efi32_boot_gdt) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 2b812028fb2f..d04d981c2b9b 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -689,6 +689,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode) jmp 1b SYM_FUNC_END(.Lno_longmode) + .globl verify_cpu #include "../../kernel/verify_cpu.S" .data @@ -736,92 +737,6 @@ SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) #ifdef CONFIG_EFI_STUB SYM_DATA(image_offset, .long 0) #endif -#ifdef CONFIG_EFI_MIXED -#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime) -#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol) -#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base) - - .text - .code32 -SYM_FUNC_START(efi32_pe_entry) -/* - * efi_status_t efi32_pe_entry(efi_handle_t image_handle, - * efi_system_table_32_t *sys_table) - */ - - pushl %ebp - movl %esp, %ebp - pushl %eax // dummy push to allocate loaded_image - - pushl %ebx // save callee-save registers - pushl %edi - - call verify_cpu // check for long mode support - testl %eax, %eax - movl $0x80000003, %eax // EFI_UNSUPPORTED - jnz 2f - - call 1f -1: pop %ebx - - /* Get the loaded image protocol pointer from the image handle */ - leal -4(%ebp), %eax - pushl %eax // &loaded_image - leal (loaded_image_proto - 1b)(%ebx), %eax - pushl %eax // pass the GUID address - pushl 8(%ebp) // pass the image handle - - /* - * Note the alignment of the stack frame. - * sys_table - * handle <-- 16-byte aligned on entry by ABI - * return address - * frame pointer - * loaded_image <-- local variable - * saved %ebx <-- 16-byte aligned here - * saved %edi - * &loaded_image - * &loaded_image_proto - * handle <-- 16-byte aligned for call to handle_protocol - */ - - movl 12(%ebp), %eax // sys_table - movl ST32_boottime(%eax), %eax // sys_table->boottime - call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol - addl $12, %esp // restore argument space - testl %eax, %eax - jnz 2f - - movl 8(%ebp), %ecx // image_handle - movl 12(%ebp), %edx // sys_table - movl -4(%ebp), %esi // loaded_image - movl LI32_image_base(%esi), %esi // loaded_image->image_base - leal (startup_32 - 1b)(%ebx), %ebp // runtime address of startup_32 - /* - * We need to set the image_offset variable here since startup_32() will - * use it before we get to the 64-bit efi_pe_entry() in C code. - */ - subl %esi, %ebp // calculate image_offset - movl %ebp, (image_offset - 1b)(%ebx) // save image_offset - xorl %esi, %esi - jmp efi32_entry // pass %ecx, %edx, %esi - // no other registers remain live - -2: popl %edi // restore callee-save registers - popl %ebx - leave - RET -SYM_FUNC_END(efi32_pe_entry) - - .section ".rodata" - /* EFI loaded image protocol GUID */ - .balign 4 -SYM_DATA_START_LOCAL(loaded_image_proto) - .long 0x5b1b31a1 - .word 0x9562, 0x11d2 - .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b -SYM_DATA_END(loaded_image_proto) -#endif #ifdef CONFIG_AMD_MEM_ENCRYPT __HEAD From ef12d049fa7b429a0f1842307e921da30ff2e97b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:07 +0100 Subject: [PATCH 116/216] x86/boot/compressed, efi: Merge multiple definitions of image_offset into one commit 4b52016247aeaa55ca3e3bc2e03cd91114c145c2 upstream. There is no need for head_32.S and head_64.S both declaring a copy of the global 'image_offset' variable, so drop those and make the extern C declaration the definition. When image_offset is moved to the .c file, it needs to be placed particularly in the .data section because it lands by default in the .bss section which is cleared too late, in .Lrelocated, before the first access to it and thus garbage gets read, leading to SEV guests exploding in early boot. This happens only when the SEV guest kernel is loaded through grub. If supplied with qemu's -kernel command line option, that memory is always cleared upfront by qemu and all is fine there. [ bp: Expand commit message with SEV aspect. ] Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-8-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_32.S | 4 ---- arch/x86/boot/compressed/head_64.S | 4 ---- drivers/firmware/efi/libstub/x86-stub.c | 2 +- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 3b354eb9516d..6589ddd4cfaf 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -208,10 +208,6 @@ SYM_DATA_START_LOCAL(gdt) .quad 0x00cf92000000ffff /* __KERNEL_DS */ SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) -#ifdef CONFIG_EFI_STUB -SYM_DATA(image_offset, .long 0) -#endif - /* * Stack and heap for uncompression */ diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index d04d981c2b9b..2235c3a13f54 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -734,10 +734,6 @@ SYM_DATA_START(boot32_idt) SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) #endif -#ifdef CONFIG_EFI_STUB -SYM_DATA(image_offset, .long 0) -#endif - #ifdef CONFIG_AMD_MEM_ENCRYPT __HEAD .code32 diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 4f0152b11a89..9ae0d6d0c285 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -23,7 +23,7 @@ const efi_system_table_t *efi_system_table; const efi_dxe_services_table_t *efi_dxe_table; -extern u32 image_offset; +u32 image_offset __section(".data"); static efi_loaded_image_t *image = NULL; static efi_status_t From 88035744b91a187bcc23253a73ef3b1ccc08a2f9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:08 +0100 Subject: [PATCH 117/216] x86/boot/compressed: Simplify IDT/GDT preserve/restore in the EFI thunk commit 630f337f0c4fd80390e8600adcab31550aea33df upstream. Tweak the asm and remove some redundant instructions. While at it, fix the associated comment for style and correctness. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-9-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/efi_mixed.S | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S index 8844d8ed4b1c..8b02e507d3bb 100644 --- a/arch/x86/boot/compressed/efi_mixed.S +++ b/arch/x86/boot/compressed/efi_mixed.S @@ -96,24 +96,20 @@ SYM_FUNC_START(__efi64_thunk) leaq 0x20(%rsp), %rbx sgdt (%rbx) - - addq $16, %rbx - sidt (%rbx) + sidt 16(%rbx) leaq 1f(%rip), %rbp /* - * Switch to IDT and GDT with 32-bit segments. This is the firmware GDT - * and IDT that was installed when the kernel started executing. The - * pointers were saved by the efi32_entry() routine below. + * Switch to IDT and GDT with 32-bit segments. These are the firmware + * GDT and IDT that were installed when the kernel started executing. + * The pointers were saved by the efi32_entry() routine below. * * Pass the saved DS selector to the 32-bit code, and use far return to * restore the saved CS selector. */ - leaq efi32_boot_idt(%rip), %rax - lidt (%rax) - leaq efi32_boot_gdt(%rip), %rax - lgdt (%rax) + lidt efi32_boot_idt(%rip) + lgdt efi32_boot_gdt(%rip) movzwl efi32_boot_ds(%rip), %edx movzwq efi32_boot_cs(%rip), %rax @@ -187,9 +183,7 @@ SYM_FUNC_START_LOCAL(efi_enter32) */ cli - lidtl (%ebx) - subl $16, %ebx - + lidtl 16(%ebx) lgdtl (%ebx) movl %cr4, %eax From 530a4271b7ba5776b7f5a67015ae63a3ba3d2348 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:09 +0100 Subject: [PATCH 118/216] x86/boot/compressed: Avoid touching ECX in startup32_set_idt_entry() commit 6aac80a8da46d70f2ae7ff97c9f45a15c7c9b3ef upstream. Avoid touching register %ecx in startup32_set_idt_entry(), by folding the MOV, SHL and ORL instructions into a single ORL which no longer requires a temp register. This permits ECX to be used as a function argument in a subsequent patch. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-10-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 2235c3a13f54..e90cbbb2903d 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -749,7 +749,6 @@ SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) */ SYM_FUNC_START(startup32_set_idt_entry) push %ebx - push %ecx /* IDT entry address to %ebx */ leal rva(boot32_idt)(%ebp), %ebx @@ -758,10 +757,8 @@ SYM_FUNC_START(startup32_set_idt_entry) /* Build IDT entry, lower 4 bytes */ movl %eax, %edx - andl $0x0000ffff, %edx # Target code segment offset [15:0] - movl $__KERNEL32_CS, %ecx # Target code segment selector - shl $16, %ecx - orl %ecx, %edx + andl $0x0000ffff, %edx # Target code segment offset [15:0] + orl $(__KERNEL32_CS << 16), %edx # Target code segment selector /* Store lower 4 bytes to IDT */ movl %edx, (%ebx) @@ -774,7 +771,6 @@ SYM_FUNC_START(startup32_set_idt_entry) /* Store upper 4 bytes to IDT */ movl %edx, 4(%ebx) - pop %ecx pop %ebx RET SYM_FUNC_END(startup32_set_idt_entry) From 29134968f72da9337dff949bba0bdb0c5134ba0a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:10 +0100 Subject: [PATCH 119/216] x86/boot/compressed: Pull global variable reference into startup32_load_idt() commit d73a257f7f86871c3aac24dc20538e3983096647 upstream. In preparation for moving startup32_load_idt() out of head_64.S and turning it into an ordinary function using the ordinary 32-bit calling convention, pull the global variable reference to boot32_idt up into startup32_load_idt() so that startup32_set_idt_entry() does not need to discover its own runtime physical address, which will no longer be correlated with startup_32 once this code is moved into .text. While at it, give startup32_set_idt_entry() static linkage. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-11-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index e90cbbb2903d..9d1e1ccab256 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -744,16 +744,11 @@ SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) * * %eax: Handler address * %edx: Vector number - * - * Physical offset is expected in %ebp + * %ecx: IDT address */ -SYM_FUNC_START(startup32_set_idt_entry) - push %ebx - - /* IDT entry address to %ebx */ - leal rva(boot32_idt)(%ebp), %ebx - shl $3, %edx - addl %edx, %ebx +SYM_FUNC_START_LOCAL(startup32_set_idt_entry) + /* IDT entry address to %ecx */ + leal (%ecx, %edx, 8), %ecx /* Build IDT entry, lower 4 bytes */ movl %eax, %edx @@ -761,7 +756,7 @@ SYM_FUNC_START(startup32_set_idt_entry) orl $(__KERNEL32_CS << 16), %edx # Target code segment selector /* Store lower 4 bytes to IDT */ - movl %edx, (%ebx) + movl %edx, (%ecx) /* Build IDT entry, upper 4 bytes */ movl %eax, %edx @@ -769,15 +764,16 @@ SYM_FUNC_START(startup32_set_idt_entry) orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate /* Store upper 4 bytes to IDT */ - movl %edx, 4(%ebx) + movl %edx, 4(%ecx) - pop %ebx RET SYM_FUNC_END(startup32_set_idt_entry) #endif SYM_FUNC_START(startup32_load_idt) #ifdef CONFIG_AMD_MEM_ENCRYPT + leal rva(boot32_idt)(%ebp), %ecx + /* #VC handler */ leal rva(startup32_vc_handler)(%ebp), %eax movl $X86_TRAP_VC, %edx From 2e47116315a08bd5fa451bbeb66cb14ffc3f0de1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:11 +0100 Subject: [PATCH 120/216] x86/boot/compressed: Move startup32_load_idt() into .text section commit c6355995ba471d7ad574174e593192ce805c7e1a upstream. Convert startup32_load_idt() into an ordinary function and move it into the .text section. This involves turning the rva() immediates into ones derived from a local label, and preserving/restoring the %ebp and %ebx as per the calling convention. Also move the #ifdef to the only existing call site. This makes it clear that the function call does nothing if support for memory encryption is not compiled in. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-12-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 31 +++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 9d1e1ccab256..53f3f01f88af 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -118,7 +118,9 @@ SYM_FUNC_START(startup_32) 1: /* Setup Exception handling for SEV-ES */ +#ifdef CONFIG_AMD_MEM_ENCRYPT call startup32_load_idt +#endif /* Make sure cpu supports long mode. */ call verify_cpu @@ -732,10 +734,8 @@ SYM_DATA_START(boot32_idt) .quad 0 .endr SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) -#endif -#ifdef CONFIG_AMD_MEM_ENCRYPT - __HEAD + .text .code32 /* * Write an IDT entry into boot32_idt @@ -768,24 +768,32 @@ SYM_FUNC_START_LOCAL(startup32_set_idt_entry) RET SYM_FUNC_END(startup32_set_idt_entry) -#endif SYM_FUNC_START(startup32_load_idt) -#ifdef CONFIG_AMD_MEM_ENCRYPT - leal rva(boot32_idt)(%ebp), %ecx + push %ebp + push %ebx + + call 1f +1: pop %ebp + + leal (boot32_idt - 1b)(%ebp), %ebx /* #VC handler */ - leal rva(startup32_vc_handler)(%ebp), %eax + leal (startup32_vc_handler - 1b)(%ebp), %eax movl $X86_TRAP_VC, %edx + movl %ebx, %ecx call startup32_set_idt_entry /* Load IDT */ - leal rva(boot32_idt)(%ebp), %eax - movl %eax, rva(boot32_idt_desc+2)(%ebp) - lidt rva(boot32_idt_desc)(%ebp) -#endif + leal (boot32_idt_desc - 1b)(%ebp), %ecx + movl %ebx, 2(%ecx) + lidt (%ecx) + + pop %ebx + pop %ebp RET SYM_FUNC_END(startup32_load_idt) +#endif /* * Check for the correct C-bit position when the startup_32 boot-path is used. @@ -804,6 +812,7 @@ SYM_FUNC_END(startup32_load_idt) * succeed. An incorrect C-bit position will map all memory unencrypted, so that * the compare will use the encrypted random data and fail. */ + __HEAD SYM_FUNC_START(startup32_check_sev_cbit) #ifdef CONFIG_AMD_MEM_ENCRYPT pushl %eax From 801873f1750aa1cc42e290d8a818e340fd7d0987 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:12 +0100 Subject: [PATCH 121/216] x86/boot/compressed: Move startup32_load_idt() out of head_64.S commit 9ea813be3d345dfb8ac5bf6fbb29e6a63647a39d upstream. Now that startup32_load_idt() has been refactored into an ordinary callable function, move it into mem-encrypt.S where it belongs. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-13-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 72 -------------------------- arch/x86/boot/compressed/mem_encrypt.S | 72 +++++++++++++++++++++++++- 2 files changed, 71 insertions(+), 73 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 53f3f01f88af..b1d00f862af9 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -723,78 +723,6 @@ SYM_DATA_START(boot_idt) .endr SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) -#ifdef CONFIG_AMD_MEM_ENCRYPT -SYM_DATA_START(boot32_idt_desc) - .word boot32_idt_end - boot32_idt - 1 - .long 0 -SYM_DATA_END(boot32_idt_desc) - .balign 8 -SYM_DATA_START(boot32_idt) - .rept 32 - .quad 0 - .endr -SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) - - .text - .code32 -/* - * Write an IDT entry into boot32_idt - * - * Parameters: - * - * %eax: Handler address - * %edx: Vector number - * %ecx: IDT address - */ -SYM_FUNC_START_LOCAL(startup32_set_idt_entry) - /* IDT entry address to %ecx */ - leal (%ecx, %edx, 8), %ecx - - /* Build IDT entry, lower 4 bytes */ - movl %eax, %edx - andl $0x0000ffff, %edx # Target code segment offset [15:0] - orl $(__KERNEL32_CS << 16), %edx # Target code segment selector - - /* Store lower 4 bytes to IDT */ - movl %edx, (%ecx) - - /* Build IDT entry, upper 4 bytes */ - movl %eax, %edx - andl $0xffff0000, %edx # Target code segment offset [31:16] - orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate - - /* Store upper 4 bytes to IDT */ - movl %edx, 4(%ecx) - - RET -SYM_FUNC_END(startup32_set_idt_entry) - -SYM_FUNC_START(startup32_load_idt) - push %ebp - push %ebx - - call 1f -1: pop %ebp - - leal (boot32_idt - 1b)(%ebp), %ebx - - /* #VC handler */ - leal (startup32_vc_handler - 1b)(%ebp), %eax - movl $X86_TRAP_VC, %edx - movl %ebx, %ecx - call startup32_set_idt_entry - - /* Load IDT */ - leal (boot32_idt_desc - 1b)(%ebp), %ecx - movl %ebx, 2(%ecx) - lidt (%ecx) - - pop %ebx - pop %ebp - RET -SYM_FUNC_END(startup32_load_idt) -#endif - /* * Check for the correct C-bit position when the startup_32 boot-path is used. * diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index a73e4d783cae..6747e5e4c696 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -12,6 +12,8 @@ #include #include #include +#include +#include .text .code32 @@ -98,7 +100,7 @@ SYM_CODE_START_LOCAL(sev_es_req_cpuid) jmp 1b SYM_CODE_END(sev_es_req_cpuid) -SYM_CODE_START(startup32_vc_handler) +SYM_CODE_START_LOCAL(startup32_vc_handler) pushl %eax pushl %ebx pushl %ecx @@ -184,6 +186,63 @@ SYM_CODE_START(startup32_vc_handler) jmp .Lfail SYM_CODE_END(startup32_vc_handler) +/* + * Write an IDT entry into boot32_idt + * + * Parameters: + * + * %eax: Handler address + * %edx: Vector number + * %ecx: IDT address + */ +SYM_FUNC_START_LOCAL(startup32_set_idt_entry) + /* IDT entry address to %ecx */ + leal (%ecx, %edx, 8), %ecx + + /* Build IDT entry, lower 4 bytes */ + movl %eax, %edx + andl $0x0000ffff, %edx # Target code segment offset [15:0] + orl $(__KERNEL32_CS << 16), %edx # Target code segment selector + + /* Store lower 4 bytes to IDT */ + movl %edx, (%ecx) + + /* Build IDT entry, upper 4 bytes */ + movl %eax, %edx + andl $0xffff0000, %edx # Target code segment offset [31:16] + orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate + + /* Store upper 4 bytes to IDT */ + movl %edx, 4(%ecx) + + RET +SYM_FUNC_END(startup32_set_idt_entry) + +SYM_FUNC_START(startup32_load_idt) + push %ebp + push %ebx + + call 1f +1: pop %ebp + + leal (boot32_idt - 1b)(%ebp), %ebx + + /* #VC handler */ + leal (startup32_vc_handler - 1b)(%ebp), %eax + movl $X86_TRAP_VC, %edx + movl %ebx, %ecx + call startup32_set_idt_entry + + /* Load IDT */ + leal (boot32_idt_desc - 1b)(%ebp), %ecx + movl %ebx, 2(%ecx) + lidt (%ecx) + + pop %ebx + pop %ebp + RET +SYM_FUNC_END(startup32_load_idt) + .code64 #include "../../kernel/sev_verify_cbit.S" @@ -195,4 +254,15 @@ SYM_CODE_END(startup32_vc_handler) SYM_DATA(sme_me_mask, .quad 0) SYM_DATA(sev_status, .quad 0) SYM_DATA(sev_check_data, .quad 0) + +SYM_DATA_START_LOCAL(boot32_idt) + .rept 32 + .quad 0 + .endr +SYM_DATA_END(boot32_idt) + +SYM_DATA_START_LOCAL(boot32_idt_desc) + .word . - boot32_idt - 1 + .long 0 +SYM_DATA_END(boot32_idt_desc) #endif From e840ae3dc277f7f4ae38f600e7f5da7f169b8d7c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:13 +0100 Subject: [PATCH 122/216] x86/boot/compressed: Move startup32_check_sev_cbit() into .text commit b5d854cd4b6a314edd6c15dabc4233b84a0f8e5e upstream. Move startup32_check_sev_cbit() into the .text section and turn it into an ordinary function using the ordinary 32-bit calling convention, instead of saving/restoring the registers that are known to be live at the only call site. This improves maintainability, and makes it possible to move this function out of head_64.S and into a separate compilation unit that is specific to memory encryption. Note that this requires the call site to be moved before the mixed mode check, as %eax will be live otherwise. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-14-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 35 ++++++++++++++++-------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index b1d00f862af9..c7655a9dfd3f 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -251,6 +251,11 @@ SYM_FUNC_START(startup_32) movl $__BOOT_TSS, %eax ltr %ax +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* Check if the C-bit position is correct when SEV is active */ + call startup32_check_sev_cbit +#endif + /* * Setup for the jump to 64bit mode * @@ -268,8 +273,6 @@ SYM_FUNC_START(startup_32) leal rva(startup_64_mixed_mode)(%ebp), %eax 1: #endif - /* Check if the C-bit position is correct when SEV is active */ - call startup32_check_sev_cbit pushl $__KERNEL_CS pushl %eax @@ -740,16 +743,17 @@ SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) * succeed. An incorrect C-bit position will map all memory unencrypted, so that * the compare will use the encrypted random data and fail. */ - __HEAD -SYM_FUNC_START(startup32_check_sev_cbit) #ifdef CONFIG_AMD_MEM_ENCRYPT - pushl %eax + .text +SYM_FUNC_START(startup32_check_sev_cbit) pushl %ebx - pushl %ecx - pushl %edx + pushl %ebp + + call 0f +0: popl %ebp /* Check for non-zero sev_status */ - movl rva(sev_status)(%ebp), %eax + movl (sev_status - 0b)(%ebp), %eax testl %eax, %eax jz 4f @@ -764,17 +768,18 @@ SYM_FUNC_START(startup32_check_sev_cbit) jnc 2b /* Store to memory and keep it in the registers */ - movl %eax, rva(sev_check_data)(%ebp) - movl %ebx, rva(sev_check_data+4)(%ebp) + leal (sev_check_data - 0b)(%ebp), %ebp + movl %eax, 0(%ebp) + movl %ebx, 4(%ebp) /* Enable paging to see if encryption is active */ movl %cr0, %edx /* Backup %cr0 in %edx */ movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */ movl %ecx, %cr0 - cmpl %eax, rva(sev_check_data)(%ebp) + cmpl %eax, 0(%ebp) jne 3f - cmpl %ebx, rva(sev_check_data+4)(%ebp) + cmpl %ebx, 4(%ebp) jne 3f movl %edx, %cr0 /* Restore previous %cr0 */ @@ -786,13 +791,11 @@ SYM_FUNC_START(startup32_check_sev_cbit) jmp 3b 4: - popl %edx - popl %ecx + popl %ebp popl %ebx - popl %eax -#endif RET SYM_FUNC_END(startup32_check_sev_cbit) +#endif /* * Stack and heap for uncompression From 0912dce9ed4e8a6442fb39627cd37ca5a25beec5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:14 +0100 Subject: [PATCH 123/216] x86/boot/compressed: Move startup32_check_sev_cbit() out of head_64.S commit 9d7eaae6a071ff1f718e0aa5e610bb712f8cc632 upstream. Now that the startup32_check_sev_cbit() routine can execute from anywhere and behaves like an ordinary function, it can be moved where it belongs. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-15-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 71 -------------------------- arch/x86/boot/compressed/mem_encrypt.S | 68 ++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 71 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index c7655a9dfd3f..43a82df0e9d6 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -726,77 +726,6 @@ SYM_DATA_START(boot_idt) .endr SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) -/* - * Check for the correct C-bit position when the startup_32 boot-path is used. - * - * The check makes use of the fact that all memory is encrypted when paging is - * disabled. The function creates 64 bits of random data using the RDRAND - * instruction. RDRAND is mandatory for SEV guests, so always available. If the - * hypervisor violates that the kernel will crash right here. - * - * The 64 bits of random data are stored to a memory location and at the same - * time kept in the %eax and %ebx registers. Since encryption is always active - * when paging is off the random data will be stored encrypted in main memory. - * - * Then paging is enabled. When the C-bit position is correct all memory is - * still mapped encrypted and comparing the register values with memory will - * succeed. An incorrect C-bit position will map all memory unencrypted, so that - * the compare will use the encrypted random data and fail. - */ -#ifdef CONFIG_AMD_MEM_ENCRYPT - .text -SYM_FUNC_START(startup32_check_sev_cbit) - pushl %ebx - pushl %ebp - - call 0f -0: popl %ebp - - /* Check for non-zero sev_status */ - movl (sev_status - 0b)(%ebp), %eax - testl %eax, %eax - jz 4f - - /* - * Get two 32-bit random values - Don't bail out if RDRAND fails - * because it is better to prevent forward progress if no random value - * can be gathered. - */ -1: rdrand %eax - jnc 1b -2: rdrand %ebx - jnc 2b - - /* Store to memory and keep it in the registers */ - leal (sev_check_data - 0b)(%ebp), %ebp - movl %eax, 0(%ebp) - movl %ebx, 4(%ebp) - - /* Enable paging to see if encryption is active */ - movl %cr0, %edx /* Backup %cr0 in %edx */ - movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */ - movl %ecx, %cr0 - - cmpl %eax, 0(%ebp) - jne 3f - cmpl %ebx, 4(%ebp) - jne 3f - - movl %edx, %cr0 /* Restore previous %cr0 */ - - jmp 4f - -3: /* Check failed - hlt the machine */ - hlt - jmp 3b - -4: - popl %ebp - popl %ebx - RET -SYM_FUNC_END(startup32_check_sev_cbit) -#endif - /* * Stack and heap for uncompression */ diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index 6747e5e4c696..14cf04a1ed09 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -243,6 +243,74 @@ SYM_FUNC_START(startup32_load_idt) RET SYM_FUNC_END(startup32_load_idt) +/* + * Check for the correct C-bit position when the startup_32 boot-path is used. + * + * The check makes use of the fact that all memory is encrypted when paging is + * disabled. The function creates 64 bits of random data using the RDRAND + * instruction. RDRAND is mandatory for SEV guests, so always available. If the + * hypervisor violates that the kernel will crash right here. + * + * The 64 bits of random data are stored to a memory location and at the same + * time kept in the %eax and %ebx registers. Since encryption is always active + * when paging is off the random data will be stored encrypted in main memory. + * + * Then paging is enabled. When the C-bit position is correct all memory is + * still mapped encrypted and comparing the register values with memory will + * succeed. An incorrect C-bit position will map all memory unencrypted, so that + * the compare will use the encrypted random data and fail. + */ +SYM_FUNC_START(startup32_check_sev_cbit) + pushl %ebx + pushl %ebp + + call 0f +0: popl %ebp + + /* Check for non-zero sev_status */ + movl (sev_status - 0b)(%ebp), %eax + testl %eax, %eax + jz 4f + + /* + * Get two 32-bit random values - Don't bail out if RDRAND fails + * because it is better to prevent forward progress if no random value + * can be gathered. + */ +1: rdrand %eax + jnc 1b +2: rdrand %ebx + jnc 2b + + /* Store to memory and keep it in the registers */ + leal (sev_check_data - 0b)(%ebp), %ebp + movl %eax, 0(%ebp) + movl %ebx, 4(%ebp) + + /* Enable paging to see if encryption is active */ + movl %cr0, %edx /* Backup %cr0 in %edx */ + movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */ + movl %ecx, %cr0 + + cmpl %eax, 0(%ebp) + jne 3f + cmpl %ebx, 4(%ebp) + jne 3f + + movl %edx, %cr0 /* Restore previous %cr0 */ + + jmp 4f + +3: /* Check failed - hlt the machine */ + hlt + jmp 3b + +4: + popl %ebp + popl %ebx + RET +SYM_FUNC_END(startup32_check_sev_cbit) + .code64 #include "../../kernel/sev_verify_cbit.S" From cac22c9a5e661a000e734af797641375fa181dbc Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:15 +0100 Subject: [PATCH 124/216] x86/boot/compressed: Adhere to calling convention in get_sev_encryption_bit() commit 30c9ca16a5271ba6f8ad9c86507ff1c789c94677 upstream. Make get_sev_encryption_bit() follow the ordinary i386 calling convention, and only call it if CONFIG_AMD_MEM_ENCRYPT is actually enabled. This clarifies the calling code, and makes it more maintainable. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-16-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 5 +++-- arch/x86/boot/compressed/mem_encrypt.S | 10 ---------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 43a82df0e9d6..cd4eb22aa84b 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -180,12 +180,13 @@ SYM_FUNC_START(startup_32) */ /* * If SEV is active then set the encryption mask in the page tables. - * This will insure that when the kernel is copied and decompressed + * This will ensure that when the kernel is copied and decompressed * it will be done so encrypted. */ - call get_sev_encryption_bit xorl %edx, %edx #ifdef CONFIG_AMD_MEM_ENCRYPT + call get_sev_encryption_bit + xorl %edx, %edx testl %eax, %eax jz 1f subl $32, %eax /* Encryption bit is always above bit 31 */ diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index 14cf04a1ed09..e69674588a31 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -18,12 +18,7 @@ .text .code32 SYM_FUNC_START(get_sev_encryption_bit) - xor %eax, %eax - -#ifdef CONFIG_AMD_MEM_ENCRYPT push %ebx - push %ecx - push %edx movl $0x80000000, %eax /* CPUID to check the highest leaf */ cpuid @@ -54,12 +49,7 @@ SYM_FUNC_START(get_sev_encryption_bit) xor %eax, %eax .Lsev_exit: - pop %edx - pop %ecx pop %ebx - -#endif /* CONFIG_AMD_MEM_ENCRYPT */ - RET SYM_FUNC_END(get_sev_encryption_bit) From 71c43b714fd688ff5ee6d906e5cc38e6a8f2836f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:16 +0100 Subject: [PATCH 125/216] x86/boot/compressed: Only build mem_encrypt.S if AMD_MEM_ENCRYPT=y commit 61de13df95901bc58456bc5acdbd3c18c66cf859 upstream. Avoid building the mem_encrypt.o object if memory encryption support is not enabled to begin with. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-17-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/mem_encrypt.S | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 27c82c78ac26..0c9ebf74fac5 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -100,7 +100,7 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 vmlinux-objs-y += $(obj)/ident_map_64.o vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o - vmlinux-objs-y += $(obj)/mem_encrypt.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o endif diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index e69674588a31..32f7cc8a8625 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -307,7 +307,6 @@ SYM_FUNC_END(startup32_check_sev_cbit) .data -#ifdef CONFIG_AMD_MEM_ENCRYPT .balign 8 SYM_DATA(sme_me_mask, .quad 0) SYM_DATA(sev_status, .quad 0) @@ -323,4 +322,3 @@ SYM_DATA_START_LOCAL(boot32_idt_desc) .word . - boot32_idt - 1 .long 0 SYM_DATA_END(boot32_idt_desc) -#endif From a8901f331b8b7f95a7315d033a22bc84c8365f35 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 19 Jan 2023 17:42:54 +0100 Subject: [PATCH 126/216] efi: verify that variable services are supported commit bad267f9e18f8e9e628abd1811d2899b1735a4e1 upstream. Current Qualcomm UEFI firmware does not implement the variable services but not all revisions clear the corresponding bits in the RT_PROP table services mask and instead the corresponding calls return EFI_UNSUPPORTED. This leads to efi core registering the generic efivar ops even when the variable services are not supported or when they are accessed through some other interface (e.g. Google SMI or the upcoming Qualcomm SCM implementation). Instead of playing games with init call levels to make sure that the custom implementations are registered after the generic one, make sure that get_next_variable() is actually supported before registering the generic ops. Signed-off-by: Johan Hovold Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/efi.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index b7c0e8cc0764..9077353d1c98 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -185,8 +185,27 @@ static const struct attribute_group efi_subsys_attr_group = { static struct efivars generic_efivars; static struct efivar_operations generic_ops; +static bool generic_ops_supported(void) +{ + unsigned long name_size; + efi_status_t status; + efi_char16_t name; + efi_guid_t guid; + + name_size = sizeof(name); + + status = efi.get_next_variable(&name_size, &name, &guid); + if (status == EFI_UNSUPPORTED) + return false; + + return true; +} + static int generic_ops_register(void) { + if (!generic_ops_supported()) + return 0; + generic_ops.get_variable = efi.get_variable; generic_ops.get_next_variable = efi.get_next_variable; generic_ops.query_variable_store = efi_query_variable_store; @@ -200,6 +219,9 @@ static int generic_ops_register(void) static void generic_ops_unregister(void) { + if (!generic_ops.get_variable) + return; + efivars_unregister(&generic_efivars); } From 7bc9533e077e2553264b447189d13f83c47770a0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Nov 2022 17:10:17 +0100 Subject: [PATCH 127/216] x86/efi: Make the deprecated EFI handover protocol optional commit cc3fdda2876e58a7e83e558ab51853cf106afb6a upstream. The EFI handover protocol permits a bootloader to invoke the kernel as a EFI PE/COFF application, while passing a bootparams struct as a third argument to the entrypoint function call. This has no basis in the UEFI specification, and there are better ways to pass additional data to a UEFI application (UEFI configuration tables, UEFI variables, UEFI protocols) than going around the StartImage() boot service and jumping to a fixed offset in the loaded image, just to call a different function that takes a third parameter. The reason for handling struct bootparams in the bootloader was that the EFI stub could only load initrd images from the EFI system partition, and so passing it via struct bootparams was needed for loaders like GRUB, which pass the initrd in memory, and may load it from anywhere, including from the network. Another motivation was EFI mixed mode, which could not use the initrd loader in the EFI stub at all due to 32/64 bit incompatibilities (which will be fixed shortly [0]), and could not invoke the ordinary PE/COFF entry point either, for the same reasons. Given that loaders such as GRUB already carried the bootparams handling in order to implement non-EFI boot, retaining that code and just passing bootparams to the EFI stub was a reasonable choice (although defining an alternate entrypoint could have been avoided.) However, the GRUB side changes never made it upstream, and are only shipped by some of the distros in their downstream versions. In the meantime, EFI support has been added to other Linux architecture ports, as well as to U-boot and systemd, including arch-agnostic methods for passing initrd images in memory [1], and for doing mixed mode boot [2], none of them requiring anything like the EFI handover protocol. So given that only out-of-tree distro GRUB relies on this, let's permit it to be omitted from the build, in preparation for retiring it completely at a later date. (Note that systemd-boot does have an implementation as well, but only uses it as a fallback for booting images that do not implement the LoadFile2 based initrd loading method, i.e., v5.8 or older) [0] https://lore.kernel.org/all/20220927085842.2860715-1-ardb@kernel.org/ [1] ec93fc371f01 ("efi/libstub: Add support for loading the initrd from a device path") [2] 97aa276579b2 ("efi/x86: Add true mixed mode entry point into .compat section") Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221122161017.2426828-18-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig | 17 +++++++++++++++++ arch/x86/boot/compressed/head_64.S | 4 +++- arch/x86/boot/header.S | 2 +- arch/x86/boot/tools/build.c | 2 ++ 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4c9bfc4be58d..2f7af61b49b6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1982,6 +1982,23 @@ config EFI_STUB See Documentation/admin-guide/efi-stub.rst for more information. +config EFI_HANDOVER_PROTOCOL + bool "EFI handover protocol (DEPRECATED)" + depends on EFI_STUB + default y + help + Select this in order to include support for the deprecated EFI + handover protocol, which defines alternative entry points into the + EFI stub. This is a practice that has no basis in the UEFI + specification, and requires a priori knowledge on the part of the + bootloader about Linux/x86 specific ways of passing the command line + and initrd, and where in memory those assets may be loaded. + + If in doubt, say Y. Even though the corresponding support is not + present in upstream GRUB or other bootloaders, most distros build + GRUB with numerous downstream patches applied, and may rely on the + handover protocol as as result. + config EFI_MIXED bool "EFI mixed-mode support" depends on EFI_STUB && X86_64 diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index cd4eb22aa84b..96c61c9883c3 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -286,7 +286,7 @@ SYM_FUNC_START(startup_32) lret SYM_FUNC_END(startup_32) -#ifdef CONFIG_EFI_MIXED +#if IS_ENABLED(CONFIG_EFI_MIXED) && IS_ENABLED(CONFIG_EFI_HANDOVER_PROTOCOL) .org 0x190 SYM_FUNC_START(efi32_stub_entry) add $0x4, %esp /* Discard return address */ @@ -535,7 +535,9 @@ trampoline_return: SYM_CODE_END(startup_64) #ifdef CONFIG_EFI_STUB +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL .org 0x390 +#endif SYM_FUNC_START(efi64_stub_entry) and $~0xf, %rsp /* realign the stack */ movq %rdx, %rbx /* save boot_params pointer */ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index f912d7770130..d31982509654 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -406,7 +406,7 @@ xloadflags: # define XLF1 0 #endif -#ifdef CONFIG_EFI_STUB +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL # ifdef CONFIG_EFI_MIXED # define XLF23 (XLF_EFI_HANDOVER_32|XLF_EFI_HANDOVER_64) # else diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index a3725ad46c5a..bd247692b701 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -290,6 +290,7 @@ static void efi_stub_entry_update(void) { unsigned long addr = efi32_stub_entry; +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL #ifdef CONFIG_X86_64 /* Yes, this is really how we defined it :( */ addr = efi64_stub_entry - 0x200; @@ -298,6 +299,7 @@ static void efi_stub_entry_update(void) #ifdef CONFIG_EFI_MIXED if (efi32_stub_entry != addr) die("32-bit and 64-bit EFI entry points do not match\n"); +#endif #endif put_unaligned_le32(addr, &buf[0x264]); } From 4f3077c3eae7e68e2c0ba6d1bd3f5afeb61eb269 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Mon, 9 Jan 2023 18:04:02 +0100 Subject: [PATCH 128/216] x86/boot: Robustify calling startup_{32,64}() from the decompressor code commit 7734a0f31e99c433df3063bbb7e8ee5a16a2cb82 upstream. After commit ce697ccee1a8 ("kbuild: remove head-y syntax"), I started digging whether x86 is ready for removing this old cruft. Removing its objects from the list makes the kernel unbootable. This applies only to bzImage, vmlinux still works correctly. The reason is that with no strict object order determined by the linker arguments, not the linker script, startup_64 can be placed not right at the beginning of the kernel. Here's vmlinux.map's beginning before removing: ffffffff81000000 vmlinux.o:(.head.text) ffffffff81000000 startup_64 ffffffff81000070 secondary_startup_64 ffffffff81000075 secondary_startup_64_no_verify ffffffff81000160 verify_cpu and after: ffffffff81000000 vmlinux.o:(.head.text) ffffffff81000000 pvh_start_xen ffffffff81000080 startup_64 ffffffff810000f0 secondary_startup_64 ffffffff810000f5 secondary_startup_64_no_verify Not a problem itself, but the self-extractor code has the address of that function hardcoded the beginning, not looking onto the ELF header, which always contains the address of startup_{32,64}(). So, instead of doing an "act of blind faith", just take the address from the ELF header and extract a relative offset to the entry point. The decompressor function already returns a pointer to the beginning of the kernel to the Asm code, which then jumps to it, so add that offset to the return value. This doesn't change anything for now, but allows to resign from the "head object list" for x86 and makes sure valid Kbuild or any other improvements won't break anything here in general. Signed-off-by: Alexander Lobakin Signed-off-by: Ingo Molnar Tested-by: Jiri Slaby Cc: "H. Peter Anvin" Cc: Linus Torvalds Link: https://lore.kernel.org/r/20230109170403.4117105-2-alexandr.lobakin@intel.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_32.S | 2 +- arch/x86/boot/compressed/head_64.S | 2 +- arch/x86/boot/compressed/misc.c | 18 +++++++++++------- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 6589ddd4cfaf..987ae727cf9f 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -187,7 +187,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) leal boot_heap@GOTOFF(%ebx), %eax pushl %eax /* heap area */ pushl %esi /* real mode pointer */ - call extract_kernel /* returns kernel location in %eax */ + call extract_kernel /* returns kernel entry point in %eax */ addl $24, %esp /* diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 96c61c9883c3..3a970a388adb 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -580,7 +580,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) movl input_len(%rip), %ecx /* input_len */ movq %rbp, %r8 /* output target address */ movl output_len(%rip), %r9d /* decompressed length, end of relocs */ - call extract_kernel /* returns kernel location in %rax */ + call extract_kernel /* returns kernel entry point in %rax */ popq %rsi /* diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index cf690d8712f4..014ff222bf4b 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -277,7 +277,7 @@ static inline void handle_relocations(void *output, unsigned long output_len, { } #endif -static void parse_elf(void *output) +static size_t parse_elf(void *output) { #ifdef CONFIG_X86_64 Elf64_Ehdr ehdr; @@ -293,10 +293,8 @@ static void parse_elf(void *output) if (ehdr.e_ident[EI_MAG0] != ELFMAG0 || ehdr.e_ident[EI_MAG1] != ELFMAG1 || ehdr.e_ident[EI_MAG2] != ELFMAG2 || - ehdr.e_ident[EI_MAG3] != ELFMAG3) { + ehdr.e_ident[EI_MAG3] != ELFMAG3) error("Kernel is not a valid ELF file"); - return; - } debug_putstr("Parsing ELF... "); @@ -328,6 +326,8 @@ static void parse_elf(void *output) } free(phdrs); + + return ehdr.e_entry - LOAD_PHYSICAL_ADDR; } /* @@ -356,6 +356,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, const unsigned long kernel_total_size = VO__end - VO__text; unsigned long virt_addr = LOAD_PHYSICAL_ADDR; unsigned long needed_size; + size_t entry_offset; /* Retain x86 boot parameters pointer passed from startup_32/64. */ boot_params = rmode; @@ -456,14 +457,17 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, debug_putstr("\nDecompressing Linux... "); __decompress(input_data, input_len, NULL, NULL, output, output_len, NULL, error); - parse_elf(output); + entry_offset = parse_elf(output); handle_relocations(output, output_len, virt_addr); - debug_putstr("done.\nBooting the kernel.\n"); + + debug_putstr("done.\nBooting the kernel (entry_offset: 0x"); + debug_puthex(entry_offset); + debug_putstr(").\n"); /* Disable exception handling before booting the kernel */ cleanup_exception_handling(); - return output; + return output + entry_offset; } void fortify_panic(const char *name) From 51a0710218cea5c7d5528b92ba19e964423c7f5a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Aug 2023 18:27:00 +0200 Subject: [PATCH 129/216] x86/efistub: Branch straight to kernel entry point from C code commit d2d7a54f69b67cd0a30e0ebb5307cb2de625baac upstream. Instead of returning to the calling code in assembler that does nothing more than perform an indirect call with the boot_params pointer in register ESI/RSI, perform the jump directly from the EFI stub C code. This will allow the asm entrypoint code to be dropped entirely in subsequent patches. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-4-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/libstub/x86-stub.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 9ae0d6d0c285..9422fddfbc8f 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -279,7 +279,7 @@ adjust_memory_range_protection(unsigned long start, unsigned long size) #define TRAMPOLINE_PLACEMENT_BASE ((128 - 8)*1024) #define TRAMPOLINE_PLACEMENT_SIZE (640*1024 - (128 - 8)*1024) -void startup_32(struct boot_params *boot_params); +extern const u8 startup_32[], startup_64[]; static void setup_memory_protection(unsigned long image_base, unsigned long image_size) @@ -760,10 +760,19 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle) return EFI_SUCCESS; } +static void __noreturn enter_kernel(unsigned long kernel_addr, + struct boot_params *boot_params) +{ + /* enter decompressed kernel with boot_params pointer in RSI/ESI */ + asm("jmp *%0"::"r"(kernel_addr), "S"(boot_params)); + + unreachable(); +} + /* - * On success, we return the address of startup_32, which has potentially been - * relocated by efi_relocate_kernel. - * On failure, we exit to the firmware via efi_exit instead of returning. + * On success, this routine will jump to the relocated image directly and never + * return. On failure, it will exit to the firmware via efi_exit() instead of + * returning. */ asmlinkage unsigned long efi_main(efi_handle_t handle, efi_system_table_t *sys_table_arg, @@ -905,7 +914,10 @@ asmlinkage unsigned long efi_main(efi_handle_t handle, goto fail; } - return bzimage_addr; + if (IS_ENABLED(CONFIG_X86_64)) + bzimage_addr += startup_64 - startup_32; + + enter_kernel(bzimage_addr, boot_params); fail: efi_err("efi_main() failed!\n"); From 2cca5f519e3a967f4b5b72e69758521401f021eb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Aug 2023 18:27:04 +0200 Subject: [PATCH 130/216] x86/decompressor: Store boot_params pointer in callee save register commit 8b63cba746f86a754d66e302c43209cc9b9b6e39 upstream. Instead of pushing and popping %RSI several times to preserve the struct boot_params pointer across the execution of the startup code, move it into a callee save register before the first call into C, and copy it back when needed. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-8-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 42 ++++++++++++------------------ 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 3a970a388adb..bb268d165906 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -408,10 +408,14 @@ SYM_CODE_START(startup_64) lretq .Lon_kernel_cs: + /* + * RSI holds a pointer to a boot_params structure provided by the + * loader, and this needs to be preserved across C function calls. So + * move it into a callee saved register. + */ + movq %rsi, %r15 - pushq %rsi call load_stage1_idt - popq %rsi #ifdef CONFIG_AMD_MEM_ENCRYPT /* @@ -422,12 +426,10 @@ SYM_CODE_START(startup_64) * CPUID instructions being issued, so go ahead and do that now via * sev_enable(), which will also handle the rest of the SEV-related * detection/setup to ensure that has been done in advance of any dependent - * code. + * code. Pass the boot_params pointer as the first argument. */ - pushq %rsi - movq %rsi, %rdi /* real mode address */ + movq %r15, %rdi call sev_enable - popq %rsi #endif /* @@ -440,13 +442,10 @@ SYM_CODE_START(startup_64) * - Non zero RDX means trampoline needs to enable 5-level * paging. * - * RSI holds real mode data and needs to be preserved across - * this function call. + * Pass the boot_params pointer as the first argument. */ - pushq %rsi - movq %rsi, %rdi /* real mode address */ + movq %r15, %rdi call paging_prepare - popq %rsi /* Save the trampoline address in RCX */ movq %rax, %rcx @@ -459,9 +458,9 @@ SYM_CODE_START(startup_64) * because the architecture does not guarantee that GPRs will retain * their full 64-bit values across a 32-bit mode switch. */ + pushq %r15 pushq %rbp pushq %rbx - pushq %rsi /* * Push the 64-bit address of trampoline_return() onto the new stack. @@ -478,9 +477,9 @@ SYM_CODE_START(startup_64) lretq trampoline_return: /* Restore live 64-bit registers */ - popq %rsi popq %rbx popq %rbp + popq %r15 /* Restore the stack, the 32-bit trampoline uses its own stack */ leaq rva(boot_stack_end)(%rbx), %rsp @@ -490,14 +489,9 @@ trampoline_return: * * RDI is address of the page table to use instead of page table * in trampoline memory (if required). - * - * RSI holds real mode data and needs to be preserved across - * this function call. */ - pushq %rsi leaq rva(top_pgtable)(%rbx), %rdi call cleanup_trampoline - popq %rsi /* Zero EFLAGS */ pushq $0 @@ -507,7 +501,6 @@ trampoline_return: * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - pushq %rsi leaq (_bss-8)(%rip), %rsi leaq rva(_bss-8)(%rbx), %rdi movl $(_bss - startup_32), %ecx @@ -515,7 +508,6 @@ trampoline_return: std rep movsq cld - popq %rsi /* * The GDT may get overwritten either during the copy we just did or @@ -562,30 +554,28 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) shrq $3, %rcx rep stosq - pushq %rsi call load_stage2_idt /* Pass boot_params to initialize_identity_maps() */ - movq (%rsp), %rdi + movq %r15, %rdi call initialize_identity_maps - popq %rsi /* * Do the extraction, and jump to the new kernel.. */ - pushq %rsi /* Save the real mode argument */ - movq %rsi, %rdi /* real mode address */ + /* pass struct boot_params pointer */ + movq %r15, %rdi leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ leaq input_data(%rip), %rdx /* input_data */ movl input_len(%rip), %ecx /* input_len */ movq %rbp, %r8 /* output target address */ movl output_len(%rip), %r9d /* decompressed length, end of relocs */ call extract_kernel /* returns kernel entry point in %rax */ - popq %rsi /* * Jump to the decompressed kernel. */ + movq %r15, %rsi jmp *%rax SYM_FUNC_END(.Lrelocated) From 99a20f58913a4093c73817f5b364f0cf050a6d75 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Aug 2023 18:27:05 +0200 Subject: [PATCH 131/216] x86/decompressor: Assign paging related global variables earlier commit 00c6b0978ec182f1a672095930872168b9d5b1e2 upstream. There is no need to defer the assignment of the paging related global variables 'pgdir_shift' and 'ptrs_per_p4d' until after the trampoline is cleaned up, so assign them as soon as it is clear that 5-level paging will be enabled. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-9-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/misc.h | 2 -- arch/x86/boot/compressed/pgtable_64.c | 14 +++++--------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index a49d9219c06e..b6e46435b90b 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -170,9 +170,7 @@ static inline int count_immovable_mem_regions(void) { return 0; } #endif /* ident_map_64.c */ -#ifdef CONFIG_X86_5LEVEL extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d; -#endif extern void kernel_add_identity_map(unsigned long start, unsigned long end); /* Used by PAGE_KERN* macros: */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 2ac12ff4111b..f8092d3244c9 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -130,6 +130,11 @@ struct paging_config paging_prepare(void *rmode) native_cpuid_eax(0) >= 7 && (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) { paging_config.l5_required = 1; + + /* Initialize variables for 5-level paging */ + __pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; } paging_config.trampoline_start = find_trampoline_placement(); @@ -206,13 +211,4 @@ void cleanup_trampoline(void *pgtable) /* Restore trampoline memory */ memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE); - - /* Initialize variables for 5-level paging */ -#ifdef CONFIG_X86_5LEVEL - if (__read_cr4() & X86_CR4_LA57) { - __pgtable_l5_enabled = 1; - pgdir_shift = 48; - ptrs_per_p4d = 512; - } -#endif } From 640f27fc2e7bd69d511675c0c62a90bd9ed977cb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Aug 2023 18:27:06 +0200 Subject: [PATCH 132/216] x86/decompressor: Call trampoline as a normal function commit e8972a76aa90c05a0078043413f806c02fcb3487 upstream. Move the long return to switch to 32-bit mode into the trampoline code so it can be called as an ordinary function. This will allow it to be called directly from C code in a subsequent patch. While at it, reorganize the code somewhat to keep the prologue and epilogue of the function together, making the code a bit easier to follow. Also, given that the trampoline is now entered in 64-bit mode, a simple RIP-relative reference can be used to take the address of the exit point. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20230807162720.545787-10-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 79 +++++++++++++----------------- arch/x86/boot/compressed/pgtable.h | 2 +- 2 files changed, 36 insertions(+), 45 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index bb268d165906..381ca6bf62d3 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -450,39 +450,8 @@ SYM_CODE_START(startup_64) /* Save the trampoline address in RCX */ movq %rax, %rcx - /* Set up 32-bit addressable stack */ - leaq TRAMPOLINE_32BIT_STACK_END(%rcx), %rsp - - /* - * Preserve live 64-bit registers on the stack: this is necessary - * because the architecture does not guarantee that GPRs will retain - * their full 64-bit values across a 32-bit mode switch. - */ - pushq %r15 - pushq %rbp - pushq %rbx - - /* - * Push the 64-bit address of trampoline_return() onto the new stack. - * It will be used by the trampoline to return to the main code. Due to - * the 32-bit mode switch, it cannot be kept it in a register either. - */ - leaq trampoline_return(%rip), %rdi - pushq %rdi - - /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ - pushq $__KERNEL32_CS leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax - pushq %rax - lretq -trampoline_return: - /* Restore live 64-bit registers */ - popq %rbx - popq %rbp - popq %r15 - - /* Restore the stack, the 32-bit trampoline uses its own stack */ - leaq rva(boot_stack_end)(%rbx), %rsp + call *%rax /* * cleanup_trampoline() would restore trampoline memory. @@ -579,7 +548,6 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) jmp *%rax SYM_FUNC_END(.Lrelocated) - .code32 /* * This is the 32-bit trampoline that will be copied over to low memory. * @@ -588,6 +556,39 @@ SYM_FUNC_END(.Lrelocated) * Non zero RDX means trampoline needs to enable 5-level paging. */ SYM_CODE_START(trampoline_32bit_src) + /* + * Preserve live 64-bit registers on the stack: this is necessary + * because the architecture does not guarantee that GPRs will retain + * their full 64-bit values across a 32-bit mode switch. + */ + pushq %r15 + pushq %rbp + pushq %rbx + + /* Set up 32-bit addressable stack and push the old RSP value */ + leaq (TRAMPOLINE_32BIT_STACK_END - 8)(%rcx), %rbx + movq %rsp, (%rbx) + movq %rbx, %rsp + + /* Take the address of the trampoline exit code */ + leaq .Lret(%rip), %rbx + + /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ + pushq $__KERNEL32_CS + leaq 0f(%rip), %rax + pushq %rax + lretq + +.Lret: + /* Restore the preserved 64-bit registers */ + movq (%rsp), %rsp + popq %rbx + popq %rbp + popq %r15 + retq + + .code32 +0: /* Set up data and stack segments */ movl $__KERNEL_DS, %eax movl %eax, %ds @@ -651,12 +652,9 @@ SYM_CODE_START(trampoline_32bit_src) 1: movl %eax, %cr4 - /* Calculate address of paging_enabled() once we are executing in the trampoline */ - leal .Lpaging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax - /* Prepare the stack for far return to Long Mode */ pushl $__KERNEL_CS - pushl %eax + pushl %ebx /* Enable paging again. */ movl %cr0, %eax @@ -666,12 +664,6 @@ SYM_CODE_START(trampoline_32bit_src) lret SYM_CODE_END(trampoline_32bit_src) - .code64 -SYM_FUNC_START_LOCAL_NOALIGN(.Lpaging_enabled) - /* Return from the trampoline */ - retq -SYM_FUNC_END(.Lpaging_enabled) - /* * The trampoline code has a size limit. * Make sure we fail to compile if the trampoline code grows @@ -679,7 +671,6 @@ SYM_FUNC_END(.Lpaging_enabled) */ .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE - .code32 SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode) /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h index cc9b2529a086..91dbb99203fb 100644 --- a/arch/x86/boot/compressed/pgtable.h +++ b/arch/x86/boot/compressed/pgtable.h @@ -6,7 +6,7 @@ #define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 #define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE -#define TRAMPOLINE_32BIT_CODE_SIZE 0x80 +#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0 #define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE From 6083b4c5908e0e6d1b578af04103f64c257ffb82 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Aug 2023 18:27:07 +0200 Subject: [PATCH 133/216] x86/decompressor: Use standard calling convention for trampoline commit 918a7a04e71745e99a0efc6753e587439b794b29 upstream. Update the trampoline code so its arguments are passed via RDI and RSI, which matches the ordinary SysV calling convention for x86_64. This will allow this code to be called directly from C. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20230807162720.545787-11-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 27 +++++++++++++-------------- arch/x86/boot/compressed/pgtable.h | 2 +- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 381ca6bf62d3..c28f7ef80ad2 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -447,9 +447,9 @@ SYM_CODE_START(startup_64) movq %r15, %rdi call paging_prepare - /* Save the trampoline address in RCX */ - movq %rax, %rcx - + /* Pass the trampoline address and boolean flag as args #1 and #2 */ + movq %rax, %rdi + movq %rdx, %rsi leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax call *%rax @@ -549,11 +549,14 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) SYM_FUNC_END(.Lrelocated) /* - * This is the 32-bit trampoline that will be copied over to low memory. + * This is the 32-bit trampoline that will be copied over to low memory. It + * will be called using the ordinary 64-bit calling convention from code + * running in 64-bit mode. * * Return address is at the top of the stack (might be above 4G). - * ECX contains the base address of the trampoline memory. - * Non zero RDX means trampoline needs to enable 5-level paging. + * The first argument (EDI) contains the 32-bit addressable base of the + * trampoline memory. A non-zero second argument (ESI) means that the + * trampoline needs to enable 5-level paging. */ SYM_CODE_START(trampoline_32bit_src) /* @@ -600,7 +603,7 @@ SYM_CODE_START(trampoline_32bit_src) movl %eax, %cr0 /* Check what paging mode we want to be in after the trampoline */ - testl %edx, %edx + testl %esi, %esi jz 1f /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ @@ -615,21 +618,17 @@ SYM_CODE_START(trampoline_32bit_src) jz 3f 2: /* Point CR3 to the trampoline's new top level page table */ - leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax + leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%edi), %eax movl %eax, %cr3 3: /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */ - pushl %ecx - pushl %edx movl $MSR_EFER, %ecx rdmsr btsl $_EFER_LME, %eax /* Avoid writing EFER if no change was made (for TDX guest) */ jc 1f wrmsr -1: popl %edx - popl %ecx - +1: #ifdef CONFIG_X86_MCE /* * Preserve CR4.MCE if the kernel will enable #MC support. @@ -646,7 +645,7 @@ SYM_CODE_START(trampoline_32bit_src) /* Enable PAE and LA57 (if required) paging modes */ orl $X86_CR4_PAE, %eax - testl %edx, %edx + testl %esi, %esi jz 1f orl $X86_CR4_LA57, %eax 1: diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h index 91dbb99203fb..4e8cef135226 100644 --- a/arch/x86/boot/compressed/pgtable.h +++ b/arch/x86/boot/compressed/pgtable.h @@ -14,7 +14,7 @@ extern unsigned long *trampoline_32bit; -extern void trampoline_32bit_src(void *return_ptr); +extern void trampoline_32bit_src(void *trampoline, bool enable_5lvl); #endif /* __ASSEMBLER__ */ #endif /* BOOT_COMPRESSED_PAGETABLE_H */ From 1523291591de054393ff4d732f18abd222ff5949 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Aug 2023 18:27:08 +0200 Subject: [PATCH 134/216] x86/decompressor: Avoid the need for a stack in the 32-bit trampoline commit bd328aa01ff77a45aeffea5fc4521854291db11f upstream. The 32-bit trampoline no longer uses the stack for anything except performing a far return back to long mode, and preserving the caller's stack pointer value. Currently, the trampoline stack is placed in the same page that carries the trampoline code, which means this page must be mapped writable and executable, and the stack is therefore executable as well. Replace the far return with a far jump, so that the return address can be pre-calculated and patched into the code before it is called. This removes the need for a 32-bit addressable stack entirely, and in a later patch, this will be taken advantage of by removing writable permissions from (and adding executable permissions to) the trampoline code page when booting via the EFI stub. Note that the value of RSP still needs to be preserved explicitly across the switch into 32-bit mode, as the register may get truncated to 32 bits. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20230807162720.545787-12-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 45 ++++++++++++++++----------- arch/x86/boot/compressed/pgtable.h | 4 +-- arch/x86/boot/compressed/pgtable_64.c | 12 ++++++- 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index c28f7ef80ad2..40848b817ac5 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -558,6 +558,7 @@ SYM_FUNC_END(.Lrelocated) * trampoline memory. A non-zero second argument (ESI) means that the * trampoline needs to enable 5-level paging. */ + .section ".rodata", "a", @progbits SYM_CODE_START(trampoline_32bit_src) /* * Preserve live 64-bit registers on the stack: this is necessary @@ -568,13 +569,9 @@ SYM_CODE_START(trampoline_32bit_src) pushq %rbp pushq %rbx - /* Set up 32-bit addressable stack and push the old RSP value */ - leaq (TRAMPOLINE_32BIT_STACK_END - 8)(%rcx), %rbx - movq %rsp, (%rbx) - movq %rbx, %rsp - - /* Take the address of the trampoline exit code */ - leaq .Lret(%rip), %rbx + /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */ + movq %rsp, %rbx + shrq $32, %rbx /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ pushq $__KERNEL32_CS @@ -582,9 +579,17 @@ SYM_CODE_START(trampoline_32bit_src) pushq %rax lretq + /* + * The 32-bit code below will do a far jump back to long mode and end + * up here after reconfiguring the number of paging levels. First, the + * stack pointer needs to be restored to its full 64-bit value before + * the callee save register contents can be popped from the stack. + */ .Lret: + shlq $32, %rbx + orq %rbx, %rsp + /* Restore the preserved 64-bit registers */ - movq (%rsp), %rsp popq %rbx popq %rbp popq %r15 @@ -592,11 +597,6 @@ SYM_CODE_START(trampoline_32bit_src) .code32 0: - /* Set up data and stack segments */ - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %ss - /* Disable paging */ movl %cr0, %eax btrl $X86_CR0_PG_BIT, %eax @@ -651,18 +651,26 @@ SYM_CODE_START(trampoline_32bit_src) 1: movl %eax, %cr4 - /* Prepare the stack for far return to Long Mode */ - pushl $__KERNEL_CS - pushl %ebx - /* Enable paging again. */ movl %cr0, %eax btsl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 - lret + /* + * Return to the 64-bit calling code using LJMP rather than LRET, to + * avoid the need for a 32-bit addressable stack. The destination + * address will be adjusted after the template code is copied into a + * 32-bit addressable buffer. + */ +.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src) SYM_CODE_END(trampoline_32bit_src) +/* + * This symbol is placed right after trampoline_32bit_src() so its address can + * be used to infer the size of the trampoline code. + */ +SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src) + /* * The trampoline code has a size limit. * Make sure we fail to compile if the trampoline code grows @@ -670,6 +678,7 @@ SYM_CODE_END(trampoline_32bit_src) */ .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE + .text SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode) /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h index 4e8cef135226..c6b0903aded0 100644 --- a/arch/x86/boot/compressed/pgtable.h +++ b/arch/x86/boot/compressed/pgtable.h @@ -8,13 +8,13 @@ #define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE #define TRAMPOLINE_32BIT_CODE_SIZE 0xA0 -#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE - #ifndef __ASSEMBLER__ extern unsigned long *trampoline_32bit; extern void trampoline_32bit_src(void *trampoline, bool enable_5lvl); +extern const u16 trampoline_ljmp_imm_offset; + #endif /* __ASSEMBLER__ */ #endif /* BOOT_COMPRESSED_PAGETABLE_H */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index f8092d3244c9..5198a05aefa8 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -109,6 +109,7 @@ static unsigned long find_trampoline_placement(void) struct paging_config paging_prepare(void *rmode) { struct paging_config paging_config = {}; + void *tramp_code; /* Initialize boot_params. Required for cmdline_find_option_bool(). */ boot_params = rmode; @@ -148,9 +149,18 @@ struct paging_config paging_prepare(void *rmode) memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE); /* Copy trampoline code in place */ - memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), + tramp_code = memcpy(trampoline_32bit + + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE); + /* + * Avoid the need for a stack in the 32-bit trampoline code, by using + * LJMP rather than LRET to return back to long mode. LJMP takes an + * immediate absolute address, which needs to be adjusted based on the + * placement of the trampoline. + */ + *(u32 *)(tramp_code + trampoline_ljmp_imm_offset) += (unsigned long)tramp_code; + /* * The code below prepares page table in trampoline memory. * From 364d7745974f20ed940918e3129d10c271638153 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Aug 2023 18:27:09 +0200 Subject: [PATCH 135/216] x86/decompressor: Call trampoline directly from C code commit 64ef578b6b6866bec012544416946533444036c8 upstream. Instead of returning to the asm calling code to invoke the trampoline, call it straight from the C code that sets it up. That way, the struct return type is no longer needed for returning two values, and the call can be made conditional more cleanly in a subsequent patch. This means that all callee save 64-bit registers need to be preserved and restored, as their contents may not survive the legacy mode switch. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20230807162720.545787-13-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 31 +++++++++++--------------- arch/x86/boot/compressed/pgtable_64.c | 32 +++++++++++---------------- 2 files changed, 26 insertions(+), 37 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 40848b817ac5..7d0cf029c824 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -433,25 +433,14 @@ SYM_CODE_START(startup_64) #endif /* - * paging_prepare() sets up the trampoline and checks if we need to - * enable 5-level paging. - * - * paging_prepare() returns a two-quadword structure which lands - * into RDX:RAX: - * - Address of the trampoline is returned in RAX. - * - Non zero RDX means trampoline needs to enable 5-level - * paging. + * configure_5level_paging() updates the number of paging levels using + * a trampoline in 32-bit addressable memory if the current number does + * not match the desired number. * * Pass the boot_params pointer as the first argument. */ movq %r15, %rdi - call paging_prepare - - /* Pass the trampoline address and boolean flag as args #1 and #2 */ - movq %rax, %rdi - movq %rdx, %rsi - leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax - call *%rax + call configure_5level_paging /* * cleanup_trampoline() would restore trampoline memory. @@ -561,11 +550,14 @@ SYM_FUNC_END(.Lrelocated) .section ".rodata", "a", @progbits SYM_CODE_START(trampoline_32bit_src) /* - * Preserve live 64-bit registers on the stack: this is necessary - * because the architecture does not guarantee that GPRs will retain - * their full 64-bit values across a 32-bit mode switch. + * Preserve callee save 64-bit registers on the stack: this is + * necessary because the architecture does not guarantee that GPRs will + * retain their full 64-bit values across a 32-bit mode switch. */ pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 pushq %rbp pushq %rbx @@ -592,6 +584,9 @@ SYM_CODE_START(trampoline_32bit_src) /* Restore the preserved 64-bit registers */ popq %rbx popq %rbp + popq %r12 + popq %r13 + popq %r14 popq %r15 retq diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 5198a05aefa8..f9cc86b2ee55 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -16,11 +16,6 @@ unsigned int __section(".data") pgdir_shift = 39; unsigned int __section(".data") ptrs_per_p4d = 1; #endif -struct paging_config { - unsigned long trampoline_start; - unsigned long l5_required; -}; - /* Buffer to preserve trampoline memory */ static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; @@ -29,7 +24,7 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; * purposes. * * Avoid putting the pointer into .bss as it will be cleared between - * paging_prepare() and extract_kernel(). + * configure_5level_paging() and extract_kernel(). */ unsigned long *trampoline_32bit __section(".data"); @@ -106,13 +101,13 @@ static unsigned long find_trampoline_placement(void) return bios_start - TRAMPOLINE_32BIT_SIZE; } -struct paging_config paging_prepare(void *rmode) +asmlinkage void configure_5level_paging(struct boot_params *bp) { - struct paging_config paging_config = {}; - void *tramp_code; + void (*toggle_la57)(void *trampoline, bool enable_5lvl); + bool l5_required = false; /* Initialize boot_params. Required for cmdline_find_option_bool(). */ - boot_params = rmode; + boot_params = bp; /* * Check if LA57 is desired and supported. @@ -130,7 +125,7 @@ struct paging_config paging_prepare(void *rmode) !cmdline_find_option_bool("no5lvl") && native_cpuid_eax(0) >= 7 && (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) { - paging_config.l5_required = 1; + l5_required = true; /* Initialize variables for 5-level paging */ __pgtable_l5_enabled = 1; @@ -138,9 +133,7 @@ struct paging_config paging_prepare(void *rmode) ptrs_per_p4d = 512; } - paging_config.trampoline_start = find_trampoline_placement(); - - trampoline_32bit = (unsigned long *)paging_config.trampoline_start; + trampoline_32bit = (unsigned long *)find_trampoline_placement(); /* Preserve trampoline memory */ memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE); @@ -149,7 +142,7 @@ struct paging_config paging_prepare(void *rmode) memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE); /* Copy trampoline code in place */ - tramp_code = memcpy(trampoline_32bit + + toggle_la57 = memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE); @@ -159,7 +152,8 @@ struct paging_config paging_prepare(void *rmode) * immediate absolute address, which needs to be adjusted based on the * placement of the trampoline. */ - *(u32 *)(tramp_code + trampoline_ljmp_imm_offset) += (unsigned long)tramp_code; + *(u32 *)((u8 *)toggle_la57 + trampoline_ljmp_imm_offset) += + (unsigned long)toggle_la57; /* * The code below prepares page table in trampoline memory. @@ -175,10 +169,10 @@ struct paging_config paging_prepare(void *rmode) * We are not going to use the page table in trampoline memory if we * are already in the desired paging mode. */ - if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57)) + if (l5_required == !!(native_read_cr4() & X86_CR4_LA57)) goto out; - if (paging_config.l5_required) { + if (l5_required) { /* * For 4- to 5-level paging transition, set up current CR3 as * the first and the only entry in a new top-level page table. @@ -201,7 +195,7 @@ struct paging_config paging_prepare(void *rmode) } out: - return paging_config; + toggle_la57(trampoline_32bit, l5_required); } void cleanup_trampoline(void *pgtable) From e2fa53a04cc722aaaedbd91cd414d170067fb09d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Aug 2023 18:27:10 +0200 Subject: [PATCH 136/216] x86/decompressor: Only call the trampoline when changing paging levels commit f97b67a773cd84bd8b55c0a0ec32448a87fc56bb upstream. Since the current and desired number of paging levels are known when the trampoline is being prepared, avoid calling the trampoline at all if it is clear that calling it is not going to result in a change to the number of paging levels. Given that the CPU is already running in long mode, the PAE and LA57 settings are necessarily consistent with the currently active page tables, and other fields in CR4 will be initialized by the startup code in the kernel proper. So limit the manipulation of CR4 to toggling the LA57 bit, which is the only thing that really needs doing at this point in the boot. This also means that there is no need to pass the value of l5_required to toggle_la57(), as it will not be called unless CR4.LA57 needs to toggle. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20230807162720.545787-14-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 45 +++------------------------ arch/x86/boot/compressed/pgtable_64.c | 22 ++++++------- 2 files changed, 13 insertions(+), 54 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 7d0cf029c824..4645d7b66b1f 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -390,10 +390,6 @@ SYM_CODE_START(startup_64) * For the trampoline, we need the top page table to reside in lower * memory as we don't have a way to load 64-bit values into CR3 in * 32-bit mode. - * - * We go though the trampoline even if we don't have to: if we're - * already in a desired paging mode. This way the trampoline code gets - * tested on every boot. */ /* Make sure we have GDT with 32-bit code segment */ @@ -544,8 +540,7 @@ SYM_FUNC_END(.Lrelocated) * * Return address is at the top of the stack (might be above 4G). * The first argument (EDI) contains the 32-bit addressable base of the - * trampoline memory. A non-zero second argument (ESI) means that the - * trampoline needs to enable 5-level paging. + * trampoline memory. */ .section ".rodata", "a", @progbits SYM_CODE_START(trampoline_32bit_src) @@ -597,25 +592,10 @@ SYM_CODE_START(trampoline_32bit_src) btrl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 - /* Check what paging mode we want to be in after the trampoline */ - testl %esi, %esi - jz 1f - - /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ - movl %cr4, %eax - testl $X86_CR4_LA57, %eax - jnz 3f - jmp 2f -1: - /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */ - movl %cr4, %eax - testl $X86_CR4_LA57, %eax - jz 3f -2: /* Point CR3 to the trampoline's new top level page table */ leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%edi), %eax movl %eax, %cr3 -3: + /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */ movl $MSR_EFER, %ecx rdmsr @@ -624,26 +604,9 @@ SYM_CODE_START(trampoline_32bit_src) jc 1f wrmsr 1: -#ifdef CONFIG_X86_MCE - /* - * Preserve CR4.MCE if the kernel will enable #MC support. - * Clearing MCE may fault in some environments (that also force #MC - * support). Any machine check that occurs before #MC support is fully - * configured will crash the system regardless of the CR4.MCE value set - * here. - */ + /* Toggle CR4.LA57 */ movl %cr4, %eax - andl $X86_CR4_MCE, %eax -#else - movl $0, %eax -#endif - - /* Enable PAE and LA57 (if required) paging modes */ - orl $X86_CR4_PAE, %eax - testl %esi, %esi - jz 1f - orl $X86_CR4_LA57, %eax -1: + btcl $X86_CR4_LA57_BIT, %eax movl %eax, %cr4 /* Enable paging again. */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index f9cc86b2ee55..4213473ae548 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -103,7 +103,7 @@ static unsigned long find_trampoline_placement(void) asmlinkage void configure_5level_paging(struct boot_params *bp) { - void (*toggle_la57)(void *trampoline, bool enable_5lvl); + void (*toggle_la57)(void *trampoline); bool l5_required = false; /* Initialize boot_params. Required for cmdline_find_option_bool(). */ @@ -133,6 +133,13 @@ asmlinkage void configure_5level_paging(struct boot_params *bp) ptrs_per_p4d = 512; } + /* + * The trampoline will not be used if the paging mode is already set to + * the desired one. + */ + if (l5_required == !!(native_read_cr4() & X86_CR4_LA57)) + return; + trampoline_32bit = (unsigned long *)find_trampoline_placement(); /* Preserve trampoline memory */ @@ -160,18 +167,8 @@ asmlinkage void configure_5level_paging(struct boot_params *bp) * * The new page table will be used by trampoline code for switching * from 4- to 5-level paging or vice versa. - * - * If switching is not required, the page table is unused: trampoline - * code wouldn't touch CR3. */ - /* - * We are not going to use the page table in trampoline memory if we - * are already in the desired paging mode. - */ - if (l5_required == !!(native_read_cr4() & X86_CR4_LA57)) - goto out; - if (l5_required) { /* * For 4- to 5-level paging transition, set up current CR3 as @@ -194,8 +191,7 @@ asmlinkage void configure_5level_paging(struct boot_params *bp) (void *)src, PAGE_SIZE); } -out: - toggle_la57(trampoline_32bit, l5_required); + toggle_la57(trampoline_32bit); } void cleanup_trampoline(void *pgtable) From df3dec320b7c14780484e824f3ec9c213e4996e1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Aug 2023 18:27:11 +0200 Subject: [PATCH 137/216] x86/decompressor: Pass pgtable address to trampoline directly commit cb83cece57e1889109dd73ea08ee338668c9d1b8 upstream. The only remaining use of the trampoline address by the trampoline itself is deriving the page table address from it, and this involves adding an offset of 0x0. So simplify this, and pass the new CR3 value directly. This makes the fact that the page table happens to be at the start of the trampoline allocation an implementation detail of the caller. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-15-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 8 ++++---- arch/x86/boot/compressed/pgtable.h | 2 -- arch/x86/boot/compressed/pgtable_64.c | 9 ++++----- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 4645d7b66b1f..148b349c193e 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -539,8 +539,9 @@ SYM_FUNC_END(.Lrelocated) * running in 64-bit mode. * * Return address is at the top of the stack (might be above 4G). - * The first argument (EDI) contains the 32-bit addressable base of the - * trampoline memory. + * The first argument (EDI) contains the address of the temporary PGD level + * page table in 32-bit addressable memory which will be programmed into + * register CR3. */ .section ".rodata", "a", @progbits SYM_CODE_START(trampoline_32bit_src) @@ -593,8 +594,7 @@ SYM_CODE_START(trampoline_32bit_src) movl %eax, %cr0 /* Point CR3 to the trampoline's new top level page table */ - leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%edi), %eax - movl %eax, %cr3 + movl %edi, %cr3 /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */ movl $MSR_EFER, %ecx diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h index c6b0903aded0..6d595abe06b3 100644 --- a/arch/x86/boot/compressed/pgtable.h +++ b/arch/x86/boot/compressed/pgtable.h @@ -3,8 +3,6 @@ #define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) -#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 - #define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE #define TRAMPOLINE_32BIT_CODE_SIZE 0xA0 diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 4213473ae548..eab4e6b568ae 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -103,7 +103,7 @@ static unsigned long find_trampoline_placement(void) asmlinkage void configure_5level_paging(struct boot_params *bp) { - void (*toggle_la57)(void *trampoline); + void (*toggle_la57)(void *cr3); bool l5_required = false; /* Initialize boot_params. Required for cmdline_find_option_bool(). */ @@ -174,7 +174,7 @@ asmlinkage void configure_5level_paging(struct boot_params *bp) * For 4- to 5-level paging transition, set up current CR3 as * the first and the only entry in a new top-level page table. */ - trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC; + *trampoline_32bit = __native_read_cr3() | _PAGE_TABLE_NOENC; } else { unsigned long src; @@ -187,8 +187,7 @@ asmlinkage void configure_5level_paging(struct boot_params *bp) * may be above 4G. */ src = *(unsigned long *)__native_read_cr3() & PAGE_MASK; - memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long), - (void *)src, PAGE_SIZE); + memcpy(trampoline_32bit, (void *)src, PAGE_SIZE); } toggle_la57(trampoline_32bit); @@ -198,7 +197,7 @@ void cleanup_trampoline(void *pgtable) { void *trampoline_pgtable; - trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long); + trampoline_pgtable = trampoline_32bit; /* * Move the top level page table out of trampoline memory, From 463b51e90c576cd63269f8420c0a0b09152092e5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Aug 2023 18:27:12 +0200 Subject: [PATCH 138/216] x86/decompressor: Merge trampoline cleanup with switching code commit 03dda95137d3247564854ad9032c0354273a159d upstream. Now that the trampoline setup code and the actual invocation of it are all done from the C routine, the trampoline cleanup can be merged into it as well, instead of returning to asm just to call another C function. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20230807162720.545787-16-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_64.S | 14 ++++---------- arch/x86/boot/compressed/pgtable_64.c | 18 ++++-------------- 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 148b349c193e..81458f77131b 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -433,20 +433,14 @@ SYM_CODE_START(startup_64) * a trampoline in 32-bit addressable memory if the current number does * not match the desired number. * - * Pass the boot_params pointer as the first argument. + * Pass the boot_params pointer as the first argument. The second + * argument is the relocated address of the page table to use instead + * of the page table in trampoline memory (if required). */ movq %r15, %rdi + leaq rva(top_pgtable)(%rbx), %rsi call configure_5level_paging - /* - * cleanup_trampoline() would restore trampoline memory. - * - * RDI is address of the page table to use instead of page table - * in trampoline memory (if required). - */ - leaq rva(top_pgtable)(%rbx), %rdi - call cleanup_trampoline - /* Zero EFLAGS */ pushq $0 popfq diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index eab4e6b568ae..7939eb6e6ce9 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -101,7 +101,7 @@ static unsigned long find_trampoline_placement(void) return bios_start - TRAMPOLINE_32BIT_SIZE; } -asmlinkage void configure_5level_paging(struct boot_params *bp) +asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable) { void (*toggle_la57)(void *cr3); bool l5_required = false; @@ -191,22 +191,12 @@ asmlinkage void configure_5level_paging(struct boot_params *bp) } toggle_la57(trampoline_32bit); -} - -void cleanup_trampoline(void *pgtable) -{ - void *trampoline_pgtable; - - trampoline_pgtable = trampoline_32bit; /* - * Move the top level page table out of trampoline memory, - * if it's there. + * Move the top level page table out of trampoline memory. */ - if ((void *)__native_read_cr3() == trampoline_pgtable) { - memcpy(pgtable, trampoline_pgtable, PAGE_SIZE); - native_write_cr3((unsigned long)pgtable); - } + memcpy(pgtable, trampoline_32bit, PAGE_SIZE); + native_write_cr3((unsigned long)pgtable); /* Restore trampoline memory */ memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE); From 5c4feadb0011983bbc4587bc61056c7b379d9969 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Aug 2023 18:27:16 +0200 Subject: [PATCH 139/216] x86/decompressor: Move global symbol references to C code commit 24388292e2d7fae79a0d4183cc91716b851299cf upstream. It is no longer necessary to be cautious when referring to global variables in the position independent decompressor code, now that it is built using PIE codegen and makes an assertion in the linker script that no GOT entries exist (which would require adjustment for the actual runtime load address of the decompressor binary). This means global variables can be referenced directly from C code, instead of having to pass their runtime addresses into C routines from asm code, which needs to happen at each call site. Do so for the code that will be called directly from the EFI stub after a subsequent patch, and avoid the need to duplicate this logic a third time. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-20-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/head_32.S | 8 -------- arch/x86/boot/compressed/head_64.S | 10 ++-------- arch/x86/boot/compressed/misc.c | 16 +++++++++------- 3 files changed, 11 insertions(+), 23 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 987ae727cf9f..3ecc1bbe971e 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -179,13 +179,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) */ /* push arguments for extract_kernel: */ - pushl output_len@GOTOFF(%ebx) /* decompressed length, end of relocs */ pushl %ebp /* output address */ - pushl input_len@GOTOFF(%ebx) /* input_len */ - leal input_data@GOTOFF(%ebx), %eax - pushl %eax /* input_data */ - leal boot_heap@GOTOFF(%ebx), %eax - pushl %eax /* heap area */ pushl %esi /* real mode pointer */ call extract_kernel /* returns kernel entry point in %eax */ addl $24, %esp @@ -213,8 +207,6 @@ SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) */ .bss .balign 4 -boot_heap: - .fill BOOT_HEAP_SIZE, 1, 0 boot_stack: .fill BOOT_STACK_SIZE, 1, 0 boot_stack_end: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 81458f77131b..fafd0a59f396 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -511,13 +511,9 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) /* * Do the extraction, and jump to the new kernel.. */ - /* pass struct boot_params pointer */ + /* pass struct boot_params pointer and output target address */ movq %r15, %rdi - leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ - leaq input_data(%rip), %rdx /* input_data */ - movl input_len(%rip), %ecx /* input_len */ - movq %rbp, %r8 /* output target address */ - movl output_len(%rip), %r9d /* decompressed length, end of relocs */ + movq %rbp, %rsi call extract_kernel /* returns kernel entry point in %rax */ /* @@ -675,8 +671,6 @@ SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) */ .bss .balign 4 -SYM_DATA_LOCAL(boot_heap, .fill BOOT_HEAP_SIZE, 1, 0) - SYM_DATA_START_LOCAL(boot_stack) .fill BOOT_STACK_SIZE, 1, 0 .balign 16 diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 014ff222bf4b..e4e3e49fcc37 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -330,6 +330,11 @@ static size_t parse_elf(void *output) return ehdr.e_entry - LOAD_PHYSICAL_ADDR; } +static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4); + +extern unsigned char input_data[]; +extern unsigned int input_len, output_len; + /* * The compressed kernel image (ZO), has been moved so that its position * is against the end of the buffer used to hold the uncompressed kernel @@ -347,14 +352,11 @@ static size_t parse_elf(void *output) * |-------uncompressed kernel image---------| * */ -asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, - unsigned char *input_data, - unsigned long input_len, - unsigned char *output, - unsigned long output_len) +asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) { const unsigned long kernel_total_size = VO__end - VO__text; unsigned long virt_addr = LOAD_PHYSICAL_ADDR; + memptr heap = (memptr)boot_heap; unsigned long needed_size; size_t entry_offset; @@ -412,7 +414,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, * entries. This ensures the full mapped area is usable RAM * and doesn't include any reserved areas. */ - needed_size = max(output_len, kernel_total_size); + needed_size = max_t(unsigned long, output_len, kernel_total_size); #ifdef CONFIG_X86_64 needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN); #endif @@ -443,7 +445,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, #ifdef CONFIG_X86_64 if (heap > 0x3fffffffffffUL) error("Destination address too large"); - if (virt_addr + max(output_len, kernel_total_size) > KERNEL_IMAGE_SIZE) + if (virt_addr + needed_size > KERNEL_IMAGE_SIZE) error("Destination virtual address is beyond the kernel mapping area"); #else if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff)) From bf0ca988e250af95824c121873b2f76fccfc91df Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Aug 2023 18:27:15 +0200 Subject: [PATCH 140/216] decompress: Use 8 byte alignment commit 8217ad0a435ff06d651d7298ea8ae8d72388179e upstream. The ZSTD decompressor requires malloc() allocations to be 8 byte aligned, so ensure that this the case. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-19-ardb@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/decompress/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/decompress/mm.h b/include/linux/decompress/mm.h index 9192986b1a73..ac862422df15 100644 --- a/include/linux/decompress/mm.h +++ b/include/linux/decompress/mm.h @@ -48,7 +48,7 @@ MALLOC_VISIBLE void *malloc(int size) if (!malloc_ptr) malloc_ptr = free_mem_ptr; - malloc_ptr = (malloc_ptr + 3) & ~3; /* Align */ + malloc_ptr = (malloc_ptr + 7) & ~7; /* Align */ p = (void *)malloc_ptr; malloc_ptr += size; From 04dd4403ff3721ad0bff925116fada773ed6ae69 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 30 Nov 2023 17:34:07 -0500 Subject: [PATCH 141/216] drm/amd/display: Increase frame warning limit with KASAN or KCSAN in dml commit 5b750b22530fe53bf7fd6a30baacd53ada26911b upstream. Does the same thing as: commit 6740ec97bcdb ("drm/amd/display: Increase frame warning limit with KASAN or KCSAN in dml2") Reviewed-by: Harry Wentland Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311302107.hUDXVyWT-lkp@intel.com/ Fixes: 67e38874b85b ("drm/amd/display: Increase num voltage states to 40") Signed-off-by: Alex Deucher Cc: Alvin Lee Cc: Hamza Mahfooz Cc: Samson Tam Cc: Harry Wentland Cc: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/display/dc/dml/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile b/drivers/gpu/drm/amd/display/dc/dml/Makefile index 6fdf87a6e240..6c7b286e1123 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile @@ -51,8 +51,12 @@ endif endif ifneq ($(CONFIG_FRAME_WARN),0) +ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y) +frame_warn_flag := -Wframe-larger-than=3072 +else frame_warn_flag := -Wframe-larger-than=2048 endif +endif CFLAGS_$(AMDDALPATH)/dc/dml/display_mode_lib.o := $(dml_ccflags) From 831e9e63cc3b90f62d82df854cda8232408526a9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 28 Feb 2024 10:25:49 +1100 Subject: [PATCH 142/216] NFS: Fix data corruption caused by congestion. when AOP_WRITEPAGE_ACTIVATE is returned (as NFS does when it detects congestion) it is important that the page is redirtied. nfs_writepage_locked() doesn't do this, so files can become corrupted as writes can be lost. Note that this is not needed in v6.8 as AOP_WRITEPAGE_ACTIVATE cannot be returned. It is needed for kernels v5.18..v6.7. From 6.3 onward the patch is different as it needs to mention "folio", not "page". Reported-and-tested-by: Jacek Tomaka Fixes: 6df25e58532b ("nfs: remove reliance on bdi congestion") Signed-off-by: NeilBrown Signed-off-by: Greg Kroah-Hartman --- fs/nfs/write.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f41d24b54fd1..6a0606668417 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -667,8 +667,10 @@ static int nfs_writepage_locked(struct page *page, int err; if (wbc->sync_mode == WB_SYNC_NONE && - NFS_SERVER(inode)->write_congested) + NFS_SERVER(inode)->write_congested) { + redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; + } nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_pageio_init_write(&pgio, inode, 0, From d03a9855cbe6f41b2928c4df2e33e05f32a8e7fa Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 13 Sep 2022 14:01:51 -0400 Subject: [PATCH 143/216] NFSD: Simplify READ_PLUS [ Upstream commit eeadcb75794516839078c28b3730132aeb700ce6 ] Chuck had suggested reverting READ_PLUS so it returns a single DATA segment covering the requested read range. This prepares the server for a future "sparse read" function so support can easily be added without needing to rip out the old READ_PLUS code at the same time. Signed-off-by: Anna Schumaker Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4xdr.c | 139 +++++++++++----------------------------------- 1 file changed, 32 insertions(+), 107 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 89a579be042e..51a598ee68fe 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -4777,79 +4777,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, - struct nfsd4_read *read, - unsigned long *maxcount, u32 *eof, - loff_t *pos) + struct nfsd4_read *read) { - struct xdr_stream *xdr = resp->xdr; + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); struct file *file = read->rd_nf->nf_file; - int starting_len = xdr->buf->len; - loff_t hole_pos; - __be32 nfserr; - __be32 *p, tmp; - __be64 tmp64; - - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE); - if (hole_pos > read->rd_offset) - *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset); - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len)); + struct xdr_stream *xdr = resp->xdr; + unsigned long maxcount; + __be32 nfserr, *p; /* Content type, offset, byte count */ p = xdr_reserve_space(xdr, 4 + 8 + 4); if (!p) - return nfserr_resource; + return nfserr_io; + if (resp->xdr->buf->page_len && splice_ok) { + WARN_ON_ONCE(splice_ok); + return nfserr_serverfault; + } - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount); - if (read->rd_vlen < 0) - return nfserr_resource; + maxcount = min_t(unsigned long, read->rd_length, + (xdr->buf->buflen - xdr->buf->len)); - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, - resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof); + if (file->f_op->splice_read && splice_ok) + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); + else + nfserr = nfsd4_encode_readv(resp, read, file, maxcount); if (nfserr) return nfserr; - xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount)); - tmp = htonl(NFS4_CONTENT_DATA); - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); - tmp64 = cpu_to_be64(read->rd_offset); - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8); - tmp = htonl(*maxcount); - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4); - - tmp = xdr_zero; - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp, - xdr_pad_size(*maxcount)); - return nfs_ok; -} - -static __be32 -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, - struct nfsd4_read *read, - unsigned long *maxcount, u32 *eof) -{ - struct file *file = read->rd_nf->nf_file; - loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA); - loff_t f_size = i_size_read(file_inode(file)); - unsigned long count; - __be32 *p; - - if (data_pos == -ENXIO) - data_pos = f_size; - else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE)) - return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size); - count = data_pos - read->rd_offset; - - /* Content type, offset, byte count */ - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8); - if (!p) - return nfserr_resource; - - *p++ = htonl(NFS4_CONTENT_HOLE); + *p++ = cpu_to_be32(NFS4_CONTENT_DATA); p = xdr_encode_hyper(p, read->rd_offset); - p = xdr_encode_hyper(p, count); + *p = cpu_to_be32(read->rd_length); - *eof = (read->rd_offset + count) >= f_size; - *maxcount = min_t(unsigned long, count, *maxcount); return nfs_ok; } @@ -4857,69 +4815,36 @@ static __be32 nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_read *read) { - unsigned long maxcount, count; + struct file *file = read->rd_nf->nf_file; struct xdr_stream *xdr = resp->xdr; - struct file *file; int starting_len = xdr->buf->len; - int last_segment = xdr->buf->len; - int segments = 0; - __be32 *p, tmp; - bool is_data; - loff_t pos; - u32 eof; + u32 segments = 0; + __be32 *p; if (nfserr) return nfserr; - file = read->rd_nf->nf_file; /* eof flag, segment count */ p = xdr_reserve_space(xdr, 4 + 4); if (!p) - return nfserr_resource; + return nfserr_io; xdr_commit_encode(xdr); - maxcount = min_t(unsigned long, read->rd_length, - (xdr->buf->buflen - xdr->buf->len)); - count = maxcount; - - eof = read->rd_offset >= i_size_read(file_inode(file)); - if (eof) + read->rd_eof = read->rd_offset >= i_size_read(file_inode(file)); + if (read->rd_eof) goto out; - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE); - is_data = pos > read->rd_offset; - - while (count > 0 && !eof) { - maxcount = count; - if (is_data) - nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof, - segments == 0 ? &pos : NULL); - else - nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof); - if (nfserr) - goto out; - count -= maxcount; - read->rd_offset += maxcount; - is_data = !is_data; - last_segment = xdr->buf->len; - segments++; + nfserr = nfsd4_encode_read_plus_data(resp, read); + if (nfserr) { + xdr_truncate_encode(xdr, starting_len); + return nfserr; } + segments++; + out: - if (nfserr && segments == 0) - xdr_truncate_encode(xdr, starting_len); - else { - if (nfserr) { - xdr_truncate_encode(xdr, last_segment); - nfserr = nfs_ok; - eof = 0; - } - tmp = htonl(eof); - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); - tmp = htonl(segments); - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); - } - + p = xdr_encode_bool(p, read->rd_eof); + *p = cpu_to_be32(segments); return nfserr; } From 0a49efb94888b6381d9c43fda17115ffda40a039 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 10 Oct 2022 21:24:23 +0100 Subject: [PATCH 144/216] NFSD: Remove redundant assignment to variable host_err [ Upstream commit 69eed23baf877bbb1f14d7f4df54f89807c9ee2a ] Variable host_err is assigned a value that is never read, it is being re-assigned a value in every different execution path in the following switch statement. The assignment is redundant and can be removed. Cleans up clang-scan warning: warning: Value stored to 'host_err' is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/vfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index eccc6ce55a63..fce7a35a5e64 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1317,7 +1317,6 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, iap->ia_mode &= ~current_umask(); err = 0; - host_err = 0; switch (type) { case S_IFREG: host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true); From bfef0cfab41cb4894bc5cf8b93e76327ac04b9b9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 18 Oct 2022 07:47:54 -0400 Subject: [PATCH 145/216] nfsd: ignore requests to disable unsupported versions [ Upstream commit 8e823bafff2308753d430566256c83d8085952da ] The kernel currently errors out if you attempt to enable or disable a version that it doesn't recognize. Change it to ignore attempts to disable an unrecognized version. If we don't support it, then there is no harm in doing so. Signed-off-by: Jeff Layton Reviewed-by: Tom Talpey Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfsctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 573de0d49e17..c21f5815d726 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -601,7 +601,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) } break; default: - return -EINVAL; + /* Ignore requests to disable non-existent versions */ + if (cmd == NFSD_SET) + return -EINVAL; } vers += len + 1; } while ((len = qword_get(&mesg, vers, size)) > 0); From 850333a25aab582118d9fa405af00caae32faa62 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 18 Oct 2022 07:47:55 -0400 Subject: [PATCH 146/216] nfsd: move nfserrno() to vfs.c [ Upstream commit cb12fae1c34b1fa7eaae92c5aadc72d86d7fae19 ] nfserrno() is common to all nfs versions, but nfsproc.c is specifically for NFSv2. Move it to vfs.c, and the prototype to vfs.h. While we're in here, remove the #ifdef EDQUOT check in this function. It's apparently a holdover from the initial merge of the nfsd code in 1997. No other place in the kernel checks that that symbol is defined before using it, so I think we can dispense with it here. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/blocklayout.c | 1 + fs/nfsd/blocklayoutxdr.c | 1 + fs/nfsd/export.h | 1 - fs/nfsd/flexfilelayout.c | 1 + fs/nfsd/nfs4idmap.c | 1 + fs/nfsd/nfsproc.c | 62 --------------------------------------- fs/nfsd/vfs.c | 63 ++++++++++++++++++++++++++++++++++++++++ fs/nfsd/vfs.h | 1 + 8 files changed, 68 insertions(+), 63 deletions(-) diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index e7e6e78d965d..01d7fd108cf3 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -12,6 +12,7 @@ #include "blocklayoutxdr.h" #include "pnfs.h" #include "filecache.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 2455dc8be18a..1ed2f691ebb9 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -9,6 +9,7 @@ #include "nfsd.h" #include "blocklayoutxdr.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index ee0e3aba4a6e..d03f7f6a8642 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -115,7 +115,6 @@ struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *); int exp_rootfh(struct net *, struct auth_domain *, char *path, struct knfsd_fh *, int maxsize); __be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *); -__be32 nfserrno(int errno); static inline void exp_put(struct svc_export *exp) { diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index 070f90ed09b6..3ca5304440ff 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -15,6 +15,7 @@ #include "flexfilelayoutxdr.h" #include "pnfs.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index e70a1a2999b7..5e9809aff37e 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -41,6 +41,7 @@ #include "idmap.h" #include "nfsd.h" #include "netns.h" +#include "vfs.h" /* * Turn off idmapping when using AUTH_SYS. diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 82b3ddeacc33..52fc222c34f2 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -848,65 +848,3 @@ const struct svc_version nfsd_version2 = { .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS2_SVC_XDRSIZE, }; - -/* - * Map errnos to NFS errnos. - */ -__be32 -nfserrno (int errno) -{ - static struct { - __be32 nfserr; - int syserr; - } nfs_errtbl[] = { - { nfs_ok, 0 }, - { nfserr_perm, -EPERM }, - { nfserr_noent, -ENOENT }, - { nfserr_io, -EIO }, - { nfserr_nxio, -ENXIO }, - { nfserr_fbig, -E2BIG }, - { nfserr_stale, -EBADF }, - { nfserr_acces, -EACCES }, - { nfserr_exist, -EEXIST }, - { nfserr_xdev, -EXDEV }, - { nfserr_mlink, -EMLINK }, - { nfserr_nodev, -ENODEV }, - { nfserr_notdir, -ENOTDIR }, - { nfserr_isdir, -EISDIR }, - { nfserr_inval, -EINVAL }, - { nfserr_fbig, -EFBIG }, - { nfserr_nospc, -ENOSPC }, - { nfserr_rofs, -EROFS }, - { nfserr_mlink, -EMLINK }, - { nfserr_nametoolong, -ENAMETOOLONG }, - { nfserr_notempty, -ENOTEMPTY }, -#ifdef EDQUOT - { nfserr_dquot, -EDQUOT }, -#endif - { nfserr_stale, -ESTALE }, - { nfserr_jukebox, -ETIMEDOUT }, - { nfserr_jukebox, -ERESTARTSYS }, - { nfserr_jukebox, -EAGAIN }, - { nfserr_jukebox, -EWOULDBLOCK }, - { nfserr_jukebox, -ENOMEM }, - { nfserr_io, -ETXTBSY }, - { nfserr_notsupp, -EOPNOTSUPP }, - { nfserr_toosmall, -ETOOSMALL }, - { nfserr_serverfault, -ESERVERFAULT }, - { nfserr_serverfault, -ENFILE }, - { nfserr_io, -EREMOTEIO }, - { nfserr_stale, -EOPENSTALE }, - { nfserr_io, -EUCLEAN }, - { nfserr_perm, -ENOKEY }, - { nfserr_no_grace, -ENOGRACE}, - }; - int i; - - for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) { - if (nfs_errtbl[i].syserr == errno) - return nfs_errtbl[i].nfserr; - } - WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno); - return nfserr_io; -} - diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index fce7a35a5e64..5d6a61d47a90 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -49,6 +49,69 @@ #define NFSDDBG_FACILITY NFSDDBG_FILEOP +/** + * nfserrno - Map Linux errnos to NFS errnos + * @errno: POSIX(-ish) error code to be mapped + * + * Returns the appropriate (net-endian) nfserr_* (or nfs_ok if errno is 0). If + * it's an error we don't expect, log it once and return nfserr_io. + */ +__be32 +nfserrno (int errno) +{ + static struct { + __be32 nfserr; + int syserr; + } nfs_errtbl[] = { + { nfs_ok, 0 }, + { nfserr_perm, -EPERM }, + { nfserr_noent, -ENOENT }, + { nfserr_io, -EIO }, + { nfserr_nxio, -ENXIO }, + { nfserr_fbig, -E2BIG }, + { nfserr_stale, -EBADF }, + { nfserr_acces, -EACCES }, + { nfserr_exist, -EEXIST }, + { nfserr_xdev, -EXDEV }, + { nfserr_mlink, -EMLINK }, + { nfserr_nodev, -ENODEV }, + { nfserr_notdir, -ENOTDIR }, + { nfserr_isdir, -EISDIR }, + { nfserr_inval, -EINVAL }, + { nfserr_fbig, -EFBIG }, + { nfserr_nospc, -ENOSPC }, + { nfserr_rofs, -EROFS }, + { nfserr_mlink, -EMLINK }, + { nfserr_nametoolong, -ENAMETOOLONG }, + { nfserr_notempty, -ENOTEMPTY }, + { nfserr_dquot, -EDQUOT }, + { nfserr_stale, -ESTALE }, + { nfserr_jukebox, -ETIMEDOUT }, + { nfserr_jukebox, -ERESTARTSYS }, + { nfserr_jukebox, -EAGAIN }, + { nfserr_jukebox, -EWOULDBLOCK }, + { nfserr_jukebox, -ENOMEM }, + { nfserr_io, -ETXTBSY }, + { nfserr_notsupp, -EOPNOTSUPP }, + { nfserr_toosmall, -ETOOSMALL }, + { nfserr_serverfault, -ESERVERFAULT }, + { nfserr_serverfault, -ENFILE }, + { nfserr_io, -EREMOTEIO }, + { nfserr_stale, -EOPENSTALE }, + { nfserr_io, -EUCLEAN }, + { nfserr_perm, -ENOKEY }, + { nfserr_no_grace, -ENOGRACE}, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) { + if (nfs_errtbl[i].syserr == errno) + return nfs_errtbl[i].nfserr; + } + WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno); + return nfserr_io; +} + /* * Called from nfsd_lookup and encode_dirent. Check if we have crossed * a mount point. diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 9744b041105b..dbdfef7ae85b 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -60,6 +60,7 @@ static inline void nfsd_attrs_free(struct nfsd_attrs *attrs) posix_acl_release(attrs->na_dpacl); } +__be32 nfserrno (int errno); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp); __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, From f82865e2a026b6d491377e64ad18326a413e6421 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 18 Oct 2022 07:47:56 -0400 Subject: [PATCH 147/216] nfsd: allow disabling NFSv2 at compile time [ Upstream commit 2f3a4b2ac2f28b9be78ad21f401f31e263845214 ] rpc.nfsd stopped supporting NFSv2 a year ago. Take the next logical step toward deprecating it and allow NFSv2 support to be compiled out. Add a new CONFIG_NFSD_V2 option that can be turned off and rework the CONFIG_NFSD_V?_ACL option dependencies. Add a description that discourages enabling it. Also, change the description of CONFIG_NFSD to state that the always-on version is now 3 instead of 2. Finally, add an #ifdef around "case 2:" in __write_versions. When NFSv2 is disabled at compile time, this should make the kernel ignore attempts to disable it at runtime, but still error out when trying to enable it. Signed-off-by: Jeff Layton Reviewed-by: Tom Talpey Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/Kconfig | 19 +++++++++++++++---- fs/nfsd/Makefile | 5 +++-- fs/nfsd/nfsctl.c | 2 ++ fs/nfsd/nfsd.h | 3 +-- fs/nfsd/nfssvc.c | 6 ++++++ 5 files changed, 27 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index f6a2fd3015e7..7c441f2bd444 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -8,6 +8,7 @@ config NFSD select SUNRPC select EXPORTFS select NFS_ACL_SUPPORT if NFSD_V2_ACL + select NFS_ACL_SUPPORT if NFSD_V3_ACL depends on MULTIUSER help Choose Y here if you want to allow other computers to access @@ -26,19 +27,29 @@ config NFSD Below you can choose which versions of the NFS protocol are available to clients mounting the NFS server on this system. - Support for NFS version 2 (RFC 1094) is always available when + Support for NFS version 3 (RFC 1813) is always available when CONFIG_NFSD is selected. If unsure, say N. -config NFSD_V2_ACL - bool +config NFSD_V2 + bool "NFS server support for NFS version 2 (DEPRECATED)" depends on NFSD + default n + help + NFSv2 (RFC 1094) was the first publicly-released version of NFS. + Unless you are hosting ancient (1990's era) NFS clients, you don't + need this. + + If unsure, say N. + +config NFSD_V2_ACL + bool "NFS server support for the NFSv2 ACL protocol extension" + depends on NFSD_V2 config NFSD_V3_ACL bool "NFS server support for the NFSv3 ACL protocol extension" depends on NFSD - select NFSD_V2_ACL help Solaris NFS servers support an auxiliary NFSv3 ACL protocol that never became an official part of the NFS version 3 protocol. diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 805c06d5f1b4..6fffc8f03f74 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -10,9 +10,10 @@ obj-$(CONFIG_NFSD) += nfsd.o # this one should be compiled first, as the tracing macros can easily blow up nfsd-y += trace.o -nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ - export.o auth.o lockd.o nfscache.o nfsxdr.o \ +nfsd-y += nfssvc.o nfsctl.o nfsfh.o vfs.o \ + export.o auth.o lockd.o nfscache.o \ stats.o filecache.o nfs3proc.o nfs3xdr.o +nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c21f5815d726..a8884d0b4638 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -581,7 +581,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET; switch(num) { +#ifdef CONFIG_NFSD_V2 case 2: +#endif case 3: nfsd_vers(nn, num, cmd); break; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 09726c5b9a31..93b42ef9ed91 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -64,8 +64,7 @@ struct readdir_cd { extern struct svc_program nfsd_program; -extern const struct svc_version nfsd_version2, nfsd_version3, - nfsd_version4; +extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; extern struct mutex nfsd_mutex; extern spinlock_t nfsd_drc_lock; extern unsigned long nfsd_drc_max_mem; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index c7695ebd28dc..6f4a38f5ab0c 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -91,8 +91,12 @@ unsigned long nfsd_drc_mem_used; #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static struct svc_stat nfsd_acl_svcstats; static const struct svc_version *nfsd_acl_version[] = { +# if defined(CONFIG_NFSD_V2_ACL) [2] = &nfsd_acl_version2, +# endif +# if defined(CONFIG_NFSD_V3_ACL) [3] = &nfsd_acl_version3, +# endif }; #define NFSD_ACL_MINVERS 2 @@ -116,7 +120,9 @@ static struct svc_stat nfsd_acl_svcstats = { #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ static const struct svc_version *nfsd_version[] = { +#if defined(CONFIG_NFSD_V2) [2] = &nfsd_version2, +#endif [3] = &nfsd_version3, #if defined(CONFIG_NFSD_V4) [4] = &nfsd_version4, From 137d20da8ea0daa9e0a2787acc4b66261e8796df Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Fri, 21 Oct 2022 14:24:14 +0200 Subject: [PATCH 148/216] exportfs: use pr_debug for unreachable debug statements [ Upstream commit 427505ffeaa464f683faba945a88d3e3248f6979 ] expfs.c has a bunch of dprintk statements which are unusable due to: #define dprintk(fmt, args...) do{}while(0) Use pr_debug so that they can be enabled dynamically. Also make some minor changes to the debug statements to fix some incorrect types, and remove __func__ which can be handled by dynamic debug separately. Signed-off-by: David Disseldorp Reviewed-by: NeilBrown Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/exportfs/expfs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index c648a493faf2..3204bd33e4e8 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -18,7 +18,7 @@ #include #include -#define dprintk(fmt, args...) do{}while(0) +#define dprintk(fmt, args...) pr_debug(fmt, ##args) static int get_name(const struct path *path, char *name, struct dentry *child); @@ -132,8 +132,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, inode_unlock(dentry->d_inode); if (IS_ERR(parent)) { - dprintk("%s: get_parent of %ld failed, err %d\n", - __func__, dentry->d_inode->i_ino, PTR_ERR(parent)); + dprintk("get_parent of %lu failed, err %ld\n", + dentry->d_inode->i_ino, PTR_ERR(parent)); return parent; } @@ -147,7 +147,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, dprintk("%s: found name: %s\n", __func__, nbuf); tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf)); if (IS_ERR(tmp)) { - dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); + dprintk("lookup failed: %ld\n", PTR_ERR(tmp)); err = PTR_ERR(tmp); goto out_err; } From e62d8c1281662a0cff23df2948162c1fe705d613 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 1 Nov 2022 13:30:46 -0400 Subject: [PATCH 149/216] NFSD: Flesh out a documenting comment for filecache.c [ Upstream commit b3276c1f5b268ff56622e9e125b792b4c3dc03ac ] Record what we've learned recently about the NFSD filecache in a documenting comment so our future selves don't forget what all this is for. Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/filecache.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 5cb8cce153a5..24ed511ed0d3 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -2,6 +2,30 @@ * Open file cache. * * (c) 2015 - Jeff Layton + * + * An nfsd_file object is a per-file collection of open state that binds + * together: + * - a struct file * + * - a user credential + * - a network namespace + * - a read-ahead context + * - monitoring for writeback errors + * + * nfsd_file objects are reference-counted. Consumers acquire a new + * object via the nfsd_file_acquire API. They manage their interest in + * the acquired object, and hence the object's reference count, via + * nfsd_file_get and nfsd_file_put. There are two varieties of nfsd_file + * object: + * + * * non-garbage-collected: When a consumer wants to precisely control + * the lifetime of a file's open state, it acquires a non-garbage- + * collected nfsd_file. The final nfsd_file_put releases the open + * state immediately. + * + * * garbage-collected: When a consumer does not control the lifetime + * of open state, it acquires a garbage-collected nfsd_file. The + * final nfsd_file_put allows the open state to linger for a period + * during which it may be re-used. */ #include From 519a80ea5a1770f1bb7d0627f4670ca1c1767f80 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:46:57 -0400 Subject: [PATCH 150/216] NFSD: Clean up nfs4_preprocess_stateid_op() call sites [ Upstream commit eeff73f7c1c583f79a401284f46c619294859310 ] Remove the lame-duck dprintk()s around nfs4_preprocess_stateid_op() call sites. Signed-off-by: Chuck Lever Tested-by: Jeff Layton Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4proc.c | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a9105e95b59c..ba53cd89ec62 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -943,12 +943,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &read->rd_stateid, RD_STATE, &read->rd_nf, NULL); - if (status) { - dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); - goto out; - } - status = nfs_ok; -out: + read->rd_rqstp = rqstp; read->rd_fhp = &cstate->current_fh; return status; @@ -1117,10 +1112,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &setattr->sa_stateid, WR_STATE, NULL, NULL); - if (status) { - dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); + if (status) return status; - } } err = fh_want_write(&cstate->current_fh); if (err) @@ -1170,10 +1163,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_offset, cnt); status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, stateid, WR_STATE, &nf, NULL); - if (status) { - dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); + if (status) return status; - } write->wr_how_written = write->wr_stable_how; @@ -1204,17 +1195,13 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, src_stateid, RD_STATE, src, NULL); - if (status) { - dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); + if (status) goto out; - } status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, dst_stateid, WR_STATE, dst, NULL); - if (status) { - dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); + if (status) goto out_put_src; - } /* fix up for NFS-specific error code */ if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) || @@ -1935,10 +1922,8 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &fallocate->falloc_stateid, WR_STATE, &nf, NULL); - if (status != nfs_ok) { - dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); + if (status != nfs_ok) return status; - } status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file, fallocate->falloc_offset, @@ -1994,10 +1979,8 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &seek->seek_stateid, RD_STATE, &nf, NULL); - if (status) { - dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); + if (status) return status; - } switch (seek->seek_whence) { case NFS4_CONTENT_DATA: From 9fbef7dcd8aa552d5a7e6867eec570e89d7d1631 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:03 -0400 Subject: [PATCH 151/216] NFSD: Trace stateids returned via DELEGRETURN [ Upstream commit 20eee313ff4b8a7e71ae9560f5c4ba27cd763005 ] Handing out a delegation stateid is recorded with the nfsd_deleg_read tracepoint, but there isn't a matching tracepoint for recording when the stateid is returned. Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 1 + fs/nfsd/trace.h | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b3f6dda930d8..6f974fdb47de 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6935,6 +6935,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto put_stateid; + trace_nfsd_deleg_return(stateid); wake_up_var(d_inode(cstate->current_fh.fh_dentry)); destroy_delegation(dp); put_stateid: diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 132335011cca..fedf676ef446 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -604,6 +604,7 @@ DEFINE_STATEID_EVENT(layout_recall_release); DEFINE_STATEID_EVENT(open); DEFINE_STATEID_EVENT(deleg_read); +DEFINE_STATEID_EVENT(deleg_return); DEFINE_STATEID_EVENT(deleg_recall); DECLARE_EVENT_CLASS(nfsd_stateseqid_class, From fae3f8b554fae8631a954e2b205ad84c531ba71b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:09 -0400 Subject: [PATCH 152/216] NFSD: Trace delegation revocations [ Upstream commit a1c74569bbde91299f24535abf711be5c84df9de ] Delegation revocation is an exceptional event that is not otherwise visible externally (eg, no network traffic is emitted). Generate a trace record when it occurs so that revocation can be observed or other activity can be triggered. Example: nfsd-1104 [005] 1912.002544: nfsd_stid_revoke: client 633c9343:4e82788d stateid 00000003:00000001 ref=2 type=DELEG Trace infrastructure is provided for subsequent additional tracing related to nfs4_stid activity. Signed-off-by: Chuck Lever Tested-by: Jeff Layton Reviewed-by: Jeff Layton Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 2 ++ fs/nfsd/trace.h | 55 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6f974fdb47de..5d55812f5a2a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1367,6 +1367,8 @@ static void revoke_delegation(struct nfs4_delegation *dp) WARN_ON(!list_empty(&dp->dl_recall_lru)); + trace_nfsd_stid_revoke(&dp->dl_stid); + if (clp->cl_minorversion) { spin_lock(&clp->cl_lock); dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index fedf676ef446..d261a06b6140 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -637,6 +637,61 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \ DEFINE_STATESEQID_EVENT(preprocess); DEFINE_STATESEQID_EVENT(open_confirm); +TRACE_DEFINE_ENUM(NFS4_OPEN_STID); +TRACE_DEFINE_ENUM(NFS4_LOCK_STID); +TRACE_DEFINE_ENUM(NFS4_DELEG_STID); +TRACE_DEFINE_ENUM(NFS4_CLOSED_STID); +TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID); +TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID); +TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID); + +#define show_stid_type(x) \ + __print_flags(x, "|", \ + { NFS4_OPEN_STID, "OPEN" }, \ + { NFS4_LOCK_STID, "LOCK" }, \ + { NFS4_DELEG_STID, "DELEG" }, \ + { NFS4_CLOSED_STID, "CLOSED" }, \ + { NFS4_REVOKED_DELEG_STID, "REVOKED" }, \ + { NFS4_CLOSED_DELEG_STID, "CLOSED_DELEG" }, \ + { NFS4_LAYOUT_STID, "LAYOUT" }) + +DECLARE_EVENT_CLASS(nfsd_stid_class, + TP_PROTO( + const struct nfs4_stid *stid + ), + TP_ARGS(stid), + TP_STRUCT__entry( + __field(unsigned long, sc_type) + __field(int, sc_count) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + ), + TP_fast_assign( + const stateid_t *stp = &stid->sc_stateid; + + __entry->sc_type = stid->sc_type; + __entry->sc_count = refcount_read(&stid->sc_count); + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + ), + TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s", + __entry->cl_boot, __entry->cl_id, + __entry->si_id, __entry->si_generation, + __entry->sc_count, show_stid_type(__entry->sc_type) + ) +); + +#define DEFINE_STID_EVENT(name) \ +DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name, \ + TP_PROTO(const struct nfs4_stid *stid), \ + TP_ARGS(stid)) + +DEFINE_STID_EVENT(revoke); + DECLARE_EVENT_CLASS(nfsd_clientid_class, TP_PROTO(const clientid_t *clid), TP_ARGS(clid), From 255ac53d78d562fe27b486360699dcbeb0bfacf8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:16 -0400 Subject: [PATCH 153/216] NFSD: Use const pointers as parameters to fh_ helpers [ Upstream commit b48f8056c034f28dd54668399f1d22be421b0bef ] Enable callers to use const pointers where they are able to. Signed-off-by: Chuck Lever Tested-by: Jeff Layton Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfsfh.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index c3ae6414fc5c..513e028b0bbe 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -220,7 +220,7 @@ __be32 fh_update(struct svc_fh *); void fh_put(struct svc_fh *); static __inline__ struct svc_fh * -fh_copy(struct svc_fh *dst, struct svc_fh *src) +fh_copy(struct svc_fh *dst, const struct svc_fh *src) { WARN_ON(src->fh_dentry); @@ -229,7 +229,7 @@ fh_copy(struct svc_fh *dst, struct svc_fh *src) } static inline void -fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src) +fh_copy_shallow(struct knfsd_fh *dst, const struct knfsd_fh *src) { dst->fh_size = src->fh_size; memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size); @@ -243,7 +243,8 @@ fh_init(struct svc_fh *fhp, int maxsize) return fhp; } -static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +static inline bool fh_match(const struct knfsd_fh *fh1, + const struct knfsd_fh *fh2) { if (fh1->fh_size != fh2->fh_size) return false; @@ -252,7 +253,8 @@ static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) return true; } -static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +static inline bool fh_fsid_match(const struct knfsd_fh *fh1, + const struct knfsd_fh *fh2) { if (fh1->fh_fsid_type != fh2->fh_fsid_type) return false; From 6ee5c4e269a9136da48df4126c4dde9b899d35cf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:22 -0400 Subject: [PATCH 154/216] NFSD: Update file_hashtbl() helpers [ Upstream commit 3fe828caddd81e68e9d29353c6e9285a658ca056 ] Enable callers to use const pointers for type safety. Signed-off-by: Chuck Lever Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 5d55812f5a2a..b6e7db0f6a69 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -721,7 +721,7 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) #define FILE_HASH_BITS 8 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -static unsigned int file_hashval(struct svc_fh *fh) +static unsigned int file_hashval(const struct svc_fh *fh) { struct inode *inode = d_inode(fh->fh_dentry); @@ -4687,7 +4687,7 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) /* search file_hashtbl[] for file */ static struct nfs4_file * -find_file_locked(struct svc_fh *fh, unsigned int hashval) +find_file_locked(const struct svc_fh *fh, unsigned int hashval) { struct nfs4_file *fp; From c8d8876aae34f2609d3ea815106024645fa85112 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:28 -0400 Subject: [PATCH 155/216] NFSD: Clean up nfsd4_init_file() [ Upstream commit 81a21fa3e7fdecb3c5b97014f0fc5a17d5806cae ] Name this function more consistently. I'm going to use nfsd4_file_ and nfsd4_file_hash_ for these helpers. Change the @fh parameter to be const pointer for better type safety. Finally, move the hash insertion operation to the caller. This is typical for most other "init_object" type helpers, and it is where most of the other nfs4_file hash table operations are located. Signed-off-by: Chuck Lever Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b6e7db0f6a69..80e11da95282 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4278,11 +4278,9 @@ static struct nfs4_file *nfsd4_alloc_file(void) } /* OPEN Share state helper functions */ -static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval, - struct nfs4_file *fp) -{ - lockdep_assert_held(&state_lock); +static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp) +{ refcount_set(&fp->fi_ref, 1); spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); @@ -4300,7 +4298,6 @@ static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval, INIT_LIST_HEAD(&fp->fi_lo_states); atomic_set(&fp->fi_lo_recalls, 0); #endif - hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); } void @@ -4718,7 +4715,8 @@ static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh, fp->fi_aliased = alias_found = true; } if (likely(ret == NULL)) { - nfsd4_init_file(fh, hashval, new); + nfsd4_file_init(fh, new); + hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]); new->fi_aliased = alias_found; ret = new; } From 5aa0c564c017a008b3d971a6228cfae171695f57 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:34 -0400 Subject: [PATCH 156/216] NFSD: Add a nfsd4_file_hash_remove() helper [ Upstream commit 3341678f2fd6106055cead09e513fad6950a0d19 ] Refactor to relocate hash deletion operation to a helper function that is close to most other nfs4_file data structure operations. The "noinline" annotation will become useful in a moment when the hlist_del_rcu() is replaced with a more complex rhash remove operation. It also guarantees that hash remove operations can be traced with "-p function -l remove_nfs4_file_locked". This also simplifies the organization of forward declarations: the to-be-added rhashtable and its param structure will be defined /after/ put_nfs4_file(). Signed-off-by: Chuck Lever Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 80e11da95282..8e26edbe54a3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -84,6 +84,7 @@ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) static void nfs4_free_ol_stateid(struct nfs4_stid *stid); void nfsd4_end_grace(struct nfsd_net *nn); static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps); +static void nfsd4_file_hash_remove(struct nfs4_file *fi); /* Locking: */ @@ -591,7 +592,7 @@ put_nfs4_file(struct nfs4_file *fi) might_lock(&state_lock); if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) { - hlist_del_rcu(&fi->fi_hash); + nfsd4_file_hash_remove(fi); spin_unlock(&state_lock); WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); @@ -4750,6 +4751,11 @@ find_or_add_file(struct nfs4_file *new, struct svc_fh *fh) return insert_file(new, fh, hashval); } +static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) +{ + hlist_del_rcu(&fi->fi_hash); +} + /* * Called to check deny when READ with all zero stateid or * WRITE with all zero or all one stateid From 0d4150f5eb20b2f14153474af7ca3a26814850a8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:41 -0400 Subject: [PATCH 157/216] NFSD: Clean up find_or_add_file() [ Upstream commit 9270fc514ba7d415636b23bcb937573a1ce54f6a ] Remove the call to find_file_locked() in insert_nfs4_file(). Tracing shows that over 99% of these calls return NULL. Thus it is not worth the expense of the extra bucket list traversal. insert_file() already deals correctly with the case where the item is already in the hash bucket. Since nfsd4_file_hash_insert() is now just a wrapper around insert_file(), move the meat of insert_file() into nfsd4_file_hash_insert() and get rid of it. Signed-off-by: Chuck Lever Reviewed-by: NeilBrown Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 68 ++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8e26edbe54a3..da6a7574ac55 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4699,32 +4699,6 @@ find_file_locked(const struct svc_fh *fh, unsigned int hashval) return NULL; } -static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh, - unsigned int hashval) -{ - struct nfs4_file *fp; - struct nfs4_file *ret = NULL; - bool alias_found = false; - - spin_lock(&state_lock); - hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash, - lockdep_is_held(&state_lock)) { - if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) { - if (refcount_inc_not_zero(&fp->fi_ref)) - ret = fp; - } else if (d_inode(fh->fh_dentry) == fp->fi_inode) - fp->fi_aliased = alias_found = true; - } - if (likely(ret == NULL)) { - nfsd4_file_init(fh, new); - hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]); - new->fi_aliased = alias_found; - ret = new; - } - spin_unlock(&state_lock); - return ret; -} - static struct nfs4_file * find_file(struct svc_fh *fh) { struct nfs4_file *fp; @@ -4736,19 +4710,37 @@ static struct nfs4_file * find_file(struct svc_fh *fh) return fp; } -static struct nfs4_file * -find_or_add_file(struct nfs4_file *new, struct svc_fh *fh) +/* + * On hash insertion, identify entries with the same inode but + * distinct filehandles. They will all be in the same hash bucket + * because nfs4_file's are hashed by the address in the fi_inode + * field. + */ +static noinline_for_stack struct nfs4_file * +nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp) { - struct nfs4_file *fp; - unsigned int hashval = file_hashval(fh); + unsigned int hashval = file_hashval(fhp); + struct nfs4_file *ret = NULL; + bool alias_found = false; + struct nfs4_file *fi; - rcu_read_lock(); - fp = find_file_locked(fh, hashval); - rcu_read_unlock(); - if (fp) - return fp; - - return insert_file(new, fh, hashval); + spin_lock(&state_lock); + hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, + lockdep_is_held(&state_lock)) { + if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { + if (refcount_inc_not_zero(&fi->fi_ref)) + ret = fi; + } else if (d_inode(fhp->fh_dentry) == fi->fi_inode) + fi->fi_aliased = alias_found = true; + } + if (likely(ret == NULL)) { + nfsd4_file_init(fhp, new); + hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]); + new->fi_aliased = alias_found; + ret = new; + } + spin_unlock(&state_lock); + return ret; } static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) @@ -5661,7 +5653,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * and check for delegations in the process of being recalled. * If not found, create the nfs4_file struct */ - fp = find_or_add_file(open->op_file, current_fh); + fp = nfsd4_file_hash_insert(open->op_file, current_fh); if (fp != open->op_file) { status = nfs4_check_deleg(cl, open, &dp); if (status) From 49e8d9f465006ba7197cbcc6d297528b72a2f196 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:47 -0400 Subject: [PATCH 158/216] NFSD: Refactor find_file() [ Upstream commit 15424748001a9b5ea62b3e6ad45f0a8b27f01df9 ] find_file() is now the only caller of find_file_locked(), so just fold these two together. Signed-off-by: Chuck Lever Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index da6a7574ac55..78b8f1442e00 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4683,31 +4683,24 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) nfs4_put_stid(&last->st_stid); } -/* search file_hashtbl[] for file */ -static struct nfs4_file * -find_file_locked(const struct svc_fh *fh, unsigned int hashval) +static noinline_for_stack struct nfs4_file * +nfsd4_file_hash_lookup(const struct svc_fh *fhp) { - struct nfs4_file *fp; - - hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash, - lockdep_is_held(&state_lock)) { - if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) { - if (refcount_inc_not_zero(&fp->fi_ref)) - return fp; - } - } - return NULL; -} - -static struct nfs4_file * find_file(struct svc_fh *fh) -{ - struct nfs4_file *fp; - unsigned int hashval = file_hashval(fh); + unsigned int hashval = file_hashval(fhp); + struct nfs4_file *fi; rcu_read_lock(); - fp = find_file_locked(fh, hashval); + hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, + lockdep_is_held(&state_lock)) { + if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { + if (refcount_inc_not_zero(&fi->fi_ref)) { + rcu_read_unlock(); + return fi; + } + } + } rcu_read_unlock(); - return fp; + return NULL; } /* @@ -4758,9 +4751,10 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) struct nfs4_file *fp; __be32 ret = nfs_ok; - fp = find_file(current_fh); + fp = nfsd4_file_hash_lookup(current_fh); if (!fp) return ret; + /* Check for conflicting share reservations */ spin_lock(&fp->fi_lock); if (fp->fi_share_deny & deny_type) From 5a1f61516f802d95959944e7529d23dfa6868031 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Oct 2022 10:47:53 -0400 Subject: [PATCH 159/216] NFSD: Use rhashtable for managing nfs4_file objects [ Upstream commit d47b295e8d76a4d69f0e2ea0cd8a79c9d3488280 ] fh_match() is costly, especially when filehandles are large (as is the case for NFSv4). It needs to be used sparingly when searching data structures. Unfortunately, with common workloads, I see multiple thousands of objects stored in file_hashtbl[], which has just 256 buckets, making its bucket hash chains quite lengthy. Walking long hash chains with the state_lock held blocks other activity that needs that lock. Sizable hash chains are a common occurrance once the server has handed out some delegations, for example -- IIUC, each delegated file is held open on the server by an nfs4_file object. To help mitigate the cost of searching with fh_match(), replace the nfs4_file hash table with an rhashtable, which can dynamically resize its bucket array to minimize hash chain length. The result of this modification is an improvement in the latency of NFSv4 operations, and the reduction of nfsd CPU utilization due to eliminating the cost of multiple calls to fh_match() and reducing the CPU cache misses incurred while walking long hash chains in the nfs4_file hash table. Signed-off-by: Chuck Lever Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 97 +++++++++++++++++++++++++++++---------------- fs/nfsd/state.h | 5 +-- 2 files changed, 63 insertions(+), 39 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 78b8f1442e00..64a61c3d8c60 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -44,7 +44,9 @@ #include #include #include +#include #include + #include "xdr4.h" #include "xdr4cb.h" #include "vfs.h" @@ -589,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu) void put_nfs4_file(struct nfs4_file *fi) { - might_lock(&state_lock); - - if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) { + if (refcount_dec_and_test(&fi->fi_ref)) { nfsd4_file_hash_remove(fi); - spin_unlock(&state_lock); WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); @@ -718,19 +717,20 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) return ret & OWNER_HASH_MASK; } -/* hash table for nfs4_file */ -#define FILE_HASH_BITS 8 -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) +static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp; -static unsigned int file_hashval(const struct svc_fh *fh) -{ - struct inode *inode = d_inode(fh->fh_dentry); +static const struct rhashtable_params nfs4_file_rhash_params = { + .key_len = sizeof_field(struct nfs4_file, fi_inode), + .key_offset = offsetof(struct nfs4_file, fi_inode), + .head_offset = offsetof(struct nfs4_file, fi_rlist), - /* XXX: why not (here & in file cache) use inode? */ - return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS); -} - -static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; + /* + * Start with a single page hash table to reduce resizing churn + * on light workloads. + */ + .min_size = 256, + .automatic_shrinking = true, +}; /* * Check if courtesy clients have conflicting access and resolve it if possible @@ -4686,12 +4686,14 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) static noinline_for_stack struct nfs4_file * nfsd4_file_hash_lookup(const struct svc_fh *fhp) { - unsigned int hashval = file_hashval(fhp); + struct inode *inode = d_inode(fhp->fh_dentry); + struct rhlist_head *tmp, *list; struct nfs4_file *fi; rcu_read_lock(); - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, - lockdep_is_held(&state_lock)) { + list = rhltable_lookup(&nfs4_file_rhltable, &inode, + nfs4_file_rhash_params); + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { if (refcount_inc_not_zero(&fi->fi_ref)) { rcu_read_unlock(); @@ -4705,40 +4707,56 @@ nfsd4_file_hash_lookup(const struct svc_fh *fhp) /* * On hash insertion, identify entries with the same inode but - * distinct filehandles. They will all be in the same hash bucket - * because nfs4_file's are hashed by the address in the fi_inode - * field. + * distinct filehandles. They will all be on the list returned + * by rhltable_lookup(). + * + * inode->i_lock prevents racing insertions from adding an entry + * for the same inode/fhp pair twice. */ static noinline_for_stack struct nfs4_file * nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp) { - unsigned int hashval = file_hashval(fhp); + struct inode *inode = d_inode(fhp->fh_dentry); + struct rhlist_head *tmp, *list; struct nfs4_file *ret = NULL; bool alias_found = false; struct nfs4_file *fi; + int err; - spin_lock(&state_lock); - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, - lockdep_is_held(&state_lock)) { + rcu_read_lock(); + spin_lock(&inode->i_lock); + + list = rhltable_lookup(&nfs4_file_rhltable, &inode, + nfs4_file_rhash_params); + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { if (refcount_inc_not_zero(&fi->fi_ref)) ret = fi; - } else if (d_inode(fhp->fh_dentry) == fi->fi_inode) + } else fi->fi_aliased = alias_found = true; } - if (likely(ret == NULL)) { - nfsd4_file_init(fhp, new); - hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]); - new->fi_aliased = alias_found; - ret = new; - } - spin_unlock(&state_lock); + if (ret) + goto out_unlock; + + nfsd4_file_init(fhp, new); + err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist, + nfs4_file_rhash_params); + if (err) + goto out_unlock; + + new->fi_aliased = alias_found; + ret = new; + +out_unlock: + spin_unlock(&inode->i_lock); + rcu_read_unlock(); return ret; } static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) { - hlist_del_rcu(&fi->fi_hash); + rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist, + nfs4_file_rhash_params); } /* @@ -5648,6 +5666,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * If not found, create the nfs4_file struct */ fp = nfsd4_file_hash_insert(open->op_file, current_fh); + if (unlikely(!fp)) + return nfserr_jukebox; if (fp != open->op_file) { status = nfs4_check_deleg(cl, open, &dp); if (status) @@ -8064,10 +8084,16 @@ nfs4_state_start(void) { int ret; - ret = nfsd4_create_callback_queue(); + ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params); if (ret) return ret; + ret = nfsd4_create_callback_queue(); + if (ret) { + rhltable_destroy(&nfs4_file_rhltable); + return ret; + } + set_max_delegations(); return 0; } @@ -8098,6 +8124,7 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); + rhltable_destroy(&nfs4_file_rhltable); #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_shutdown_umount(nn); #endif diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e2daef3cc003..eadd7f465bf5 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -536,16 +536,13 @@ struct nfs4_clnt_odstate { * inode can have multiple filehandles associated with it, so there is * (potentially) a many to one relationship between this struct and struct * inode. - * - * These are hashed by filehandle in the file_hashtbl, which is protected by - * the global state_lock spinlock. */ struct nfs4_file { refcount_t fi_ref; struct inode * fi_inode; bool fi_aliased; spinlock_t fi_lock; - struct hlist_node fi_hash; /* hash on fi_fhandle */ + struct rhlist_head fi_rlist; struct list_head fi_stateids; union { struct list_head fi_delegations; From 6b12589f610ae5ca924573315b4cf3afd593cb09 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 31 Oct 2022 09:53:26 -0400 Subject: [PATCH 160/216] NFSD: Fix licensing header in filecache.c [ Upstream commit 3f054211b29c0fa06dfdcab402c795fd7e906be1 ] Add a missing SPDX header. Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/filecache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 24ed511ed0d3..e37ecd8b8197 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * Open file cache. + * The NFSD open file cache. * * (c) 2015 - Jeff Layton * From 1f76cb66ff2257675666172aba42b4e661809a20 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 16 Nov 2022 09:02:30 -0500 Subject: [PATCH 161/216] filelock: add a new locks_inode_context accessor function [ Upstream commit 401a8b8fd5acd51582b15238d72a8d0edd580e9f ] There are a number of places in the kernel that are accessing the inode->i_flctx field without smp_load_acquire. This is required to ensure that the caller doesn't see a partially-initialized structure. Add a new accessor function for it to make this clear and convert all of the relevant accesses in locks.c to use it. Also, convert locks_free_lock_context to use the helper as well instead of just doing a "bare" assignment. Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton Stable-dep-of: 77c67530e1f9 ("nfsd: use locks_inode_context helper") Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/locks.c | 24 ++++++++++++------------ include/linux/fs.h | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 1047ab2b15e9..7d0918b8fe5d 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -175,7 +175,7 @@ locks_get_lock_context(struct inode *inode, int type) struct file_lock_context *ctx; /* paired with cmpxchg() below */ - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (likely(ctx) || type == F_UNLCK) goto out; @@ -194,7 +194,7 @@ locks_get_lock_context(struct inode *inode, int type) */ if (cmpxchg(&inode->i_flctx, NULL, ctx)) { kmem_cache_free(flctx_cache, ctx); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); } out: trace_locks_get_lock_context(inode, type, ctx); @@ -247,7 +247,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list, void locks_free_lock_context(struct inode *inode) { - struct file_lock_context *ctx = inode->i_flctx; + struct file_lock_context *ctx = locks_inode_context(inode); if (unlikely(ctx)) { locks_check_ctx_lists(inode); @@ -891,7 +891,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) void *owner; void (*func)(void); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx || list_empty_careful(&ctx->flc_posix)) { fl->fl_type = F_UNLCK; return; @@ -1483,7 +1483,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) new_fl->fl_flags = type; /* typically we will check that ctx is non-NULL before calling */ - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) { WARN_ON_ONCE(1); goto free_lock; @@ -1588,7 +1588,7 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time) struct file_lock_context *ctx; struct file_lock *fl; - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); fl = list_first_entry_or_null(&ctx->flc_lease, @@ -1634,7 +1634,7 @@ int fcntl_getlease(struct file *filp) int type = F_UNLCK; LIST_HEAD(dispose); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (ctx && !list_empty_careful(&ctx->flc_lease)) { percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); @@ -1823,7 +1823,7 @@ static int generic_delete_lease(struct file *filp, void *owner) struct file_lock_context *ctx; LIST_HEAD(dispose); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) { trace_generic_delete_lease(inode, NULL); return error; @@ -2562,7 +2562,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx || list_empty(&ctx->flc_posix)) return; @@ -2635,7 +2635,7 @@ void locks_remove_file(struct file *filp) { struct file_lock_context *ctx; - ctx = smp_load_acquire(&locks_inode(filp)->i_flctx); + ctx = locks_inode_context(locks_inode(filp)); if (!ctx) return; @@ -2682,7 +2682,7 @@ bool vfs_inode_has_locks(struct inode *inode) struct file_lock_context *ctx; bool ret; - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) return false; @@ -2863,7 +2863,7 @@ void show_fd_locks(struct seq_file *f, struct file_lock_context *ctx; int id = 0; - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) return; diff --git a/include/linux/fs.h b/include/linux/fs.h index 67313881f8ac..092d8fa10153 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1189,6 +1189,13 @@ extern void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); extern bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner); + +static inline struct file_lock_context * +locks_inode_context(const struct inode *inode) +{ + return smp_load_acquire(&inode->i_flctx); +} + #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) @@ -1334,6 +1341,13 @@ static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, { return false; } + +static inline struct file_lock_context * +locks_inode_context(const struct inode *inode) +{ + return NULL; +} + #endif /* !CONFIG_FILE_LOCKING */ static inline struct inode *file_inode(const struct file *f) From c66f9f22e6e555edd575d471c6a309466651a2f5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 16 Nov 2022 09:19:43 -0500 Subject: [PATCH 162/216] lockd: use locks_inode_context helper [ Upstream commit 98b41ffe0afdfeaa1439a5d6bd2db4a94277e31b ] lockd currently doesn't access i_flctx safely. This requires a smp_load_acquire, as the pointer is set via cmpxchg (a release operation). Cc: Trond Myklebust Cc: Anna Schumaker Cc: Chuck Lever Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/lockd/svcsubs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 3515f17eaf3f..e3b6229e7ae5 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -210,7 +210,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, { struct inode *inode = nlmsvc_file_inode(file); struct file_lock *fl; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct nlm_host *lockhost; if (!flctx || list_empty_careful(&flctx->flc_posix)) @@ -265,7 +265,7 @@ nlm_file_inuse(struct nlm_file *file) { struct inode *inode = nlmsvc_file_inode(file); struct file_lock *fl; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) return 1; From e017486dadf9ebb890bdd654b67014a9aeaa41c1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 16 Nov 2022 09:36:07 -0500 Subject: [PATCH 163/216] nfsd: use locks_inode_context helper [ Upstream commit 77c67530e1f95ac25c7075635f32f04367380894 ] nfsd currently doesn't access i_flctx safely everywhere. This requires a smp_load_acquire, as the pointer is set via cmpxchg (a release operation). Acked-by: Chuck Lever Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 64a61c3d8c60..5d6851fd4f97 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4784,7 +4784,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) static bool nfsd4_deleg_present(const struct inode *inode) { - struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx); + struct file_lock_context *ctx = locks_inode_context(inode); return ctx && !list_empty_careful(&ctx->flc_lease); } @@ -5944,7 +5944,7 @@ nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo) list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { nf = stp->st_stid.sc_file; - ctx = nf->fi_inode->i_flctx; + ctx = locks_inode_context(nf->fi_inode); if (!ctx) continue; if (locks_owner_has_blockers(ctx, lo)) @@ -7761,7 +7761,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) } inode = locks_inode(nf->nf_file); - flctx = inode->i_flctx; + flctx = locks_inode_context(inode); if (flctx && !list_empty_careful(&flctx->flc_posix)) { spin_lock(&flctx->flc_lock); From 8b7be6ef588e0df6036e99f0f637fdac641da396 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Nov 2022 14:44:50 -0400 Subject: [PATCH 164/216] nfsd: fix up the filecache laundrette scheduling [ Upstream commit 22ae4c114f77b55a4c5036e8f70409a0799a08f8 ] We don't really care whether there are hashed entries when it comes to scheduling the laundrette. They might all be non-gc entries, after all. We only want to schedule it if there are entries on the LRU. Switch to using list_lru_count, and move the check into nfsd_file_gc_worker. The other callsite in nfsd_file_put doesn't need to count entries, since it only schedules the laundrette after adding an entry to the LRU. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/filecache.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index e37ecd8b8197..697acf5c3c68 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -211,12 +211,9 @@ static const struct rhashtable_params nfsd_file_rhash_params = { static void nfsd_file_schedule_laundrette(void) { - if ((atomic_read(&nfsd_file_rhash_tbl.nelems) == 0) || - test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0) - return; - - queue_delayed_work(system_wq, &nfsd_filecache_laundrette, - NFSD_LAUNDRETTE_DELAY); + if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags)) + queue_delayed_work(system_wq, &nfsd_filecache_laundrette, + NFSD_LAUNDRETTE_DELAY); } static void @@ -614,7 +611,8 @@ static void nfsd_file_gc_worker(struct work_struct *work) { nfsd_file_gc(); - nfsd_file_schedule_laundrette(); + if (list_lru_count(&nfsd_file_lru)) + nfsd_file_schedule_laundrette(); } static unsigned long From 12e63680a76cad3bf505669b753560582ccadfcb Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Fri, 11 Nov 2022 17:18:35 +0800 Subject: [PATCH 165/216] NFSD: Use struct_size() helper in alloc_session() [ Upstream commit 85a0d0c9a58002ef7d1bf5e3ea630f4fbd42a4f0 ] Use struct_size() helper to simplify the code, no functional changes. Signed-off-by: Xiu Jianfeng Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 5d6851fd4f97..d2ed047dfdc8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1834,13 +1834,12 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, int numslots = fattrs->maxreqs; int slotsize = slot_bytes(fattrs); struct nfsd4_session *new; - int mem, i; + int i; - BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *) - + sizeof(struct nfsd4_session) > PAGE_SIZE); - mem = numslots * sizeof(struct nfsd4_slot *); + BUILD_BUG_ON(struct_size(new, se_slots, NFSD_MAX_SLOTS_PER_SESSION) + > PAGE_SIZE); - new = kzalloc(sizeof(*new) + mem, GFP_KERNEL); + new = kzalloc(struct_size(new, se_slots, numslots), GFP_KERNEL); if (!new) return NULL; /* allocate each struct nfsd4_slot and data cache in one piece */ From 8973a8f9b72dbafbd1083c220d975a0e7ec871d0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Nov 2022 14:36:36 -0500 Subject: [PATCH 166/216] lockd: set missing fl_flags field when retrieving args [ Upstream commit 75c7940d2a86d3f1b60a0a265478cb8fc887b970 ] Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/lockd/svc4proc.c | 1 + fs/lockd/svcproc.c | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 284b019cb652..b72023a6b4c1 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -52,6 +52,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, *filp = file; /* Set up the missing parts of the file_lock structure */ + lock->fl.fl_flags = FL_POSIX; lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_start = (loff_t)lock->lock_start; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index e35c05e27806..32784f508c81 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -77,6 +77,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Set up the missing parts of the file_lock structure */ mode = lock_to_openmode(&lock->fl); + lock->fl.fl_flags = FL_POSIX; lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; From ccbf6efab8d37e3af007e83d7e7797f0ab2f3064 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Nov 2022 14:36:37 -0500 Subject: [PATCH 167/216] lockd: ensure we use the correct file descriptor when unlocking [ Upstream commit 69efce009f7df888e1fede3cb2913690eb829f52 ] Shared locks are set on O_RDONLY descriptors and exclusive locks are set on O_WRONLY ones. nlmsvc_unlock however calls vfs_lock_file twice, once for each descriptor, but it doesn't reset fl_file. Ensure that it does. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/lockd/svclock.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 9c1aa75441e1..9eae99e08e69 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -659,11 +659,13 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) nlmsvc_cancel_blocked(net, file, lock); lock->fl.fl_type = F_UNLCK; - if (file->f_file[O_RDONLY]) - error = vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, + lock->fl.fl_file = file->f_file[O_RDONLY]; + if (lock->fl.fl_file) + error = vfs_lock_file(lock->fl.fl_file, F_SETLK, &lock->fl, NULL); - if (file->f_file[O_WRONLY]) - error = vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, + lock->fl.fl_file = file->f_file[O_WRONLY]; + if (lock->fl.fl_file) + error |= vfs_lock_file(lock->fl.fl_file, F_SETLK, &lock->fl, NULL); return (error < 0)? nlm_lck_denied_nolocks : nlm_granted; From 0920deeec6dd2e8d142a688a81744702895d46c6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Nov 2022 14:36:38 -0500 Subject: [PATCH 168/216] lockd: fix file selection in nlmsvc_cancel_blocked [ Upstream commit 9f27783b4dd235ef3c8dbf69fc6322777450323c ] We currently do a lock_to_openmode call based on the arguments from the NLM_UNLOCK call, but that will always set the fl_type of the lock to F_UNLCK, and the O_RDONLY descriptor is always chosen. Fix it to use the file_lock from the block instead. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/lockd/svclock.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 9eae99e08e69..4e30f3c50970 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -699,9 +699,10 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l block = nlmsvc_lookup_block(file, lock); mutex_unlock(&file->f_mutex); if (block != NULL) { - mode = lock_to_openmode(&lock->fl); - vfs_cancel_lock(block->b_file->f_file[mode], - &block->b_call->a_args.lock.fl); + struct file_lock *fl = &block->b_call->a_args.lock.fl; + + mode = lock_to_openmode(fl); + vfs_cancel_lock(block->b_file->f_file[mode], fl); status = nlmsvc_unlink_block(block); nlmsvc_release_block(block); } From 371e1c1b326b5de0a12204f217709e2f626c7fe7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 14 Nov 2022 08:57:43 -0500 Subject: [PATCH 169/216] trace: Relocate event helper files [ Upstream commit 247c01ff5f8d66e62a404c91733be52fecb8b7f6 ] Steven Rostedt says: > The include/trace/events/ directory should only hold files that > are to create events, not headers that hold helper functions. > > Can you please move them out of include/trace/events/ as that > directory is "special" in the creation of events. Signed-off-by: Chuck Lever Acked-by: Leon Romanovsky Acked-by: Steven Rostedt (Google) Acked-by: Anna Schumaker Stable-dep-of: 638593be55c0 ("NFSD: add CB_RECALL_ANY tracepoints") Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 7 +++++++ drivers/infiniband/core/cm_trace.h | 2 +- drivers/infiniband/core/cma_trace.h | 2 +- fs/nfs/nfs4trace.h | 6 +++--- fs/nfs/nfstrace.h | 6 +++--- include/trace/events/rpcgss.h | 2 +- include/trace/events/rpcrdma.h | 4 ++-- include/trace/events/sunrpc.h | 2 +- include/trace/{events => misc}/fs.h | 0 include/trace/{events => misc}/nfs.h | 0 include/trace/{events => misc}/rdma.h | 0 include/trace/{events/sunrpc_base.h => misc/sunrpc.h} | 0 12 files changed, 19 insertions(+), 12 deletions(-) rename include/trace/{events => misc}/fs.h (100%) rename include/trace/{events => misc}/nfs.h (100%) rename include/trace/{events => misc}/rdma.h (100%) rename include/trace/{events/sunrpc_base.h => misc/sunrpc.h} (100%) diff --git a/MAINTAINERS b/MAINTAINERS index 13d1078808bb..bbfedb0b2093 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10051,6 +10051,7 @@ F: drivers/infiniband/ F: include/rdma/ F: include/trace/events/ib_mad.h F: include/trace/events/ib_umad.h +F: include/trace/misc/rdma.h F: include/uapi/linux/if_infiniband.h F: include/uapi/rdma/ F: samples/bpf/ibumad_kern.c @@ -11139,6 +11140,12 @@ F: fs/nfs_common/ F: fs/nfsd/ F: include/linux/lockd/ F: include/linux/sunrpc/ +F: include/trace/events/rpcgss.h +F: include/trace/events/rpcrdma.h +F: include/trace/events/sunrpc.h +F: include/trace/misc/fs.h +F: include/trace/misc/nfs.h +F: include/trace/misc/sunrpc.h F: include/uapi/linux/nfsd/ F: include/uapi/linux/sunrpc/ F: net/sunrpc/ diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h index e9d282679ef1..944d9071245d 100644 --- a/drivers/infiniband/core/cm_trace.h +++ b/drivers/infiniband/core/cm_trace.h @@ -16,7 +16,7 @@ #include #include -#include +#include /* * enum ib_cm_state, from include/rdma/ib_cm.h diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h index e45264267bcc..47f3c6e4be89 100644 --- a/drivers/infiniband/core/cma_trace.h +++ b/drivers/infiniband/core/cma_trace.h @@ -15,7 +15,7 @@ #define _TRACE_RDMA_CMA_H #include -#include +#include DECLARE_EVENT_CLASS(cma_fsm_class, diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 3fa77ad7258f..c8a57cfde64b 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -9,10 +9,10 @@ #define _TRACE_NFS4_H #include -#include +#include -#include -#include +#include +#include #define show_nfs_fattr_flags(valid) \ __print_flags((unsigned long)valid, "|", \ diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 8c6cc58679ff..642f6921852f 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -11,9 +11,9 @@ #include #include -#include -#include -#include +#include +#include +#include #define nfs_show_cache_validity(v) \ __print_flags(v, "|", \ diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h index c9048f3e471b..3f121eed369e 100644 --- a/include/trace/events/rpcgss.h +++ b/include/trace/events/rpcgss.h @@ -13,7 +13,7 @@ #include -#include +#include /** ** GSS-API related trace events diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index fcd3b3f1020a..8f461e04e5f0 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -15,8 +15,8 @@ #include #include -#include -#include +#include +#include /** ** Event classes diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index f48f2ab9d238..ffe2679a13ce 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -14,7 +14,7 @@ #include #include -#include +#include TRACE_DEFINE_ENUM(SOCK_STREAM); TRACE_DEFINE_ENUM(SOCK_DGRAM); diff --git a/include/trace/events/fs.h b/include/trace/misc/fs.h similarity index 100% rename from include/trace/events/fs.h rename to include/trace/misc/fs.h diff --git a/include/trace/events/nfs.h b/include/trace/misc/nfs.h similarity index 100% rename from include/trace/events/nfs.h rename to include/trace/misc/nfs.h diff --git a/include/trace/events/rdma.h b/include/trace/misc/rdma.h similarity index 100% rename from include/trace/events/rdma.h rename to include/trace/misc/rdma.h diff --git a/include/trace/events/sunrpc_base.h b/include/trace/misc/sunrpc.h similarity index 100% rename from include/trace/events/sunrpc_base.h rename to include/trace/misc/sunrpc.h From 4481d72a4b63eb190e71e381050aa2959226e13b Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 16 Nov 2022 19:44:45 -0800 Subject: [PATCH 170/216] NFSD: refactoring courtesy_client_reaper to a generic low memory shrinker [ Upstream commit a1049eb47f20b9eabf9afb218578fff16b4baca6 ] Refactoring courtesy_client_reaper to generic low memory shrinker so it can be used for other purposes. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d2ed047dfdc8..df3d421025aa 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4362,7 +4362,7 @@ out: } static unsigned long -nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc) +nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { int cnt; struct nfsd_net *nn = container_of(shrink, @@ -4375,7 +4375,7 @@ nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc) } static unsigned long -nfsd_courtesy_client_scan(struct shrinker *shrink, struct shrink_control *sc) +nfsd4_state_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { return SHRINK_STOP; } @@ -4402,8 +4402,8 @@ nfsd4_init_leases_net(struct nfsd_net *nn) nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB); atomic_set(&nn->nfsd_courtesy_clients, 0); - nn->nfsd_client_shrinker.scan_objects = nfsd_courtesy_client_scan; - nn->nfsd_client_shrinker.count_objects = nfsd_courtesy_client_count; + nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan; + nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count; nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client"); } @@ -6171,17 +6171,24 @@ laundromat_main(struct work_struct *laundry) } static void -courtesy_client_reaper(struct work_struct *reaper) +courtesy_client_reaper(struct nfsd_net *nn) { struct list_head reaplist; - struct delayed_work *dwork = to_delayed_work(reaper); - struct nfsd_net *nn = container_of(dwork, struct nfsd_net, - nfsd_shrinker_work); nfs4_get_courtesy_client_reaplist(nn, &reaplist); nfs4_process_client_reaplist(&reaplist); } +static void +nfsd4_state_shrinker_worker(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct nfsd_net *nn = container_of(dwork, struct nfsd_net, + nfsd_shrinker_work); + + courtesy_client_reaper(nn); +} + static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) { if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle)) @@ -8007,7 +8014,7 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->blocked_locks_lru); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); - INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, courtesy_client_reaper); + INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); get_net(net); return 0; From f30f07ba5789ef5c68c2352b996d8a98fefca8a2 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 16 Nov 2022 19:44:46 -0800 Subject: [PATCH 171/216] NFSD: add support for sending CB_RECALL_ANY [ Upstream commit 3959066b697b5dfbb7141124ae9665337d4bc638 ] Add XDR encode and decode function for CB_RECALL_ANY. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4callback.c | 72 ++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/state.h | 1 + fs/nfsd/xdr4.h | 5 +++ fs/nfsd/xdr4cb.h | 6 ++++ 4 files changed, 84 insertions(+) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 39989c14c8a1..4eae2c5af2ed 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -76,6 +76,17 @@ static __be32 *xdr_encode_empty_array(__be32 *p) * 1 Protocol" */ +static void encode_uint32(struct xdr_stream *xdr, u32 n) +{ + WARN_ON_ONCE(xdr_stream_encode_u32(xdr, n) < 0); +} + +static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap, + size_t len) +{ + WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0); +} + /* * nfs_cb_opnum4 * @@ -328,6 +339,24 @@ static void encode_cb_recall4args(struct xdr_stream *xdr, hdr->nops++; } +/* + * CB_RECALLANY4args + * + * struct CB_RECALLANY4args { + * uint32_t craa_objects_to_keep; + * bitmap4 craa_type_mask; + * }; + */ +static void +encode_cb_recallany4args(struct xdr_stream *xdr, + struct nfs4_cb_compound_hdr *hdr, struct nfsd4_cb_recall_any *ra) +{ + encode_nfs_cb_opnum4(xdr, OP_CB_RECALL_ANY); + encode_uint32(xdr, ra->ra_keep); + encode_bitmap4(xdr, ra->ra_bmval, ARRAY_SIZE(ra->ra_bmval)); + hdr->nops++; +} + /* * CB_SEQUENCE4args * @@ -482,6 +511,26 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, encode_cb_nops(&hdr); } +/* + * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects + */ +static void +nfs4_xdr_enc_cb_recall_any(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfsd4_callback *cb = data; + struct nfsd4_cb_recall_any *ra; + struct nfs4_cb_compound_hdr hdr = { + .ident = cb->cb_clp->cl_cb_ident, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + ra = container_of(cb, struct nfsd4_cb_recall_any, ra_cb); + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_recallany4args(xdr, &hdr, ra); + encode_cb_nops(&hdr); +} /* * NFSv4.0 and NFSv4.1 XDR decode functions @@ -520,6 +569,28 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); } +/* + * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects + */ +static int +nfs4_xdr_dec_cb_recall_any(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + status = decode_cb_op_status(xdr, OP_CB_RECALL_ANY, &cb->cb_status); + return status; +} + #ifdef CONFIG_NFSD_PNFS /* * CB_LAYOUTRECALL4args @@ -783,6 +854,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = { #endif PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload), + PROC(CB_RECALL_ANY, COMPOUND, cb_recall_any, cb_recall_any), }; static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)]; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index eadd7f465bf5..e30882f8b851 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -636,6 +636,7 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_OFFLOAD, NFSPROC4_CLNT_CB_SEQUENCE, NFSPROC4_CLNT_CB_NOTIFY_LOCK, + NFSPROC4_CLNT_CB_RECALL_ANY, }; /* Returns true iff a is later than b: */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 36c3340c1d54..510978e602da 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -896,5 +896,10 @@ struct nfsd4_operation { union nfsd4_op_u *); }; +struct nfsd4_cb_recall_any { + struct nfsd4_callback ra_cb; + u32 ra_keep; + u32 ra_bmval[1]; +}; #endif diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index 547cf07cf4e0..0d39af1b00a0 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -48,3 +48,9 @@ #define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) +#define NFS4_enc_cb_recall_any_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + 1 + 1) +#define NFS4_dec_cb_recall_any_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) From f28dae54632c5ea45f32ddc6fba494f5efc15007 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 16 Nov 2022 19:44:47 -0800 Subject: [PATCH 172/216] NFSD: add delegation reaper to react to low memory condition [ Upstream commit 44df6f439a1790a5f602e3842879efa88f346672 ] The delegation reaper is called by nfsd memory shrinker's on the 'count' callback. It scans the client list and sends the courtesy CB_RECALL_ANY to the clients that hold delegations. To avoid flooding the clients with CB_RECALL_ANY requests, the delegation reaper sends only one CB_RECALL_ANY request to each client per 5 seconds. Signed-off-by: Dai Ngo [ cel: moved definition of RCA4_TYPE_MASK_RDATA_DLG ] Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 88 ++++++++++++++++++++++++++++++++++++++++++-- fs/nfsd/state.h | 5 +++ include/linux/nfs4.h | 13 +++++++ 3 files changed, 102 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index df3d421025aa..1e030e030930 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2145,6 +2145,7 @@ static void __free_client(struct kref *k) kfree(clp->cl_nii_domain.data); kfree(clp->cl_nii_name.data); idr_destroy(&clp->cl_stateids); + kfree(clp->cl_ra); kmem_cache_free(client_slab, clp); } @@ -2872,6 +2873,36 @@ static const struct tree_descr client_files[] = { [3] = {""}, }; +static int +nfsd4_cb_recall_any_done(struct nfsd4_callback *cb, + struct rpc_task *task) +{ + switch (task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, 2 * HZ); + return 0; + default: + return 1; + } +} + +static void +nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); + clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); + put_client_renew_locked(clp); + spin_unlock(&nn->client_lock); +} + +static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = { + .done = nfsd4_cb_recall_any_done, + .release = nfsd4_cb_recall_any_release, +}; + static struct nfs4_client *create_client(struct xdr_netobj name, struct svc_rqst *rqstp, nfs4_verifier *verf) { @@ -2909,6 +2940,14 @@ static struct nfs4_client *create_client(struct xdr_netobj name, free_client(clp); return NULL; } + clp->cl_ra = kzalloc(sizeof(*clp->cl_ra), GFP_KERNEL); + if (!clp->cl_ra) { + free_client(clp); + return NULL; + } + clp->cl_ra_time = 0; + nfsd4_init_cb(&clp->cl_ra->ra_cb, clp, &nfsd4_cb_recall_any_ops, + NFSPROC4_CLNT_CB_RECALL_ANY); return clp; } @@ -4364,14 +4403,16 @@ out: static unsigned long nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { - int cnt; + int count; struct nfsd_net *nn = container_of(shrink, struct nfsd_net, nfsd_client_shrinker); - cnt = atomic_read(&nn->nfsd_courtesy_clients); - if (cnt > 0) + count = atomic_read(&nn->nfsd_courtesy_clients); + if (!count) + count = atomic_long_read(&num_delegations); + if (count) mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); - return (unsigned long)cnt; + return (unsigned long)count; } static unsigned long @@ -6179,6 +6220,44 @@ courtesy_client_reaper(struct nfsd_net *nn) nfs4_process_client_reaplist(&reaplist); } +static void +deleg_reaper(struct nfsd_net *nn) +{ + struct list_head *pos, *next; + struct nfs4_client *clp; + struct list_head cblist; + + INIT_LIST_HEAD(&cblist); + spin_lock(&nn->client_lock); + list_for_each_safe(pos, next, &nn->client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (clp->cl_state != NFSD4_ACTIVE || + list_empty(&clp->cl_delegations) || + atomic_read(&clp->cl_delegs_in_recall) || + test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) || + (ktime_get_boottime_seconds() - + clp->cl_ra_time < 5)) { + continue; + } + list_add(&clp->cl_ra_cblist, &cblist); + + /* release in nfsd4_cb_recall_any_release */ + atomic_inc(&clp->cl_rpc_users); + set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); + clp->cl_ra_time = ktime_get_boottime_seconds(); + } + spin_unlock(&nn->client_lock); + + while (!list_empty(&cblist)) { + clp = list_first_entry(&cblist, struct nfs4_client, + cl_ra_cblist); + list_del_init(&clp->cl_ra_cblist); + clp->cl_ra->ra_keep = 0; + clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG); + nfsd4_run_cb(&clp->cl_ra->ra_cb); + } +} + static void nfsd4_state_shrinker_worker(struct work_struct *work) { @@ -6187,6 +6266,7 @@ nfsd4_state_shrinker_worker(struct work_struct *work) nfsd_shrinker_work); courtesy_client_reaper(nn); + deleg_reaper(nn); } static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e30882f8b851..e94634d30591 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -368,6 +368,7 @@ struct nfs4_client { #define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */ #define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 1 << NFSD4_CLIENT_CB_KILL) +#define NFSD4_CLIENT_CB_RECALL_ANY (6) unsigned long cl_flags; const struct cred *cl_cb_cred; struct rpc_clnt *cl_cb_client; @@ -411,6 +412,10 @@ struct nfs4_client { unsigned int cl_state; atomic_t cl_delegs_in_recall; + + struct nfsd4_cb_recall_any *cl_ra; + time64_t cl_ra_time; + struct list_head cl_ra_cblist; }; /* struct nfs4_client_reset diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 8d04b6a5964c..730003c4f4af 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -732,4 +732,17 @@ enum nfs4_setxattr_options { SETXATTR4_CREATE = 1, SETXATTR4_REPLACE = 2, }; + +enum { + RCA4_TYPE_MASK_RDATA_DLG = 0, + RCA4_TYPE_MASK_WDATA_DLG = 1, + RCA4_TYPE_MASK_DIR_DLG = 2, + RCA4_TYPE_MASK_FILE_LAYOUT = 3, + RCA4_TYPE_MASK_BLK_LAYOUT = 4, + RCA4_TYPE_MASK_OBJ_LAYOUT_MIN = 8, + RCA4_TYPE_MASK_OBJ_LAYOUT_MAX = 9, + RCA4_TYPE_MASK_OTHER_LAYOUT_MIN = 12, + RCA4_TYPE_MASK_OTHER_LAYOUT_MAX = 15, +}; + #endif From 7b2b8a6c75f0c0175f626d61a74e4f7f75d38df4 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 16 Nov 2022 19:44:48 -0800 Subject: [PATCH 173/216] NFSD: add CB_RECALL_ANY tracepoints [ Upstream commit 638593be55c0b37a1930038460a9918215d5c24b ] Add tracepoints to trace start and end of CB_RECALL_ANY operation. Signed-off-by: Dai Ngo [ cel: added show_rca_mask() macro ] Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 2 ++ fs/nfsd/trace.h | 50 ++++++++++++++++++++++++++++++++++++++++ include/trace/misc/nfs.h | 12 ++++++++++ 3 files changed, 64 insertions(+) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1e030e030930..d8829fa53fda 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2877,6 +2877,7 @@ static int nfsd4_cb_recall_any_done(struct nfsd4_callback *cb, struct rpc_task *task) { + trace_nfsd_cb_recall_any_done(cb, task); switch (task->tk_status) { case -NFS4ERR_DELAY: rpc_delay(task, 2 * HZ); @@ -6254,6 +6255,7 @@ deleg_reaper(struct nfsd_net *nn) list_del_init(&clp->cl_ra_cblist); clp->cl_ra->ra_keep = 0; clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG); + trace_nfsd_cb_recall_any(clp->cl_ra); nfsd4_run_cb(&clp->cl_ra->ra_cb); } } diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index d261a06b6140..4183819ea082 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -9,9 +9,12 @@ #define _NFSD_TRACE_H #include +#include +#include #include "export.h" #include "nfsfh.h" +#include "xdr4.h" #define NFSD_TRACE_PROC_RES_FIELDS \ __field(unsigned int, netns_ino) \ @@ -1492,6 +1495,32 @@ TRACE_EVENT(nfsd_cb_offload, __entry->fh_hash, __entry->count, __entry->status) ); +TRACE_EVENT(nfsd_cb_recall_any, + TP_PROTO( + const struct nfsd4_cb_recall_any *ra + ), + TP_ARGS(ra), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, keep) + __field(unsigned long, bmval0) + __sockaddr(addr, ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->cl_boot = ra->ra_cb.cb_clp->cl_clientid.cl_boot; + __entry->cl_id = ra->ra_cb.cb_clp->cl_clientid.cl_id; + __entry->keep = ra->ra_keep; + __entry->bmval0 = ra->ra_bmval[0]; + __assign_sockaddr(addr, &ra->ra_cb.cb_clp->cl_addr, + ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen); + ), + TP_printk("addr=%pISpc client %08x:%08x keep=%u bmval0=%s", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->keep, show_rca_mask(__entry->bmval0) + ) +); + DECLARE_EVENT_CLASS(nfsd_cb_done_class, TP_PROTO( const stateid_t *stp, @@ -1531,6 +1560,27 @@ DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_notify_lock_done); DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_layout_done); DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_offload_done); +TRACE_EVENT(nfsd_cb_recall_any_done, + TP_PROTO( + const struct nfsd4_callback *cb, + const struct rpc_task *task + ), + TP_ARGS(cb, task), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(int, status) + ), + TP_fast_assign( + __entry->status = task->tk_status; + __entry->cl_boot = cb->cb_clp->cl_clientid.cl_boot; + __entry->cl_id = cb->cb_clp->cl_clientid.cl_id; + ), + TP_printk("client %08x:%08x status=%d", + __entry->cl_boot, __entry->cl_id, __entry->status + ) +); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/include/trace/misc/nfs.h b/include/trace/misc/nfs.h index 09ffdbb04134..0d9d48dca38a 100644 --- a/include/trace/misc/nfs.h +++ b/include/trace/misc/nfs.h @@ -360,6 +360,18 @@ TRACE_DEFINE_ENUM(IOMODE_ANY); { IOMODE_RW, "RW" }, \ { IOMODE_ANY, "ANY" }) +#define show_rca_mask(x) \ + __print_flags(x, "|", \ + { BIT(RCA4_TYPE_MASK_RDATA_DLG), "RDATA_DLG" }, \ + { BIT(RCA4_TYPE_MASK_WDATA_DLG), "WDATA_DLG" }, \ + { BIT(RCA4_TYPE_MASK_DIR_DLG), "DIR_DLG" }, \ + { BIT(RCA4_TYPE_MASK_FILE_LAYOUT), "FILE_LAYOUT" }, \ + { BIT(RCA4_TYPE_MASK_BLK_LAYOUT), "BLK_LAYOUT" }, \ + { BIT(RCA4_TYPE_MASK_OBJ_LAYOUT_MIN), "OBJ_LAYOUT_MIN" }, \ + { BIT(RCA4_TYPE_MASK_OBJ_LAYOUT_MAX), "OBJ_LAYOUT_MAX" }, \ + { BIT(RCA4_TYPE_MASK_OTHER_LAYOUT_MIN), "OTHER_LAYOUT_MIN" }, \ + { BIT(RCA4_TYPE_MASK_OTHER_LAYOUT_MAX), "OTHER_LAYOUT_MAX" }) + #define show_nfs4_seq4_status(x) \ __print_flags(x, "|", \ { SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \ From eb73733124305ce47d86d74fc3610ea7a4e55260 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 26 Nov 2022 15:55:30 -0500 Subject: [PATCH 174/216] NFSD: Use only RQ_DROPME to signal the need to drop a reply [ Upstream commit 9315564747cb6a570e99196b3a4880fb817635fd ] Clean up: NFSv2 has the only two usages of rpc_drop_reply in the NFSD code base. Since NFSv2 is going away at some point, replace these in order to simplify the "drop this reply?" check in nfsd_dispatch(). Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfsproc.c | 4 ++-- fs/nfsd/nfssvc.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 52fc222c34f2..a5570cf75f3f 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -211,7 +211,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) - return rpc_drop_reply; + __set_bit(RQ_DROPME, &rqstp->rq_flags); return rpc_success; } @@ -246,7 +246,7 @@ nfsd_proc_write(struct svc_rqst *rqstp) if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) - return rpc_drop_reply; + __set_bit(RQ_DROPME, &rqstp->rq_flags); return rpc_success; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 6f4a38f5ab0c..0c75636054a5 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -1071,7 +1071,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0); *statp = proc->pc_func(rqstp); - if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags)) + if (test_bit(RQ_DROPME, &rqstp->rq_flags)) goto out_update_drop; if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) From 5c6c2fb3c12f7d7bb7f04259878ac965a8ea2d2d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Dec 2022 12:48:59 -0800 Subject: [PATCH 175/216] NFSD: Avoid clashing function prototypes [ Upstream commit e78e274eb22d966258a3845acc71d3c5b8ee2ea8 ] When built with Control Flow Integrity, function prototypes between caller and function declaration must match. These mismatches are visible at compile time with the new -Wcast-function-type-strict in Clang[1]. There were 97 warnings produced by NFS. For example: fs/nfsd/nfs4xdr.c:2228:17: warning: cast from '__be32 (*)(struct nfsd4_compoundargs *, struct nfsd4_access *)' (aka 'unsigned int (*)(struct nfsd4_compoundargs *, struct nfsd4_access *)') to 'nfsd4_dec' (aka 'unsigned int (*)(struct nfsd4_compoundargs *, void *)') converts to incompatible function type [-Wcast-function-type-strict] [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The enc/dec callbacks were defined as passing "void *" as the second argument, but were being implicitly cast to a new type. Replace the argument with union nfsd4_op_u, and perform explicit member selection in the function body. There are no resulting binary differences. Changes were made mechanically using the following Coccinelle script, with minor by-hand fixes for members that didn't already match their existing argument name: @find@ identifier func; type T, opsT; identifier ops, N; @@ opsT ops[] = { [N] = (T) func, }; @already_void@ identifier find.func; identifier name; @@ func(..., -void +union nfsd4_op_u *name) { ... } @proto depends on !already_void@ identifier find.func; type T; identifier name; position p; @@ func@p(..., T name ) { ... } @script:python get_member@ type_name << proto.T; member; @@ coccinelle.member = cocci.make_ident(type_name.split("_", 1)[1].split(' ',1)[0]) @convert@ identifier find.func; type proto.T; identifier proto.name; position proto.p; identifier get_member.member; @@ func@p(..., - T name + union nfsd4_op_u *u ) { + T name = &u->member; ... } @cast@ identifier find.func; type T, opsT; identifier ops, N; @@ opsT ops[] = { [N] = - (T) func, }; Cc: Chuck Lever Cc: Jeff Layton Cc: Gustavo A. R. Silva Cc: linux-nfs@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4xdr.c | 632 +++++++++++++++++++++++++++------------------- 1 file changed, 377 insertions(+), 255 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 51a598ee68fe..597f14a80512 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -770,16 +770,18 @@ nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) static __be32 nfsd4_decode_access(struct nfsd4_compoundargs *argp, - struct nfsd4_access *access) + union nfsd4_op_u *u) { + struct nfsd4_access *access = &u->access; if (xdr_stream_decode_u32(argp->xdr, &access->ac_req_access) < 0) return nfserr_bad_xdr; return nfs_ok; } static __be32 -nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) +nfsd4_decode_close(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_close *close = &u->close; if (xdr_stream_decode_u32(argp->xdr, &close->cl_seqid) < 0) return nfserr_bad_xdr; return nfsd4_decode_stateid4(argp, &close->cl_stateid); @@ -787,8 +789,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) static __be32 -nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit) +nfsd4_decode_commit(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_commit *commit = &u->commit; if (xdr_stream_decode_u64(argp->xdr, &commit->co_offset) < 0) return nfserr_bad_xdr; if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0) @@ -798,8 +801,9 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit } static __be32 -nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create) +nfsd4_decode_create(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_create *create = &u->create; __be32 *p, status; memset(create, 0, sizeof(*create)); @@ -844,22 +848,25 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create } static inline __be32 -nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) +nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_delegreturn *dr = &u->delegreturn; return nfsd4_decode_stateid4(argp, &dr->dr_stateid); } static inline __be32 -nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr) +nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_getattr *getattr = &u->getattr; memset(getattr, 0, sizeof(*getattr)); return nfsd4_decode_bitmap4(argp, getattr->ga_bmval, ARRAY_SIZE(getattr->ga_bmval)); } static __be32 -nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) +nfsd4_decode_link(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_link *link = &u->link; memset(link, 0, sizeof(*link)); return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen); } @@ -907,8 +914,9 @@ nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) } static __be32 -nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) +nfsd4_decode_lock(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_lock *lock = &u->lock; memset(lock, 0, sizeof(*lock)); if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0) return nfserr_bad_xdr; @@ -924,8 +932,9 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) } static __be32 -nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) +nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_lockt *lockt = &u->lockt; memset(lockt, 0, sizeof(*lockt)); if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0) return nfserr_bad_xdr; @@ -940,8 +949,9 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) } static __be32 -nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) +nfsd4_decode_locku(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_locku *locku = &u->locku; __be32 status; if (xdr_stream_decode_u32(argp->xdr, &locku->lu_type) < 0) @@ -962,8 +972,9 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) } static __be32 -nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup) +nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_lookup *lookup = &u->lookup; return nfsd4_decode_component4(argp, &lookup->lo_name, &lookup->lo_len); } @@ -1143,8 +1154,9 @@ nfsd4_decode_open_claim4(struct nfsd4_compoundargs *argp, } static __be32 -nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +nfsd4_decode_open(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_open *open = &u->open; __be32 status; u32 dummy; @@ -1171,8 +1183,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) } static __be32 -nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf) +nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_open_confirm *open_conf = &u->open_confirm; __be32 status; if (argp->minorversion >= 1) @@ -1190,8 +1204,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con } static __be32 -nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down) +nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_open_downgrade *open_down = &u->open_downgrade; __be32 status; memset(open_down, 0, sizeof(*open_down)); @@ -1209,8 +1225,9 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d } static __be32 -nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) +nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_putfh *putfh = &u->putfh; __be32 *p; if (xdr_stream_decode_u32(argp->xdr, &putfh->pf_fhlen) < 0) @@ -1229,7 +1246,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) } static __be32 -nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p) +nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p) { if (argp->minorversion == 0) return nfs_ok; @@ -1237,8 +1254,9 @@ nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p) } static __be32 -nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) +nfsd4_decode_read(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_read *read = &u->read; __be32 status; memset(read, 0, sizeof(*read)); @@ -1254,8 +1272,9 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) } static __be32 -nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir) +nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_readdir *readdir = &u->readdir; __be32 status; memset(readdir, 0, sizeof(*readdir)); @@ -1276,15 +1295,17 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read } static __be32 -nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove) +nfsd4_decode_remove(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_remove *remove = &u->remove; memset(&remove->rm_cinfo, 0, sizeof(remove->rm_cinfo)); return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen); } static __be32 -nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename) +nfsd4_decode_rename(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_rename *rename = &u->rename; __be32 status; memset(rename, 0, sizeof(*rename)); @@ -1295,22 +1316,25 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename } static __be32 -nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid) +nfsd4_decode_renew(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + clientid_t *clientid = &u->renew; return nfsd4_decode_clientid4(argp, clientid); } static __be32 nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, - struct nfsd4_secinfo *secinfo) + union nfsd4_op_u *u) { + struct nfsd4_secinfo *secinfo = &u->secinfo; secinfo->si_exp = NULL; return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen); } static __be32 -nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) +nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_setattr *setattr = &u->setattr; __be32 status; memset(setattr, 0, sizeof(*setattr)); @@ -1324,8 +1348,9 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta } static __be32 -nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid) +nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_setclientid *setclientid = &u->setclientid; __be32 *p, status; memset(setclientid, 0, sizeof(*setclientid)); @@ -1367,8 +1392,10 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient } static __be32 -nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c) +nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_setclientid_confirm *scd_c = &u->setclientid_confirm; __be32 status; if (argp->minorversion >= 1) @@ -1382,8 +1409,9 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s /* Also used for NVERIFY */ static __be32 -nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify) +nfsd4_decode_verify(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_verify *verify = &u->verify; __be32 *p, status; memset(verify, 0, sizeof(*verify)); @@ -1409,8 +1437,9 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify } static __be32 -nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) +nfsd4_decode_write(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_write *write = &u->write; __be32 status; status = nfsd4_decode_stateid4(argp, &write->wr_stateid); @@ -1434,8 +1463,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) } static __be32 -nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner) +nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner; __be32 status; if (argp->minorversion >= 1) @@ -1452,16 +1483,20 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel return nfs_ok; } -static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc) +static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_backchannel_ctl *bc = &u->backchannel_ctl; memset(bc, 0, sizeof(*bc)); if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0) return nfserr_bad_xdr; return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); } -static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) +static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session; u32 use_conn_in_rdma_mode; __be32 status; @@ -1603,8 +1638,9 @@ nfsd4_decode_nfs_impl_id4(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, - struct nfsd4_exchange_id *exid) + union nfsd4_op_u *u) { + struct nfsd4_exchange_id *exid = &u->exchange_id; __be32 status; memset(exid, 0, sizeof(*exid)); @@ -1656,8 +1692,9 @@ nfsd4_decode_channel_attrs4(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, - struct nfsd4_create_session *sess) + union nfsd4_op_u *u) { + struct nfsd4_create_session *sess = &u->create_session; __be32 status; memset(sess, 0, sizeof(*sess)); @@ -1681,23 +1718,26 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, - struct nfsd4_destroy_session *destroy_session) + union nfsd4_op_u *u) { + struct nfsd4_destroy_session *destroy_session = &u->destroy_session; return nfsd4_decode_sessionid4(argp, &destroy_session->sessionid); } static __be32 nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, - struct nfsd4_free_stateid *free_stateid) + union nfsd4_op_u *u) { + struct nfsd4_free_stateid *free_stateid = &u->free_stateid; return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid); } #ifdef CONFIG_NFSD_PNFS static __be32 nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, - struct nfsd4_getdeviceinfo *gdev) + union nfsd4_op_u *u) { + struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo; __be32 status; memset(gdev, 0, sizeof(*gdev)); @@ -1717,8 +1757,9 @@ nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutcommit *lcp) + union nfsd4_op_u *u) { + struct nfsd4_layoutcommit *lcp = &u->layoutcommit; __be32 *p, status; memset(lcp, 0, sizeof(*lcp)); @@ -1753,8 +1794,9 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutget *lgp) + union nfsd4_op_u *u) { + struct nfsd4_layoutget *lgp = &u->layoutget; __be32 status; memset(lgp, 0, sizeof(*lgp)); @@ -1781,8 +1823,9 @@ nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutreturn *lrp) + union nfsd4_op_u *u) { + struct nfsd4_layoutreturn *lrp = &u->layoutreturn; memset(lrp, 0, sizeof(*lrp)); if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0) return nfserr_bad_xdr; @@ -1795,8 +1838,9 @@ nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, #endif /* CONFIG_NFSD_PNFS */ static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, - struct nfsd4_secinfo_no_name *sin) + union nfsd4_op_u *u) { + struct nfsd4_secinfo_no_name *sin = &u->secinfo_no_name; if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0) return nfserr_bad_xdr; @@ -1806,8 +1850,9 @@ static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, - struct nfsd4_sequence *seq) + union nfsd4_op_u *u) { + struct nfsd4_sequence *seq = &u->sequence; __be32 *p, status; status = nfsd4_decode_sessionid4(argp, &seq->sessionid); @@ -1826,8 +1871,10 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, } static __be32 -nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid) +nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { + struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct nfsd4_test_stateid_id *stateid; __be32 status; u32 i; @@ -1852,14 +1899,16 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta } static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, - struct nfsd4_destroy_clientid *dc) + union nfsd4_op_u *u) { + struct nfsd4_destroy_clientid *dc = &u->destroy_clientid; return nfsd4_decode_clientid4(argp, &dc->clientid); } static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, - struct nfsd4_reclaim_complete *rc) + union nfsd4_op_u *u) { + struct nfsd4_reclaim_complete *rc = &u->reclaim_complete; if (xdr_stream_decode_bool(argp->xdr, &rc->rca_one_fs) < 0) return nfserr_bad_xdr; return nfs_ok; @@ -1867,8 +1916,9 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, - struct nfsd4_fallocate *fallocate) + union nfsd4_op_u *u) { + struct nfsd4_fallocate *fallocate = &u->allocate; __be32 status; status = nfsd4_decode_stateid4(argp, &fallocate->falloc_stateid); @@ -1924,8 +1974,9 @@ static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp, } static __be32 -nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) +nfsd4_decode_copy(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_copy *copy = &u->copy; u32 consecutive, i, count, sync; struct nl4_server *ns_dummy; __be32 status; @@ -1982,8 +2033,9 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) static __be32 nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp, - struct nfsd4_copy_notify *cn) + union nfsd4_op_u *u) { + struct nfsd4_copy_notify *cn = &u->copy_notify; __be32 status; memset(cn, 0, sizeof(*cn)); @@ -2002,16 +2054,18 @@ nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp, - struct nfsd4_offload_status *os) + union nfsd4_op_u *u) { + struct nfsd4_offload_status *os = &u->offload_status; os->count = 0; os->status = 0; return nfsd4_decode_stateid4(argp, &os->stateid); } static __be32 -nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) +nfsd4_decode_seek(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_seek *seek = &u->seek; __be32 status; status = nfsd4_decode_stateid4(argp, &seek->seek_stateid); @@ -2028,8 +2082,9 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) } static __be32 -nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) +nfsd4_decode_clone(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_clone *clone = &u->clone; __be32 status; status = nfsd4_decode_stateid4(argp, &clone->cl_src_stateid); @@ -2154,8 +2209,9 @@ nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep) */ static __be32 nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp, - struct nfsd4_getxattr *getxattr) + union nfsd4_op_u *u) { + struct nfsd4_getxattr *getxattr = &u->getxattr; __be32 status; u32 maxcount; @@ -2173,8 +2229,9 @@ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, - struct nfsd4_setxattr *setxattr) + union nfsd4_op_u *u) { + struct nfsd4_setxattr *setxattr = &u->setxattr; u32 flags, maxcount, size; __be32 status; @@ -2214,8 +2271,9 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, - struct nfsd4_listxattrs *listxattrs) + union nfsd4_op_u *u) { + struct nfsd4_listxattrs *listxattrs = &u->listxattrs; u32 maxcount; memset(listxattrs, 0, sizeof(*listxattrs)); @@ -2245,113 +2303,114 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp, - struct nfsd4_removexattr *removexattr) + union nfsd4_op_u *u) { + struct nfsd4_removexattr *removexattr = &u->removexattr; memset(removexattr, 0, sizeof(*removexattr)); return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name); } static __be32 -nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) +nfsd4_decode_noop(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p) { return nfs_ok; } static __be32 -nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) +nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p) { return nfserr_notsupp; } -typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); +typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u); static const nfsd4_dec nfsd4_dec_ops[] = { - [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, - [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close, - [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit, - [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create, - [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn, - [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr, - [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_LINK] = (nfsd4_dec)nfsd4_decode_link, - [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock, - [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt, - [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku, - [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup, - [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop, - [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify, - [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open, - [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, - [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, - [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, - [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_putpubfh, - [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_READ] = (nfsd4_dec)nfsd4_decode_read, - [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, - [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop, - [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove, - [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename, - [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew, - [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo, - [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr, - [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid, - [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm, - [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify, - [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write, - [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, + [OP_ACCESS] = nfsd4_decode_access, + [OP_CLOSE] = nfsd4_decode_close, + [OP_COMMIT] = nfsd4_decode_commit, + [OP_CREATE] = nfsd4_decode_create, + [OP_DELEGPURGE] = nfsd4_decode_notsupp, + [OP_DELEGRETURN] = nfsd4_decode_delegreturn, + [OP_GETATTR] = nfsd4_decode_getattr, + [OP_GETFH] = nfsd4_decode_noop, + [OP_LINK] = nfsd4_decode_link, + [OP_LOCK] = nfsd4_decode_lock, + [OP_LOCKT] = nfsd4_decode_lockt, + [OP_LOCKU] = nfsd4_decode_locku, + [OP_LOOKUP] = nfsd4_decode_lookup, + [OP_LOOKUPP] = nfsd4_decode_noop, + [OP_NVERIFY] = nfsd4_decode_verify, + [OP_OPEN] = nfsd4_decode_open, + [OP_OPENATTR] = nfsd4_decode_notsupp, + [OP_OPEN_CONFIRM] = nfsd4_decode_open_confirm, + [OP_OPEN_DOWNGRADE] = nfsd4_decode_open_downgrade, + [OP_PUTFH] = nfsd4_decode_putfh, + [OP_PUTPUBFH] = nfsd4_decode_putpubfh, + [OP_PUTROOTFH] = nfsd4_decode_noop, + [OP_READ] = nfsd4_decode_read, + [OP_READDIR] = nfsd4_decode_readdir, + [OP_READLINK] = nfsd4_decode_noop, + [OP_REMOVE] = nfsd4_decode_remove, + [OP_RENAME] = nfsd4_decode_rename, + [OP_RENEW] = nfsd4_decode_renew, + [OP_RESTOREFH] = nfsd4_decode_noop, + [OP_SAVEFH] = nfsd4_decode_noop, + [OP_SECINFO] = nfsd4_decode_secinfo, + [OP_SETATTR] = nfsd4_decode_setattr, + [OP_SETCLIENTID] = nfsd4_decode_setclientid, + [OP_SETCLIENTID_CONFIRM] = nfsd4_decode_setclientid_confirm, + [OP_VERIFY] = nfsd4_decode_verify, + [OP_WRITE] = nfsd4_decode_write, + [OP_RELEASE_LOCKOWNER] = nfsd4_decode_release_lockowner, /* new operations for NFSv4.1 */ - [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl, - [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session, - [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, - [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, - [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, - [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, - [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_BACKCHANNEL_CTL] = nfsd4_decode_backchannel_ctl, + [OP_BIND_CONN_TO_SESSION] = nfsd4_decode_bind_conn_to_session, + [OP_EXCHANGE_ID] = nfsd4_decode_exchange_id, + [OP_CREATE_SESSION] = nfsd4_decode_create_session, + [OP_DESTROY_SESSION] = nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = nfsd4_decode_free_stateid, + [OP_GET_DIR_DELEGATION] = nfsd4_decode_notsupp, #ifdef CONFIG_NFSD_PNFS - [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo, - [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, - [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, - [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, + [OP_GETDEVICEINFO] = nfsd4_decode_getdeviceinfo, + [OP_GETDEVICELIST] = nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = nfsd4_decode_layoutcommit, + [OP_LAYOUTGET] = nfsd4_decode_layoutget, + [OP_LAYOUTRETURN] = nfsd4_decode_layoutreturn, #else - [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICEINFO] = nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = nfsd4_decode_notsupp, + [OP_LAYOUTGET] = nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = nfsd4_decode_notsupp, #endif - [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, - [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, - [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid, - [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, - [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, + [OP_SECINFO_NO_NAME] = nfsd4_decode_secinfo_no_name, + [OP_SEQUENCE] = nfsd4_decode_sequence, + [OP_SET_SSV] = nfsd4_decode_notsupp, + [OP_TEST_STATEID] = nfsd4_decode_test_stateid, + [OP_WANT_DELEGATION] = nfsd4_decode_notsupp, + [OP_DESTROY_CLIENTID] = nfsd4_decode_destroy_clientid, + [OP_RECLAIM_COMPLETE] = nfsd4_decode_reclaim_complete, /* new operations for NFSv4.2 */ - [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, - [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy, - [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_copy_notify, - [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, - [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status, - [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status, - [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_read, - [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, - [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone, + [OP_ALLOCATE] = nfsd4_decode_fallocate, + [OP_COPY] = nfsd4_decode_copy, + [OP_COPY_NOTIFY] = nfsd4_decode_copy_notify, + [OP_DEALLOCATE] = nfsd4_decode_fallocate, + [OP_IO_ADVISE] = nfsd4_decode_notsupp, + [OP_LAYOUTERROR] = nfsd4_decode_notsupp, + [OP_LAYOUTSTATS] = nfsd4_decode_notsupp, + [OP_OFFLOAD_CANCEL] = nfsd4_decode_offload_status, + [OP_OFFLOAD_STATUS] = nfsd4_decode_offload_status, + [OP_READ_PLUS] = nfsd4_decode_read, + [OP_SEEK] = nfsd4_decode_seek, + [OP_WRITE_SAME] = nfsd4_decode_notsupp, + [OP_CLONE] = nfsd4_decode_clone, /* RFC 8276 extended atributes operations */ - [OP_GETXATTR] = (nfsd4_dec)nfsd4_decode_getxattr, - [OP_SETXATTR] = (nfsd4_dec)nfsd4_decode_setxattr, - [OP_LISTXATTRS] = (nfsd4_dec)nfsd4_decode_listxattrs, - [OP_REMOVEXATTR] = (nfsd4_dec)nfsd4_decode_removexattr, + [OP_GETXATTR] = nfsd4_decode_getxattr, + [OP_SETXATTR] = nfsd4_decode_setxattr, + [OP_LISTXATTRS] = nfsd4_decode_listxattrs, + [OP_REMOVEXATTR] = nfsd4_decode_removexattr, }; static inline bool @@ -3643,8 +3702,10 @@ nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) } static __be32 -nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) +nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_access *access = &u->access; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3656,8 +3717,10 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ return 0; } -static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) +static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3673,8 +3736,10 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, } static __be32 -nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) +nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_close *close = &u->close; struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &close->cl_stateid); @@ -3682,8 +3747,10 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c static __be32 -nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) +nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_commit *commit = &u->commit; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3696,8 +3763,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ } static __be32 -nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) +nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_create *create = &u->create; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3710,8 +3779,10 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ } static __be32 -nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr) +nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_getattr *getattr = &u->getattr; struct svc_fh *fhp = getattr->ga_fhp; struct xdr_stream *xdr = resp->xdr; @@ -3720,8 +3791,10 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 } static __be32 -nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) +nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct svc_fh **fhpp = &u->getfh; struct xdr_stream *xdr = resp->xdr; struct svc_fh *fhp = *fhpp; unsigned int len; @@ -3775,8 +3848,10 @@ again: } static __be32 -nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) +nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_lock *lock = &u->lock; struct xdr_stream *xdr = resp->xdr; if (!nfserr) @@ -3788,8 +3863,10 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo } static __be32 -nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) +nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_lockt *lockt = &u->lockt; struct xdr_stream *xdr = resp->xdr; if (nfserr == nfserr_denied) @@ -3798,8 +3875,10 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l } static __be32 -nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) +nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_locku *locku = &u->locku; struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &locku->lu_stateid); @@ -3807,8 +3886,10 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l static __be32 -nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) +nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_link *link = &u->link; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3821,8 +3902,10 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li static __be32 -nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) +nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_open *open = &u->open; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -3915,16 +3998,20 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op } static __be32 -nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) +nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_open_confirm *oc = &u->open_confirm; struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); } static __be32 -nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) +nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_open_downgrade *od = &u->open_downgrade; struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &od->od_stateid); @@ -4023,8 +4110,9 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, static __be32 nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_read *read) + union nfsd4_op_u *u) { + struct nfsd4_read *read = &u->read; bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); unsigned long maxcount; struct xdr_stream *xdr = resp->xdr; @@ -4065,8 +4153,10 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 -nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink) +nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_readlink *readlink = &u->readlink; __be32 *p, *maxcount_p, zero = xdr_zero; struct xdr_stream *xdr = resp->xdr; int length_offset = xdr->buf->len; @@ -4110,8 +4200,10 @@ out_err: } static __be32 -nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir) +nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_readdir *readdir = &u->readdir; int maxcount; int bytes_left; loff_t offset; @@ -4201,8 +4293,10 @@ err_no_verf: } static __be32 -nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) +nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_remove *remove = &u->remove; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4214,8 +4308,10 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ } static __be32 -nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) +nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_rename *rename = &u->rename; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4297,8 +4393,9 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) static __be32 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_secinfo *secinfo) + union nfsd4_op_u *u) { + struct nfsd4_secinfo *secinfo = &u->secinfo; struct xdr_stream *xdr = resp->xdr; return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp); @@ -4306,8 +4403,9 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_secinfo_no_name *secinfo) + union nfsd4_op_u *u) { + struct nfsd4_secinfo_no_name *secinfo = &u->secinfo_no_name; struct xdr_stream *xdr = resp->xdr; return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp); @@ -4318,8 +4416,10 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, * regardless of the error status. */ static __be32 -nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) +nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_setattr *setattr = &u->setattr; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4342,8 +4442,10 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 } static __be32 -nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) +nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_setclientid *scd = &u->setclientid; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4366,8 +4468,10 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n } static __be32 -nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) +nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_write *write = &u->write; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4383,8 +4487,9 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w static __be32 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_exchange_id *exid) + union nfsd4_op_u *u) { + struct nfsd4_exchange_id *exid = &u->exchange_id; struct xdr_stream *xdr = resp->xdr; __be32 *p; char *major_id; @@ -4461,8 +4566,9 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_create_session *sess) + union nfsd4_op_u *u) { + struct nfsd4_create_session *sess = &u->create_session; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4514,8 +4620,9 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_sequence *seq) + union nfsd4_op_u *u) { + struct nfsd4_sequence *seq = &u->sequence; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4537,8 +4644,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_test_stateid *test_stateid) + union nfsd4_op_u *u) { + struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct xdr_stream *xdr = resp->xdr; struct nfsd4_test_stateid_id *stateid, *next; __be32 *p; @@ -4558,8 +4666,9 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, #ifdef CONFIG_NFSD_PNFS static __be32 nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_getdeviceinfo *gdev) + union nfsd4_op_u *u) { + struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo; struct xdr_stream *xdr = resp->xdr; const struct nfsd4_layout_ops *ops; u32 starting_len = xdr->buf->len, needed_len; @@ -4611,8 +4720,9 @@ toosmall: static __be32 nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_layoutget *lgp) + union nfsd4_op_u *u) { + struct nfsd4_layoutget *lgp = &u->layoutget; struct xdr_stream *xdr = resp->xdr; const struct nfsd4_layout_ops *ops; __be32 *p; @@ -4638,8 +4748,9 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_layoutcommit *lcp) + union nfsd4_op_u *u) { + struct nfsd4_layoutcommit *lcp = &u->layoutcommit; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4659,8 +4770,9 @@ nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_layoutreturn *lrp) + union nfsd4_op_u *u) { + struct nfsd4_layoutreturn *lrp = &u->layoutreturn; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4745,8 +4857,9 @@ nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns) static __be32 nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_copy *copy) + union nfsd4_op_u *u) { + struct nfsd4_copy *copy = &u->copy; __be32 *p; nfserr = nfsd42_encode_write_res(resp, ©->cp_res, @@ -4762,8 +4875,9 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_offload_status *os) + union nfsd4_op_u *u) { + struct nfsd4_offload_status *os = &u->offload_status; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4813,8 +4927,9 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, static __be32 nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_read *read) + union nfsd4_op_u *u) { + struct nfsd4_read *read = &u->read; struct file *file = read->rd_nf->nf_file; struct xdr_stream *xdr = resp->xdr; int starting_len = xdr->buf->len; @@ -4850,8 +4965,9 @@ out: static __be32 nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_copy_notify *cn) + union nfsd4_op_u *u) { + struct nfsd4_copy_notify *cn = &u->copy_notify; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -4885,8 +5001,9 @@ nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_seek *seek) + union nfsd4_op_u *u) { + struct nfsd4_seek *seek = &u->seek; __be32 *p; p = xdr_reserve_space(resp->xdr, 4 + 8); @@ -4897,7 +5014,8 @@ nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 -nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) +nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *p) { return nfserr; } @@ -4948,8 +5066,9 @@ nfsd4_vbuf_to_stream(struct xdr_stream *xdr, char *buf, u32 buflen) static __be32 nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_getxattr *getxattr) + union nfsd4_op_u *u) { + struct nfsd4_getxattr *getxattr = &u->getxattr; struct xdr_stream *xdr = resp->xdr; __be32 *p, err; @@ -4972,8 +5091,9 @@ nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_setxattr *setxattr) + union nfsd4_op_u *u) { + struct nfsd4_setxattr *setxattr = &u->setxattr; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -5013,8 +5133,9 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs, static __be32 nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_listxattrs *listxattrs) + union nfsd4_op_u *u) { + struct nfsd4_listxattrs *listxattrs = &u->listxattrs; struct xdr_stream *xdr = resp->xdr; u32 cookie_offset, count_offset, eof; u32 left, xdrleft, slen, count; @@ -5124,8 +5245,9 @@ out: static __be32 nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_removexattr *removexattr) + union nfsd4_op_u *u) { + struct nfsd4_removexattr *removexattr = &u->removexattr; struct xdr_stream *xdr = resp->xdr; __be32 *p; @@ -5137,7 +5259,7 @@ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, return 0; } -typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); +typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u); /* * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1 @@ -5145,93 +5267,93 @@ typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); * done in the decoding phase. */ static const nfsd4_enc nfsd4_enc_ops[] = { - [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, - [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, - [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit, - [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create, - [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr, - [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh, - [OP_LINK] = (nfsd4_enc)nfsd4_encode_link, - [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock, - [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt, - [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku, - [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, - [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, - [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, - [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, - [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_READ] = (nfsd4_enc)nfsd4_encode_read, - [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir, - [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink, - [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove, - [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename, - [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop, - [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo, - [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr, - [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid, - [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop, - [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, - [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, - [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, + [OP_ACCESS] = nfsd4_encode_access, + [OP_CLOSE] = nfsd4_encode_close, + [OP_COMMIT] = nfsd4_encode_commit, + [OP_CREATE] = nfsd4_encode_create, + [OP_DELEGPURGE] = nfsd4_encode_noop, + [OP_DELEGRETURN] = nfsd4_encode_noop, + [OP_GETATTR] = nfsd4_encode_getattr, + [OP_GETFH] = nfsd4_encode_getfh, + [OP_LINK] = nfsd4_encode_link, + [OP_LOCK] = nfsd4_encode_lock, + [OP_LOCKT] = nfsd4_encode_lockt, + [OP_LOCKU] = nfsd4_encode_locku, + [OP_LOOKUP] = nfsd4_encode_noop, + [OP_LOOKUPP] = nfsd4_encode_noop, + [OP_NVERIFY] = nfsd4_encode_noop, + [OP_OPEN] = nfsd4_encode_open, + [OP_OPENATTR] = nfsd4_encode_noop, + [OP_OPEN_CONFIRM] = nfsd4_encode_open_confirm, + [OP_OPEN_DOWNGRADE] = nfsd4_encode_open_downgrade, + [OP_PUTFH] = nfsd4_encode_noop, + [OP_PUTPUBFH] = nfsd4_encode_noop, + [OP_PUTROOTFH] = nfsd4_encode_noop, + [OP_READ] = nfsd4_encode_read, + [OP_READDIR] = nfsd4_encode_readdir, + [OP_READLINK] = nfsd4_encode_readlink, + [OP_REMOVE] = nfsd4_encode_remove, + [OP_RENAME] = nfsd4_encode_rename, + [OP_RENEW] = nfsd4_encode_noop, + [OP_RESTOREFH] = nfsd4_encode_noop, + [OP_SAVEFH] = nfsd4_encode_noop, + [OP_SECINFO] = nfsd4_encode_secinfo, + [OP_SETATTR] = nfsd4_encode_setattr, + [OP_SETCLIENTID] = nfsd4_encode_setclientid, + [OP_SETCLIENTID_CONFIRM] = nfsd4_encode_noop, + [OP_VERIFY] = nfsd4_encode_noop, + [OP_WRITE] = nfsd4_encode_write, + [OP_RELEASE_LOCKOWNER] = nfsd4_encode_noop, /* NFSv4.1 operations */ - [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, - [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session, - [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, - [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, - [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, - [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_BACKCHANNEL_CTL] = nfsd4_encode_noop, + [OP_BIND_CONN_TO_SESSION] = nfsd4_encode_bind_conn_to_session, + [OP_EXCHANGE_ID] = nfsd4_encode_exchange_id, + [OP_CREATE_SESSION] = nfsd4_encode_create_session, + [OP_DESTROY_SESSION] = nfsd4_encode_noop, + [OP_FREE_STATEID] = nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = nfsd4_encode_noop, #ifdef CONFIG_NFSD_PNFS - [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo, - [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, - [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, - [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, + [OP_GETDEVICEINFO] = nfsd4_encode_getdeviceinfo, + [OP_GETDEVICELIST] = nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = nfsd4_encode_layoutcommit, + [OP_LAYOUTGET] = nfsd4_encode_layoutget, + [OP_LAYOUTRETURN] = nfsd4_encode_layoutreturn, #else - [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICEINFO] = nfsd4_encode_noop, + [OP_GETDEVICELIST] = nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = nfsd4_encode_noop, + [OP_LAYOUTGET] = nfsd4_encode_noop, + [OP_LAYOUTRETURN] = nfsd4_encode_noop, #endif - [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, - [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, - [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, - [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_test_stateid, - [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, - [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, - [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO_NO_NAME] = nfsd4_encode_secinfo_no_name, + [OP_SEQUENCE] = nfsd4_encode_sequence, + [OP_SET_SSV] = nfsd4_encode_noop, + [OP_TEST_STATEID] = nfsd4_encode_test_stateid, + [OP_WANT_DELEGATION] = nfsd4_encode_noop, + [OP_DESTROY_CLIENTID] = nfsd4_encode_noop, + [OP_RECLAIM_COMPLETE] = nfsd4_encode_noop, /* NFSv4.2 operations */ - [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy, - [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_copy_notify, - [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status, - [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_read_plus, - [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, - [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, - [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_ALLOCATE] = nfsd4_encode_noop, + [OP_COPY] = nfsd4_encode_copy, + [OP_COPY_NOTIFY] = nfsd4_encode_copy_notify, + [OP_DEALLOCATE] = nfsd4_encode_noop, + [OP_IO_ADVISE] = nfsd4_encode_noop, + [OP_LAYOUTERROR] = nfsd4_encode_noop, + [OP_LAYOUTSTATS] = nfsd4_encode_noop, + [OP_OFFLOAD_CANCEL] = nfsd4_encode_noop, + [OP_OFFLOAD_STATUS] = nfsd4_encode_offload_status, + [OP_READ_PLUS] = nfsd4_encode_read_plus, + [OP_SEEK] = nfsd4_encode_seek, + [OP_WRITE_SAME] = nfsd4_encode_noop, + [OP_CLONE] = nfsd4_encode_noop, /* RFC 8276 extended atributes operations */ - [OP_GETXATTR] = (nfsd4_enc)nfsd4_encode_getxattr, - [OP_SETXATTR] = (nfsd4_enc)nfsd4_encode_setxattr, - [OP_LISTXATTRS] = (nfsd4_enc)nfsd4_encode_listxattrs, - [OP_REMOVEXATTR] = (nfsd4_enc)nfsd4_encode_removexattr, + [OP_GETXATTR] = nfsd4_encode_getxattr, + [OP_SETXATTR] = nfsd4_encode_setxattr, + [OP_LISTXATTRS] = nfsd4_encode_listxattrs, + [OP_REMOVEXATTR] = nfsd4_encode_removexattr, }; /* From ce606d5334c2abd772bac18c5ee83f3dd82f2a11 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 7 Jan 2023 10:15:35 -0500 Subject: [PATCH 176/216] NFSD: Use set_bit(RQ_DROPME) [ Upstream commit 5304930dbae82d259bcf7e5611db7c81e7a42eff ] The premise that "Once an svc thread is scheduled and executing an RPC, no other processes will touch svc_rqst::rq_flags" is false. svc_xprt_enqueue() examines the RQ_BUSY flag in scheduled nfsd threads when determining which thread to wake up next. Fixes: 9315564747cb ("NFSD: Use only RQ_DROPME to signal the need to drop a reply") Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfsproc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index a5570cf75f3f..9744443c3965 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -211,7 +211,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) - __set_bit(RQ_DROPME, &rqstp->rq_flags); + set_bit(RQ_DROPME, &rqstp->rq_flags); return rpc_success; } @@ -246,7 +246,7 @@ nfsd_proc_write(struct svc_rqst *rqstp) if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) - __set_bit(RQ_DROPME, &rqstp->rq_flags); + set_bit(RQ_DROPME, &rqstp->rq_flags); return rpc_success; } From c479755cb80a85bbd7569fa7a7e133a66f792a31 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 11 Jan 2023 12:17:09 -0800 Subject: [PATCH 177/216] NFSD: register/unregister of nfsd-client shrinker at nfsd startup/shutdown time [ Upstream commit f385f7d244134246f984975ed34cd75f77de479f ] Currently the nfsd-client shrinker is registered and unregistered at the time the nfsd module is loaded and unloaded. The problem with this is the shrinker is being registered before all of the relevant fields in nfsd_net are initialized when nfsd is started. This can lead to an oops when memory is low and the shrinker is called while nfsd is not running. This patch moves the register/unregister of nfsd-client shrinker from module load/unload time to nfsd startup/shutdown time. Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") Reported-by: Mike Galbraith Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 22 +++++++++++----------- fs/nfsd/nfsctl.c | 7 +------ fs/nfsd/nfsd.h | 6 ++---- 3 files changed, 14 insertions(+), 21 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d8829fa53fda..02a3629ba307 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4422,7 +4422,7 @@ nfsd4_state_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) return SHRINK_STOP; } -int +void nfsd4_init_leases_net(struct nfsd_net *nn) { struct sysinfo si; @@ -4444,16 +4444,6 @@ nfsd4_init_leases_net(struct nfsd_net *nn) nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB); atomic_set(&nn->nfsd_courtesy_clients, 0); - nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan; - nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count; - nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; - return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client"); -} - -void -nfsd4_leases_net_shutdown(struct nfsd_net *nn) -{ - unregister_shrinker(&nn->nfsd_client_shrinker); } static void init_nfs4_replay(struct nfs4_replay *rp) @@ -8099,8 +8089,17 @@ static int nfs4_state_create_net(struct net *net) INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); get_net(net); + nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan; + nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count; + nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; + + if (register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client")) + goto err_shrinker; return 0; +err_shrinker: + put_net(net); + kfree(nn->sessionid_hashtbl); err_sessionid: kfree(nn->unconf_id_hashtbl); err_unconf_id: @@ -8193,6 +8192,7 @@ nfs4_state_shutdown_net(struct net *net) struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + unregister_shrinker(&nn->nfsd_client_shrinker); cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index a8884d0b4638..76a60e7a7509 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1452,9 +1452,7 @@ static __net_init int nfsd_init_net(struct net *net) goto out_idmap_error; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; - retval = nfsd4_init_leases_net(nn); - if (retval) - goto out_drc_error; + nfsd4_init_leases_net(nn); retval = nfsd_reply_cache_init(nn); if (retval) goto out_cache_error; @@ -1464,8 +1462,6 @@ static __net_init int nfsd_init_net(struct net *net) return 0; out_cache_error: - nfsd4_leases_net_shutdown(nn); -out_drc_error: nfsd_idmap_shutdown(net); out_idmap_error: nfsd_export_shutdown(net); @@ -1481,7 +1477,6 @@ static __net_exit void nfsd_exit_net(struct net *net) nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); nfsd_netns_free_versions(net_generic(net, nfsd_net_id)); - nfsd4_leases_net_shutdown(nn); } static struct pernet_operations nfsd_net_ops = { diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 93b42ef9ed91..fa0144a74267 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -504,8 +504,7 @@ extern void unregister_cld_notifier(void); extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn); #endif -extern int nfsd4_init_leases_net(struct nfsd_net *nn); -extern void nfsd4_leases_net_shutdown(struct nfsd_net *nn); +extern void nfsd4_init_leases_net(struct nfsd_net *nn); #else /* CONFIG_NFSD_V4 */ static inline int nfsd4_is_junction(struct dentry *dentry) @@ -513,8 +512,7 @@ static inline int nfsd4_is_junction(struct dentry *dentry) return 0; } -static inline int nfsd4_init_leases_net(struct nfsd_net *nn) { return 0; }; -static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) {}; +static inline void nfsd4_init_leases_net(struct nfsd_net *nn) { }; #define register_cld_notifier() 0 #define unregister_cld_notifier() do { } while(0) From f3ea5ec83d1a827f074b2b660749817e0bf2b23e Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Wed, 11 Jan 2023 16:06:51 -0800 Subject: [PATCH 178/216] NFSD: replace delayed_work with work_struct for nfsd_client_shrinker [ Upstream commit 7c24fa225081f31bc6da6a355c1ba801889ab29a ] Since nfsd4_state_shrinker_count always calls mod_delayed_work with 0 delay, we can replace delayed_work with work_struct to save some space and overhead. Also add the call to cancel_work after unregister the shrinker in nfs4_state_shutdown_net. Signed-off-by: Dai Ngo Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/netns.h | 2 +- fs/nfsd/nfs4state.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 8c854ba3285b..51a4b7885cae 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -195,7 +195,7 @@ struct nfsd_net { atomic_t nfsd_courtesy_clients; struct shrinker nfsd_client_shrinker; - struct delayed_work nfsd_shrinker_work; + struct work_struct nfsd_shrinker_work; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 02a3629ba307..a6fcde4f7b13 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4412,7 +4412,7 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) if (!count) count = atomic_long_read(&num_delegations); if (count) - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); + queue_work(laundry_wq, &nn->nfsd_shrinker_work); return (unsigned long)count; } @@ -6253,8 +6253,7 @@ deleg_reaper(struct nfsd_net *nn) static void nfsd4_state_shrinker_worker(struct work_struct *work) { - struct delayed_work *dwork = to_delayed_work(work); - struct nfsd_net *nn = container_of(dwork, struct nfsd_net, + struct nfsd_net *nn = container_of(work, struct nfsd_net, nfsd_shrinker_work); courtesy_client_reaper(nn); @@ -8086,7 +8085,7 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->blocked_locks_lru); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); - INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); + INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); get_net(net); nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan; @@ -8193,6 +8192,7 @@ nfs4_state_shutdown_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); unregister_shrinker(&nn->nfsd_client_shrinker); + cancel_work(&nn->nfsd_shrinker_work); cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); From 56587affe21c5cd806523a89efd8da5b49872a72 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 11 Feb 2023 07:50:08 -0500 Subject: [PATCH 179/216] nfsd: don't destroy global nfs4_file table in per-net shutdown [ Upstream commit 4102db175b5d884d133270fdbd0e59111ce688fc ] The nfs4_file table is global, so shutting it down when a containerized nfsd is shut down is wrong and can lead to double-frees. Tear down the nfs4_file_rhltable in nfs4_state_shutdown instead of nfs4_state_shutdown_net. Fixes: d47b295e8d76 ("NFSD: Use rhashtable for managing nfs4_file objects") Link: https://bugzilla.redhat.com/show_bug.cgi?id=2169017 Reported-by: JianHong Yin Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a6fcde4f7b13..b9d694ec25d1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8212,7 +8212,6 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); - rhltable_destroy(&nfs4_file_rhltable); #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_shutdown_umount(nn); #endif @@ -8222,6 +8221,7 @@ void nfs4_state_shutdown(void) { nfsd4_destroy_callback_queue(); + rhltable_destroy(&nfs4_file_rhltable); } static void From e58f2862e9fe500b073d20f94e73abc52fb70634 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:39 +0100 Subject: [PATCH 180/216] arm64: efi: Limit allocations to 48-bit addressable physical region From: Ard Biesheuvel [ Commit a37dac5c5dcfe0f1fd58513c16cdbc280a47f628 upstream ] The UEFI spec does not mention or reason about the configured size of the virtual address space at all, but it does mention that all memory should be identity mapped using a page size of 4 KiB. This means that a LPA2 capable system that has any system memory outside of the 48-bit addressable physical range and follows the spec to the letter may serve page allocation requests from regions of memory that the kernel cannot access unless it was built with LPA2 support and enables it at runtime. So let's ensure that all page allocations are limited to the 48-bit range. Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/efi.h | 1 + drivers/firmware/efi/libstub/alignedmem.c | 2 ++ drivers/firmware/efi/libstub/arm64-stub.c | 5 +++-- drivers/firmware/efi/libstub/efistub.h | 4 ++++ drivers/firmware/efi/libstub/mem.c | 2 ++ drivers/firmware/efi/libstub/randomalloc.c | 2 +- 6 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 62c846be2d76..a75c0772ecfc 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -103,6 +103,7 @@ static inline void free_screen_info(struct screen_info *si) } #define EFI_ALLOC_ALIGN SZ_64K +#define EFI_ALLOC_LIMIT ((1UL << 48) - 1) /* * On ARM systems, virtually remapped UEFI runtime services are set up in two diff --git a/drivers/firmware/efi/libstub/alignedmem.c b/drivers/firmware/efi/libstub/alignedmem.c index 174832661251..6b83c492c3b8 100644 --- a/drivers/firmware/efi/libstub/alignedmem.c +++ b/drivers/firmware/efi/libstub/alignedmem.c @@ -29,6 +29,8 @@ efi_status_t efi_allocate_pages_aligned(unsigned long size, unsigned long *addr, efi_status_t status; int slack; + max = min(max, EFI_ALLOC_LIMIT); + if (align < EFI_ALLOC_ALIGN) align = EFI_ALLOC_ALIGN; diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 08f46c072da5..40275c3131c8 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -191,10 +191,11 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, if (status != EFI_SUCCESS) { if (!check_image_region((u64)_text, kernel_memsize)) { efi_err("FIRMWARE BUG: Image BSS overlaps adjacent EFI memory region\n"); - } else if (IS_ALIGNED((u64)_text, min_kimg_align)) { + } else if (IS_ALIGNED((u64)_text, min_kimg_align) && + (u64)_end < EFI_ALLOC_LIMIT) { /* * Just execute from wherever we were loaded by the - * UEFI PE/COFF loader if the alignment is suitable. + * UEFI PE/COFF loader if the placement is suitable. */ *image_addr = (u64)_text; *reserve_size = 0; diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index ab505b07e626..002f02a6d359 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -29,6 +29,10 @@ #define EFI_ALLOC_ALIGN EFI_PAGE_SIZE #endif +#ifndef EFI_ALLOC_LIMIT +#define EFI_ALLOC_LIMIT ULONG_MAX +#endif + extern bool efi_nochunk; extern bool efi_nokaslr; extern int efi_loglevel; diff --git a/drivers/firmware/efi/libstub/mem.c b/drivers/firmware/efi/libstub/mem.c index 03d147f17185..4f1fa302234d 100644 --- a/drivers/firmware/efi/libstub/mem.c +++ b/drivers/firmware/efi/libstub/mem.c @@ -89,6 +89,8 @@ efi_status_t efi_allocate_pages(unsigned long size, unsigned long *addr, efi_physical_addr_t alloc_addr; efi_status_t status; + max = min(max, EFI_ALLOC_LIMIT); + if (EFI_ALLOC_ALIGN > EFI_PAGE_SIZE) return efi_allocate_pages_aligned(size, addr, max, EFI_ALLOC_ALIGN, diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index ec44bb7e092f..1692d19ae80f 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -29,7 +29,7 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md, return 0; region_end = min(md->phys_addr + md->num_pages * EFI_PAGE_SIZE - 1, - (u64)ULONG_MAX); + (u64)EFI_ALLOC_LIMIT); if (region_end < size) return 0; From 33d064aecd89846d5cf284ab75eeb9098b5ff49e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:40 +0100 Subject: [PATCH 181/216] efi: efivars: prevent double registration From: Johan Hovold [ Commit 0217a40d7ba6e71d7f3422fbe89b436e8ee7ece7 upstream ] Add the missing sanity check to efivars_register() so that it is no longer possible to override an already registered set of efivar ops (without first deregistering them). This can help debug initialisation ordering issues where drivers have so far unknowingly been relying on overriding the generic ops. Signed-off-by: Johan Hovold Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/vars.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index 0ba9f18312f5..4ca256bcd697 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -66,19 +66,28 @@ int efivars_register(struct efivars *efivars, const struct efivar_operations *ops, struct kobject *kobject) { + int rv; + if (down_interruptible(&efivars_lock)) return -EINTR; + if (__efivars) { + pr_warn("efivars already registered\n"); + rv = -EBUSY; + goto out; + } + efivars->ops = ops; efivars->kobject = kobject; __efivars = efivars; pr_info("Registered efivars operations\n"); - + rv = 0; +out: up(&efivars_lock); - return 0; + return rv; } EXPORT_SYMBOL_GPL(efivars_register); From f0acafd6f79fa6068b7fc4af7980ac9bbd14f1d1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:41 +0100 Subject: [PATCH 182/216] x86/efistub: Simplify and clean up handover entry code From: Ard Biesheuvel [ Commit df9215f15206c2a81909ccf60f21d170801dce38 upstream ] Now that the EFI entry code in assembler is only used by the optional and deprecated EFI handover protocol, and given that the EFI stub C code no longer returns to it, most of it can simply be dropped. While at it, clarify the symbol naming, by merging efi_main() and efi_stub_entry(), making the latter the shared entry point for all different boot modes that enter via the EFI stub. The efi32_stub_entry() and efi64_stub_entry() names are referenced explicitly by the tooling that populates the setup header, so these must be retained, but can be emitted as aliases of efi_stub_entry() where appropriate. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-5-ardb@kernel.org Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- Documentation/x86/boot.rst | 2 +- arch/x86/boot/compressed/efi_mixed.S | 22 ++++++++++++---------- arch/x86/boot/compressed/head_32.S | 11 ----------- arch/x86/boot/compressed/head_64.S | 12 ++---------- drivers/firmware/efi/libstub/x86-stub.c | 20 ++++++++++++++++---- 5 files changed, 31 insertions(+), 36 deletions(-) diff --git a/Documentation/x86/boot.rst b/Documentation/x86/boot.rst index 894a19897005..bac3789f3e8f 100644 --- a/Documentation/x86/boot.rst +++ b/Documentation/x86/boot.rst @@ -1416,7 +1416,7 @@ execution context provided by the EFI firmware. The function prototype for the handover entry point looks like this:: - efi_main(void *handle, efi_system_table_t *table, struct boot_params *bp) + efi_stub_entry(void *handle, efi_system_table_t *table, struct boot_params *bp) 'handle' is the EFI image handle passed to the boot loader by the EFI firmware, 'table' is the EFI system table - these are the first two diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S index 8b02e507d3bb..d05f0250bbbc 100644 --- a/arch/x86/boot/compressed/efi_mixed.S +++ b/arch/x86/boot/compressed/efi_mixed.S @@ -26,8 +26,8 @@ * When booting in 64-bit mode on 32-bit EFI firmware, startup_64_mixed_mode() * is the first thing that runs after switching to long mode. Depending on * whether the EFI handover protocol or the compat entry point was used to - * enter the kernel, it will either branch to the 64-bit EFI handover - * entrypoint at offset 0x390 in the image, or to the 64-bit EFI PE/COFF + * enter the kernel, it will either branch to the common 64-bit EFI stub + * entrypoint efi_stub_entry() directly, or via the 64-bit EFI PE/COFF * entrypoint efi_pe_entry(). In the former case, the bootloader must provide a * struct bootparams pointer as the third argument, so the presence of such a * pointer is used to disambiguate. @@ -37,21 +37,23 @@ * | efi32_pe_entry |---->| | | +-----------+--+ * +------------------+ | | +------+----------------+ | * | startup_32 |---->| startup_64_mixed_mode | | - * +------------------+ | | +------+----------------+ V - * | efi32_stub_entry |---->| | | +------------------+ - * +------------------+ +------------+ +---->| efi64_stub_entry | - * +-------------+----+ - * +------------+ +----------+ | - * | startup_64 |<----| efi_main |<--------------+ - * +------------+ +----------+ + * +------------------+ | | +------+----------------+ | + * | efi32_stub_entry |---->| | | | + * +------------------+ +------------+ | | + * V | + * +------------+ +----------------+ | + * | startup_64 |<----| efi_stub_entry |<--------+ + * +------------+ +----------------+ */ SYM_FUNC_START(startup_64_mixed_mode) lea efi32_boot_args(%rip), %rdx mov 0(%rdx), %edi mov 4(%rdx), %esi +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL mov 8(%rdx), %edx // saved bootparams pointer test %edx, %edx - jnz efi64_stub_entry + jnz efi_stub_entry +#endif /* * efi_pe_entry uses MS calling convention, which requires 32 bytes of * shadow space on the stack even if all arguments are passed in diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 3ecc1bbe971e..3af4a383615b 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -150,17 +150,6 @@ SYM_FUNC_START(startup_32) jmp *%eax SYM_FUNC_END(startup_32) -#ifdef CONFIG_EFI_STUB -SYM_FUNC_START(efi32_stub_entry) - add $0x4, %esp - movl 8(%esp), %esi /* save boot_params pointer */ - call efi_main - /* efi_main returns the possibly relocated address of startup_32 */ - jmp *%eax -SYM_FUNC_END(efi32_stub_entry) -SYM_FUNC_ALIAS(efi_stub_entry, efi32_stub_entry) -#endif - .text SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fafd0a59f396..d4ccae574c4f 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -474,19 +474,11 @@ SYM_CODE_START(startup_64) jmp *%rax SYM_CODE_END(startup_64) -#ifdef CONFIG_EFI_STUB -#ifdef CONFIG_EFI_HANDOVER_PROTOCOL +#if IS_ENABLED(CONFIG_EFI_MIXED) && IS_ENABLED(CONFIG_EFI_HANDOVER_PROTOCOL) .org 0x390 -#endif SYM_FUNC_START(efi64_stub_entry) - and $~0xf, %rsp /* realign the stack */ - movq %rdx, %rbx /* save boot_params pointer */ - call efi_main - movq %rbx,%rsi - leaq rva(startup_64)(%rax), %rax - jmp *%rax + jmp efi_stub_entry SYM_FUNC_END(efi64_stub_entry) -SYM_FUNC_ALIAS(efi_stub_entry, efi64_stub_entry) #endif .text diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 9422fddfbc8f..9661d5a5769e 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -774,9 +774,9 @@ static void __noreturn enter_kernel(unsigned long kernel_addr, * return. On failure, it will exit to the firmware via efi_exit() instead of * returning. */ -asmlinkage unsigned long efi_main(efi_handle_t handle, - efi_system_table_t *sys_table_arg, - struct boot_params *boot_params) +void __noreturn efi_stub_entry(efi_handle_t handle, + efi_system_table_t *sys_table_arg, + struct boot_params *boot_params) { unsigned long bzimage_addr = (unsigned long)startup_32; unsigned long buffer_start, buffer_end; @@ -919,7 +919,19 @@ asmlinkage unsigned long efi_main(efi_handle_t handle, enter_kernel(bzimage_addr, boot_params); fail: - efi_err("efi_main() failed!\n"); + efi_err("efi_stub_entry() failed!\n"); efi_exit(handle, status); } + +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL +#ifndef CONFIG_EFI_MIXED +extern __alias(efi_stub_entry) +void efi32_stub_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg, + struct boot_params *boot_params); + +extern __alias(efi_stub_entry) +void efi64_stub_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg, + struct boot_params *boot_params); +#endif +#endif From 1f3fd81bff03355c3acc8558c3c4da2f2d4e1d18 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:42 +0100 Subject: [PATCH 183/216] x86/decompressor: Avoid magic offsets for EFI handover entrypoint From: Ard Biesheuvel [ Commit 12792064587623065250069d1df980e2c9ac3e67 upstream ] The native 32-bit or 64-bit EFI handover protocol entrypoint offset relative to the respective startup_32/64 address is described in boot_params as handover_offset, so that the special Linux/x86 aware EFI loader can find it there. When mixed mode is enabled, this single field has to describe this offset for both the 32-bit and 64-bit entrypoints, so their respective relative offsets have to be identical. Given that startup_32 and startup_64 are 0x200 bytes apart, and the EFI handover entrypoint resides at a fixed offset, the 32-bit and 64-bit versions of those entrypoints must be exactly 0x200 bytes apart as well. Currently, hard-coded fixed offsets are used to ensure this, but it is sufficient to emit the 64-bit entrypoint 0x200 bytes after the 32-bit one, wherever it happens to reside. This allows this code (which is now EFI mixed mode specific) to be moved into efi_mixed.S and out of the startup code in head_64.S. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-6-ardb@kernel.org Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/efi_mixed.S | 20 +++++++++++++++++++- arch/x86/boot/compressed/head_64.S | 18 ------------------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S index d05f0250bbbc..deb36129e3a9 100644 --- a/arch/x86/boot/compressed/efi_mixed.S +++ b/arch/x86/boot/compressed/efi_mixed.S @@ -146,6 +146,16 @@ SYM_FUNC_START(__efi64_thunk) SYM_FUNC_END(__efi64_thunk) .code32 +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL +SYM_FUNC_START(efi32_stub_entry) + add $0x4, %esp /* Discard return address */ + popl %ecx + popl %edx + popl %esi + jmp efi32_entry +SYM_FUNC_END(efi32_stub_entry) +#endif + /* * EFI service pointer must be in %edi. * @@ -226,7 +236,7 @@ SYM_FUNC_END(efi_enter32) * stub may still exit and return to the firmware using the Exit() EFI boot * service.] */ -SYM_FUNC_START(efi32_entry) +SYM_FUNC_START_LOCAL(efi32_entry) call 1f 1: pop %ebx @@ -326,6 +336,14 @@ SYM_FUNC_START(efi32_pe_entry) RET SYM_FUNC_END(efi32_pe_entry) +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL + .org efi32_stub_entry + 0x200 + .code64 +SYM_FUNC_START_NOALIGN(efi64_stub_entry) + jmp efi_stub_entry +SYM_FUNC_END(efi64_stub_entry) +#endif + .section ".rodata" /* EFI loaded image protocol GUID */ .balign 4 diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index d4ccae574c4f..9a0d83b4d266 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -286,17 +286,6 @@ SYM_FUNC_START(startup_32) lret SYM_FUNC_END(startup_32) -#if IS_ENABLED(CONFIG_EFI_MIXED) && IS_ENABLED(CONFIG_EFI_HANDOVER_PROTOCOL) - .org 0x190 -SYM_FUNC_START(efi32_stub_entry) - add $0x4, %esp /* Discard return address */ - popl %ecx - popl %edx - popl %esi - jmp efi32_entry -SYM_FUNC_END(efi32_stub_entry) -#endif - .code64 .org 0x200 SYM_CODE_START(startup_64) @@ -474,13 +463,6 @@ SYM_CODE_START(startup_64) jmp *%rax SYM_CODE_END(startup_64) -#if IS_ENABLED(CONFIG_EFI_MIXED) && IS_ENABLED(CONFIG_EFI_HANDOVER_PROTOCOL) - .org 0x390 -SYM_FUNC_START(efi64_stub_entry) - jmp efi_stub_entry -SYM_FUNC_END(efi64_stub_entry) -#endif - .text SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) From 34378d7ad273ff859c1ed9ab77bb71e55f652b06 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:43 +0100 Subject: [PATCH 184/216] x86/efistub: Clear BSS in EFI handover protocol entrypoint From: Ard Biesheuvel [ Commit d7156b986d4cc0657fa6dc05c9fcf51c3d55a0fe upstream ] The so-called EFI handover protocol is value-add from the distros that permits a loader to simply copy a PE kernel image into memory and call an alternative entrypoint that is described by an embedded boot_params structure. Most implementations of this protocol do not bother to check the PE header for minimum alignment, section placement, etc, and therefore also don't clear the image's BSS, or even allocate enough memory for it. Allocating more memory on the fly is rather difficult, but at least clear the BSS region explicitly when entering in this manner, so that the EFI stub code does not get confused by global variables that were not zero-initialized correctly. When booting in mixed mode, this BSS clearing must occur before any global state is created, so clear it in the 32-bit asm entry point. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-7-ardb@kernel.org Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/efi_mixed.S | 14 +++++++++++++- drivers/firmware/efi/libstub/x86-stub.c | 13 +++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S index deb36129e3a9..d6d1b76b594d 100644 --- a/arch/x86/boot/compressed/efi_mixed.S +++ b/arch/x86/boot/compressed/efi_mixed.S @@ -148,6 +148,18 @@ SYM_FUNC_END(__efi64_thunk) .code32 #ifdef CONFIG_EFI_HANDOVER_PROTOCOL SYM_FUNC_START(efi32_stub_entry) + call 1f +1: popl %ecx + + /* Clear BSS */ + xorl %eax, %eax + leal (_bss - 1b)(%ecx), %edi + leal (_ebss - 1b)(%ecx), %ecx + subl %edi, %ecx + shrl $2, %ecx + cld + rep stosl + add $0x4, %esp /* Discard return address */ popl %ecx popl %edx @@ -340,7 +352,7 @@ SYM_FUNC_END(efi32_pe_entry) .org efi32_stub_entry + 0x200 .code64 SYM_FUNC_START_NOALIGN(efi64_stub_entry) - jmp efi_stub_entry + jmp efi_handover_entry SYM_FUNC_END(efi64_stub_entry) #endif diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 9661d5a5769e..764bac6b58f9 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -925,12 +925,21 @@ fail: } #ifdef CONFIG_EFI_HANDOVER_PROTOCOL +void efi_handover_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg, + struct boot_params *boot_params) +{ + extern char _bss[], _ebss[]; + + memset(_bss, 0, _ebss - _bss); + efi_stub_entry(handle, sys_table_arg, boot_params); +} + #ifndef CONFIG_EFI_MIXED -extern __alias(efi_stub_entry) +extern __alias(efi_handover_entry) void efi32_stub_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg, struct boot_params *boot_params); -extern __alias(efi_stub_entry) +extern __alias(efi_handover_entry) void efi64_stub_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg, struct boot_params *boot_params); #endif From 8ff6d88c0443acdd4199aacb69f1dd4a24120e8e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:45 +0100 Subject: [PATCH 185/216] efi/libstub: Add memory attribute protocol definitions From: Evgeniy Baskov [ Commit 79729f26b074a5d2722c27fa76cc45ef721e65cd upstream ] EFI_MEMORY_ATTRIBUTE_PROTOCOL servers as a better alternative to DXE services for setting memory attributes in EFI Boot Services environment. This protocol is better since it is a part of UEFI specification itself and not UEFI PI specification like DXE services. Add EFI_MEMORY_ATTRIBUTE_PROTOCOL definitions. Support mixed mode properly for its calls. Tested-by: Mario Limonciello Signed-off-by: Evgeniy Baskov Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/efi.h | 7 +++++++ drivers/firmware/efi/libstub/efistub.h | 20 ++++++++++++++++++++ include/linux/efi.h | 1 + 3 files changed, 28 insertions(+) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 233ae6986d6f..522ff2e443b3 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -325,6 +325,13 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_set_memory_space_attributes(phys, size, flags) \ (__efi64_split(phys), __efi64_split(size), __efi64_split(flags)) +/* Memory Attribute Protocol */ +#define __efi64_argmap_set_memory_attributes(protocol, phys, size, flags) \ + ((protocol), __efi64_split(phys), __efi64_split(size), __efi64_split(flags)) + +#define __efi64_argmap_clear_memory_attributes(protocol, phys, size, flags) \ + ((protocol), __efi64_split(phys), __efi64_split(size), __efi64_split(flags)) + /* * The macros below handle the plumbing for the argument mapping. To add a * mapping for a specific EFI method, simply define a macro diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 002f02a6d359..6f5a1a16db15 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -419,6 +419,26 @@ union efi_dxe_services_table { } mixed_mode; }; +typedef union efi_memory_attribute_protocol efi_memory_attribute_protocol_t; + +union efi_memory_attribute_protocol { + struct { + efi_status_t (__efiapi *get_memory_attributes)( + efi_memory_attribute_protocol_t *, efi_physical_addr_t, u64, u64 *); + + efi_status_t (__efiapi *set_memory_attributes)( + efi_memory_attribute_protocol_t *, efi_physical_addr_t, u64, u64); + + efi_status_t (__efiapi *clear_memory_attributes)( + efi_memory_attribute_protocol_t *, efi_physical_addr_t, u64, u64); + }; + struct { + u32 get_memory_attributes; + u32 set_memory_attributes; + u32 clear_memory_attributes; + } mixed_mode; +}; + typedef union efi_uga_draw_protocol efi_uga_draw_protocol_t; union efi_uga_draw_protocol { diff --git a/include/linux/efi.h b/include/linux/efi.h index 4e1bfee9675d..de6d6558a4d3 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -390,6 +390,7 @@ void efi_native_runtime_setup(void); #define EFI_RT_PROPERTIES_TABLE_GUID EFI_GUID(0xeb66918a, 0x7eef, 0x402a, 0x84, 0x2e, 0x93, 0x1d, 0x21, 0xc3, 0x8a, 0xe9) #define EFI_DXE_SERVICES_TABLE_GUID EFI_GUID(0x05ad34ba, 0x6f02, 0x4214, 0x95, 0x2e, 0x4d, 0xa0, 0x39, 0x8e, 0x2b, 0xb9) #define EFI_SMBIOS_PROTOCOL_GUID EFI_GUID(0x03583ff6, 0xcb36, 0x4940, 0x94, 0x7e, 0xb9, 0xb3, 0x9f, 0x4a, 0xfa, 0xf7) +#define EFI_MEMORY_ATTRIBUTE_PROTOCOL_GUID EFI_GUID(0xf4560cf6, 0x40ec, 0x4b4a, 0xa1, 0x92, 0xbf, 0x1d, 0x57, 0xd0, 0xb1, 0x89) #define EFI_IMAGE_SECURITY_DATABASE_GUID EFI_GUID(0xd719b2cb, 0x3d3a, 0x4596, 0xa3, 0xbc, 0xda, 0xd0, 0x0e, 0x67, 0x65, 0x6f) #define EFI_SHIM_LOCK_GUID EFI_GUID(0x605dab50, 0xe046, 0x4300, 0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23) From 476a48cd37c948b160cc3d5ff5b4d2e711f1ca36 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:46 +0100 Subject: [PATCH 186/216] efi/libstub: Add limit argument to efi_random_alloc() From: Ard Biesheuvel [ Commit bc5ddceff4c14494d83449ad45c985e6cd353fce upstream ] x86 will need to limit the kernel memory allocation to the lowest 512 MiB of memory, to match the behavior of the existing bare metal KASLR physical randomization logic. So in preparation for that, add a limit parameter to efi_random_alloc() and wire it up. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-22-ardb@kernel.org Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/libstub/arm64-stub.c | 2 +- drivers/firmware/efi/libstub/efistub.h | 2 +- drivers/firmware/efi/libstub/randomalloc.c | 10 ++++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 40275c3131c8..16377b452119 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -181,7 +181,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, */ status = efi_random_alloc(*reserve_size, min_kimg_align, reserve_addr, phys_seed, - EFI_LOADER_CODE); + EFI_LOADER_CODE, EFI_ALLOC_LIMIT); if (status != EFI_SUCCESS) efi_warn("efi_random_alloc() failed: 0x%lx\n", status); } else { diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 6f5a1a16db15..8a343ea1231a 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -905,7 +905,7 @@ efi_status_t efi_get_random_bytes(unsigned long size, u8 *out); efi_status_t efi_random_alloc(unsigned long size, unsigned long align, unsigned long *addr, unsigned long random_seed, - int memory_type); + int memory_type, unsigned long alloc_limit); efi_status_t efi_random_get_seed(void); diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index 1692d19ae80f..ed6f6087a9ea 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -16,7 +16,8 @@ */ static unsigned long get_entry_num_slots(efi_memory_desc_t *md, unsigned long size, - unsigned long align_shift) + unsigned long align_shift, + u64 alloc_limit) { unsigned long align = 1UL << align_shift; u64 first_slot, last_slot, region_end; @@ -29,7 +30,7 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md, return 0; region_end = min(md->phys_addr + md->num_pages * EFI_PAGE_SIZE - 1, - (u64)EFI_ALLOC_LIMIT); + alloc_limit); if (region_end < size) return 0; @@ -54,7 +55,8 @@ efi_status_t efi_random_alloc(unsigned long size, unsigned long align, unsigned long *addr, unsigned long random_seed, - int memory_type) + int memory_type, + unsigned long alloc_limit) { unsigned long total_slots = 0, target_slot; unsigned long total_mirrored_slots = 0; @@ -76,7 +78,7 @@ efi_status_t efi_random_alloc(unsigned long size, efi_memory_desc_t *md = (void *)map->map + map_offset; unsigned long slots; - slots = get_entry_num_slots(md, size, ilog2(align)); + slots = get_entry_num_slots(md, size, ilog2(align), alloc_limit); MD_NUM_SLOTS(md) = slots; total_slots += slots; if (md->attribute & EFI_MEMORY_MORE_RELIABLE) From 350265a753d8b39e2bb11660f2109c8dd5306b45 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:47 +0100 Subject: [PATCH 187/216] x86/efistub: Perform 4/5 level paging switch from the stub From: Ard Biesheuvel [ Commit cb380000dd23cbbf8bd7d023b51896804c1f7e68 upstream ] In preparation for updating the EFI stub boot flow to avoid the bare metal decompressor code altogether, implement the support code for switching between 4 and 5 levels of paging before jumping to the kernel proper. This reuses the newly refactored trampoline that the bare metal decompressor uses, but relies on EFI APIs to allocate 32-bit addressable memory and remap it with the appropriate permissions. Given that the bare metal decompressor will no longer call into the trampoline if the number of paging levels is already set correctly, it is no longer needed to remove NX restrictions from the memory range where this trampoline may end up. Acked-by: Kirill A. Shutemov Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/libstub/Makefile | 1 + .../firmware/efi/libstub/efi-stub-helper.c | 2 + drivers/firmware/efi/libstub/efistub.h | 1 + drivers/firmware/efi/libstub/x86-5lvl.c | 95 +++++++++++++++++++ drivers/firmware/efi/libstub/x86-stub.c | 40 +++----- drivers/firmware/efi/libstub/x86-stub.h | 17 ++++ 6 files changed, 130 insertions(+), 26 deletions(-) create mode 100644 drivers/firmware/efi/libstub/x86-5lvl.c create mode 100644 drivers/firmware/efi/libstub/x86-stub.h diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index b6e1dcb98a64..473ef18421db 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -84,6 +84,7 @@ lib-$(CONFIG_EFI_GENERIC_STUB) += efi-stub.o string.o intrinsics.o systable.o lib-$(CONFIG_ARM) += arm32-stub.o lib-$(CONFIG_ARM64) += arm64-stub.o smbios.o lib-$(CONFIG_X86) += x86-stub.o +lib-$(CONFIG_X86_64) += x86-5lvl.o lib-$(CONFIG_RISCV) += riscv-stub.o lib-$(CONFIG_LOONGARCH) += loongarch-stub.o diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 3d9b2469a0df..97744822dd95 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -216,6 +216,8 @@ efi_status_t efi_parse_options(char const *cmdline) efi_loglevel = CONSOLE_LOGLEVEL_QUIET; } else if (!strcmp(param, "noinitrd")) { efi_noinitrd = true; + } else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) { + efi_no5lvl = true; } else if (!strcmp(param, "efi") && val) { efi_nochunk = parse_option_str(val, "nochunk"); efi_novamap |= parse_option_str(val, "novamap"); diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 8a343ea1231a..4b4055877f3d 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -33,6 +33,7 @@ #define EFI_ALLOC_LIMIT ULONG_MAX #endif +extern bool efi_no5lvl; extern bool efi_nochunk; extern bool efi_nokaslr; extern int efi_loglevel; diff --git a/drivers/firmware/efi/libstub/x86-5lvl.c b/drivers/firmware/efi/libstub/x86-5lvl.c new file mode 100644 index 000000000000..479dd445acdc --- /dev/null +++ b/drivers/firmware/efi/libstub/x86-5lvl.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +#include +#include +#include + +#include "efistub.h" +#include "x86-stub.h" + +bool efi_no5lvl; + +static void (*la57_toggle)(void *cr3); + +static const struct desc_struct gdt[] = { + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), +}; + +/* + * Enabling (or disabling) 5 level paging is tricky, because it can only be + * done from 32-bit mode with paging disabled. This means not only that the + * code itself must be running from 32-bit addressable physical memory, but + * also that the root page table must be 32-bit addressable, as programming + * a 64-bit value into CR3 when running in 32-bit mode is not supported. + */ +efi_status_t efi_setup_5level_paging(void) +{ + u8 tmpl_size = (u8 *)&trampoline_ljmp_imm_offset - (u8 *)&trampoline_32bit_src; + efi_status_t status; + u8 *la57_code; + + if (!efi_is_64bit()) + return EFI_SUCCESS; + + /* check for 5 level paging support */ + if (native_cpuid_eax(0) < 7 || + !(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + return EFI_SUCCESS; + + /* allocate some 32-bit addressable memory for code and a page table */ + status = efi_allocate_pages(2 * PAGE_SIZE, (unsigned long *)&la57_code, + U32_MAX); + if (status != EFI_SUCCESS) + return status; + + la57_toggle = memcpy(la57_code, trampoline_32bit_src, tmpl_size); + memset(la57_code + tmpl_size, 0x90, PAGE_SIZE - tmpl_size); + + /* + * To avoid the need to allocate a 32-bit addressable stack, the + * trampoline uses a LJMP instruction to switch back to long mode. + * LJMP takes an absolute destination address, which needs to be + * fixed up at runtime. + */ + *(u32 *)&la57_code[trampoline_ljmp_imm_offset] += (unsigned long)la57_code; + + efi_adjust_memory_range_protection((unsigned long)la57_toggle, PAGE_SIZE); + + return EFI_SUCCESS; +} + +void efi_5level_switch(void) +{ + bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl; + bool have_la57 = native_read_cr4() & X86_CR4_LA57; + bool need_toggle = want_la57 ^ have_la57; + u64 *pgt = (void *)la57_toggle + PAGE_SIZE; + u64 *cr3 = (u64 *)__native_read_cr3(); + u64 *new_cr3; + + if (!la57_toggle || !need_toggle) + return; + + if (!have_la57) { + /* + * 5 level paging will be enabled, so a root level page needs + * to be allocated from the 32-bit addressable physical region, + * with its first entry referring to the existing hierarchy. + */ + new_cr3 = memset(pgt, 0, PAGE_SIZE); + new_cr3[0] = (u64)cr3 | _PAGE_TABLE_NOENC; + } else { + /* take the new root table pointer from the current entry #0 */ + new_cr3 = (u64 *)(cr3[0] & PAGE_MASK); + + /* copy the new root table if it is not 32-bit addressable */ + if ((u64)new_cr3 > U32_MAX) + new_cr3 = memcpy(pgt, new_cr3, PAGE_SIZE); + } + + native_load_gdt(&(struct desc_ptr){ sizeof(gdt) - 1, (u64)gdt }); + + la57_toggle(new_cr3); +} diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 764bac6b58f9..adaddd38d97d 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -17,6 +17,7 @@ #include #include "efistub.h" +#include "x86-stub.h" /* Maximum physical address for 64-bit kernel with 4-level paging */ #define MAXMEM_X86_64_4LEVEL (1ull << 46) @@ -212,8 +213,8 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) } } -static void -adjust_memory_range_protection(unsigned long start, unsigned long size) +void efi_adjust_memory_range_protection(unsigned long start, + unsigned long size) { efi_status_t status; efi_gcd_memory_space_desc_t desc; @@ -267,35 +268,14 @@ adjust_memory_range_protection(unsigned long start, unsigned long size) } } -/* - * Trampoline takes 2 pages and can be loaded in first megabyte of memory - * with its end placed between 128k and 640k where BIOS might start. - * (see arch/x86/boot/compressed/pgtable_64.c) - * - * We cannot find exact trampoline placement since memory map - * can be modified by UEFI, and it can alter the computed address. - */ - -#define TRAMPOLINE_PLACEMENT_BASE ((128 - 8)*1024) -#define TRAMPOLINE_PLACEMENT_SIZE (640*1024 - (128 - 8)*1024) - extern const u8 startup_32[], startup_64[]; static void setup_memory_protection(unsigned long image_base, unsigned long image_size) { - /* - * Allow execution of possible trampoline used - * for switching between 4- and 5-level page tables - * and relocated kernel image. - */ - - adjust_memory_range_protection(TRAMPOLINE_PLACEMENT_BASE, - TRAMPOLINE_PLACEMENT_SIZE); - #ifdef CONFIG_64BIT if (image_base != (unsigned long)startup_32) - adjust_memory_range_protection(image_base, image_size); + efi_adjust_memory_range_protection(image_base, image_size); #else /* * Clear protection flags on a whole range of possible @@ -305,8 +285,8 @@ setup_memory_protection(unsigned long image_base, unsigned long image_size) * need to remove possible protection on relocated image * itself disregarding further relocations. */ - adjust_memory_range_protection(LOAD_PHYSICAL_ADDR, - KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR); + efi_adjust_memory_range_protection(LOAD_PHYSICAL_ADDR, + KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR); #endif } @@ -796,6 +776,12 @@ void __noreturn efi_stub_entry(efi_handle_t handle, efi_dxe_table = NULL; } + status = efi_setup_5level_paging(); + if (status != EFI_SUCCESS) { + efi_err("efi_setup_5level_paging() failed!\n"); + goto fail; + } + /* * If the kernel isn't already loaded at a suitable address, * relocate it. @@ -914,6 +900,8 @@ void __noreturn efi_stub_entry(efi_handle_t handle, goto fail; } + efi_5level_switch(); + if (IS_ENABLED(CONFIG_X86_64)) bzimage_addr += startup_64 - startup_32; diff --git a/drivers/firmware/efi/libstub/x86-stub.h b/drivers/firmware/efi/libstub/x86-stub.h new file mode 100644 index 000000000000..37c5a36b9d8c --- /dev/null +++ b/drivers/firmware/efi/libstub/x86-stub.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#include + +extern void trampoline_32bit_src(void *, bool); +extern const u16 trampoline_ljmp_imm_offset; + +void efi_adjust_memory_range_protection(unsigned long start, + unsigned long size); + +#ifdef CONFIG_X86_64 +efi_status_t efi_setup_5level_paging(void); +void efi_5level_switch(void); +#else +static inline efi_status_t efi_setup_5level_paging(void) { return EFI_SUCCESS; } +static inline void efi_5level_switch(void) {} +#endif From 5a664585a71c3af82a64aa9b38cadfa02f11c841 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:48 +0100 Subject: [PATCH 188/216] x86/decompressor: Factor out kernel decompression and relocation From: Ard Biesheuvel [ Commit 83381519352d6b5b3e429bf72aaab907480cb6b6 upstream ] Factor out the decompressor sequence that invokes the decompressor, parses the ELF and applies the relocations so that it can be called directly from the EFI stub. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-21-ardb@kernel.org Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/misc.c | 29 ++++++++++++++++++++++++----- arch/x86/include/asm/boot.h | 8 ++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e4e3e49fcc37..fb55ac18af6f 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -330,11 +330,33 @@ static size_t parse_elf(void *output) return ehdr.e_entry - LOAD_PHYSICAL_ADDR; } +const unsigned long kernel_total_size = VO__end - VO__text; + static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4); extern unsigned char input_data[]; extern unsigned int input_len, output_len; +unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, + void (*error)(char *x)) +{ + unsigned long entry; + + if (!free_mem_ptr) { + free_mem_ptr = (unsigned long)boot_heap; + free_mem_end_ptr = (unsigned long)boot_heap + sizeof(boot_heap); + } + + if (__decompress(input_data, input_len, NULL, NULL, outbuf, output_len, + NULL, error) < 0) + return ULONG_MAX; + + entry = parse_elf(outbuf); + handle_relocations(outbuf, output_len, virt_addr); + + return entry; +} + /* * The compressed kernel image (ZO), has been moved so that its position * is against the end of the buffer used to hold the uncompressed kernel @@ -354,7 +376,6 @@ extern unsigned int input_len, output_len; */ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) { - const unsigned long kernel_total_size = VO__end - VO__text; unsigned long virt_addr = LOAD_PHYSICAL_ADDR; memptr heap = (memptr)boot_heap; unsigned long needed_size; @@ -457,10 +478,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) #endif debug_putstr("\nDecompressing Linux... "); - __decompress(input_data, input_len, NULL, NULL, output, output_len, - NULL, error); - entry_offset = parse_elf(output); - handle_relocations(output, output_len, virt_addr); + + entry_offset = decompress_kernel(output, virt_addr, error); debug_putstr("done.\nBooting the kernel (entry_offset: 0x"); debug_puthex(entry_offset); diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 215d37f7dde8..b3a7cfb0d99e 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -79,4 +79,12 @@ # define BOOT_STACK_SIZE 0x1000 #endif +#ifndef __ASSEMBLY__ +extern unsigned int output_len; +extern const unsigned long kernel_total_size; + +unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, + void (*error)(char *x)); +#endif + #endif /* _ASM_X86_BOOT_H */ From 77330c123d7c443936585f25b31d3979876ba1d0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:49 +0100 Subject: [PATCH 189/216] x86/efistub: Prefer EFI memory attributes protocol over DXE services From: Ard Biesheuvel [ Commit 11078876b7a6a1b7226344fecab968945c806832 upstream ] Currently, the EFI stub relies on DXE services in some cases to clear non-execute restrictions from page allocations that need to be executable. This is dodgy, because DXE services are not specified by UEFI but by PI, and they are not intended for consumption by OS loaders. However, no alternative existed at the time. Now, there is a new UEFI protocol that should be used instead, so if it exists, prefer it over the DXE services calls. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-18-ardb@kernel.org Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/libstub/x86-stub.c | 29 ++++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index adaddd38d97d..01af018b9315 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -26,6 +26,7 @@ const efi_system_table_t *efi_system_table; const efi_dxe_services_table_t *efi_dxe_table; u32 image_offset __section(".data"); static efi_loaded_image_t *image = NULL; +static efi_memory_attribute_protocol_t *memattr; static efi_status_t preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) @@ -222,12 +223,18 @@ void efi_adjust_memory_range_protection(unsigned long start, unsigned long rounded_start, rounded_end; unsigned long unprotect_start, unprotect_size; - if (efi_dxe_table == NULL) - return; - rounded_start = rounddown(start, EFI_PAGE_SIZE); rounded_end = roundup(start + size, EFI_PAGE_SIZE); + if (memattr != NULL) { + efi_call_proto(memattr, clear_memory_attributes, rounded_start, + rounded_end - rounded_start, EFI_MEMORY_XP); + return; + } + + if (efi_dxe_table == NULL) + return; + /* * Don't modify memory region attributes, they are * already suitable, to lower the possibility to @@ -758,6 +765,7 @@ void __noreturn efi_stub_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg, struct boot_params *boot_params) { + efi_guid_t guid = EFI_MEMORY_ATTRIBUTE_PROTOCOL_GUID; unsigned long bzimage_addr = (unsigned long)startup_32; unsigned long buffer_start, buffer_end; struct setup_header *hdr = &boot_params->hdr; @@ -769,13 +777,18 @@ void __noreturn efi_stub_entry(efi_handle_t handle, if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) efi_exit(handle, EFI_INVALID_PARAMETER); - efi_dxe_table = get_efi_config_table(EFI_DXE_SERVICES_TABLE_GUID); - if (efi_dxe_table && - efi_dxe_table->hdr.signature != EFI_DXE_SERVICES_TABLE_SIGNATURE) { - efi_warn("Ignoring DXE services table: invalid signature\n"); - efi_dxe_table = NULL; + if (IS_ENABLED(CONFIG_EFI_DXE_MEM_ATTRIBUTES)) { + efi_dxe_table = get_efi_config_table(EFI_DXE_SERVICES_TABLE_GUID); + if (efi_dxe_table && + efi_dxe_table->hdr.signature != EFI_DXE_SERVICES_TABLE_SIGNATURE) { + efi_warn("Ignoring DXE services table: invalid signature\n"); + efi_dxe_table = NULL; + } } + /* grab the memory attributes protocol if it exists */ + efi_bs_call(locate_protocol, &guid, NULL, (void **)&memattr); + status = efi_setup_5level_paging(); if (status != EFI_SUCCESS) { efi_err("efi_setup_5level_paging() failed!\n"); From fff7614f576f802fb0f4ff169cb251c180ce377e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:50 +0100 Subject: [PATCH 190/216] x86/efistub: Perform SNP feature test while running in the firmware From: Ard Biesheuvel [ Commit 31c77a50992e8dd136feed7b67073bb5f1f978cc upstream ] Before refactoring the EFI stub boot flow to avoid the legacy bare metal decompressor, duplicate the SNP feature check in the EFI stub before handing over to the kernel proper. The SNP feature check can be performed while running under the EFI boot services, which means it can force the boot to fail gracefully and return an error to the bootloader if the loaded kernel does not implement support for all the features that the hypervisor enabled. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-23-ardb@kernel.org Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/sev.c | 112 ++++++++++++++---------- arch/x86/include/asm/sev.h | 5 ++ drivers/firmware/efi/libstub/x86-stub.c | 17 ++++ 3 files changed, 88 insertions(+), 46 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 9c91cc40f456..8b21c57bc470 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -327,20 +327,25 @@ static void enforce_vmpl0(void) */ #define SNP_FEATURES_PRESENT (0) +u64 snp_get_unsupported_features(u64 status) +{ + if (!(status & MSR_AMD64_SEV_SNP_ENABLED)) + return 0; + + return status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT; +} + void snp_check_features(void) { u64 unsupported; - if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) - return; - /* * Terminate the boot if hypervisor has enabled any feature lacking * guest side implementation. Pass on the unsupported features mask through * EXIT_INFO_2 of the GHCB protocol so that those features can be reported * as part of the guest boot failure. */ - unsupported = sev_status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT; + unsupported = snp_get_unsupported_features(sev_status); if (unsupported) { if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb())) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); @@ -350,10 +355,45 @@ void snp_check_features(void) } } -void sev_enable(struct boot_params *bp) +/* + * sev_check_cpu_support - Check for SEV support in the CPU capabilities + * + * Returns < 0 if SEV is not supported, otherwise the position of the + * encryption bit in the page table descriptors. + */ +static int sev_check_cpu_support(void) { unsigned int eax, ebx, ecx, edx; + + /* Check for the SME/SEV support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return -ENODEV; + + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + /* Check whether SEV is supported */ + if (!(eax & BIT(1))) + return -ENODEV; + + return ebx & 0x3f; +} + +void sev_enable(struct boot_params *bp) +{ struct msr m; + int bitpos; bool snp; /* @@ -373,26 +413,7 @@ void sev_enable(struct boot_params *bp) * which is good enough. */ - /* Check for the SME/SEV support leaf */ - eax = 0x80000000; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - if (eax < 0x8000001f) - return; - - /* - * Check for the SME/SEV feature: - * CPUID Fn8000_001F[EAX] - * - Bit 0 - Secure Memory Encryption support - * - Bit 1 - Secure Encrypted Virtualization support - * CPUID Fn8000_001F[EBX] - * - Bits 5:0 - Pagetable bit position used to indicate encryption - */ - eax = 0x8000001f; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - /* Check whether SEV is supported */ - if (!(eax & BIT(1))) + if (sev_check_cpu_support() < 0) return; /* @@ -403,26 +424,8 @@ void sev_enable(struct boot_params *bp) /* Now repeat the checks with the SNP CPUID table. */ - /* Recheck the SME/SEV support leaf */ - eax = 0x80000000; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - if (eax < 0x8000001f) - return; - - /* - * Recheck for the SME/SEV feature: - * CPUID Fn8000_001F[EAX] - * - Bit 0 - Secure Memory Encryption support - * - Bit 1 - Secure Encrypted Virtualization support - * CPUID Fn8000_001F[EBX] - * - Bits 5:0 - Pagetable bit position used to indicate encryption - */ - eax = 0x8000001f; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - /* Check whether SEV is supported */ - if (!(eax & BIT(1))) { + bitpos = sev_check_cpu_support(); + if (bitpos < 0) { if (snp) error("SEV-SNP support indicated by CC blob, but not CPUID."); return; @@ -454,7 +457,24 @@ void sev_enable(struct boot_params *bp) if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) error("SEV-SNP supported indicated by CC blob, but not SEV status MSR."); - sme_me_mask = BIT_ULL(ebx & 0x3f); + sme_me_mask = BIT_ULL(bitpos); +} + +/* + * sev_get_status - Retrieve the SEV status mask + * + * Returns 0 if the CPU is not SEV capable, otherwise the value of the + * AMD64_SEV MSR. + */ +u64 sev_get_status(void) +{ + struct msr m; + + if (sev_check_cpu_support() < 0) + return 0; + + boot_rdmsr(MSR_AMD64_SEV, &m); + return m.q; } /* Search for Confidential Computing blob in the EFI config table. */ diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 7ca5c9ec8b52..e231638ba19a 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -202,6 +202,8 @@ void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); void __init __noreturn snp_abort(void); int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio); +u64 snp_get_unsupported_features(u64 status); +u64 sev_get_status(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -225,6 +227,9 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in { return -ENOTTY; } + +static inline u64 snp_get_unsupported_features(u64 status) { return 0; } +static inline u64 sev_get_status(void) { return 0; } #endif #endif diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 01af018b9315..8d3ce383bcbb 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "efistub.h" #include "x86-stub.h" @@ -747,6 +748,19 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle) return EFI_SUCCESS; } +static bool have_unsupported_snp_features(void) +{ + u64 unsupported; + + unsupported = snp_get_unsupported_features(sev_get_status()); + if (unsupported) { + efi_err("Unsupported SEV-SNP features detected: 0x%llx\n", + unsupported); + return true; + } + return false; +} + static void __noreturn enter_kernel(unsigned long kernel_addr, struct boot_params *boot_params) { @@ -777,6 +791,9 @@ void __noreturn efi_stub_entry(efi_handle_t handle, if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) efi_exit(handle, EFI_INVALID_PARAMETER); + if (have_unsupported_snp_features()) + efi_exit(handle, EFI_UNSUPPORTED); + if (IS_ENABLED(CONFIG_EFI_DXE_MEM_ATTRIBUTES)) { efi_dxe_table = get_efi_config_table(EFI_DXE_SERVICES_TABLE_GUID); if (efi_dxe_table && From 2dfaeac3f38e4e550d215204eedd97a061fdc118 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:51 +0100 Subject: [PATCH 191/216] x86/efistub: Avoid legacy decompressor when doing EFI boot From: Ard Biesheuvel [ Commit a1b87d54f4e45ff5e0d081fb1d9db3bf1a8fb39a upstream ] The bare metal decompressor code was never really intended to run in a hosted environment such as the EFI boot services, and does a few things that are becoming problematic in the context of EFI boot now that the logo requirements are getting tighter: EFI executables will no longer be allowed to consist of a single executable section that is mapped with read, write and execute permissions if they are intended for use in a context where Secure Boot is enabled (and where Microsoft's set of certificates is used, i.e., every x86 PC built to run Windows). To avoid stepping on reserved memory before having inspected the E820 tables, and to ensure the correct placement when running a kernel build that is non-relocatable, the bare metal decompressor moves its own executable image to the end of the allocation that was reserved for it, in order to perform the decompression in place. This means the region in question requires both write and execute permissions, which either need to be given upfront (which EFI will no longer permit), or need to be applied on demand using the existing page fault handling framework. However, the physical placement of the kernel is usually randomized anyway, and even if it isn't, a dedicated decompression output buffer can be allocated anywhere in memory using EFI APIs when still running in the boot services, given that EFI support already implies a relocatable kernel. This means that decompression in place is never necessary, nor is moving the compressed image from one end to the other. Since EFI already maps all of memory 1:1, it is also unnecessary to create new page tables or handle page faults when decompressing the kernel. That means there is also no need to replace the special exception handlers for SEV. Generally, there is little need to do any of the things that the decompressor does beyond - initialize SEV encryption, if needed, - perform the 4/5 level paging switch, if needed, - decompress the kernel - relocate the kernel So do all of this from the EFI stub code, and avoid the bare metal decompressor altogether. Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230807162720.545787-24-ardb@kernel.org Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/Makefile | 5 + arch/x86/boot/compressed/efi_mixed.S | 55 -------- arch/x86/boot/compressed/head_32.S | 13 -- arch/x86/boot/compressed/head_64.S | 27 ---- arch/x86/include/asm/efi.h | 7 +- arch/x86/include/asm/sev.h | 2 + drivers/firmware/efi/libstub/x86-stub.c | 166 ++++++++++-------------- 7 files changed, 84 insertions(+), 191 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 0c9ebf74fac5..3965b2c9efee 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -74,6 +74,11 @@ LDFLAGS_vmlinux += -z noexecstack ifeq ($(CONFIG_LD_IS_BFD),y) LDFLAGS_vmlinux += $(call ld-option,--no-warn-rwx-segments) endif +ifeq ($(CONFIG_EFI_STUB),y) +# ensure that the static EFI stub library will be pulled in, even if it is +# never referenced explicitly from the startup code +LDFLAGS_vmlinux += -u efi_pe_entry +endif LDFLAGS_vmlinux += -T hostprogs := mkpiggy diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S index d6d1b76b594d..8232c5b2a9bf 100644 --- a/arch/x86/boot/compressed/efi_mixed.S +++ b/arch/x86/boot/compressed/efi_mixed.S @@ -275,10 +275,6 @@ SYM_FUNC_START_LOCAL(efi32_entry) jmp startup_32 SYM_FUNC_END(efi32_entry) -#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime) -#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol) -#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base) - /* * efi_status_t efi32_pe_entry(efi_handle_t image_handle, * efi_system_table_32_t *sys_table) @@ -286,8 +282,6 @@ SYM_FUNC_END(efi32_entry) SYM_FUNC_START(efi32_pe_entry) pushl %ebp movl %esp, %ebp - pushl %eax // dummy push to allocate loaded_image - pushl %ebx // save callee-save registers pushl %edi @@ -296,48 +290,8 @@ SYM_FUNC_START(efi32_pe_entry) movl $0x80000003, %eax // EFI_UNSUPPORTED jnz 2f - call 1f -1: pop %ebx - - /* Get the loaded image protocol pointer from the image handle */ - leal -4(%ebp), %eax - pushl %eax // &loaded_image - leal (loaded_image_proto - 1b)(%ebx), %eax - pushl %eax // pass the GUID address - pushl 8(%ebp) // pass the image handle - - /* - * Note the alignment of the stack frame. - * sys_table - * handle <-- 16-byte aligned on entry by ABI - * return address - * frame pointer - * loaded_image <-- local variable - * saved %ebx <-- 16-byte aligned here - * saved %edi - * &loaded_image - * &loaded_image_proto - * handle <-- 16-byte aligned for call to handle_protocol - */ - - movl 12(%ebp), %eax // sys_table - movl ST32_boottime(%eax), %eax // sys_table->boottime - call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol - addl $12, %esp // restore argument space - testl %eax, %eax - jnz 2f - movl 8(%ebp), %ecx // image_handle movl 12(%ebp), %edx // sys_table - movl -4(%ebp), %esi // loaded_image - movl LI32_image_base(%esi), %esi // loaded_image->image_base - leal (startup_32 - 1b)(%ebx), %ebp // runtime address of startup_32 - /* - * We need to set the image_offset variable here since startup_32() will - * use it before we get to the 64-bit efi_pe_entry() in C code. - */ - subl %esi, %ebp // calculate image_offset - movl %ebp, (image_offset - 1b)(%ebx) // save image_offset xorl %esi, %esi jmp efi32_entry // pass %ecx, %edx, %esi // no other registers remain live @@ -356,15 +310,6 @@ SYM_FUNC_START_NOALIGN(efi64_stub_entry) SYM_FUNC_END(efi64_stub_entry) #endif - .section ".rodata" - /* EFI loaded image protocol GUID */ - .balign 4 -SYM_DATA_START_LOCAL(loaded_image_proto) - .long 0x5b1b31a1 - .word 0x9562, 0x11d2 - .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b -SYM_DATA_END(loaded_image_proto) - .data .balign 8 SYM_DATA_START_LOCAL(efi32_boot_gdt) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 3af4a383615b..1cfe9802a42f 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -84,19 +84,6 @@ SYM_FUNC_START(startup_32) #ifdef CONFIG_RELOCATABLE leal startup_32@GOTOFF(%edx), %ebx - -#ifdef CONFIG_EFI_STUB -/* - * If we were loaded via the EFI LoadImage service, startup_32() will be at an - * offset to the start of the space allocated for the image. efi_pe_entry() will - * set up image_offset to tell us where the image actually starts, so that we - * can use the full available buffer. - * image_offset = startup_32 - image_base - * Otherwise image_offset will be zero and has no effect on the calculations. - */ - subl image_offset@GOTOFF(%edx), %ebx -#endif - movl BP_kernel_alignment(%esi), %eax decl %eax addl %eax, %ebx diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 9a0d83b4d266..0d7aef10b19a 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -138,19 +138,6 @@ SYM_FUNC_START(startup_32) #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx - -#ifdef CONFIG_EFI_STUB -/* - * If we were loaded via the EFI LoadImage service, startup_32 will be at an - * offset to the start of the space allocated for the image. efi_pe_entry will - * set up image_offset to tell us where the image actually starts, so that we - * can use the full available buffer. - * image_offset = startup_32 - image_base - * Otherwise image_offset will be zero and has no effect on the calculations. - */ - subl rva(image_offset)(%ebp), %ebx -#endif - movl BP_kernel_alignment(%esi), %eax decl %eax addl %eax, %ebx @@ -327,20 +314,6 @@ SYM_CODE_START(startup_64) /* Start with the delta to where the kernel will run at. */ #ifdef CONFIG_RELOCATABLE leaq startup_32(%rip) /* - $startup_32 */, %rbp - -#ifdef CONFIG_EFI_STUB -/* - * If we were loaded via the EFI LoadImage service, startup_32 will be at an - * offset to the start of the space allocated for the image. efi_pe_entry will - * set up image_offset to tell us where the image actually starts, so that we - * can use the full available buffer. - * image_offset = startup_32 - image_base - * Otherwise image_offset will be zero and has no effect on the calculations. - */ - movl image_offset(%rip), %eax - subq %rax, %rbp -#endif - movl BP_kernel_alignment(%rsi), %eax decl %eax addq %rax, %rbp diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 522ff2e443b3..e601264b1a24 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -88,6 +88,8 @@ static inline void efi_fpu_end(void) } #ifdef CONFIG_X86_32 +#define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1) + #define arch_efi_call_virt_setup() \ ({ \ efi_fpu_begin(); \ @@ -101,8 +103,7 @@ static inline void efi_fpu_end(void) }) #else /* !CONFIG_X86_32 */ - -#define EFI_LOADER_SIGNATURE "EL64" +#define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT extern asmlinkage u64 __efi_call(void *fp, ...); @@ -214,6 +215,8 @@ efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size, #ifdef CONFIG_EFI_MIXED +#define EFI_ALLOC_LIMIT (efi_is_64bit() ? ULONG_MAX : U32_MAX) + #define ARCH_HAS_EFISTUB_WRAPPERS static inline bool efi_is_64bit(void) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index e231638ba19a..cf98fc28601f 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -157,6 +157,7 @@ static __always_inline void sev_es_nmi_complete(void) __sev_es_nmi_complete(); } extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); +extern void sev_enable(struct boot_params *bp); static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { @@ -210,6 +211,7 @@ static inline void sev_es_ist_exit(void) { } static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } static inline void sev_es_nmi_complete(void) { } static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } +static inline void sev_enable(struct boot_params *bp) { } static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } static inline void setup_ghcb(void) { } diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 8d3ce383bcbb..61017921f9ca 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -15,17 +15,14 @@ #include #include #include +#include #include #include "efistub.h" #include "x86-stub.h" -/* Maximum physical address for 64-bit kernel with 4-level paging */ -#define MAXMEM_X86_64_4LEVEL (1ull << 46) - const efi_system_table_t *efi_system_table; const efi_dxe_services_table_t *efi_dxe_table; -u32 image_offset __section(".data"); static efi_loaded_image_t *image = NULL; static efi_memory_attribute_protocol_t *memattr; @@ -276,33 +273,9 @@ void efi_adjust_memory_range_protection(unsigned long start, } } -extern const u8 startup_32[], startup_64[]; - -static void -setup_memory_protection(unsigned long image_base, unsigned long image_size) -{ -#ifdef CONFIG_64BIT - if (image_base != (unsigned long)startup_32) - efi_adjust_memory_range_protection(image_base, image_size); -#else - /* - * Clear protection flags on a whole range of possible - * addresses used for KASLR. We don't need to do that - * on x86_64, since KASLR/extraction is performed after - * dedicated identity page tables are built and we only - * need to remove possible protection on relocated image - * itself disregarding further relocations. - */ - efi_adjust_memory_range_protection(LOAD_PHYSICAL_ADDR, - KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR); -#endif -} - static const efi_char16_t apple[] = L"Apple"; -static void setup_quirks(struct boot_params *boot_params, - unsigned long image_base, - unsigned long image_size) +static void setup_quirks(struct boot_params *boot_params) { efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) efi_table_attr(efi_system_table, fw_vendor); @@ -311,9 +284,6 @@ static void setup_quirks(struct boot_params *boot_params, if (IS_ENABLED(CONFIG_APPLE_PROPERTIES)) retrieve_apple_device_properties(boot_params); } - - if (IS_ENABLED(CONFIG_EFI_DXE_MEM_ATTRIBUTES)) - setup_memory_protection(image_base, image_size); } /* @@ -466,7 +436,6 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, } image_base = efi_table_attr(image, image_base); - image_offset = (void *)startup_32 - image_base; status = efi_allocate_pages(sizeof(struct boot_params), (unsigned long *)&boot_params, ULONG_MAX); @@ -761,6 +730,61 @@ static bool have_unsupported_snp_features(void) return false; } +static void efi_get_seed(void *seed, int size) +{ + efi_get_random_bytes(size, seed); + + /* + * This only updates seed[0] when running on 32-bit, but in that case, + * seed[1] is not used anyway, as there is no virtual KASLR on 32-bit. + */ + *(unsigned long *)seed ^= kaslr_get_random_long("EFI"); +} + +static void error(char *str) +{ + efi_warn("Decompression failed: %s\n", str); +} + +static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry) +{ + unsigned long virt_addr = LOAD_PHYSICAL_ADDR; + unsigned long addr, alloc_size, entry; + efi_status_t status; + u32 seed[2] = {}; + + /* determine the required size of the allocation */ + alloc_size = ALIGN(max_t(unsigned long, output_len, kernel_total_size), + MIN_KERNEL_ALIGN); + + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && !efi_nokaslr) { + u64 range = KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR - kernel_total_size; + + efi_get_seed(seed, sizeof(seed)); + + virt_addr += (range * seed[1]) >> 32; + virt_addr &= ~(CONFIG_PHYSICAL_ALIGN - 1); + } + + status = efi_random_alloc(alloc_size, CONFIG_PHYSICAL_ALIGN, &addr, + seed[0], EFI_LOADER_CODE, + EFI_X86_KERNEL_ALLOC_LIMIT); + if (status != EFI_SUCCESS) + return status; + + entry = decompress_kernel((void *)addr, virt_addr, error); + if (entry == ULONG_MAX) { + efi_free(alloc_size, addr); + return EFI_LOAD_ERROR; + } + + *kernel_entry = addr + entry; + + efi_adjust_memory_range_protection(addr, kernel_total_size); + + return EFI_SUCCESS; +} + static void __noreturn enter_kernel(unsigned long kernel_addr, struct boot_params *boot_params) { @@ -780,10 +804,9 @@ void __noreturn efi_stub_entry(efi_handle_t handle, struct boot_params *boot_params) { efi_guid_t guid = EFI_MEMORY_ATTRIBUTE_PROTOCOL_GUID; - unsigned long bzimage_addr = (unsigned long)startup_32; - unsigned long buffer_start, buffer_end; struct setup_header *hdr = &boot_params->hdr; const struct linux_efi_initrd *initrd = NULL; + unsigned long kernel_entry; efi_status_t status; efi_system_table = sys_table_arg; @@ -812,60 +835,6 @@ void __noreturn efi_stub_entry(efi_handle_t handle, goto fail; } - /* - * If the kernel isn't already loaded at a suitable address, - * relocate it. - * - * It must be loaded above LOAD_PHYSICAL_ADDR. - * - * The maximum address for 64-bit is 1 << 46 for 4-level paging. This - * is defined as the macro MAXMEM, but unfortunately that is not a - * compile-time constant if 5-level paging is configured, so we instead - * define our own macro for use here. - * - * For 32-bit, the maximum address is complicated to figure out, for - * now use KERNEL_IMAGE_SIZE, which will be 512MiB, the same as what - * KASLR uses. - * - * Also relocate it if image_offset is zero, i.e. the kernel wasn't - * loaded by LoadImage, but rather by a bootloader that called the - * handover entry. The reason we must always relocate in this case is - * to handle the case of systemd-boot booting a unified kernel image, - * which is a PE executable that contains the bzImage and an initrd as - * COFF sections. The initrd section is placed after the bzImage - * without ensuring that there are at least init_size bytes available - * for the bzImage, and thus the compressed kernel's startup code may - * overwrite the initrd unless it is moved out of the way. - */ - - buffer_start = ALIGN(bzimage_addr - image_offset, - hdr->kernel_alignment); - buffer_end = buffer_start + hdr->init_size; - - if ((buffer_start < LOAD_PHYSICAL_ADDR) || - (IS_ENABLED(CONFIG_X86_32) && buffer_end > KERNEL_IMAGE_SIZE) || - (IS_ENABLED(CONFIG_X86_64) && buffer_end > MAXMEM_X86_64_4LEVEL) || - (image_offset == 0)) { - extern char _bss[]; - - status = efi_relocate_kernel(&bzimage_addr, - (unsigned long)_bss - bzimage_addr, - hdr->init_size, - hdr->pref_address, - hdr->kernel_alignment, - LOAD_PHYSICAL_ADDR); - if (status != EFI_SUCCESS) { - efi_err("efi_relocate_kernel() failed!\n"); - goto fail; - } - /* - * Now that we've copied the kernel elsewhere, we no longer - * have a set up block before startup_32(), so reset image_offset - * to zero in case it was set earlier. - */ - image_offset = 0; - } - #ifdef CONFIG_CMDLINE_BOOL status = efi_parse_options(CONFIG_CMDLINE); if (status != EFI_SUCCESS) { @@ -883,6 +852,12 @@ void __noreturn efi_stub_entry(efi_handle_t handle, } } + status = efi_decompress_kernel(&kernel_entry); + if (status != EFI_SUCCESS) { + efi_err("Failed to decompress kernel\n"); + goto fail; + } + /* * At this point, an initrd may already have been loaded by the * bootloader and passed via bootparams. We permit an initrd loaded @@ -922,7 +897,7 @@ void __noreturn efi_stub_entry(efi_handle_t handle, setup_efi_pci(boot_params); - setup_quirks(boot_params, bzimage_addr, buffer_end - buffer_start); + setup_quirks(boot_params); status = exit_boot(boot_params, handle); if (status != EFI_SUCCESS) { @@ -930,12 +905,15 @@ void __noreturn efi_stub_entry(efi_handle_t handle, goto fail; } + /* + * Call the SEV init code while still running with the firmware's + * GDT/IDT, so #VC exceptions will be handled by EFI. + */ + sev_enable(boot_params); + efi_5level_switch(); - if (IS_ENABLED(CONFIG_X86_64)) - bzimage_addr += startup_64 - startup_32; - - enter_kernel(bzimage_addr, boot_params); + enter_kernel(kernel_entry, boot_params); fail: efi_err("efi_stub_entry() failed!\n"); From 1b54062576792b41f0acb8d562deea7c4c718c33 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:52 +0100 Subject: [PATCH 192/216] efi/x86: Avoid physical KASLR on older Dell systems From: Ard Biesheuvel [ Commit 50d7cdf7a9b1ab6f4f74a69c84e974d5dc0c1bf1 upstream ] River reports boot hangs with v6.6 and v6.7, and the bisect points to commit a1b87d54f4e4 ("x86/efistub: Avoid legacy decompressor when doing EFI boot") which moves the memory allocation and kernel decompression from the legacy decompressor (which executes *after* ExitBootServices()) to the EFI stub, using boot services for allocating the memory. The memory allocation succeeds but the subsequent call to decompress_kernel() never returns, resulting in a failed boot and a hanging system. As it turns out, this issue only occurs when physical address randomization (KASLR) is enabled, and given that this is a feature we can live without (virtual KASLR is much more important), let's disable the physical part of KASLR when booting on AMI UEFI firmware claiming to implement revision v2.0 of the specification (which was released in 2006), as this is the version these systems advertise. Fixes: a1b87d54f4e4 ("x86/efistub: Avoid legacy decompressor when doing EFI boot") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218173 Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/libstub/x86-stub.c | 31 +++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 61017921f9ca..47ebc85c0d22 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -273,17 +273,20 @@ void efi_adjust_memory_range_protection(unsigned long start, } } +static efi_char16_t *efistub_fw_vendor(void) +{ + unsigned long vendor = efi_table_attr(efi_system_table, fw_vendor); + + return (efi_char16_t *)vendor; +} + static const efi_char16_t apple[] = L"Apple"; static void setup_quirks(struct boot_params *boot_params) { - efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) - efi_table_attr(efi_system_table, fw_vendor); - - if (!memcmp(fw_vendor, apple, sizeof(apple))) { - if (IS_ENABLED(CONFIG_APPLE_PROPERTIES)) - retrieve_apple_device_properties(boot_params); - } + if (IS_ENABLED(CONFIG_APPLE_PROPERTIES) && + !memcmp(efistub_fw_vendor(), apple, sizeof(apple))) + retrieve_apple_device_properties(boot_params); } /* @@ -759,11 +762,25 @@ static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry) if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && !efi_nokaslr) { u64 range = KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR - kernel_total_size; + static const efi_char16_t ami[] = L"American Megatrends"; efi_get_seed(seed, sizeof(seed)); virt_addr += (range * seed[1]) >> 32; virt_addr &= ~(CONFIG_PHYSICAL_ALIGN - 1); + + /* + * Older Dell systems with AMI UEFI firmware v2.0 may hang + * while decompressing the kernel if physical address + * randomization is enabled. + * + * https://bugzilla.kernel.org/show_bug.cgi?id=218173 + */ + if (efi_system_table->hdr.revision <= EFI_2_00_SYSTEM_TABLE_REVISION && + !memcmp(efistub_fw_vendor(), ami, sizeof(ami))) { + efi_debug("AMI firmware v2.0 or older detected - disabling physical KASLR\n"); + seed[0] = 0; + } } status = efi_random_alloc(alloc_size, CONFIG_PHYSICAL_ALIGN, &addr, From 86c909d2275b91fb34be07b081c7343a0c2351f2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:53 +0100 Subject: [PATCH 193/216] x86/efistub: Avoid placing the kernel below LOAD_PHYSICAL_ADDR From: Ard Biesheuvel [ Commit 2f77465b05b1270c832b5e2ee27037672ad2a10a upstream ] The EFI stub's kernel placement logic randomizes the physical placement of the kernel by taking all available memory into account, and picking a region at random, based on a random seed. When KASLR is disabled, this seed is set to 0x0, and this results in the lowest available region of memory to be selected for loading the kernel, even if this is below LOAD_PHYSICAL_ADDR. Some of this memory is typically reserved for the GFP_DMA region, to accommodate masters that can only access the first 16 MiB of system memory. Even if such devices are rare these days, we may still end up with a warning in the kernel log, as reported by Tom: swapper/0: page allocation failure: order:10, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0 Fix this by tweaking the random allocation logic to accept a low bound on the placement, and set it to LOAD_PHYSICAL_ADDR. Fixes: a1b87d54f4e4 ("x86/efistub: Avoid legacy decompressor when doing EFI boot") Reported-by: Tom Englund Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218404 Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/libstub/arm64-stub.c | 2 +- drivers/firmware/efi/libstub/efistub.h | 3 ++- drivers/firmware/efi/libstub/randomalloc.c | 12 +++++++----- drivers/firmware/efi/libstub/x86-stub.c | 1 + 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 16377b452119..16f15e36f9a7 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -181,7 +181,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, */ status = efi_random_alloc(*reserve_size, min_kimg_align, reserve_addr, phys_seed, - EFI_LOADER_CODE, EFI_ALLOC_LIMIT); + EFI_LOADER_CODE, 0, EFI_ALLOC_LIMIT); if (status != EFI_SUCCESS) efi_warn("efi_random_alloc() failed: 0x%lx\n", status); } else { diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 4b4055877f3d..6741f3d900c5 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -906,7 +906,8 @@ efi_status_t efi_get_random_bytes(unsigned long size, u8 *out); efi_status_t efi_random_alloc(unsigned long size, unsigned long align, unsigned long *addr, unsigned long random_seed, - int memory_type, unsigned long alloc_limit); + int memory_type, unsigned long alloc_min, + unsigned long alloc_max); efi_status_t efi_random_get_seed(void); diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index ed6f6087a9ea..7ba05719a53b 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -17,7 +17,7 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md, unsigned long size, unsigned long align_shift, - u64 alloc_limit) + u64 alloc_min, u64 alloc_max) { unsigned long align = 1UL << align_shift; u64 first_slot, last_slot, region_end; @@ -30,11 +30,11 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md, return 0; region_end = min(md->phys_addr + md->num_pages * EFI_PAGE_SIZE - 1, - alloc_limit); + alloc_max); if (region_end < size) return 0; - first_slot = round_up(md->phys_addr, align); + first_slot = round_up(max(md->phys_addr, alloc_min), align); last_slot = round_down(region_end - size + 1, align); if (first_slot > last_slot) @@ -56,7 +56,8 @@ efi_status_t efi_random_alloc(unsigned long size, unsigned long *addr, unsigned long random_seed, int memory_type, - unsigned long alloc_limit) + unsigned long alloc_min, + unsigned long alloc_max) { unsigned long total_slots = 0, target_slot; unsigned long total_mirrored_slots = 0; @@ -78,7 +79,8 @@ efi_status_t efi_random_alloc(unsigned long size, efi_memory_desc_t *md = (void *)map->map + map_offset; unsigned long slots; - slots = get_entry_num_slots(md, size, ilog2(align), alloc_limit); + slots = get_entry_num_slots(md, size, ilog2(align), alloc_min, + alloc_max); MD_NUM_SLOTS(md) = slots; total_slots += slots; if (md->attribute & EFI_MEMORY_MORE_RELIABLE) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 47ebc85c0d22..c1dcc86fcc3d 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -785,6 +785,7 @@ static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry) status = efi_random_alloc(alloc_size, CONFIG_PHYSICAL_ALIGN, &addr, seed[0], EFI_LOADER_CODE, + LOAD_PHYSICAL_ADDR, EFI_X86_KERNEL_ALLOC_LIMIT); if (status != EFI_SUCCESS) return status; From 8f05493706ff8296d26b449db295b1dbb1de31dd Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:54 +0100 Subject: [PATCH 194/216] x86/boot: Rename conflicting 'boot_params' pointer to 'boot_params_ptr' From: Ard Biesheuvel [ Commit b9e909f78e7e4b826f318cfe7bedf3ce229920e6 upstream ] The x86 decompressor is built and linked as a separate executable, but it shares components with the kernel proper, which are either #include'd as C files, or linked into the decompresor as a static library (e.g, the EFI stub) Both the kernel itself and the decompressor define a global symbol 'boot_params' to refer to the boot_params struct, but in the former case, it refers to the struct directly, whereas in the decompressor, it refers to a global pointer variable referring to the struct boot_params passed by the bootloader or constructed from scratch. This ambiguity is unfortunate, and makes it impossible to assign this decompressor variable from the x86 EFI stub, given that declaring it as extern results in a clash. So rename the decompressor version (whose scope is limited) to boot_params_ptr. [ mingo: Renamed 'boot_params_p' to 'boot_params_ptr' for clarity ] Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/acpi.c | 14 ++++++------- arch/x86/boot/compressed/cmdline.c | 4 ++-- arch/x86/boot/compressed/ident_map_64.c | 7 ++++--- arch/x86/boot/compressed/kaslr.c | 26 ++++++++++++------------- arch/x86/boot/compressed/misc.c | 24 +++++++++++------------ arch/x86/boot/compressed/misc.h | 1 - arch/x86/boot/compressed/pgtable_64.c | 9 ++++----- arch/x86/boot/compressed/sev.c | 2 +- arch/x86/include/asm/boot.h | 2 ++ 9 files changed, 45 insertions(+), 44 deletions(-) diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 9caf89063e77..55c98fdd67d2 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -30,13 +30,13 @@ __efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len) * Search EFI system tables for RSDP. Preferred is ACPI_20_TABLE_GUID to * ACPI_TABLE_GUID because it has more features. */ - rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len, + rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len, ACPI_20_TABLE_GUID); if (rsdp_addr) return (acpi_physical_address)rsdp_addr; /* No ACPI_20_TABLE_GUID found, fallback to ACPI_TABLE_GUID. */ - rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len, + rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len, ACPI_TABLE_GUID); if (rsdp_addr) return (acpi_physical_address)rsdp_addr; @@ -56,15 +56,15 @@ static acpi_physical_address efi_get_rsdp_addr(void) enum efi_type et; int ret; - et = efi_get_type(boot_params); + et = efi_get_type(boot_params_ptr); if (et == EFI_TYPE_NONE) return 0; - systab_pa = efi_get_system_table(boot_params); + systab_pa = efi_get_system_table(boot_params_ptr); if (!systab_pa) error("EFI support advertised, but unable to locate system table."); - ret = efi_get_conf_table(boot_params, &cfg_tbl_pa, &cfg_tbl_len); + ret = efi_get_conf_table(boot_params_ptr, &cfg_tbl_pa, &cfg_tbl_len); if (ret || !cfg_tbl_pa) error("EFI config table not found."); @@ -156,7 +156,7 @@ acpi_physical_address get_rsdp_addr(void) { acpi_physical_address pa; - pa = boot_params->acpi_rsdp_addr; + pa = boot_params_ptr->acpi_rsdp_addr; if (!pa) pa = efi_get_rsdp_addr(); @@ -210,7 +210,7 @@ static unsigned long get_acpi_srat_table(void) rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp(); if (!rsdp) rsdp = (struct acpi_table_rsdp *)(long) - boot_params->acpi_rsdp_addr; + boot_params_ptr->acpi_rsdp_addr; if (!rsdp) return 0; diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index f1add5d85da9..c1bb180973ea 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -14,9 +14,9 @@ static inline char rdfs8(addr_t addr) #include "../cmdline.c" unsigned long get_cmd_line_ptr(void) { - unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr; + unsigned long cmd_line_ptr = boot_params_ptr->hdr.cmd_line_ptr; - cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32; + cmd_line_ptr |= (u64)boot_params_ptr->ext_cmd_line_ptr << 32; return cmd_line_ptr; } diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index d34222816c9f..b8c42339bc35 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -167,8 +167,9 @@ void initialize_identity_maps(void *rmode) * or does not touch all the pages covering them. */ kernel_add_identity_map((unsigned long)_head, (unsigned long)_end); - boot_params = rmode; - kernel_add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1)); + boot_params_ptr = rmode; + kernel_add_identity_map((unsigned long)boot_params_ptr, + (unsigned long)(boot_params_ptr + 1)); cmdline = get_cmd_line_ptr(); kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); @@ -176,7 +177,7 @@ void initialize_identity_maps(void *rmode) * Also map the setup_data entries passed via boot_params in case they * need to be accessed by uncompressed kernel via the identity mapping. */ - sd = (struct setup_data *)boot_params->hdr.setup_data; + sd = (struct setup_data *)boot_params_ptr->hdr.setup_data; while (sd) { unsigned long sd_addr = (unsigned long)sd; diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index e476bcbd9b42..9794d9174795 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -63,7 +63,7 @@ static unsigned long get_boot_seed(void) unsigned long hash = 0; hash = rotate_xor(hash, build_str, sizeof(build_str)); - hash = rotate_xor(hash, boot_params, sizeof(*boot_params)); + hash = rotate_xor(hash, boot_params_ptr, sizeof(*boot_params_ptr)); return hash; } @@ -383,7 +383,7 @@ static void handle_mem_options(void) static void mem_avoid_init(unsigned long input, unsigned long input_size, unsigned long output) { - unsigned long init_size = boot_params->hdr.init_size; + unsigned long init_size = boot_params_ptr->hdr.init_size; u64 initrd_start, initrd_size; unsigned long cmd_line, cmd_line_size; @@ -395,10 +395,10 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; /* Avoid initrd. */ - initrd_start = (u64)boot_params->ext_ramdisk_image << 32; - initrd_start |= boot_params->hdr.ramdisk_image; - initrd_size = (u64)boot_params->ext_ramdisk_size << 32; - initrd_size |= boot_params->hdr.ramdisk_size; + initrd_start = (u64)boot_params_ptr->ext_ramdisk_image << 32; + initrd_start |= boot_params_ptr->hdr.ramdisk_image; + initrd_size = (u64)boot_params_ptr->ext_ramdisk_size << 32; + initrd_size |= boot_params_ptr->hdr.ramdisk_size; mem_avoid[MEM_AVOID_INITRD].start = initrd_start; mem_avoid[MEM_AVOID_INITRD].size = initrd_size; /* No need to set mapping for initrd, it will be handled in VO. */ @@ -413,8 +413,8 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, } /* Avoid boot parameters. */ - mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; - mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); + mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params_ptr; + mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params_ptr); /* We don't need to set a mapping for setup_data. */ @@ -447,7 +447,7 @@ static bool mem_avoid_overlap(struct mem_vector *img, } /* Avoid all entries in the setup_data linked list. */ - ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; + ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data; while (ptr) { struct mem_vector avoid; @@ -679,7 +679,7 @@ static bool process_mem_region(struct mem_vector *region, static bool process_efi_entries(unsigned long minimum, unsigned long image_size) { - struct efi_info *e = &boot_params->efi_info; + struct efi_info *e = &boot_params_ptr->efi_info; bool efi_mirror_found = false; struct mem_vector region; efi_memory_desc_t *md; @@ -761,8 +761,8 @@ static void process_e820_entries(unsigned long minimum, struct boot_e820_entry *entry; /* Verify potential e820 positions, appending to slots list. */ - for (i = 0; i < boot_params->e820_entries; i++) { - entry = &boot_params->e820_table[i]; + for (i = 0; i < boot_params_ptr->e820_entries; i++) { + entry = &boot_params_ptr->e820_table[i]; /* Skip non-RAM entries. */ if (entry->type != E820_TYPE_RAM) continue; @@ -836,7 +836,7 @@ void choose_random_location(unsigned long input, return; } - boot_params->hdr.loadflags |= KASLR_FLAG; + boot_params_ptr->hdr.loadflags |= KASLR_FLAG; if (IS_ENABLED(CONFIG_X86_32)) mem_limit = KERNEL_IMAGE_SIZE; diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index fb55ac18af6f..8ae7893d712f 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -46,7 +46,7 @@ void *memmove(void *dest, const void *src, size_t n); /* * This is set up by the setup-routine at boot-time */ -struct boot_params *boot_params; +struct boot_params *boot_params_ptr; struct port_io_ops pio_ops; @@ -132,8 +132,8 @@ void __putstr(const char *s) if (lines == 0 || cols == 0) return; - x = boot_params->screen_info.orig_x; - y = boot_params->screen_info.orig_y; + x = boot_params_ptr->screen_info.orig_x; + y = boot_params_ptr->screen_info.orig_y; while ((c = *s++) != '\0') { if (c == '\n') { @@ -154,8 +154,8 @@ void __putstr(const char *s) } } - boot_params->screen_info.orig_x = x; - boot_params->screen_info.orig_y = y; + boot_params_ptr->screen_info.orig_x = x; + boot_params_ptr->screen_info.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ outb(14, vidport); @@ -382,14 +382,14 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) size_t entry_offset; /* Retain x86 boot parameters pointer passed from startup_32/64. */ - boot_params = rmode; + boot_params_ptr = rmode; /* Clear flags intended for solely in-kernel use. */ - boot_params->hdr.loadflags &= ~KASLR_FLAG; + boot_params_ptr->hdr.loadflags &= ~KASLR_FLAG; - sanitize_boot_params(boot_params); + sanitize_boot_params(boot_params_ptr); - if (boot_params->screen_info.orig_video_mode == 7) { + if (boot_params_ptr->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; } else { @@ -397,8 +397,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) vidport = 0x3d4; } - lines = boot_params->screen_info.orig_video_lines; - cols = boot_params->screen_info.orig_video_cols; + lines = boot_params_ptr->screen_info.orig_video_lines; + cols = boot_params_ptr->screen_info.orig_video_cols; init_default_io_ops(); @@ -417,7 +417,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) * so that early debugging output from the RSDP parsing code can be * collected. */ - boot_params->acpi_rsdp_addr = get_rsdp_addr(); + boot_params_ptr->acpi_rsdp_addr = get_rsdp_addr(); debug_putstr("early console in extract_kernel\n"); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index b6e46435b90b..254acd76efde 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -52,7 +52,6 @@ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; void *malloc(int size); void free(void *where); -extern struct boot_params *boot_params; void __putstr(const char *s); void __puthex(unsigned long value); #define error_putstr(__x) __putstr(__x) diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 7939eb6e6ce9..51f957b24ba7 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -28,7 +28,6 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; */ unsigned long *trampoline_32bit __section(".data"); -extern struct boot_params *boot_params; int cmdline_find_option_bool(const char *option); static unsigned long find_trampoline_placement(void) @@ -49,7 +48,7 @@ static unsigned long find_trampoline_placement(void) * * Only look for values in the legacy ROM for non-EFI system. */ - signature = (char *)&boot_params->efi_info.efi_loader_signature; + signature = (char *)&boot_params_ptr->efi_info.efi_loader_signature; if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) { ebda_start = *(unsigned short *)0x40e << 4; @@ -65,10 +64,10 @@ static unsigned long find_trampoline_placement(void) bios_start = round_down(bios_start, PAGE_SIZE); /* Find the first usable memory region under bios_start. */ - for (i = boot_params->e820_entries - 1; i >= 0; i--) { + for (i = boot_params_ptr->e820_entries - 1; i >= 0; i--) { unsigned long new = bios_start; - entry = &boot_params->e820_table[i]; + entry = &boot_params_ptr->e820_table[i]; /* Skip all entries above bios_start. */ if (bios_start <= entry->addr) @@ -107,7 +106,7 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable) bool l5_required = false; /* Initialize boot_params. Required for cmdline_find_option_bool(). */ - boot_params = bp; + boot_params_ptr = bp; /* * Check if LA57 is desired and supported. diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 8b21c57bc470..d07e665bb265 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -565,7 +565,7 @@ void sev_prep_identity_maps(unsigned long top_level_pgt) * accessed after switchover. */ if (sev_snp_enabled()) { - unsigned long cc_info_pa = boot_params->cc_blob_address; + unsigned long cc_info_pa = boot_params_ptr->cc_blob_address; struct cc_blob_sev_info *cc_info; kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info)); diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index b3a7cfb0d99e..a38cc0afc90a 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -85,6 +85,8 @@ extern const unsigned long kernel_total_size; unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, void (*error)(char *x)); + +extern struct boot_params *boot_params_ptr; #endif #endif /* _ASM_X86_BOOT_H */ From 3a396c409a39ce701533f3f55f3db0ab700aaeae Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:55 +0100 Subject: [PATCH 195/216] x86/boot: efistub: Assign global boot_params variable From: Ard Biesheuvel [ Commit 50dcc2e0d62e3c4a54f39673c4dc3dcde7c74d52 upstream ] Now that the x86 EFI stub calls into some APIs exposed by the decompressor (e.g., kaslr_get_random_long()), it is necessary to ensure that the global boot_params variable is set correctly before doing so. Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/libstub/x86-stub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index c1dcc86fcc3d..b183b40195ee 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -827,6 +827,8 @@ void __noreturn efi_stub_entry(efi_handle_t handle, unsigned long kernel_entry; efi_status_t status; + boot_params_ptr = boot_params; + efi_system_table = sys_table_arg; /* Check if we were booted by the EFI firmware */ if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) From 2402392bed4e440e05442fb1de4ef97536ff5a96 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Mar 2024 12:19:56 +0100 Subject: [PATCH 196/216] efi/x86: Fix the missing KASLR_FLAG bit in boot_params->hdr.loadflags From: Yuntao Wang [ Commit 01638431c465741e071ab34acf3bef3c2570f878 upstream ] When KASLR is enabled, the KASLR_FLAG bit in boot_params->hdr.loadflags should be set to 1 to propagate KASLR status from compressed kernel to kernel, just as the choose_random_location() function does. Currently, when the kernel is booted via the EFI stub, the KASLR_FLAG bit in boot_params->hdr.loadflags is not set, even though it should be. This causes some functions, such as kernel_randomize_memory(), not to execute as expected. Fix it. Fixes: a1b87d54f4e4 ("x86/efistub: Avoid legacy decompressor when doing EFI boot") Signed-off-by: Yuntao Wang [ardb: drop 'else' branch clearing KASLR_FLAG] Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/libstub/x86-stub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index b183b40195ee..a0757a37b482 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -781,6 +781,8 @@ static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry) efi_debug("AMI firmware v2.0 or older detected - disabling physical KASLR\n"); seed[0] = 0; } + + boot_params_ptr->hdr.loadflags |= KASLR_FLAG; } status = efi_random_alloc(alloc_size, CONFIG_PHYSICAL_ALIGN, &addr, From c4c795b21dd23d9514ae1c6646c3fb2c78b5be60 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 Feb 2024 09:46:57 -0800 Subject: [PATCH 197/216] af_unix: Drop oob_skb ref before purging queue in GC. commit aa82ac51d63328714645c827775d64dbfd9941f3 upstream. syzbot reported another task hung in __unix_gc(). [0] The current while loop assumes that all of the left candidates have oob_skb and calling kfree_skb(oob_skb) releases the remaining candidates. However, I missed a case that oob_skb has self-referencing fd and another fd and the latter sk is placed before the former in the candidate list. Then, the while loop never proceeds, resulting the task hung. __unix_gc() has the same loop just before purging the collected skb, so we can call kfree_skb(oob_skb) there and let __skb_queue_purge() release all inflight sockets. [0]: Sending NMI from CPU 0 to CPUs 1: NMI backtrace for cpu 1 CPU: 1 PID: 2784 Comm: kworker/u4:8 Not tainted 6.8.0-rc4-syzkaller-01028-g71b605d32017 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024 Workqueue: events_unbound __unix_gc RIP: 0010:__sanitizer_cov_trace_pc+0x0/0x70 kernel/kcov.c:200 Code: 89 fb e8 23 00 00 00 48 8b 3d 84 f5 1a 0c 48 89 de 5b e9 43 26 57 00 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1e fa 48 8b 04 24 65 48 8b 0d 90 52 70 7e 65 8b 15 91 52 70 RSP: 0018:ffffc9000a17fa78 EFLAGS: 00000287 RAX: ffffffff8a0a6108 RBX: ffff88802b6c2640 RCX: ffff88802c0b3b80 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000000 RBP: ffffc9000a17fbf0 R08: ffffffff89383f1d R09: 1ffff1100ee5ff84 R10: dffffc0000000000 R11: ffffed100ee5ff85 R12: 1ffff110056d84ee R13: ffffc9000a17fae0 R14: 0000000000000000 R15: ffffffff8f47b840 FS: 0000000000000000(0000) GS:ffff8880b9500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffef5687ff8 CR3: 0000000029b34000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __unix_gc+0xe69/0xf40 net/unix/garbage.c:343 process_one_work kernel/workqueue.c:2633 [inline] process_scheduled_works+0x913/0x1420 kernel/workqueue.c:2706 worker_thread+0xa5f/0x1000 kernel/workqueue.c:2787 kthread+0x2ef/0x390 kernel/kthread.c:388 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:242 Reported-and-tested-by: syzbot+ecab4d36f920c3574bf9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=ecab4d36f920c3574bf9 Fixes: 25236c91b5ab ("af_unix: Fix task hung while purging oob_skb in GC.") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/unix/garbage.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 9e1bab97c05b..ab2c83d58b62 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -284,9 +284,17 @@ void unix_gc(void) * which are creating the cycle(s). */ skb_queue_head_init(&hitlist); - list_for_each_entry(u, &gc_candidates, link) + list_for_each_entry(u, &gc_candidates, link) { scan_children(&u->sk, inc_inflight, &hitlist); +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + kfree_skb(u->oob_skb); + u->oob_skb = NULL; + } +#endif + } + /* not_cycle_list contains those sockets which do not make up a * cycle. Restore these to the inflight list. */ @@ -314,18 +322,6 @@ void unix_gc(void) /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - while (!list_empty(&gc_candidates)) { - u = list_entry(gc_candidates.next, struct unix_sock, link); - if (u->oob_skb) { - struct sk_buff *skb = u->oob_skb; - - u->oob_skb = NULL; - kfree_skb(skb); - } - } -#endif - spin_lock(&unix_gc_lock); /* There could be io_uring registered files, just push them back to From 2c96f66cd0cca5695ec326398f98b58f545ac087 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Wed, 10 Jan 2024 10:33:43 +0100 Subject: [PATCH 198/216] phy: freescale: phy-fsl-imx8-mipi-dphy: Fix alias name to use dashes [ Upstream commit 7936378cb6d87073163130e1e1fc1e5f76a597cf ] Devicetree spec lists only dashes as valid characters for alias names. Table 3.2: Valid characters for alias names, Devicee Specification, Release v0.4 Signed-off-by: Alexander Stein Fixes: 3fbae284887de ("phy: freescale: phy-fsl-imx8-mipi-dphy: Add i.MX8qxp LVDS PHY mode support") Link: https://lore.kernel.org/r/20240110093343.468810-1-alexander.stein@ew.tq-group.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/phy/freescale/phy-fsl-imx8-mipi-dphy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/freescale/phy-fsl-imx8-mipi-dphy.c b/drivers/phy/freescale/phy-fsl-imx8-mipi-dphy.c index e625b32889bf..0928a526e2ab 100644 --- a/drivers/phy/freescale/phy-fsl-imx8-mipi-dphy.c +++ b/drivers/phy/freescale/phy-fsl-imx8-mipi-dphy.c @@ -706,7 +706,7 @@ static int mixel_dphy_probe(struct platform_device *pdev) return ret; } - priv->id = of_alias_get_id(np, "mipi_dphy"); + priv->id = of_alias_get_id(np, "mipi-dphy"); if (priv->id < 0) { dev_err(dev, "Failed to get phy node alias id: %d\n", priv->id); From 7eb95e0af5c9c2e6fad50356eaf32d216d0e7bc3 Mon Sep 17 00:00:00 2001 From: Gaurav Batra Date: Thu, 25 Jan 2024 14:30:17 -0600 Subject: [PATCH 199/216] powerpc/pseries/iommu: IOMMU table is not initialized for kdump over SR-IOV [ Upstream commit 09a3c1e46142199adcee372a420b024b4fc61051 ] When kdump kernel tries to copy dump data over SR-IOV, LPAR panics due to NULL pointer exception: Kernel attempted to read user page (0) - exploit attempt? (uid: 0) BUG: Kernel NULL pointer dereference on read at 0x00000000 Faulting instruction address: 0xc000000020847ad4 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries Modules linked in: mlx5_core(+) vmx_crypto pseries_wdt papr_scm libnvdimm mlxfw tls psample sunrpc fuse overlay squashfs loop CPU: 12 PID: 315 Comm: systemd-udevd Not tainted 6.4.0-Test102+ #12 Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_008) hv:phyp pSeries NIP: c000000020847ad4 LR: c00000002083b2dc CTR: 00000000006cd18c REGS: c000000029162ca0 TRAP: 0300 Not tainted (6.4.0-Test102+) MSR: 800000000280b033 CR: 48288244 XER: 00000008 CFAR: c00000002083b2d8 DAR: 0000000000000000 DSISR: 40000000 IRQMASK: 1 ... NIP _find_next_zero_bit+0x24/0x110 LR bitmap_find_next_zero_area_off+0x5c/0xe0 Call Trace: dev_printk_emit+0x38/0x48 (unreliable) iommu_area_alloc+0xc4/0x180 iommu_range_alloc+0x1e8/0x580 iommu_alloc+0x60/0x130 iommu_alloc_coherent+0x158/0x2b0 dma_iommu_alloc_coherent+0x3c/0x50 dma_alloc_attrs+0x170/0x1f0 mlx5_cmd_init+0xc0/0x760 [mlx5_core] mlx5_function_setup+0xf0/0x510 [mlx5_core] mlx5_init_one+0x84/0x210 [mlx5_core] probe_one+0x118/0x2c0 [mlx5_core] local_pci_probe+0x68/0x110 pci_call_probe+0x68/0x200 pci_device_probe+0xbc/0x1a0 really_probe+0x104/0x540 __driver_probe_device+0xb4/0x230 driver_probe_device+0x54/0x130 __driver_attach+0x158/0x2b0 bus_for_each_dev+0xa8/0x130 driver_attach+0x34/0x50 bus_add_driver+0x16c/0x300 driver_register+0xa4/0x1b0 __pci_register_driver+0x68/0x80 mlx5_init+0xb8/0x100 [mlx5_core] do_one_initcall+0x60/0x300 do_init_module+0x7c/0x2b0 At the time of LPAR dump, before kexec hands over control to kdump kernel, DDWs (Dynamic DMA Windows) are scanned and added to the FDT. For the SR-IOV case, default DMA window "ibm,dma-window" is removed from the FDT and DDW added, for the device. Now, kexec hands over control to the kdump kernel. When the kdump kernel initializes, PCI busses are scanned and IOMMU group/tables created, in pci_dma_bus_setup_pSeriesLP(). For the SR-IOV case, there is no "ibm,dma-window". The original commit: b1fc44eaa9ba, fixes the path where memory is pre-mapped (direct mapped) to the DDW. When TCEs are direct mapped, there is no need to initialize IOMMU tables. iommu_table_setparms_lpar() only considers "ibm,dma-window" property when initiallizing IOMMU table. In the scenario where TCEs are dynamically allocated for SR-IOV, newly created IOMMU table is not initialized. Later, when the device driver tries to enter TCEs for the SR-IOV device, NULL pointer execption is thrown from iommu_area_alloc(). The fix is to initialize the IOMMU table with DDW property stored in the FDT. There are 2 points to remember: 1. For the dedicated adapter, kdump kernel would encounter both default and DDW in FDT. In this case, DDW property is used to initialize the IOMMU table. 2. A DDW could be direct or dynamic mapped. kdump kernel would initialize IOMMU table and mark the existing DDW as "dynamic". This works fine since, at the time of table initialization, iommu_table_clear() makes some space in the DDW, for some predefined number of TCEs which are needed for kdump to succeed. Fixes: b1fc44eaa9ba ("pseries/iommu/ddw: Fix kdump to work in absence of ibm,dma-window") Signed-off-by: Gaurav Batra Reviewed-by: Brian King Signed-off-by: Michael Ellerman Link: https://msgid.link/20240125203017.61014-1-gbatra@linux.ibm.com Signed-off-by: Sasha Levin --- arch/powerpc/platforms/pseries/iommu.c | 156 +++++++++++++++++-------- 1 file changed, 105 insertions(+), 51 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 97b026130c71..1e5f083cdb72 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -569,29 +569,6 @@ static void iommu_table_setparms(struct pci_controller *phb, struct iommu_table_ops iommu_table_lpar_multi_ops; -/* - * iommu_table_setparms_lpar - * - * Function: On pSeries LPAR systems, return TCE table info, given a pci bus. - */ -static void iommu_table_setparms_lpar(struct pci_controller *phb, - struct device_node *dn, - struct iommu_table *tbl, - struct iommu_table_group *table_group, - const __be32 *dma_window) -{ - unsigned long offset, size, liobn; - - of_parse_dma_window(dn, dma_window, &liobn, &offset, &size); - - iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL, - &iommu_table_lpar_multi_ops); - - - table_group->tce32_start = offset; - table_group->tce32_size = size; -} - struct iommu_table_ops iommu_table_pseries_ops = { .set = tce_build_pSeries, .clear = tce_free_pSeries, @@ -719,26 +696,71 @@ struct iommu_table_ops iommu_table_lpar_multi_ops = { * dynamic 64bit DMA window, walking up the device tree. */ static struct device_node *pci_dma_find(struct device_node *dn, - const __be32 **dma_window) + struct dynamic_dma_window_prop *prop) { - const __be32 *dw = NULL; + const __be32 *default_prop = NULL; + const __be32 *ddw_prop = NULL; + struct device_node *rdn = NULL; + bool default_win = false, ddw_win = false; for ( ; dn && PCI_DN(dn); dn = dn->parent) { - dw = of_get_property(dn, "ibm,dma-window", NULL); - if (dw) { - if (dma_window) - *dma_window = dw; - return dn; + default_prop = of_get_property(dn, "ibm,dma-window", NULL); + if (default_prop) { + rdn = dn; + default_win = true; } - dw = of_get_property(dn, DIRECT64_PROPNAME, NULL); - if (dw) - return dn; - dw = of_get_property(dn, DMA64_PROPNAME, NULL); - if (dw) - return dn; + ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL); + if (ddw_prop) { + rdn = dn; + ddw_win = true; + break; + } + ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL); + if (ddw_prop) { + rdn = dn; + ddw_win = true; + break; + } + + /* At least found default window, which is the case for normal boot */ + if (default_win) + break; } - return NULL; + /* For PCI devices there will always be a DMA window, either on the device + * or parent bus + */ + WARN_ON(!(default_win | ddw_win)); + + /* caller doesn't want to get DMA window property */ + if (!prop) + return rdn; + + /* parse DMA window property. During normal system boot, only default + * DMA window is passed in OF. But, for kdump, a dedicated adapter might + * have both default and DDW in FDT. In this scenario, DDW takes precedence + * over default window. + */ + if (ddw_win) { + struct dynamic_dma_window_prop *p; + + p = (struct dynamic_dma_window_prop *)ddw_prop; + prop->liobn = p->liobn; + prop->dma_base = p->dma_base; + prop->tce_shift = p->tce_shift; + prop->window_shift = p->window_shift; + } else if (default_win) { + unsigned long offset, size, liobn; + + of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size); + + prop->liobn = cpu_to_be32((u32)liobn); + prop->dma_base = cpu_to_be64(offset); + prop->tce_shift = cpu_to_be32(IOMMU_PAGE_SHIFT_4K); + prop->window_shift = cpu_to_be32(order_base_2(size)); + } + + return rdn; } static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) @@ -746,17 +768,20 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) struct iommu_table *tbl; struct device_node *dn, *pdn; struct pci_dn *ppci; - const __be32 *dma_window = NULL; + struct dynamic_dma_window_prop prop; dn = pci_bus_to_OF_node(bus); pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n", dn); - pdn = pci_dma_find(dn, &dma_window); + pdn = pci_dma_find(dn, &prop); - if (dma_window == NULL) - pr_debug(" no ibm,dma-window property !\n"); + /* In PPC architecture, there will always be DMA window on bus or one of the + * parent bus. During reboot, there will be ibm,dma-window property to + * define DMA window. For kdump, there will at least be default window or DDW + * or both. + */ ppci = PCI_DN(pdn); @@ -766,13 +791,24 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) if (!ppci->table_group) { ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); tbl = ppci->table_group->tables[0]; - if (dma_window) { - iommu_table_setparms_lpar(ppci->phb, pdn, tbl, - ppci->table_group, dma_window); - if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) - panic("Failed to initialize iommu table"); - } + iommu_table_setparms_common(tbl, ppci->phb->bus->number, + be32_to_cpu(prop.liobn), + be64_to_cpu(prop.dma_base), + 1ULL << be32_to_cpu(prop.window_shift), + be32_to_cpu(prop.tce_shift), NULL, + &iommu_table_lpar_multi_ops); + + /* Only for normal boot with default window. Doesn't matter even + * if we set these with DDW which is 64bit during kdump, since + * these will not be used during kdump. + */ + ppci->table_group->tce32_start = be64_to_cpu(prop.dma_base); + ppci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); + + if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) + panic("Failed to initialize iommu table"); + iommu_register_group(ppci->table_group, pci_domain_nr(bus), 0); pr_debug(" created table: %p\n", ppci->table_group); @@ -960,6 +996,12 @@ static void find_existing_ddw_windows_named(const char *name) continue; } + /* If at the time of system initialization, there are DDWs in OF, + * it means this is during kexec. DDW could be direct or dynamic. + * We will just mark DDWs as "dynamic" since this is kdump path, + * no need to worry about perforance. ddw_list_new_entry() will + * set window->direct = false. + */ window = ddw_list_new_entry(pdn, dma64); if (!window) { of_node_put(pdn); @@ -1525,8 +1567,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) { struct device_node *pdn, *dn; struct iommu_table *tbl; - const __be32 *dma_window = NULL; struct pci_dn *pci; + struct dynamic_dma_window_prop prop; pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev)); @@ -1539,7 +1581,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) dn = pci_device_to_OF_node(dev); pr_debug(" node is %pOF\n", dn); - pdn = pci_dma_find(dn, &dma_window); + pdn = pci_dma_find(dn, &prop); if (!pdn || !PCI_DN(pdn)) { printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: " "no DMA window found for pci dev=%s dn=%pOF\n", @@ -1552,8 +1594,20 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) if (!pci->table_group) { pci->table_group = iommu_pseries_alloc_group(pci->phb->node); tbl = pci->table_group->tables[0]; - iommu_table_setparms_lpar(pci->phb, pdn, tbl, - pci->table_group, dma_window); + + iommu_table_setparms_common(tbl, pci->phb->bus->number, + be32_to_cpu(prop.liobn), + be64_to_cpu(prop.dma_base), + 1ULL << be32_to_cpu(prop.window_shift), + be32_to_cpu(prop.tce_shift), NULL, + &iommu_table_lpar_multi_ops); + + /* Only for normal boot with default window. Doesn't matter even + * if we set these with DDW which is 64bit during kdump, since + * these will not be used during kdump. + */ + pci->table_group->tce32_start = be64_to_cpu(prop.dma_base); + pci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); iommu_init_table(tbl, pci->phb->node, 0, 0); iommu_register_group(pci->table_group, From a3d369aeb332bc7a29ba1facb9a3d3d8ba8d2568 Mon Sep 17 00:00:00 2001 From: Arturas Moskvinas Date: Fri, 1 Mar 2024 09:12:04 +0200 Subject: [PATCH 200/216] gpio: 74x164: Enable output pins after registers are reset [ Upstream commit 530b1dbd97846b110ea8a94c7cc903eca21786e5 ] Chip outputs are enabled[1] before actual reset is performed[2] which might cause pin output value to flip flop if previous pin value was set to 1. Fix that behavior by making sure chip is fully reset before all outputs are enabled. Flip-flop can be noticed when module is removed and inserted again and one of the pins was changed to 1 before removal. 100 microsecond flipping is noticeable on oscilloscope (100khz SPI bus). For a properly reset chip - output is enabled around 100 microseconds (on 100khz SPI bus) later during probing process hence should be irrelevant behavioral change. Fixes: 7ebc194d0fd4 (gpio: 74x164: Introduce 'enable-gpios' property) Link: https://elixir.bootlin.com/linux/v6.7.4/source/drivers/gpio/gpio-74x164.c#L130 [1] Link: https://elixir.bootlin.com/linux/v6.7.4/source/drivers/gpio/gpio-74x164.c#L150 [2] Signed-off-by: Arturas Moskvinas Signed-off-by: Bartosz Golaszewski Signed-off-by: Sasha Levin --- drivers/gpio/gpio-74x164.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-74x164.c b/drivers/gpio/gpio-74x164.c index e00c33310517..753e7be039e4 100644 --- a/drivers/gpio/gpio-74x164.c +++ b/drivers/gpio/gpio-74x164.c @@ -127,8 +127,6 @@ static int gen_74x164_probe(struct spi_device *spi) if (IS_ERR(chip->gpiod_oe)) return PTR_ERR(chip->gpiod_oe); - gpiod_set_value_cansleep(chip->gpiod_oe, 1); - spi_set_drvdata(spi, chip); chip->gpio_chip.label = spi->modalias; @@ -153,6 +151,8 @@ static int gen_74x164_probe(struct spi_device *spi) goto exit_destroy; } + gpiod_set_value_cansleep(chip->gpiod_oe, 1); + ret = gpiochip_add_data(&chip->gpio_chip, chip); if (!ret) return 0; From 17acece41de3dafb63018fecbf54d288366901eb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 21 Feb 2024 21:28:46 +0200 Subject: [PATCH 201/216] gpiolib: Fix the error path order in gpiochip_add_data_with_key() [ Upstream commit e4aec4daa8c009057b5e063db1b7322252c92dc8 ] After shuffling the code, error path wasn't updated correctly. Fix it here. Fixes: 2f4133bb5f14 ("gpiolib: No need to call gpiochip_remove_pin_ranges() twice") Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski Signed-off-by: Sasha Levin --- drivers/gpio/gpiolib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 6d3e3454a6ed..f646df7f1b41 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -815,11 +815,11 @@ err_remove_irqchip_mask: gpiochip_irqchip_free_valid_mask(gc); err_remove_acpi_chip: acpi_gpiochip_remove(gc); + gpiochip_remove_pin_ranges(gc); err_remove_of_chip: gpiochip_free_hogs(gc); of_gpiochip_remove(gc); err_free_gpiochip_mask: - gpiochip_remove_pin_ranges(gc); gpiochip_free_valid_mask(gc); if (gdev->dev.release) { /* release() has been registered by gpiochip_setup_dev() */ From c6ff5fb6b157cf4101889c1f3e169eb6897e8f50 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 29 Feb 2024 18:25:49 +0100 Subject: [PATCH 202/216] gpio: fix resource unwinding order in error path [ Upstream commit ec5c54a9d3c4f9c15e647b049fea401ee5258696 ] Hogs are added *after* ACPI so should be removed *before* in error path. Fixes: a411e81e61df ("gpiolib: add hogs support for machine code") Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Signed-off-by: Sasha Levin --- drivers/gpio/gpiolib.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index f646df7f1b41..9d8c78312403 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -784,11 +784,11 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, ret = gpiochip_irqchip_init_valid_mask(gc); if (ret) - goto err_remove_acpi_chip; + goto err_free_hogs; ret = gpiochip_irqchip_init_hw(gc); if (ret) - goto err_remove_acpi_chip; + goto err_remove_irqchip_mask; ret = gpiochip_add_irqchip(gc, lock_key, request_key); if (ret) @@ -813,11 +813,11 @@ err_remove_irqchip: gpiochip_irqchip_remove(gc); err_remove_irqchip_mask: gpiochip_irqchip_free_valid_mask(gc); -err_remove_acpi_chip: +err_free_hogs: + gpiochip_free_hogs(gc); acpi_gpiochip_remove(gc); gpiochip_remove_pin_ranges(gc); err_remove_of_chip: - gpiochip_free_hogs(gc); of_gpiochip_remove(gc); err_free_gpiochip_mask: gpiochip_free_valid_mask(gc); From 0e351d1aa2e4c1a7a4cb2a5753b86db89796d3c8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 25 Feb 2024 11:01:41 +0800 Subject: [PATCH 203/216] block: define bvec_iter as __packed __aligned(4) [ Upstream commit 7838b4656110d950afdd92a081cc0f33e23e0ea8 ] In commit 19416123ab3e ("block: define 'struct bvec_iter' as packed"), what we need is to save the 4byte padding, and avoid `bio` to spread on one extra cache line. It is enough to define it as '__packed __aligned(4)', as '__packed' alone means byte aligned, and can cause compiler to generate horrible code on architectures that don't support unaligned access in case that bvec_iter is embedded in other structures. Cc: Mikulas Patocka Suggested-by: Linus Torvalds Fixes: 19416123ab3e ("block: define 'struct bvec_iter' as packed") Signed-off-by: Ming Lei Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/linux/bvec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 9e3dac51eb26..d4dbaae8b521 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -59,7 +59,7 @@ struct bvec_iter { unsigned int bi_bvec_done; /* number of bytes completed in current bvec */ -} __packed; +} __packed __aligned(4); struct bvec_iter_all { struct bio_vec bv; From 19ec82b3cad1abef2a929262b8c1528f4e0c192d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 4 Mar 2024 14:10:12 +0100 Subject: [PATCH 204/216] Revert "interconnect: Fix locking for runpm vs reclaim" This reverts commit ee42bfc791aa3cd78e29046f26a09d189beb3efb which is commit af42269c3523492d71ebbe11fefae2653e9cdc78 upstream. It is reported to cause boot crashes in Android systems, so revert it from the stable trees for now. Cc: Rob Clark Cc: Georgi Djakov Cc: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/interconnect/core.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c index 1d9494f64a21..87f380e0e982 100644 --- a/drivers/interconnect/core.c +++ b/drivers/interconnect/core.c @@ -29,7 +29,6 @@ static LIST_HEAD(icc_providers); static int providers_count; static bool synced_state; static DEFINE_MUTEX(icc_lock); -static DEFINE_MUTEX(icc_bw_lock); static struct dentry *icc_debugfs_dir; static void icc_summary_show_one(struct seq_file *s, struct icc_node *n) @@ -636,7 +635,7 @@ int icc_set_bw(struct icc_path *path, u32 avg_bw, u32 peak_bw) if (WARN_ON(IS_ERR(path) || !path->num_nodes)) return -EINVAL; - mutex_lock(&icc_bw_lock); + mutex_lock(&icc_lock); old_avg = path->reqs[0].avg_bw; old_peak = path->reqs[0].peak_bw; @@ -668,7 +667,7 @@ int icc_set_bw(struct icc_path *path, u32 avg_bw, u32 peak_bw) apply_constraints(path); } - mutex_unlock(&icc_bw_lock); + mutex_unlock(&icc_lock); trace_icc_set_bw_end(path, ret); @@ -971,7 +970,6 @@ void icc_node_add(struct icc_node *node, struct icc_provider *provider) return; mutex_lock(&icc_lock); - mutex_lock(&icc_bw_lock); node->provider = provider; list_add_tail(&node->node_list, &provider->nodes); @@ -997,7 +995,6 @@ void icc_node_add(struct icc_node *node, struct icc_provider *provider) node->avg_bw = 0; node->peak_bw = 0; - mutex_unlock(&icc_bw_lock); mutex_unlock(&icc_lock); } EXPORT_SYMBOL_GPL(icc_node_add); @@ -1137,7 +1134,6 @@ void icc_sync_state(struct device *dev) return; mutex_lock(&icc_lock); - mutex_lock(&icc_bw_lock); synced_state = true; list_for_each_entry(p, &icc_providers, provider_list) { dev_dbg(p->dev, "interconnect provider is in synced state\n"); From 559035e04e442a0c7fd58d5fe00308b0d99e2318 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 4 Mar 2024 14:12:15 +0100 Subject: [PATCH 205/216] Revert "interconnect: Teach lockdep about icc_bw_lock order" This reverts commit 0db211ec0f1d32b93486e8f6565249ad4d1bece5 which is commit 13619170303878e1dae86d9a58b039475c957fcf upstream. It is reported to cause boot crashes in Android systems, so revert it from the stable trees for now. Cc: Rob Clark Cc: Georgi Djakov Cc: Guenter Roeck Cc: Jon Hunter Signed-off-by: Greg Kroah-Hartman --- drivers/interconnect/core.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c index 87f380e0e982..4526ff2e1bd5 100644 --- a/drivers/interconnect/core.c +++ b/drivers/interconnect/core.c @@ -1146,21 +1146,13 @@ void icc_sync_state(struct device *dev) } } } - mutex_unlock(&icc_bw_lock); mutex_unlock(&icc_lock); } EXPORT_SYMBOL_GPL(icc_sync_state); static int __init icc_init(void) { - struct device_node *root; - - /* Teach lockdep about lock ordering wrt. shrinker: */ - fs_reclaim_acquire(GFP_KERNEL); - might_lock(&icc_bw_lock); - fs_reclaim_release(GFP_KERNEL); - - root = of_find_node_by_path("/"); + struct device_node *root = of_find_node_by_path("/"); providers_count = of_count_icc_providers(root); of_node_put(root); From 29d3e02fb448b50ffd5d83156de9680daf16f47a Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 4 Mar 2024 01:23:53 -0800 Subject: [PATCH 206/216] x86/bugs: Add asm helpers for executing VERW commit baf8361e54550a48a7087b603313ad013cc13386 upstream. MDS mitigation requires clearing the CPU buffers before returning to user. This needs to be done late in the exit-to-user path. Current location of VERW leaves a possibility of kernel data ending up in CPU buffers for memory accesses done after VERW such as: 1. Kernel data accessed by an NMI between VERW and return-to-user can remain in CPU buffers since NMI returning to kernel does not execute VERW to clear CPU buffers. 2. Alyssa reported that after VERW is executed, CONFIG_GCC_PLUGIN_STACKLEAK=y scrubs the stack used by a system call. Memory accesses during stack scrubbing can move kernel stack contents into CPU buffers. 3. When caller saved registers are restored after a return from function executing VERW, the kernel stack accesses can remain in CPU buffers(since they occur after VERW). To fix this VERW needs to be moved very late in exit-to-user path. In preparation for moving VERW to entry/exit asm code, create macros that can be used in asm. Also make VERW patching depend on a new feature flag X86_FEATURE_CLEAR_CPU_BUF. [pawan: - Runtime patch jmp instead of verw in macro CLEAR_CPU_BUFFERS due to lack of relative addressing support for relocations in kernels < v6.5. - Add UNWIND_HINT_EMPTY to avoid warning: arch/x86/entry/entry.o: warning: objtool: mds_verw_sel+0x0: unreachable instruction] Reported-by: Alyssa Milburn Suggested-by: Andrew Cooper Suggested-by: Peter Zijlstra Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20240213-delay-verw-v8-1-a6216d83edb7%40linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry.S | 23 +++++++++++++++++++++++ arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/include/asm/nospec-branch.h | 15 +++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index bfb7bcb362bc..09e99d13fc0b 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -6,6 +6,9 @@ #include #include #include +#include +#include +#include .pushsection .noinstr.text, "ax" @@ -20,3 +23,23 @@ SYM_FUNC_END(entry_ibpb) EXPORT_SYMBOL_GPL(entry_ibpb); .popsection + +/* + * Define the VERW operand that is disguised as entry code so that + * it can be referenced with KPTI enabled. This ensure VERW can be + * used late in exit-to-user path after page tables are switched. + */ +.pushsection .entry.text, "ax" + +.align L1_CACHE_BYTES, 0xcc +SYM_CODE_START_NOALIGN(mds_verw_sel) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR + .word __KERNEL_DS +.align L1_CACHE_BYTES, 0xcc +SYM_CODE_END(mds_verw_sel); +/* For KVM */ +EXPORT_SYMBOL_GPL(mds_verw_sel); + +.popsection + diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index b122708792c4..b60f24b30cb9 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -304,7 +304,7 @@ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ #define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ #define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ - +#define X86_FEATURE_CLEAR_CPU_BUF (11*32+18) /* "" Clear CPU buffers using VERW */ #define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index d3706de91a93..2c66b2081f87 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -194,6 +194,19 @@ #endif .endm +/* + * Macro to execute VERW instruction that mitigate transient data sampling + * attacks such as MDS. On affected systems a microcode update overloaded VERW + * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF. + * + * Note: Only the memory operand variant of VERW clears the CPU buffers. + */ +.macro CLEAR_CPU_BUFFERS + ALTERNATIVE "jmp .Lskip_verw_\@", "", X86_FEATURE_CLEAR_CPU_BUF + verw _ASM_RIP(mds_verw_sel) +.Lskip_verw_\@: +.endm + #else /* __ASSEMBLY__ */ #define ANNOTATE_RETPOLINE_SAFE \ @@ -375,6 +388,8 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); +extern u16 mds_verw_sel; + #include /** From 22444d079b4ccc608b9bac3e591cd88629c73df7 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 4 Mar 2024 01:23:59 -0800 Subject: [PATCH 207/216] x86/entry_64: Add VERW just before userspace transition commit 3c7501722e6b31a6e56edd23cea5e77dbb9ffd1a upstream. Mitigation for MDS is to use VERW instruction to clear any secrets in CPU Buffers. Any memory accesses after VERW execution can still remain in CPU buffers. It is safer to execute VERW late in return to user path to minimize the window in which kernel data can end up in CPU buffers. There are not many kernel secrets to be had after SWITCH_TO_USER_CR3. Add support for deploying VERW mitigation after user register state is restored. This helps minimize the chances of kernel data ending up into CPU buffers after executing VERW. Note that the mitigation at the new location is not yet enabled. Corner case not handled ======================= Interrupts returning to kernel don't clear CPUs buffers since the exit-to-user path is expected to do that anyways. But, there could be a case when an NMI is generated in kernel after the exit-to-user path has cleared the buffers. This case is not handled and NMI returning to kernel don't clear CPU buffers because: 1. It is rare to get an NMI after VERW, but before returning to user. 2. For an unprivileged user, there is no known way to make that NMI less rare or target it. 3. It would take a large number of these precisely-timed NMIs to mount an actual attack. There's presumably not enough bandwidth. 4. The NMI in question occurs after a VERW, i.e. when user state is restored and most interesting data is already scrubbed. Whats left is only the data that NMI touches, and that may or may not be of any interest. [ pawan: resolved conflict for hunk swapgs_restore_regs_and_return_to_usermode in backport ] Suggested-by: Dave Hansen Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20240213-delay-verw-v8-2-a6216d83edb7%40linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 11 +++++++++++ arch/x86/entry/entry_64_compat.S | 1 + 2 files changed, 12 insertions(+) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9953d966d124..c2383c2880ec 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -223,6 +223,7 @@ syscall_return_via_sysret: SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR swapgs + CLEAR_CPU_BUFFERS sysretq SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -656,6 +657,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) /* Restore RDI. */ popq %rdi swapgs + CLEAR_CPU_BUFFERS jmp .Lnative_iret @@ -767,6 +769,8 @@ native_irq_return_ldt: */ popq %rax /* Restore user RAX */ + CLEAR_CPU_BUFFERS + /* * RSP now points to an ordinary IRET frame, except that the page * is read-only and RSP[31:16] are preloaded with the userspace @@ -1493,6 +1497,12 @@ nmi_restore: std movq $0, 5*8(%rsp) /* clear "NMI executing" */ + /* + * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like + * NMI in kernel after user state is restored. For an unprivileged user + * these conditions are hard to meet. + */ + /* * iretq reads the "iret" frame and exits the NMI stack in a * single instruction. We are returning to kernel mode, so this @@ -1511,6 +1521,7 @@ SYM_CODE_START(ignore_sysret) UNWIND_HINT_EMPTY ENDBR mov $-ENOSYS, %eax + CLEAR_CPU_BUFFERS sysretl SYM_CODE_END(ignore_sysret) #endif diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index d6c08d8986b1..4bcd009a232b 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -272,6 +272,7 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL) xorl %r9d, %r9d xorl %r10d, %r10d swapgs + CLEAR_CPU_BUFFERS sysretl SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR From 2e3087505ddb8ba2d3d4c81306cca11e868fcdb9 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 4 Mar 2024 01:24:05 -0800 Subject: [PATCH 208/216] x86/entry_32: Add VERW just before userspace transition commit a0e2dab44d22b913b4c228c8b52b2a104434b0b3 upstream. As done for entry_64, add support for executing VERW late in exit to user path for 32-bit mode. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20240213-delay-verw-v8-3-a6216d83edb7%40linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_32.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index e309e7156038..ee5def1060c8 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -912,6 +912,7 @@ SYM_FUNC_START(entry_SYSENTER_32) BUG_IF_WRONG_CR3 no_user_check=1 popfl popl %eax + CLEAR_CPU_BUFFERS /* * Return back to the vDSO, which will pop ecx and edx. @@ -981,6 +982,7 @@ restore_all_switch_stack: /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code + CLEAR_CPU_BUFFERS .Lirq_return: /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization @@ -1173,6 +1175,7 @@ SYM_CODE_START(asm_exc_nmi) /* Not on SYSENTER stack. */ call exc_nmi + CLEAR_CPU_BUFFERS jmp .Lnmi_return .Lnmi_from_sysenter_stack: From 07946d956b55703102d5eb1518888f0d0ac87e14 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 4 Mar 2024 01:24:11 -0800 Subject: [PATCH 209/216] x86/bugs: Use ALTERNATIVE() instead of mds_user_clear static key commit 6613d82e617dd7eb8b0c40b2fe3acea655b1d611 upstream. The VERW mitigation at exit-to-user is enabled via a static branch mds_user_clear. This static branch is never toggled after boot, and can be safely replaced with an ALTERNATIVE() which is convenient to use in asm. Switch to ALTERNATIVE() to use the VERW mitigation late in exit-to-user path. Also remove the now redundant VERW in exc_nmi() and arch_exit_to_user_mode(). Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20240213-delay-verw-v8-4-a6216d83edb7%40linux.intel.com Signed-off-by: Greg Kroah-Hartman --- Documentation/x86/mds.rst | 34 ++++++++++++++++++++-------- arch/x86/include/asm/entry-common.h | 1 - arch/x86/include/asm/nospec-branch.h | 12 ---------- arch/x86/kernel/cpu/bugs.c | 15 +++++------- arch/x86/kernel/nmi.c | 3 --- arch/x86/kvm/vmx/vmx.c | 2 +- 6 files changed, 32 insertions(+), 35 deletions(-) diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst index 5d4330be200f..e801df0bb3a8 100644 --- a/Documentation/x86/mds.rst +++ b/Documentation/x86/mds.rst @@ -95,6 +95,9 @@ The kernel provides a function to invoke the buffer clearing: mds_clear_cpu_buffers() +Also macro CLEAR_CPU_BUFFERS can be used in ASM late in exit-to-user path. +Other than CFLAGS.ZF, this macro doesn't clobber any registers. + The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state (idle) transitions. @@ -138,17 +141,30 @@ Mitigation points When transitioning from kernel to user space the CPU buffers are flushed on affected CPUs when the mitigation is not disabled on the kernel - command line. The migitation is enabled through the static key - mds_user_clear. + command line. The mitigation is enabled through the feature flag + X86_FEATURE_CLEAR_CPU_BUF. - The mitigation is invoked in prepare_exit_to_usermode() which covers - all but one of the kernel to user space transitions. The exception - is when we return from a Non Maskable Interrupt (NMI), which is - handled directly in do_nmi(). + The mitigation is invoked just before transitioning to userspace after + user registers are restored. This is done to minimize the window in + which kernel data could be accessed after VERW e.g. via an NMI after + VERW. - (The reason that NMI is special is that prepare_exit_to_usermode() can - enable IRQs. In NMI context, NMIs are blocked, and we don't want to - enable IRQs with NMIs blocked.) + **Corner case not handled** + Interrupts returning to kernel don't clear CPUs buffers since the + exit-to-user path is expected to do that anyways. But, there could be + a case when an NMI is generated in kernel after the exit-to-user path + has cleared the buffers. This case is not handled and NMI returning to + kernel don't clear CPU buffers because: + + 1. It is rare to get an NMI after VERW, but before returning to userspace. + 2. For an unprivileged user, there is no known way to make that NMI + less rare or target it. + 3. It would take a large number of these precisely-timed NMIs to mount + an actual attack. There's presumably not enough bandwidth. + 4. The NMI in question occurs after a VERW, i.e. when user state is + restored and most interesting data is already scrubbed. Whats left + is only the data that NMI touches, and that may or may not be of + any interest. 2. C-State transition diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 11203a9fe0a8..ffe72790ceaf 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -91,7 +91,6 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, static __always_inline void arch_exit_to_user_mode(void) { - mds_user_clear_cpu_buffers(); amd_clear_divider(); } #define arch_exit_to_user_mode arch_exit_to_user_mode diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 2c66b2081f87..8f6f17a8617b 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -381,7 +381,6 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); -DECLARE_STATIC_KEY_FALSE(mds_user_clear); DECLARE_STATIC_KEY_FALSE(mds_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); @@ -415,17 +414,6 @@ static __always_inline void mds_clear_cpu_buffers(void) asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc"); } -/** - * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability - * - * Clear CPU buffers if the corresponding static key is enabled - */ -static __always_inline void mds_user_clear_cpu_buffers(void) -{ - if (static_branch_likely(&mds_user_clear)) - mds_clear_cpu_buffers(); -} - /** * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability * diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 13dffc43ded0..d1895930e6eb 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -110,9 +110,6 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); -/* Control MDS CPU buffer clear before returning to user space */ -DEFINE_STATIC_KEY_FALSE(mds_user_clear); -EXPORT_SYMBOL_GPL(mds_user_clear); /* Control MDS CPU buffer clear before idling (halt, mwait) */ DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); @@ -251,7 +248,7 @@ static void __init mds_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || cpu_mitigations_auto_nosmt())) @@ -355,7 +352,7 @@ static void __init taa_select_mitigation(void) * For guests that can't determine whether the correct microcode is * present on host, enable the mitigation for UCODE_NEEDED as well. */ - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); @@ -423,7 +420,7 @@ static void __init mmio_select_mitigation(void) */ if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM))) - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else static_branch_enable(&mmio_stale_data_clear); @@ -483,12 +480,12 @@ static void __init md_clear_update_mitigation(void) if (cpu_mitigations_off()) return; - if (!static_key_enabled(&mds_user_clear)) + if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) goto out; /* - * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data - * mitigation, if necessary. + * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO + * Stale Data mitigation, if necessary. */ if (mds_mitigation == MDS_MITIGATION_OFF && boot_cpu_has_bug(X86_BUG_MDS)) { diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index cec0bfa3bc04..ed6cce6c3950 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -522,9 +522,6 @@ nmi_restart: write_cr2(this_cpu_read(nmi_cr2)); if (this_cpu_dec_return(nmi_state)) goto nmi_restart; - - if (user_mode(regs)) - mds_user_clear_cpu_buffers(); } #if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 57c1374fdfd4..3b76f1bf001e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7123,7 +7123,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mds_user_clear)) + else if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF)) mds_clear_cpu_buffers(); else if (static_branch_unlikely(&mmio_stale_data_clear) && kvm_arch_has_assigned_device(vcpu->kvm)) From edfaad334a11d4fba21cbd860ba9a61213f4bd0b Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 4 Mar 2024 01:24:18 -0800 Subject: [PATCH 210/216] KVM/VMX: Use BT+JNC, i.e. EFLAGS.CF to select VMRESUME vs. VMLAUNCH From: Sean Christopherson commit 706a189dcf74d3b3f955e9384785e726ed6c7c80 upstream. Use EFLAGS.CF instead of EFLAGS.ZF to track whether to use VMRESUME versus VMLAUNCH. Freeing up EFLAGS.ZF will allow doing VERW, which clobbers ZF, for MDS mitigations as late as possible without needing to duplicate VERW for both paths. [ pawan: resolved merge conflict in __vmx_vcpu_run in backport. ] Signed-off-by: Sean Christopherson Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/all/20240213-delay-verw-v8-5-a6216d83edb7%40linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx/run_flags.h | 7 +++++-- arch/x86/kvm/vmx/vmenter.S | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h index edc3f16cc189..6a9bfdfbb6e5 100644 --- a/arch/x86/kvm/vmx/run_flags.h +++ b/arch/x86/kvm/vmx/run_flags.h @@ -2,7 +2,10 @@ #ifndef __KVM_X86_VMX_RUN_FLAGS_H #define __KVM_X86_VMX_RUN_FLAGS_H -#define VMX_RUN_VMRESUME (1 << 0) -#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1) +#define VMX_RUN_VMRESUME_SHIFT 0 +#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1 + +#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT) +#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT) #endif /* __KVM_X86_VMX_RUN_FLAGS_H */ diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 0b5db4de4d09..42c0b2c3aee1 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -106,7 +106,7 @@ SYM_FUNC_START(__vmx_vcpu_run) mov (%_ASM_SP), %_ASM_AX /* Check if vmlaunch or vmresume is needed */ - testb $VMX_RUN_VMRESUME, %bl + bt $VMX_RUN_VMRESUME_SHIFT, %bx /* Load guest registers. Don't clobber flags. */ mov VCPU_RCX(%_ASM_AX), %_ASM_CX @@ -128,8 +128,8 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX - /* Check EFLAGS.ZF from 'testb' above */ - jz .Lvmlaunch + /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */ + jnc .Lvmlaunch /* * After a successful VMRESUME/VMLAUNCH, control flow "magically" From da67116b74e6aa9c531de386e1d99f2e460d1cc4 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 4 Mar 2024 01:24:24 -0800 Subject: [PATCH 211/216] KVM/VMX: Move VERW closer to VMentry for MDS mitigation commit 43fb862de8f628c5db5e96831c915b9aebf62d33 upstream. During VMentry VERW is executed to mitigate MDS. After VERW, any memory access like register push onto stack may put host data in MDS affected CPU buffers. A guest can then use MDS to sample host data. Although likelihood of secrets surviving in registers at current VERW callsite is less, but it can't be ruled out. Harden the MDS mitigation by moving the VERW mitigation late in VMentry path. Note that VERW for MMIO Stale Data mitigation is unchanged because of the complexity of per-guest conditional VERW which is not easy to handle that late in asm with no GPRs available. If the CPU is also affected by MDS, VERW is unconditionally executed late in asm regardless of guest having MMIO access. [ pawan: conflict resolved in backport ] Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Acked-by: Sean Christopherson Link: https://lore.kernel.org/all/20240213-delay-verw-v8-6-a6216d83edb7%40linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx/vmenter.S | 3 +++ arch/x86/kvm/vmx/vmx.c | 12 ++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 42c0b2c3aee1..0b2cad66dee1 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -128,6 +128,9 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX + /* Clobbers EFLAGS.ZF */ + CLEAR_CPU_BUFFERS + /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */ jnc .Lvmlaunch diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3b76f1bf001e..5c1590855ffc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -407,7 +407,8 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - vmx->disable_fb_clear = vmx_fb_clear_ctrl_available; + vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && + vmx_fb_clear_ctrl_available; /* * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS @@ -7120,11 +7121,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, { guest_state_enter_irqoff(); - /* L1D Flush includes CPU buffer clear to mitigate MDS */ + /* + * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW + * mitigation for MDS is done late in VMentry and is still + * executed in spite of L1D Flush. This is because an extra VERW + * should not matter much after the big hammer L1D Flush. + */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF)) - mds_clear_cpu_buffers(); else if (static_branch_unlikely(&mmio_stale_data_clear) && kvm_arch_has_assigned_device(vcpu->kvm)) mds_clear_cpu_buffers(); From 5fafd8254add75d8337df44ba8536e407ffe8928 Mon Sep 17 00:00:00 2001 From: Louis DeLosSantos Date: Wed, 31 May 2023 15:38:48 -0400 Subject: [PATCH 212/216] bpf: Add table ID to bpf_fib_lookup BPF helper commit 8ad77e72caae22a1ddcfd0c03f2884929e93b7a4 upstream. Add ability to specify routing table ID to the `bpf_fib_lookup` BPF helper. A new field `tbid` is added to `struct bpf_fib_lookup` used as parameters to the `bpf_fib_lookup` BPF helper. When the helper is called with the `BPF_FIB_LOOKUP_DIRECT` and `BPF_FIB_LOOKUP_TBID` flags the `tbid` field in `struct bpf_fib_lookup` will be used as the table ID for the fib lookup. If the `tbid` does not exist the fib lookup will fail with `BPF_FIB_LKUP_RET_NOT_FWDED`. The `tbid` field becomes a union over the vlan related output fields in `struct bpf_fib_lookup` and will be zeroed immediately after usage. This functionality is useful in containerized environments. For instance, if a CNI wants to dictate the next-hop for traffic leaving a container it can create a container-specific routing table and perform a fib lookup against this table in a "host-net-namespace-side" TC program. This functionality also allows `ip rule` like functionality at the TC layer, allowing an eBPF program to pick a routing table based on some aspect of the sk_buff. As a concrete use case, this feature will be used in Cilium's SRv6 L3VPN datapath. When egress traffic leaves a Pod an eBPF program attached by Cilium will determine which VRF the egress traffic should target, and then perform a FIB lookup in a specific table representing this VRF's FIB. Signed-off-by: Louis DeLosSantos Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230505-bpf-add-tbid-fib-lookup-v2-1-0a31c22c748c@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/bpf.h | 21 ++++++++++++++++++--- net/core/filter.c | 14 +++++++++++++- tools/include/uapi/linux/bpf.h | 21 ++++++++++++++++++--- 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 201dc77ebbd7..02cf4d9d8eab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3109,6 +3109,10 @@ union bpf_attr { * **BPF_FIB_LOOKUP_DIRECT** * Do a direct table lookup vs full lookup using FIB * rules. + * **BPF_FIB_LOOKUP_TBID** + * Used with BPF_FIB_LOOKUP_DIRECT. + * Use the routing table ID present in *params*->tbid + * for the fib lookup. * **BPF_FIB_LOOKUP_OUTPUT** * Perform lookup from an egress perspective (default is * ingress). @@ -6687,6 +6691,7 @@ enum { BPF_FIB_LOOKUP_DIRECT = (1U << 0), BPF_FIB_LOOKUP_OUTPUT = (1U << 1), BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), + BPF_FIB_LOOKUP_TBID = (1U << 3), }; enum { @@ -6747,9 +6752,19 @@ struct bpf_fib_lookup { __u32 ipv6_dst[4]; /* in6_addr; network order */ }; - /* output */ - __be16 h_vlan_proto; - __be16 h_vlan_TCI; + union { + struct { + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + }; + /* input: when accompanied with the + * 'BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_TBID` flags, a + * specific routing table to use for the fib lookup. + */ + __u32 tbid; + }; + __u8 smac[6]; /* ETH_ALEN */ __u8 dmac[6]; /* ETH_ALEN */ }; diff --git a/net/core/filter.c b/net/core/filter.c index 3a6110ea4009..085d21108565 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5752,6 +5752,12 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib_table *tb; + if (flags & BPF_FIB_LOOKUP_TBID) { + tbid = params->tbid; + /* zero out for vlan output */ + params->tbid = 0; + } + tb = fib_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; @@ -5885,6 +5891,12 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib6_table *tb; + if (flags & BPF_FIB_LOOKUP_TBID) { + tbid = params->tbid; + /* zero out for vlan output */ + params->tbid = 0; + } + tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; @@ -5957,7 +5969,7 @@ set_fwd_params: #endif #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ - BPF_FIB_LOOKUP_SKIP_NEIGH) + BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 201dc77ebbd7..02cf4d9d8eab 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3109,6 +3109,10 @@ union bpf_attr { * **BPF_FIB_LOOKUP_DIRECT** * Do a direct table lookup vs full lookup using FIB * rules. + * **BPF_FIB_LOOKUP_TBID** + * Used with BPF_FIB_LOOKUP_DIRECT. + * Use the routing table ID present in *params*->tbid + * for the fib lookup. * **BPF_FIB_LOOKUP_OUTPUT** * Perform lookup from an egress perspective (default is * ingress). @@ -6687,6 +6691,7 @@ enum { BPF_FIB_LOOKUP_DIRECT = (1U << 0), BPF_FIB_LOOKUP_OUTPUT = (1U << 1), BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), + BPF_FIB_LOOKUP_TBID = (1U << 3), }; enum { @@ -6747,9 +6752,19 @@ struct bpf_fib_lookup { __u32 ipv6_dst[4]; /* in6_addr; network order */ }; - /* output */ - __be16 h_vlan_proto; - __be16 h_vlan_TCI; + union { + struct { + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + }; + /* input: when accompanied with the + * 'BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_TBID` flags, a + * specific routing table to use for the fib lookup. + */ + __u32 tbid; + }; + __u8 smac[6]; /* ETH_ALEN */ __u8 dmac[6]; /* ETH_ALEN */ }; From 2d7ebcb5d878b4311db56eeaf7bdd76dbe9b9a13 Mon Sep 17 00:00:00 2001 From: Martynas Pumputis Date: Sat, 7 Oct 2023 10:14:14 +0200 Subject: [PATCH 213/216] bpf: Derive source IP addr via bpf_*_fib_lookup() commit dab4e1f06cabb6834de14264394ccab197007302 upstream. Extend the bpf_fib_lookup() helper by making it to return the source IPv4/IPv6 address if the BPF_FIB_LOOKUP_SRC flag is set. For example, the following snippet can be used to derive the desired source IP address: struct bpf_fib_lookup p = { .ipv4_dst = ip4->daddr }; ret = bpf_skb_fib_lookup(skb, p, sizeof(p), BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_SKIP_NEIGH); if (ret != BPF_FIB_LKUP_RET_SUCCESS) return TC_ACT_SHOT; /* the p.ipv4_src now contains the source address */ The inability to derive the proper source address may cause malfunctions in BPF-based dataplanes for hosts containing netdevs with more than one routable IP address or for multi-homed hosts. For example, Cilium implements packet masquerading in BPF. If an egressing netdev to which the Cilium's BPF prog is attached has multiple IP addresses, then only one [hardcoded] IP address can be used for masquerading. This breaks connectivity if any other IP address should have been selected instead, for example, when a public and private addresses are attached to the same egress interface. The change was tested with Cilium [1]. Nikolay Aleksandrov helped to figure out the IPv6 addr selection. [1]: https://github.com/cilium/cilium/pull/28283 Signed-off-by: Martynas Pumputis Link: https://lore.kernel.org/r/20231007081415.33502-2-m@lambda.lt Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman --- include/net/ipv6_stubs.h | 5 +++++ include/uapi/linux/bpf.h | 10 ++++++++++ net/core/filter.c | 18 +++++++++++++++++- net/ipv6/af_inet6.c | 1 + tools/include/uapi/linux/bpf.h | 10 ++++++++++ 5 files changed, 43 insertions(+), 1 deletion(-) diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index c48186bf4737..21da31e1dff5 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -85,6 +85,11 @@ struct ipv6_bpf_stub { sockptr_t optval, unsigned int optlen); int (*ipv6_getsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); + int (*ipv6_dev_get_saddr)(struct net *net, + const struct net_device *dst_dev, + const struct in6_addr *daddr, + unsigned int prefs, + struct in6_addr *saddr); }; extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 02cf4d9d8eab..d5d2183730b9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3121,6 +3121,11 @@ union bpf_attr { * and *params*->smac will not be set as output. A common * use case is to call **bpf_redirect_neigh**\ () after * doing **bpf_fib_lookup**\ (). + * **BPF_FIB_LOOKUP_SRC** + * Derive and set source IP addr in *params*->ipv{4,6}_src + * for the nexthop. If the src addr cannot be derived, + * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this + * case, *params*->dmac and *params*->smac are not set either. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -6692,6 +6697,7 @@ enum { BPF_FIB_LOOKUP_OUTPUT = (1U << 1), BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), + BPF_FIB_LOOKUP_SRC = (1U << 4), }; enum { @@ -6704,6 +6710,7 @@ enum { BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_FIB_LKUP_RET_NO_SRC_ADDR, /* failed to derive IP src addr */ }; struct bpf_fib_lookup { @@ -6738,6 +6745,9 @@ struct bpf_fib_lookup { __u32 rt_metric; }; + /* input: source address to consider for lookup + * output: source address result from lookup + */ union { __be32 ipv4_src; __u32 ipv6_src[4]; /* in6_addr; network order */ diff --git a/net/core/filter.c b/net/core/filter.c index 085d21108565..cb7c4651eaec 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5809,6 +5809,9 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, params->rt_metric = res.fi->fib_priority; params->ifindex = dev->ifindex; + if (flags & BPF_FIB_LOOKUP_SRC) + params->ipv4_src = fib_result_prefsrc(net, &res); + /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here */ @@ -5951,6 +5954,18 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, params->rt_metric = res.f6i->fib6_metric; params->ifindex = dev->ifindex; + if (flags & BPF_FIB_LOOKUP_SRC) { + if (res.f6i->fib6_prefsrc.plen) { + *src = res.f6i->fib6_prefsrc.addr; + } else { + err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev, + &fl6.daddr, 0, + src); + if (err) + return BPF_FIB_LKUP_RET_NO_SRC_ADDR; + } + } + if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) goto set_fwd_params; @@ -5969,7 +5984,8 @@ set_fwd_params: #endif #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ - BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID) + BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ + BPF_FIB_LOOKUP_SRC) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 0b42eb8c55aa..62247621cea5 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -1077,6 +1077,7 @@ static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { .udp6_lib_lookup = __udp6_lib_lookup, .ipv6_setsockopt = do_ipv6_setsockopt, .ipv6_getsockopt = do_ipv6_getsockopt, + .ipv6_dev_get_saddr = ipv6_dev_get_saddr, }; static int __init inet6_init(void) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 02cf4d9d8eab..d5d2183730b9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3121,6 +3121,11 @@ union bpf_attr { * and *params*->smac will not be set as output. A common * use case is to call **bpf_redirect_neigh**\ () after * doing **bpf_fib_lookup**\ (). + * **BPF_FIB_LOOKUP_SRC** + * Derive and set source IP addr in *params*->ipv{4,6}_src + * for the nexthop. If the src addr cannot be derived, + * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this + * case, *params*->dmac and *params*->smac are not set either. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -6692,6 +6697,7 @@ enum { BPF_FIB_LOOKUP_OUTPUT = (1U << 1), BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), + BPF_FIB_LOOKUP_SRC = (1U << 4), }; enum { @@ -6704,6 +6710,7 @@ enum { BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_FIB_LKUP_RET_NO_SRC_ADDR, /* failed to derive IP src addr */ }; struct bpf_fib_lookup { @@ -6738,6 +6745,9 @@ struct bpf_fib_lookup { __u32 rt_metric; }; + /* input: source address to consider for lookup + * output: source address result from lookup + */ union { __be32 ipv4_src; __u32 ipv6_src[4]; /* in6_addr; network order */ From 8866334e35102d054160a86750b7db9203f721f9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 26 Jan 2024 12:14:30 +0100 Subject: [PATCH 214/216] x86/efistub: Give up if memory attribute protocol returns an error commit a7a6a01f88e87dec4bf2365571dd2dc7403d52d0 upstream. The recently introduced EFI memory attributes protocol should be used if it exists to ensure that the memory allocation created for the kernel permits execution. This is needed for compatibility with tightened requirements related to Windows logo certification for x86 PCs. Currently, we simply strip the execute protect (XP) attribute from the entire range, but this might be rejected under some firmware security policies, and so in a subsequent patch, this will be changed to only strip XP from the executable region that runs early, and make it read-only (RO) as well. In order to catch any issues early, ensure that the memory attribute protocol works as intended, and give up if it produces spurious errors. Note that the DXE services based fallback was always based on best effort, so don't propagate any errors returned by that API. Fixes: a1b87d54f4e4 ("x86/efistub: Avoid legacy decompressor when doing EFI boot") Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/libstub/x86-stub.c | 24 ++++++++++++++---------- drivers/firmware/efi/libstub/x86-stub.h | 4 ++-- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index a0757a37b482..784e1b2ae5cc 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -212,8 +212,8 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) } } -void efi_adjust_memory_range_protection(unsigned long start, - unsigned long size) +efi_status_t efi_adjust_memory_range_protection(unsigned long start, + unsigned long size) { efi_status_t status; efi_gcd_memory_space_desc_t desc; @@ -225,13 +225,17 @@ void efi_adjust_memory_range_protection(unsigned long start, rounded_end = roundup(start + size, EFI_PAGE_SIZE); if (memattr != NULL) { - efi_call_proto(memattr, clear_memory_attributes, rounded_start, - rounded_end - rounded_start, EFI_MEMORY_XP); - return; + status = efi_call_proto(memattr, clear_memory_attributes, + rounded_start, + rounded_end - rounded_start, + EFI_MEMORY_XP); + if (status != EFI_SUCCESS) + efi_warn("Failed to clear EFI_MEMORY_XP attribute\n"); + return status; } if (efi_dxe_table == NULL) - return; + return EFI_SUCCESS; /* * Don't modify memory region attributes, they are @@ -244,7 +248,7 @@ void efi_adjust_memory_range_protection(unsigned long start, status = efi_dxe_call(get_memory_space_descriptor, start, &desc); if (status != EFI_SUCCESS) - return; + break; next = desc.base_address + desc.length; @@ -269,8 +273,10 @@ void efi_adjust_memory_range_protection(unsigned long start, unprotect_start, unprotect_start + unprotect_size, status); + break; } } + return EFI_SUCCESS; } static efi_char16_t *efistub_fw_vendor(void) @@ -800,9 +806,7 @@ static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry) *kernel_entry = addr + entry; - efi_adjust_memory_range_protection(addr, kernel_total_size); - - return EFI_SUCCESS; + return efi_adjust_memory_range_protection(addr, kernel_total_size); } static void __noreturn enter_kernel(unsigned long kernel_addr, diff --git a/drivers/firmware/efi/libstub/x86-stub.h b/drivers/firmware/efi/libstub/x86-stub.h index 37c5a36b9d8c..1c20e99a6494 100644 --- a/drivers/firmware/efi/libstub/x86-stub.h +++ b/drivers/firmware/efi/libstub/x86-stub.h @@ -5,8 +5,8 @@ extern void trampoline_32bit_src(void *, bool); extern const u16 trampoline_ljmp_imm_offset; -void efi_adjust_memory_range_protection(unsigned long start, - unsigned long size); +efi_status_t efi_adjust_memory_range_protection(unsigned long start, + unsigned long size); #ifdef CONFIG_X86_64 efi_status_t efi_setup_5level_paging(void); From 585a344af6bcac222608a158fc2830ff02712af5 Mon Sep 17 00:00:00 2001 From: Maximilian Heyne Date: Wed, 24 Jan 2024 16:31:28 +0000 Subject: [PATCH 215/216] xen/events: close evtchn after mapping cleanup commit fa765c4b4aed2d64266b694520ecb025c862c5a9 upstream. shutdown_pirq and startup_pirq are not taking the irq_mapping_update_lock because they can't due to lock inversion. Both are called with the irq_desc->lock being taking. The lock order, however, is first irq_mapping_update_lock and then irq_desc->lock. This opens multiple races: - shutdown_pirq can be interrupted by a function that allocates an event channel: CPU0 CPU1 shutdown_pirq { xen_evtchn_close(e) __startup_pirq { EVTCHNOP_bind_pirq -> returns just freed evtchn e set_evtchn_to_irq(e, irq) } xen_irq_info_cleanup() { set_evtchn_to_irq(e, -1) } } Assume here event channel e refers here to the same event channel number. After this race the evtchn_to_irq mapping for e is invalid (-1). - __startup_pirq races with __unbind_from_irq in a similar way. Because __startup_pirq doesn't take irq_mapping_update_lock it can grab the evtchn that __unbind_from_irq is currently freeing and cleaning up. In this case even though the event channel is allocated, its mapping can be unset in evtchn_to_irq. The fix is to first cleanup the mappings and then close the event channel. In this way, when an event channel gets allocated it's potential previous evtchn_to_irq mappings are guaranteed to be unset already. This is also the reverse order of the allocation where first the event channel is allocated and then the mappings are setup. On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces"), we hit a BUG like the following during probing of NVMe devices. The issue is that during nvme_setup_io_queues, pci_free_irq is called for every device which results in a call to shutdown_pirq. With many nvme devices it's therefore likely to hit this race during boot because there will be multiple calls to shutdown_pirq and startup_pirq are running potentially in parallel. ------------[ cut here ]------------ blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled kernel BUG at drivers/xen/events/events_base.c:499! invalid opcode: 0000 [#1] SMP PTI CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1 Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006 Workqueue: nvme-reset-wq nvme_reset_work RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0 Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00 RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006 RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00 R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002 FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? show_trace_log_lvl+0x1c1/0x2d9 ? show_trace_log_lvl+0x1c1/0x2d9 ? set_affinity_irq+0xdc/0x1c0 ? __die_body.cold+0x8/0xd ? die+0x2b/0x50 ? do_trap+0x90/0x110 ? bind_evtchn_to_cpu+0xdf/0xf0 ? do_error_trap+0x65/0x80 ? bind_evtchn_to_cpu+0xdf/0xf0 ? exc_invalid_op+0x4e/0x70 ? bind_evtchn_to_cpu+0xdf/0xf0 ? asm_exc_invalid_op+0x12/0x20 ? bind_evtchn_to_cpu+0xdf/0xf0 ? bind_evtchn_to_cpu+0xc5/0xf0 set_affinity_irq+0xdc/0x1c0 irq_do_set_affinity+0x1d7/0x1f0 irq_setup_affinity+0xd6/0x1a0 irq_startup+0x8a/0xf0 __setup_irq+0x639/0x6d0 ? nvme_suspend+0x150/0x150 request_threaded_irq+0x10c/0x180 ? nvme_suspend+0x150/0x150 pci_request_irq+0xa8/0xf0 ? __blk_mq_free_request+0x74/0xa0 queue_request_irq+0x6f/0x80 nvme_create_queue+0x1af/0x200 nvme_create_io_queues+0xbd/0xf0 nvme_setup_io_queues+0x246/0x320 ? nvme_irq_check+0x30/0x30 nvme_reset_work+0x1c8/0x400 process_one_work+0x1b0/0x350 worker_thread+0x49/0x310 ? process_one_work+0x350/0x350 kthread+0x11b/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x22/0x30 Modules linked in: ---[ end trace a11715de1eee1873 ]--- Fixes: d46a78b05c0e ("xen: implement pirq type event channels") Cc: stable@vger.kernel.org Co-debugged-by: Andrew Panyakin Signed-off-by: Maximilian Heyne Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de Signed-off-by: Juergen Gross Signed-off-by: Maximilian Heyne Signed-off-by: Greg Kroah-Hartman --- drivers/xen/events/events_base.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 00f8e349921d..96b96516c980 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -937,8 +937,8 @@ static void shutdown_pirq(struct irq_data *data) return; do_mask(info, EVT_MASK_REASON_EXPLICIT); - xen_evtchn_close(evtchn); xen_irq_info_cleanup(info); + xen_evtchn_close(evtchn); } static void enable_pirq(struct irq_data *data) @@ -982,8 +982,6 @@ static void __unbind_from_irq(unsigned int irq) unsigned int cpu = cpu_from_irq(irq); struct xenbus_device *dev; - xen_evtchn_close(evtchn); - switch (type_from_irq(irq)) { case IRQT_VIRQ: per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; @@ -1001,6 +999,7 @@ static void __unbind_from_irq(unsigned int irq) } xen_irq_info_cleanup(info); + xen_evtchn_close(evtchn); } xen_free_irq(irq); From 61adba85cc40287232a539e607164f273260e0fe Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 6 Mar 2024 14:45:20 +0000 Subject: [PATCH 216/216] Linux 6.1.81 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Link: https://lore.kernel.org/r/20240304211556.993132804@linuxfoundation.org Tested-by: SeongJae Park Tested-by: Ron Economos Tested-by: Salvatore Bonaccorso Tested-by: Jon Hunter Tested-by: Pavel Machek (CIP) Tested-by: Shuah Khan Tested-by: Mateusz Jończyk Tested-by: Florian Fainelli Tested-by: Linux Kernel Functional Testing Tested-by: Yann Sionneau Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index bc4adb561a7c..e13df565a1cb 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 PATCHLEVEL = 1 -SUBLEVEL = 80 +SUBLEVEL = 81 EXTRAVERSION = NAME = Curry Ramen