From 77965c98cffe41994dce3389c4aae80e2072f098 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 1 Jul 2021 09:29:25 +0200 Subject: [PATCH 001/149] pwm: Move legacy driver handling into a dedicated function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no change in behaviour, only some code is moved from pwm_apply_state to a separate function. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 130 ++++++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 60 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index fb04a439462c..c4bbe12cd850 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -522,6 +522,64 @@ static void pwm_apply_state_debug(struct pwm_device *pwm, } } +static int pwm_apply_legacy(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + /* + * FIXME: restore the initial state in case of error. + */ + if (state->polarity != pwm->state.polarity) { + if (!chip->ops->set_polarity) + return -EINVAL; + + /* + * Changing the polarity of a running PWM is only allowed when + * the PWM driver implements ->apply(). + */ + if (pwm->state.enabled) { + chip->ops->disable(chip, pwm); + + /* + * Update pwm->state already here in case + * .set_polarity() or another callback depend on that. + */ + pwm->state.enabled = false; + } + + err = chip->ops->set_polarity(chip, pwm, state->polarity); + if (err) + return err; + + pwm->state.polarity = state->polarity; + } + + if (state->period != pwm->state.period || + state->duty_cycle != pwm->state.duty_cycle) { + err = chip->ops->config(pwm->chip, pwm, + state->duty_cycle, + state->period); + if (err) + return err; + + pwm->state.period = state->period; + pwm->state.duty_cycle = state->duty_cycle; + } + + if (state->enabled != pwm->state.enabled) { + if (!pwm->state.enabled) { + err = chip->ops->enable(chip, pwm); + if (err) + return err; + } else { + chip->ops->disable(chip, pwm); + } + } + + return 0; +} + /** * pwm_apply_state() - atomically apply a new state to a PWM device * @pwm: PWM device @@ -554,70 +612,22 @@ int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state) state->usage_power == pwm->state.usage_power) return 0; - if (chip->ops->apply) { + if (chip->ops->apply) err = chip->ops->apply(chip, pwm, state); - if (err) - return err; + else + err = pwm_apply_legacy(chip, pwm, state); + if (err) + return err; - trace_pwm_apply(pwm, state); + trace_pwm_apply(pwm, state); - pwm->state = *state; + pwm->state = *state; - /* - * only do this after pwm->state was applied as some - * implementations of .get_state depend on this - */ - pwm_apply_state_debug(pwm, state); - } else { - /* - * FIXME: restore the initial state in case of error. - */ - if (state->polarity != pwm->state.polarity) { - if (!chip->ops->set_polarity) - return -EINVAL; - - /* - * Changing the polarity of a running PWM is - * only allowed when the PWM driver implements - * ->apply(). - */ - if (pwm->state.enabled) { - chip->ops->disable(chip, pwm); - pwm->state.enabled = false; - } - - err = chip->ops->set_polarity(chip, pwm, - state->polarity); - if (err) - return err; - - pwm->state.polarity = state->polarity; - } - - if (state->period != pwm->state.period || - state->duty_cycle != pwm->state.duty_cycle) { - err = chip->ops->config(pwm->chip, pwm, - state->duty_cycle, - state->period); - if (err) - return err; - - pwm->state.duty_cycle = state->duty_cycle; - pwm->state.period = state->period; - } - - if (state->enabled != pwm->state.enabled) { - if (state->enabled) { - err = chip->ops->enable(chip, pwm); - if (err) - return err; - } else { - chip->ops->disable(chip, pwm); - } - - pwm->state.enabled = state->enabled; - } - } + /* + * only do this after pwm->state was applied as some + * implementations of .get_state depend on this + */ + pwm_apply_state_debug(pwm, state); return 0; } From 92f69e582e15bf281ff1ab3ccc7abdd8392550a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 1 Jul 2021 09:29:26 +0200 Subject: [PATCH 002/149] pwm: Prevent a glitch for legacy drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a running PWM is reconfigured to disabled calling the ->config() callback before disabling the hardware might result in a glitch where the (maybe) new period and duty_cycle are visible on the output before disabling the hardware. So handle disabling before calling ->config(). Also exit early in this case which is possible because period and duty_cycle don't matter for disabled PWMs. In return however ->config has to be called even if state->period == pwm->state.period && state->duty_cycle != pwm->state.duty_cycle because setting these might have been skipped in the previous call. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index c4bbe12cd850..dedf38a81bf9 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -555,26 +555,33 @@ static int pwm_apply_legacy(struct pwm_chip *chip, struct pwm_device *pwm, pwm->state.polarity = state->polarity; } - if (state->period != pwm->state.period || - state->duty_cycle != pwm->state.duty_cycle) { - err = chip->ops->config(pwm->chip, pwm, - state->duty_cycle, - state->period); - if (err) - return err; + if (!state->enabled) { + if (pwm->state.enabled) + chip->ops->disable(chip, pwm); - pwm->state.period = state->period; - pwm->state.duty_cycle = state->duty_cycle; + return 0; } - if (state->enabled != pwm->state.enabled) { - if (!pwm->state.enabled) { - err = chip->ops->enable(chip, pwm); - if (err) - return err; - } else { - chip->ops->disable(chip, pwm); - } + /* + * We cannot skip calling ->config even if state->period == + * pwm->state.period && state->duty_cycle == pwm->state.duty_cycle + * because we might have exited early in the last call to + * pwm_apply_state because of !state->enabled and so the two values in + * pwm->state might not be configured in hardware. + */ + err = chip->ops->config(pwm->chip, pwm, + state->duty_cycle, + state->period); + if (err) + return err; + + pwm->state.period = state->period; + pwm->state.duty_cycle = state->duty_cycle; + + if (!pwm->state.enabled) { + err = chip->ops->enable(chip, pwm); + if (err) + return err; } return 0; From e45a178e9e285c86265611a56705a1e6444037e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 1 Jul 2021 09:29:27 +0200 Subject: [PATCH 003/149] pwm: Restore initial state if a legacy callback fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is not entirely accurate to go back to the initial state after e.g. .enable() failed, as .config() still modified the hardware, but this same inconsistency exists for drivers that implement .apply(). Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index dedf38a81bf9..57b8cedfec3f 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -526,10 +526,8 @@ static int pwm_apply_legacy(struct pwm_chip *chip, struct pwm_device *pwm, const struct pwm_state *state) { int err; + struct pwm_state initial_state = pwm->state; - /* - * FIXME: restore the initial state in case of error. - */ if (state->polarity != pwm->state.polarity) { if (!chip->ops->set_polarity) return -EINVAL; @@ -550,7 +548,7 @@ static int pwm_apply_legacy(struct pwm_chip *chip, struct pwm_device *pwm, err = chip->ops->set_polarity(chip, pwm, state->polarity); if (err) - return err; + goto rollback; pwm->state.polarity = state->polarity; } @@ -573,7 +571,7 @@ static int pwm_apply_legacy(struct pwm_chip *chip, struct pwm_device *pwm, state->duty_cycle, state->period); if (err) - return err; + goto rollback; pwm->state.period = state->period; pwm->state.duty_cycle = state->duty_cycle; @@ -581,10 +579,14 @@ static int pwm_apply_legacy(struct pwm_chip *chip, struct pwm_device *pwm, if (!pwm->state.enabled) { err = chip->ops->enable(chip, pwm); if (err) - return err; + goto rollback; } return 0; + +rollback: + pwm->state = initial_state; + return err; } /** From 5e93d7782f7fda242e0e696da918f720660854bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 28 Oct 2021 12:09:42 +0200 Subject: [PATCH 004/149] pwm: twl: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This just pushes down a slightly optimized variant of how legacy drivers are handled in the core. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-twl.c | 62 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 8 deletions(-) diff --git a/drivers/pwm/pwm-twl.c b/drivers/pwm/pwm-twl.c index 203194f2c92e..86567add79db 100644 --- a/drivers/pwm/pwm-twl.c +++ b/drivers/pwm/pwm-twl.c @@ -58,9 +58,9 @@ static inline struct twl_pwm_chip *to_twl(struct pwm_chip *chip) } static int twl_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) + u64 duty_ns, u64 period_ns) { - int duty_cycle = DIV_ROUND_UP(duty_ns * TWL_PWM_MAX, period_ns) + 1; + int duty_cycle = DIV64_U64_ROUND_UP(duty_ns * TWL_PWM_MAX, period_ns) + 1; u8 pwm_config[2] = { 1, 0 }; int base, ret; @@ -279,19 +279,65 @@ out: mutex_unlock(&twl->mutex); } +static int twl4030_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + twl4030_pwm_disable(chip, pwm); + + return 0; + } + + err = twl_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = twl4030_pwm_enable(chip, pwm); + + return err; +} + +static int twl6030_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + twl6030_pwm_disable(chip, pwm); + + return 0; + } + + err = twl_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = twl6030_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops twl4030_pwm_ops = { - .config = twl_pwm_config, - .enable = twl4030_pwm_enable, - .disable = twl4030_pwm_disable, + .apply = twl4030_pwm_apply, .request = twl4030_pwm_request, .free = twl4030_pwm_free, .owner = THIS_MODULE, }; static const struct pwm_ops twl6030_pwm_ops = { - .config = twl_pwm_config, - .enable = twl6030_pwm_enable, - .disable = twl6030_pwm_disable, + .apply = twl6030_pwm_apply, .owner = THIS_MODULE, }; From 0ee11b87c38b64c693881a53969056e16a03fcd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 29 Oct 2021 12:56:17 +0200 Subject: [PATCH 005/149] pwm: img: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This just pushes down a slightly optimized variant of how legacy drivers are handled in the core. Signed-off-by: Uwe Kleine-König Tested-by: Hauke Mehrtens Signed-off-by: Thierry Reding --- drivers/pwm/pwm-img.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-img.c b/drivers/pwm/pwm-img.c index f97f82548293..1f3d6346ab86 100644 --- a/drivers/pwm/pwm-img.c +++ b/drivers/pwm/pwm-img.c @@ -184,10 +184,33 @@ static void img_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) pm_runtime_put_autosuspend(chip->dev); } +static int img_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + img_pwm_disable(chip, pwm); + + return 0; + } + + err = img_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = img_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops img_pwm_ops = { - .config = img_pwm_config, - .enable = img_pwm_enable, - .disable = img_pwm_disable, + .apply = img_pwm_apply, .owner = THIS_MODULE, }; From 14d8956548ad48574138d8fd377d434083c3c3cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 2 Nov 2021 10:28:03 +0100 Subject: [PATCH 006/149] pwm: vt8500: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This just pushes down a slightly optimized variant of how legacy drivers are handled in the core. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-vt8500.c | 57 +++++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/drivers/pwm/pwm-vt8500.c b/drivers/pwm/pwm-vt8500.c index 480bfc29782f..7170a315535b 100644 --- a/drivers/pwm/pwm-vt8500.c +++ b/drivers/pwm/pwm-vt8500.c @@ -70,7 +70,7 @@ static inline void vt8500_pwm_busy_wait(struct vt8500_chip *vt8500, int nr, u8 b } static int vt8500_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) + u64 duty_ns, u64 period_ns) { struct vt8500_chip *vt8500 = to_vt8500_chip(chip); unsigned long long c; @@ -102,8 +102,8 @@ static int vt8500_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, } c = (unsigned long long)pv * duty_ns; - do_div(c, period_ns); - dc = c; + + dc = div64_u64(c, period_ns); writel(prescale, vt8500->base + REG_SCALAR(pwm->hwpwm)); vt8500_pwm_busy_wait(vt8500, pwm->hwpwm, STATUS_SCALAR_UPDATE); @@ -176,11 +176,54 @@ static int vt8500_pwm_set_polarity(struct pwm_chip *chip, return 0; } +static int vt8500_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + bool enabled = pwm->state.enabled; + + if (state->polarity != pwm->state.polarity) { + /* + * Changing the polarity of a running PWM is only allowed when + * the PWM driver implements ->apply(). + */ + if (enabled) { + vt8500_pwm_disable(chip, pwm); + + enabled = false; + } + + err = vt8500_pwm_set_polarity(chip, pwm, state->polarity); + if (err) + return err; + } + + if (!state->enabled) { + if (enabled) + vt8500_pwm_disable(chip, pwm); + + return 0; + } + + /* + * We cannot skip calling ->config even if state->period == + * pwm->state.period && state->duty_cycle == pwm->state.duty_cycle + * because we might have exited early in the last call to + * pwm_apply_state because of !state->enabled and so the two values in + * pwm->state might not be configured in hardware. + */ + err = vt8500_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!enabled) + err = vt8500_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops vt8500_pwm_ops = { - .enable = vt8500_pwm_enable, - .disable = vt8500_pwm_disable, - .config = vt8500_pwm_config, - .set_polarity = vt8500_pwm_set_polarity, + .apply = vt8500_pwm_apply, .owner = THIS_MODULE, }; From b6ce2af8766c39a5b09afa466ed4d0ef2d8b5a65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 12 Nov 2021 20:00:15 +0100 Subject: [PATCH 007/149] pwm: img: Use only a single idiom to get a runtime PM reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently there are two very similar approaches in use by this driver: img_pwm_config() uses pm_runtime_get_sync() and calls pm_runtime_put_autosuspend() in the error path; img_pwm_enable() calls pm_runtime_resume_and_get() which already puts the reference in its own error path. Align pm_runtime usage and use the same idiom in both locations. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-img.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/pwm/pwm-img.c b/drivers/pwm/pwm-img.c index 1f3d6346ab86..5996049f66ec 100644 --- a/drivers/pwm/pwm-img.c +++ b/drivers/pwm/pwm-img.c @@ -128,11 +128,9 @@ static int img_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, duty = DIV_ROUND_UP(timebase * duty_ns, period_ns); - ret = pm_runtime_get_sync(chip->dev); - if (ret < 0) { - pm_runtime_put_autosuspend(chip->dev); + ret = pm_runtime_resume_and_get(chip->dev); + if (ret < 0) return ret; - } val = img_pwm_readl(pwm_chip, PWM_CTRL_CFG); val &= ~(PWM_CTRL_CFG_DIV_MASK << PWM_CTRL_CFG_DIV_SHIFT(pwm->hwpwm)); From 3f0565451cc0c5158513af0bc4e91aa8fb0b5e75 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Tue, 7 Dec 2021 13:48:55 +0100 Subject: [PATCH 008/149] dt-bindings: pwm: Avoid selecting schema on node name match Currently any node whose name starts with the "pwm-" prefix will match this schema and in turn required the "#pwm-cells" property. Avoid this by marking the schema with select: false, therefore only activating the schema when directly included from a PWM controller schema file. Signed-off-by: Thierry Reding Acked-by: Rob Herring Signed-off-by: Thierry Reding --- Documentation/devicetree/bindings/pwm/pwm.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/pwm/pwm.yaml b/Documentation/devicetree/bindings/pwm/pwm.yaml index 2effe6c0de6b..3c01f85029e5 100644 --- a/Documentation/devicetree/bindings/pwm/pwm.yaml +++ b/Documentation/devicetree/bindings/pwm/pwm.yaml @@ -9,6 +9,8 @@ title: PWM controllers (providers) maintainers: - Thierry Reding +select: false + properties: $nodename: pattern: "^pwm(@.*|-[0-9a-f])*$" From 21ab799585762e097387da82a4e0fd6c2ffb4000 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 1 Dec 2021 16:45:41 -0700 Subject: [PATCH 009/149] vfio/pci: Resolve sparse endian warnings in IGD support Sparse warns: sparse warnings: (new ones prefixed by >>) >> drivers/vfio/pci/vfio_pci_igd.c:146:21: sparse: sparse: incorrect type in assignment (different base types) @@ expected unsigned short [addressable] [usertype] val @@ got restricted __le16 [usertype] @@ drivers/vfio/pci/vfio_pci_igd.c:146:21: sparse: expected unsigned short [addressable] [usertype] val drivers/vfio/pci/vfio_pci_igd.c:146:21: sparse: got restricted __le16 [usertype] >> drivers/vfio/pci/vfio_pci_igd.c:161:21: sparse: sparse: incorrect type in assignment (different base types) @@ expected unsigned int [addressable] [usertype] val @@ got restricted __le32 [usertype] @@ drivers/vfio/pci/vfio_pci_igd.c:161:21: sparse: expected unsigned int [addressable] [usertype] val drivers/vfio/pci/vfio_pci_igd.c:161:21: sparse: got restricted __le32 [usertype] drivers/vfio/pci/vfio_pci_igd.c:176:21: sparse: sparse: incorrect type in assignment (different base types) @@ expected unsigned short [addressable] [usertype] val @@ got restricted __le16 [usertype] @@ drivers/vfio/pci/vfio_pci_igd.c:176:21: sparse: expected unsigned short [addressable] [usertype] val drivers/vfio/pci/vfio_pci_igd.c:176:21: sparse: got restricted __le16 [usertype] These are due to trying to use an unsigned to store the result of a cpu_to_leXX() conversion. These are small variables, so pointer tricks are wasteful and casting just generates different sparse warnings. Store to and copy results from a separate little endian variable. Reported-by: kernel test robot Link: https://lore.kernel.org/r/202111290026.O3vehj03-lkp@intel.com/ Link: https://lore.kernel.org/r/163840226123.138003.7668320168896210328.stgit@omen Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_igd.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c index 362f91ec8845..352c725ccf18 100644 --- a/drivers/vfio/pci/vfio_pci_igd.c +++ b/drivers/vfio/pci/vfio_pci_igd.c @@ -309,13 +309,14 @@ static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_core_device *vdev, if ((pos & 3) && size > 2) { u16 val; + __le16 lval; ret = pci_user_read_config_word(pdev, pos, &val); if (ret) return ret; - val = cpu_to_le16(val); - if (copy_to_user(buf + count - size, &val, 2)) + lval = cpu_to_le16(val); + if (copy_to_user(buf + count - size, &lval, 2)) return -EFAULT; pos += 2; @@ -324,13 +325,14 @@ static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_core_device *vdev, while (size > 3) { u32 val; + __le32 lval; ret = pci_user_read_config_dword(pdev, pos, &val); if (ret) return ret; - val = cpu_to_le32(val); - if (copy_to_user(buf + count - size, &val, 4)) + lval = cpu_to_le32(val); + if (copy_to_user(buf + count - size, &lval, 4)) return -EFAULT; pos += 4; @@ -339,13 +341,14 @@ static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_core_device *vdev, while (size >= 2) { u16 val; + __le16 lval; ret = pci_user_read_config_word(pdev, pos, &val); if (ret) return ret; - val = cpu_to_le16(val); - if (copy_to_user(buf + count - size, &val, 2)) + lval = cpu_to_le16(val); + if (copy_to_user(buf + count - size, &lval, 2)) return -EFAULT; pos += 2; From 2bed2ced40c97b8540ff38df0149e8ecb2bf4c65 Mon Sep 17 00:00:00 2001 From: Jiacheng Shi Date: Sun, 12 Dec 2021 01:16:00 -0800 Subject: [PATCH 010/149] vfio/iommu_type1: replace kfree with kvfree Variables allocated by kvzalloc should not be freed by kfree. Because they may be allocated by vmalloc. So we replace kfree with kvfree here. Fixes: d6a4c185660c ("vfio iommu: Implementation of ioctl for dirty pages tracking") Signed-off-by: Jiacheng Shi Link: https://lore.kernel.org/r/20211212091600.2560-1-billsjc@sjtu.edu.cn Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index f17490ab238f..9394aa9444c1 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -256,7 +256,7 @@ static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize) static void vfio_dma_bitmap_free(struct vfio_dma *dma) { - kfree(dma->bitmap); + kvfree(dma->bitmap); dma->bitmap = NULL; } From 294277410cf3b46bee2b8282ab754e52975c0a70 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Tue, 23 Nov 2021 11:10:13 +0900 Subject: [PATCH 011/149] ksmbd: use oid registry functions to decode OIDs Use look_up_OID to decode OIDs rather than implementing functions. Acked-by: Namjae Jeon Signed-off-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/asn1.c | 142 +++++++----------------------------------------- 1 file changed, 19 insertions(+), 123 deletions(-) diff --git a/fs/ksmbd/asn1.c b/fs/ksmbd/asn1.c index b014f4638610..c03eba090368 100644 --- a/fs/ksmbd/asn1.c +++ b/fs/ksmbd/asn1.c @@ -21,101 +21,11 @@ #include "ksmbd_spnego_negtokeninit.asn1.h" #include "ksmbd_spnego_negtokentarg.asn1.h" -#define SPNEGO_OID_LEN 7 #define NTLMSSP_OID_LEN 10 -#define KRB5_OID_LEN 7 -#define KRB5U2U_OID_LEN 8 -#define MSKRB5_OID_LEN 7 -static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 }; -static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 }; -static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 }; -static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 }; -static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 }; static char NTLMSSP_OID_STR[NTLMSSP_OID_LEN] = { 0x2b, 0x06, 0x01, 0x04, 0x01, 0x82, 0x37, 0x02, 0x02, 0x0a }; -static bool -asn1_subid_decode(const unsigned char **begin, const unsigned char *end, - unsigned long *subid) -{ - const unsigned char *ptr = *begin; - unsigned char ch; - - *subid = 0; - - do { - if (ptr >= end) - return false; - - ch = *ptr++; - *subid <<= 7; - *subid |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - - *begin = ptr; - return true; -} - -static bool asn1_oid_decode(const unsigned char *value, size_t vlen, - unsigned long **oid, size_t *oidlen) -{ - const unsigned char *iptr = value, *end = value + vlen; - unsigned long *optr; - unsigned long subid; - - vlen += 1; - if (vlen < 2 || vlen > UINT_MAX / sizeof(unsigned long)) - goto fail_nullify; - - *oid = kmalloc(vlen * sizeof(unsigned long), GFP_KERNEL); - if (!*oid) - return false; - - optr = *oid; - - if (!asn1_subid_decode(&iptr, end, &subid)) - goto fail; - - if (subid < 40) { - optr[0] = 0; - optr[1] = subid; - } else if (subid < 80) { - optr[0] = 1; - optr[1] = subid - 40; - } else { - optr[0] = 2; - optr[1] = subid - 80; - } - - *oidlen = 2; - optr += 2; - - while (iptr < end) { - if (++(*oidlen) > vlen) - goto fail; - - if (!asn1_subid_decode(&iptr, end, optr++)) - goto fail; - } - return true; - -fail: - kfree(*oid); -fail_nullify: - *oid = NULL; - return false; -} - -static bool oid_eq(unsigned long *oid1, unsigned int oid1len, - unsigned long *oid2, unsigned int oid2len) -{ - if (oid1len != oid2len) - return false; - - return memcmp(oid1, oid2, oid1len) == 0; -} - int ksmbd_decode_negTokenInit(unsigned char *security_blob, int length, struct ksmbd_conn *conn) @@ -252,26 +162,18 @@ int build_spnego_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, int ksmbd_gssapi_this_mech(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { - unsigned long *oid; - size_t oidlen; - int err = 0; + enum OID oid; - if (!asn1_oid_decode(value, vlen, &oid, &oidlen)) { - err = -EBADMSG; - goto out; - } - - if (!oid_eq(oid, oidlen, SPNEGO_OID, SPNEGO_OID_LEN)) - err = -EBADMSG; - kfree(oid); -out: - if (err) { + oid = look_up_OID(value, vlen); + if (oid != OID_spnego) { char buf[50]; sprint_oid(value, vlen, buf, sizeof(buf)); ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); + return -EBADMSG; } - return err; + + return 0; } int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen, @@ -279,37 +181,31 @@ int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen, size_t vlen) { struct ksmbd_conn *conn = context; - unsigned long *oid; - size_t oidlen; + enum OID oid; int mech_type; - char buf[50]; - if (!asn1_oid_decode(value, vlen, &oid, &oidlen)) - goto fail; - - if (oid_eq(oid, oidlen, NTLMSSP_OID, NTLMSSP_OID_LEN)) + oid = look_up_OID(value, vlen); + if (oid == OID_ntlmssp) { mech_type = KSMBD_AUTH_NTLMSSP; - else if (oid_eq(oid, oidlen, MSKRB5_OID, MSKRB5_OID_LEN)) + } else if (oid == OID_mskrb5) { mech_type = KSMBD_AUTH_MSKRB5; - else if (oid_eq(oid, oidlen, KRB5_OID, KRB5_OID_LEN)) + } else if (oid == OID_krb5) { mech_type = KSMBD_AUTH_KRB5; - else if (oid_eq(oid, oidlen, KRB5U2U_OID, KRB5U2U_OID_LEN)) + } else if (oid == OID_krb5u2u) { mech_type = KSMBD_AUTH_KRB5U2U; - else - goto fail; + } else { + char buf[50]; + + sprint_oid(value, vlen, buf, sizeof(buf)); + ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); + return -EBADMSG; + } conn->auth_mechs |= mech_type; if (conn->preferred_auth_mech == 0) conn->preferred_auth_mech = mech_type; - kfree(oid); return 0; - -fail: - kfree(oid); - sprint_oid(value, vlen, buf, sizeof(buf)); - ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); - return -EBADMSG; } int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen, From 80917f17e3f99027661a45262c310139e53a9faa Mon Sep 17 00:00:00 2001 From: Marios Makassikis Date: Wed, 1 Dec 2021 21:41:19 +0100 Subject: [PATCH 012/149] ksmbd: Remove unused parameter from smb2_get_name() The 'share' parameter is no longer used by smb2_get_name() since commit 265fd1991c1d ("ksmbd: use LOOKUP_BENEATH to prevent the out of share access"). Acked-by: Namjae Jeon Signed-off-by: Marios Makassikis Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index b8b3a4c28b74..225f2bcb46b6 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -610,7 +610,6 @@ static void destroy_previous_session(struct ksmbd_user *user, u64 id) /** * smb2_get_name() - get filename string from on the wire smb format - * @share: ksmbd_share_config pointer * @src: source buffer * @maxlen: maxlen of source string * @nls_table: nls_table pointer @@ -618,8 +617,7 @@ static void destroy_previous_session(struct ksmbd_user *user, u64 id) * Return: matching converted filename on success, otherwise error ptr */ static char * -smb2_get_name(struct ksmbd_share_config *share, const char *src, - const int maxlen, struct nls_table *local_nls) +smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls) { char *name; @@ -2530,8 +2528,7 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } - name = smb2_get_name(share, - req->Buffer, + name = smb2_get_name(req->Buffer, le16_to_cpu(req->NameLength), work->conn->local_nls); if (IS_ERR(name)) { @@ -5398,8 +5395,7 @@ static int smb2_rename(struct ksmbd_work *work, goto out; } - new_name = smb2_get_name(share, - file_info->FileName, + new_name = smb2_get_name(file_info->FileName, le32_to_cpu(file_info->FileNameLength), local_nls); if (IS_ERR(new_name)) { @@ -5510,8 +5506,7 @@ static int smb2_create_link(struct ksmbd_work *work, if (!pathname) return -ENOMEM; - link_name = smb2_get_name(share, - file_info->FileName, + link_name = smb2_get_name(file_info->FileName, le32_to_cpu(file_info->FileNameLength), local_nls); if (IS_ERR(link_name) || S_ISDIR(file_inode(filp)->i_mode)) { From 305f8bda15ebbe4004681286a5c67d0dc296c771 Mon Sep 17 00:00:00 2001 From: Marios Makassikis Date: Wed, 1 Dec 2021 21:40:50 +0100 Subject: [PATCH 013/149] ksmbd: Remove unused fields from ksmbd_file struct definition These fields are remnants of the not upstreamed SMB1 code. Acked-by: Namjae Jeon Signed-off-by: Marios Makassikis Signed-off-by: Steve French --- fs/ksmbd/vfs_cache.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h index 448576fbe4b7..36239ce31afd 100644 --- a/fs/ksmbd/vfs_cache.h +++ b/fs/ksmbd/vfs_cache.h @@ -96,16 +96,6 @@ struct ksmbd_file { int durable_timeout; - /* for SMB1 */ - int pid; - - /* conflict lock fail count for SMB1 */ - unsigned int cflock_cnt; - /* last lock failure start offset for SMB1 */ - unsigned long long llock_fstart; - - int dirent_offset; - /* if ls is happening on directory, below is valid*/ struct ksmbd_readdir_data readdir_data; int dot_dotdot[2]; From a58b45a4dbfd0bf2ebb157789da4d8e6368afb1b Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 16 Dec 2021 10:26:43 +0900 Subject: [PATCH 014/149] ksmbd: set RSS capable in FSCTL_QUERY_NETWORK_INTERFACE_INFO Set RSS capable in FSCTL_QUERY_NETWORK_INTERFACE_INFO if netdev has multi tx queues. And add ksmbd_compare_user() to avoid racy condition issue in ksmbd_free_user(). because windows client is simultaneously used to send session setup requests for multichannel connection. Tested-by: Ziwei Xie Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/mgmt/user_config.c | 10 ++++++++++ fs/ksmbd/mgmt/user_config.h | 1 + fs/ksmbd/smb2pdu.c | 15 ++++++++++----- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/ksmbd/mgmt/user_config.c index 1019d3677d55..279d00feff21 100644 --- a/fs/ksmbd/mgmt/user_config.c +++ b/fs/ksmbd/mgmt/user_config.c @@ -67,3 +67,13 @@ int ksmbd_anonymous_user(struct ksmbd_user *user) return 1; return 0; } + +bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2) +{ + if (strcmp(u1->name, u2->name)) + return false; + if (memcmp(u1->passkey, u2->passkey, u1->passkey_sz)) + return false; + + return true; +} diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/ksmbd/mgmt/user_config.h index aff80b029579..6a44109617f1 100644 --- a/fs/ksmbd/mgmt/user_config.h +++ b/fs/ksmbd/mgmt/user_config.h @@ -64,4 +64,5 @@ struct ksmbd_user *ksmbd_login_user(const char *account); struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp); void ksmbd_free_user(struct ksmbd_user *user); int ksmbd_anonymous_user(struct ksmbd_user *user); +bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2); #endif /* __USER_CONFIG_MANAGEMENT_H__ */ diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 225f2bcb46b6..1ef8ef2c8e85 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -1448,10 +1448,16 @@ static int ntlm_authenticate(struct ksmbd_work *work) ksmbd_free_user(user); return 0; } - ksmbd_free_user(sess->user); + + if (!ksmbd_compare_user(sess->user, user)) { + ksmbd_free_user(user); + return -EPERM; + } + ksmbd_free_user(user); + } else { + sess->user = user; } - sess->user = user; if (user_guest(sess->user)) { if (conn->sign) { ksmbd_debug(SMB, "Guest login not allowed when signing enabled\n"); @@ -2055,9 +2061,6 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_debug(SMB, "request\n"); - /* Got a valid session, set connection state */ - WARN_ON(sess->conn != conn); - /* setting CifsExiting here may race with start_tcp_sess */ ksmbd_conn_set_need_reconnect(work); ksmbd_close_session_fds(work); @@ -7260,6 +7263,8 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, nii_rsp->IfIndex = cpu_to_le32(netdev->ifindex); nii_rsp->Capability = 0; + if (netdev->real_num_tx_queues > 1) + nii_rsp->Capability |= cpu_to_le32(RSS_CAPABLE); if (ksmbd_rdma_capable_netdev(netdev)) nii_rsp->Capability |= cpu_to_le32(RDMA_CAPABLE); From 71cd9cb680cb5d536c0dcbddb1c1d0010d79b214 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 16 Dec 2021 10:31:44 +0900 Subject: [PATCH 015/149] ksmbd: set both ipv4 and ipv6 in FSCTL_QUERY_NETWORK_INTERFACE_INFO Set ipv4 and ipv6 address in FSCTL_QUERY_NETWORK_INTERFACE_INFO. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 1ef8ef2c8e85..fcc3a9d0ab50 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -7241,15 +7241,10 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, struct sockaddr_storage_rsp *sockaddr_storage; unsigned int flags; unsigned long long speed; - struct sockaddr_in6 *csin6 = (struct sockaddr_in6 *)&conn->peer_addr; rtnl_lock(); for_each_netdev(&init_net, netdev) { - if (out_buf_len < - nbytes + sizeof(struct network_interface_info_ioctl_rsp)) { - rtnl_unlock(); - return -ENOSPC; - } + bool ipv4_set = false; if (netdev->type == ARPHRD_LOOPBACK) continue; @@ -7257,6 +7252,12 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, flags = dev_get_flags(netdev); if (!(flags & IFF_RUNNING)) continue; +ipv6_retry: + if (out_buf_len < + nbytes + sizeof(struct network_interface_info_ioctl_rsp)) { + rtnl_unlock(); + return -ENOSPC; + } nii_rsp = (struct network_interface_info_ioctl_rsp *) &rsp->Buffer[nbytes]; @@ -7289,8 +7290,7 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, nii_rsp->SockAddr_Storage; memset(sockaddr_storage, 0, 128); - if (conn->peer_addr.ss_family == PF_INET || - ipv6_addr_v4mapped(&csin6->sin6_addr)) { + if (!ipv4_set) { struct in_device *idev; sockaddr_storage->Family = cpu_to_le16(INTERNETWORK); @@ -7301,6 +7301,9 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, continue; sockaddr_storage->addr4.IPv4address = idev_ipv4_address(idev); + nbytes += sizeof(struct network_interface_info_ioctl_rsp); + ipv4_set = true; + goto ipv6_retry; } else { struct inet6_dev *idev6; struct inet6_ifaddr *ifa; @@ -7322,9 +7325,8 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, break; } sockaddr_storage->addr6.ScopeId = 0; + nbytes += sizeof(struct network_interface_info_ioctl_rsp); } - - nbytes += sizeof(struct network_interface_info_ioctl_rsp); } rtnl_unlock(); From ce53d365378cde71bb6596d79c257e600d951d29 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 15 Dec 2021 14:57:27 +0900 Subject: [PATCH 016/149] ksmbd: fix multi session connection failure When RSS mode is enable, windows client do simultaneously send several session requests to server. There is racy issue using sess->ntlmssp.cryptkey on N connection : 1 session. So authetication failed using wrong cryptkey on some session. This patch move cryptkey to ksmbd_conn structure to use each cryptkey on connection. Tested-by: Ziwei Xie Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/auth.c | 27 ++++++++++++++------------- fs/ksmbd/auth.h | 10 +++++----- fs/ksmbd/connection.h | 7 +------ fs/ksmbd/mgmt/user_session.h | 1 - fs/ksmbd/smb2pdu.c | 8 ++++---- 5 files changed, 24 insertions(+), 29 deletions(-) diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index 3503b1c48cb4..dc3d061edda9 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -215,7 +215,7 @@ out: * Return: 0 on success, error number on error */ int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, - int blen, char *domain_name) + int blen, char *domain_name, char *cryptkey) { char ntlmv2_hash[CIFS_ENCPWD_SIZE]; char ntlmv2_rsp[CIFS_HMAC_MD5_HASH_SIZE]; @@ -256,7 +256,7 @@ int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, goto out; } - memcpy(construct, sess->ntlmssp.cryptkey, CIFS_CRYPTO_KEY_SIZE); + memcpy(construct, cryptkey, CIFS_CRYPTO_KEY_SIZE); memcpy(construct + CIFS_CRYPTO_KEY_SIZE, &ntlmv2->blob_signature, blen); rc = crypto_shash_update(CRYPTO_HMACMD5(ctx), construct, len); @@ -295,7 +295,8 @@ out: * Return: 0 on success, error number on error */ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, - int blob_len, struct ksmbd_session *sess) + int blob_len, struct ksmbd_conn *conn, + struct ksmbd_session *sess) { char *domain_name; unsigned int nt_off, dn_off; @@ -324,7 +325,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, /* TODO : use domain name that imported from configuration file */ domain_name = smb_strndup_from_utf16((const char *)authblob + dn_off, - dn_len, true, sess->conn->local_nls); + dn_len, true, conn->local_nls); if (IS_ERR(domain_name)) return PTR_ERR(domain_name); @@ -333,7 +334,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, domain_name); ret = ksmbd_auth_ntlmv2(sess, (struct ntlmv2_resp *)((char *)authblob + nt_off), nt_len - CIFS_ENCPWD_SIZE, - domain_name); + domain_name, conn->ntlmssp.cryptkey); kfree(domain_name); return ret; } @@ -347,7 +348,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, * */ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, - int blob_len, struct ksmbd_session *sess) + int blob_len, struct ksmbd_conn *conn) { if (blob_len < sizeof(struct negotiate_message)) { ksmbd_debug(AUTH, "negotiate blob len %d too small\n", @@ -361,7 +362,7 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, return -EINVAL; } - sess->ntlmssp.client_flags = le32_to_cpu(negblob->NegotiateFlags); + conn->ntlmssp.client_flags = le32_to_cpu(negblob->NegotiateFlags); return 0; } @@ -375,14 +376,14 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, */ unsigned int ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, - struct ksmbd_session *sess) + struct ksmbd_conn *conn) { struct target_info *tinfo; wchar_t *name; __u8 *target_name; unsigned int flags, blob_off, blob_len, type, target_info_len = 0; int len, uni_len, conv_len; - int cflags = sess->ntlmssp.client_flags; + int cflags = conn->ntlmssp.client_flags; memcpy(chgblob->Signature, NTLMSSP_SIGNATURE, 8); chgblob->MessageType = NtLmChallenge; @@ -403,7 +404,7 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, if (cflags & NTLMSSP_REQUEST_TARGET) flags |= NTLMSSP_REQUEST_TARGET; - if (sess->conn->use_spnego && + if (conn->use_spnego && (cflags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) flags |= NTLMSSP_NEGOTIATE_EXTENDED_SEC; @@ -414,7 +415,7 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, return -ENOMEM; conv_len = smb_strtoUTF16((__le16 *)name, ksmbd_netbios_name(), len, - sess->conn->local_nls); + conn->local_nls); if (conv_len < 0 || conv_len > len) { kfree(name); return -EINVAL; @@ -430,8 +431,8 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, chgblob->TargetName.BufferOffset = cpu_to_le32(blob_off); /* Initialize random conn challenge */ - get_random_bytes(sess->ntlmssp.cryptkey, sizeof(__u64)); - memcpy(chgblob->Challenge, sess->ntlmssp.cryptkey, + get_random_bytes(conn->ntlmssp.cryptkey, sizeof(__u64)); + memcpy(chgblob->Challenge, conn->ntlmssp.cryptkey, CIFS_CRYPTO_KEY_SIZE); /* Add Target Information to security buffer */ diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h index 9c2d4badd05d..95629651cf26 100644 --- a/fs/ksmbd/auth.h +++ b/fs/ksmbd/auth.h @@ -38,16 +38,16 @@ struct kvec; int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, unsigned int nvec, int enc); void ksmbd_copy_gss_neg_header(void *buf); -int ksmbd_auth_ntlm(struct ksmbd_session *sess, char *pw_buf); int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, - int blen, char *domain_name); + int blen, char *domain_name, char *cryptkey); int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, - int blob_len, struct ksmbd_session *sess); + int blob_len, struct ksmbd_conn *conn, + struct ksmbd_session *sess); int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, - int blob_len, struct ksmbd_session *sess); + int blob_len, struct ksmbd_conn *conn); unsigned int ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, - struct ksmbd_session *sess); + struct ksmbd_conn *conn); int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, int in_len, char *out_blob, int *out_len); int ksmbd_sign_smb2_pdu(struct ksmbd_conn *conn, char *key, struct kvec *iov, diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index e5403c587a58..72dfd155b5bf 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -72,12 +72,7 @@ struct ksmbd_conn { int connection_type; struct ksmbd_stats stats; char ClientGUID[SMB2_CLIENT_GUID_SIZE]; - union { - /* pending trans request table */ - struct trans_state *recent_trans; - /* Used by ntlmssp */ - char *ntlmssp_cryptkey; - }; + struct ntlmssp_auth ntlmssp; spinlock_t llist_lock; struct list_head lock_list; diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h index 82289c3cbd2b..e241f16a3851 100644 --- a/fs/ksmbd/mgmt/user_session.h +++ b/fs/ksmbd/mgmt/user_session.h @@ -45,7 +45,6 @@ struct ksmbd_session { int state; __u8 *Preauth_HashValue; - struct ntlmssp_auth ntlmssp; char sess_key[CIFS_KEY_SIZE]; struct hlist_node hlist; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index fcc3a9d0ab50..036a0dc86859 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -1301,7 +1301,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, int sz, rc; ksmbd_debug(SMB, "negotiate phase\n"); - rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->sess); + rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->conn); if (rc) return rc; @@ -1311,7 +1311,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, memset(chgblob, 0, sizeof(struct challenge_message)); if (!work->conn->use_spnego) { - sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess); + sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->conn); if (sz < 0) return -ENOMEM; @@ -1327,7 +1327,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, return -ENOMEM; chgblob = (struct challenge_message *)neg_blob; - sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess); + sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->conn); if (sz < 0) { rc = -ENOMEM; goto out; @@ -1470,7 +1470,7 @@ static int ntlm_authenticate(struct ksmbd_work *work) authblob = user_authblob(conn, req); sz = le16_to_cpu(req->SecurityBufferLength); - rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, sess); + rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, conn, sess); if (rc) { set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD); ksmbd_debug(SMB, "authentication failed\n"); From e230d013378489bcd4b5589ca1d2a5b91ff8d098 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 21 Dec 2021 17:07:11 +0800 Subject: [PATCH 017/149] ksmbd: Fix buffer_check_err() kernel-doc comment Add the description of @rsp_org in buffer_check_err() kernel-doc comment to remove a warning found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/ksmbd/smb2pdu.c:4028: warning: Function parameter or member 'rsp_org' not described in 'buffer_check_err' Reported-by: Abaci Robot Fixes: cb4517201b8a ("ksmbd: remove smb2_buf_length in smb2_hdr") Acked-by: Namjae Jeon Signed-off-by: Yang Li Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 036a0dc86859..08ff4684dee6 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -4018,6 +4018,7 @@ err_out2: * buffer_check_err() - helper function to check buffer errors * @reqOutputBufferLength: max buffer length expected in command response * @rsp: query info response buffer contains output buffer length + * @rsp_org: base response buffer pointer in case of chained response * @infoclass_size: query info class response buffer size * * Return: 0 on success, otherwise error From 4bfd9eed15e163969156e976c62db5ef423e5b0f Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 21 Dec 2021 17:07:12 +0800 Subject: [PATCH 018/149] ksmbd: Fix smb2_set_info_file() kernel-doc comment Fix argument list that the kdoc format and script verified in smb2_set_info_file(). The warnings were found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/ksmbd/smb2pdu.c:5862: warning: Function parameter or member 'req' not described in 'smb2_set_info_file' fs/ksmbd/smb2pdu.c:5862: warning: Excess function parameter 'info_class' description in 'smb2_set_info_file' Reported-by: Abaci Robot Fixes: 9496e268e3af ("ksmbd: add request buffer validation in smb2_set_info") Acked-by: Namjae Jeon Signed-off-by: Yang Li Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 08ff4684dee6..722f14a1b77a 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -5848,7 +5848,7 @@ static int set_file_mode_info(struct ksmbd_file *fp, * smb2_set_info_file() - handler for smb2 set info command * @work: smb work containing set info command buffer * @fp: ksmbd_file pointer - * @info_class: smb2 set info class + * @req: request buffer pointer * @share: ksmbd_share_config pointer * * Return: 0 on success, otherwise error From f5c381392948dcae19f854b9586b806654f08a11 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 21 Dec 2021 17:07:13 +0800 Subject: [PATCH 019/149] ksmbd: Delete an invalid argument description in smb2_populate_readdir_entry() A warning is reported because an invalid argument description, it is found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/ksmbd/smb2pdu.c:3406: warning: Excess function parameter 'user_ns' description in 'smb2_populate_readdir_entry' Reported-by: Abaci Robot Fixes: 475d6f98804c ("ksmbd: fix translation in smb2_populate_readdir_entry()") Acked-by: Namjae Jeon Signed-off-by: Yang Li Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 722f14a1b77a..43882f5baa3b 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -3392,7 +3392,6 @@ static int dentry_name(struct ksmbd_dir_info *d_info, int info_level) * @conn: connection instance * @info_level: smb information level * @d_info: structure included variables for query dir - * @user_ns: user namespace * @ksmbd_kstat: ksmbd wrapper of dirent stat information * * if directory has many entries, find first can't read it fully. From d4eeb82674acadf789277b577986e8e7d3faf695 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 21 Dec 2021 20:48:57 +0900 Subject: [PATCH 020/149] ksmbd: Fix smb2_get_name() kernel-doc comment Remove some warnings found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/ksmbd/smb2pdu.c:623: warning: Function parameter or member 'local_nls' not described in 'smb2_get_name' fs/ksmbd/smb2pdu.c:623: warning: Excess function parameter 'nls_table' description in 'smb2_get_name' Reported-by: Abaci Robot Acked-by: Namjae Jeon Signed-off-by: Yang Li Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 43882f5baa3b..beae94f6033a 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -612,7 +612,7 @@ static void destroy_previous_session(struct ksmbd_user *user, u64 id) * smb2_get_name() - get filename string from on the wire smb format * @src: source buffer * @maxlen: maxlen of source string - * @nls_table: nls_table pointer + * @local_nls: nls_table pointer * * Return: matching converted filename on success, otherwise error ptr */ From 31928a001bed0d9642711d2eba520fc46d41c376 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Wed, 29 Dec 2021 23:02:15 +0900 Subject: [PATCH 021/149] ksmbd: register ksmbd ib client with ib_register_client() Register ksmbd ib client with ib_register_client() to find the rdma capable network adapter. If ops.get_netdev(Chelsio NICs) is NULL, ksmbd will find it using ib_device_get_by_netdev in old way. Signed-off-by: Hyunchul Lee Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 107 ++++++++++++++++++++++++++++++++++---- fs/ksmbd/transport_rdma.h | 2 +- 2 files changed, 98 insertions(+), 11 deletions(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 7e57cbb0bb35..339fa4f025f7 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -79,6 +79,14 @@ static int smb_direct_max_read_write_size = 1024 * 1024; static int smb_direct_max_outstanding_rw_ops = 8; +static LIST_HEAD(smb_direct_device_list); +static DEFINE_RWLOCK(smb_direct_device_lock); + +struct smb_direct_device { + struct ib_device *ib_dev; + struct list_head list; +}; + static struct smb_direct_listener { struct rdma_cm_id *cm_id; } smb_direct_listener; @@ -2007,12 +2015,61 @@ err: return ret; } +static int smb_direct_ib_client_add(struct ib_device *ib_dev) +{ + struct smb_direct_device *smb_dev; + + if (!ib_dev->ops.get_netdev || + !rdma_frwr_is_supported(&ib_dev->attrs)) + return 0; + + smb_dev = kzalloc(sizeof(*smb_dev), GFP_KERNEL); + if (!smb_dev) + return -ENOMEM; + smb_dev->ib_dev = ib_dev; + + write_lock(&smb_direct_device_lock); + list_add(&smb_dev->list, &smb_direct_device_list); + write_unlock(&smb_direct_device_lock); + + ksmbd_debug(RDMA, "ib device added: name %s\n", ib_dev->name); + return 0; +} + +static void smb_direct_ib_client_remove(struct ib_device *ib_dev, + void *client_data) +{ + struct smb_direct_device *smb_dev, *tmp; + + write_lock(&smb_direct_device_lock); + list_for_each_entry_safe(smb_dev, tmp, &smb_direct_device_list, list) { + if (smb_dev->ib_dev == ib_dev) { + list_del(&smb_dev->list); + kfree(smb_dev); + break; + } + } + write_unlock(&smb_direct_device_lock); +} + +static struct ib_client smb_direct_ib_client = { + .name = "ksmbd_smb_direct_ib", + .add = smb_direct_ib_client_add, + .remove = smb_direct_ib_client_remove, +}; + int ksmbd_rdma_init(void) { int ret; smb_direct_listener.cm_id = NULL; + ret = ib_register_client(&smb_direct_ib_client); + if (ret) { + pr_err("failed to ib_register_client\n"); + return ret; + } + /* When a client is running out of send credits, the credits are * granted by the server's sending a packet using this queue. * This avoids the situation that a clients cannot send packets @@ -2036,30 +2093,60 @@ int ksmbd_rdma_init(void) return 0; } -int ksmbd_rdma_destroy(void) +void ksmbd_rdma_destroy(void) { - if (smb_direct_listener.cm_id) - rdma_destroy_id(smb_direct_listener.cm_id); + if (!smb_direct_listener.cm_id) + return; + + ib_unregister_client(&smb_direct_ib_client); + rdma_destroy_id(smb_direct_listener.cm_id); + smb_direct_listener.cm_id = NULL; if (smb_direct_wq) { destroy_workqueue(smb_direct_wq); smb_direct_wq = NULL; } - return 0; } bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { - struct ib_device *ibdev; + struct smb_direct_device *smb_dev; + int i; bool rdma_capable = false; - ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); - if (ibdev) { - if (rdma_frwr_is_supported(&ibdev->attrs)) - rdma_capable = true; - ib_device_put(ibdev); + read_lock(&smb_direct_device_lock); + list_for_each_entry(smb_dev, &smb_direct_device_list, list) { + for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) { + struct net_device *ndev; + + ndev = smb_dev->ib_dev->ops.get_netdev(smb_dev->ib_dev, + i + 1); + if (!ndev) + continue; + + if (ndev == netdev) { + dev_put(ndev); + rdma_capable = true; + goto out; + } + dev_put(ndev); + } } +out: + read_unlock(&smb_direct_device_lock); + + if (rdma_capable == false) { + struct ib_device *ibdev; + + ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); + if (ibdev) { + if (rdma_frwr_is_supported(&ibdev->attrs)) + rdma_capable = true; + ib_device_put(ibdev); + } + } + return rdma_capable; } diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h index 0fa8adc0776f..ab9250a7cb86 100644 --- a/fs/ksmbd/transport_rdma.h +++ b/fs/ksmbd/transport_rdma.h @@ -52,7 +52,7 @@ struct smb_direct_data_transfer { #ifdef CONFIG_SMB_SERVER_SMBDIRECT int ksmbd_rdma_init(void); -int ksmbd_rdma_destroy(void); +void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); #else static inline int ksmbd_rdma_init(void) { return 0; } From cb097b3dd5ece9596a0a0b7e33893c02a9bde8c6 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 29 Dec 2021 23:02:50 +0900 Subject: [PATCH 022/149] ksmbd: set 445 port to smbdirect port by default When SMB Direct is used with iWARP, Windows use 5445 port for smb direct port, 445 port for SMB. This patch check ib_device using ib_client to know if NICs type is iWARP or Infiniband. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 15 ++++++++++++--- fs/ksmbd/transport_rdma.h | 2 -- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 339fa4f025f7..f89b64e27836 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -34,7 +34,8 @@ #include "smbstatus.h" #include "transport_rdma.h" -#define SMB_DIRECT_PORT 5445 +#define SMB_DIRECT_PORT_IWARP 5445 +#define SMB_DIRECT_PORT_INFINIBAND 445 #define SMB_DIRECT_VERSION_LE cpu_to_le16(0x0100) @@ -60,6 +61,10 @@ * as defined in [MS-SMBD] 3.1.1.1 * Those may change after a SMB_DIRECT negotiation */ + +/* Set 445 port to SMB Direct port by default */ +static int smb_direct_port = SMB_DIRECT_PORT_INFINIBAND; + /* The local peer's maximum number of credits to grant to the peer */ static int smb_direct_receive_credit_max = 255; @@ -1942,7 +1947,7 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, KSMBD_TRANS(t)->conn, "ksmbd:r%u", - SMB_DIRECT_PORT); + smb_direct_port); if (IS_ERR(KSMBD_TRANS(t)->handler)) { int ret = PTR_ERR(KSMBD_TRANS(t)->handler); @@ -2019,6 +2024,10 @@ static int smb_direct_ib_client_add(struct ib_device *ib_dev) { struct smb_direct_device *smb_dev; + /* Set 5445 port if device type is iWARP(No IB) */ + if (ib_dev->node_type != RDMA_NODE_IB_CA) + smb_direct_port = SMB_DIRECT_PORT_IWARP; + if (!ib_dev->ops.get_netdev || !rdma_frwr_is_supported(&ib_dev->attrs)) return 0; @@ -2080,7 +2089,7 @@ int ksmbd_rdma_init(void) if (!smb_direct_wq) return -ENOMEM; - ret = smb_direct_listen(SMB_DIRECT_PORT); + ret = smb_direct_listen(smb_direct_port); if (ret) { destroy_workqueue(smb_direct_wq); smb_direct_wq = NULL; diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h index ab9250a7cb86..5567d93a6f96 100644 --- a/fs/ksmbd/transport_rdma.h +++ b/fs/ksmbd/transport_rdma.h @@ -7,8 +7,6 @@ #ifndef __KSMBD_TRANSPORT_RDMA_H__ #define __KSMBD_TRANSPORT_RDMA_H__ -#define SMB_DIRECT_PORT 5445 - /* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */ struct smb_direct_negotiate_req { __le16 min_version; From 004443b3f6d722b455cf963ed7c3edd7f4772405 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 29 Dec 2021 23:08:46 +0900 Subject: [PATCH 023/149] ksmbd: add support for smb2 max credit parameter Add smb2 max credits parameter to adjust maximum credits value to limit number of outstanding requests. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/connection.h | 1 - fs/ksmbd/ksmbd_netlink.h | 1 + fs/ksmbd/smb2misc.c | 2 +- fs/ksmbd/smb2ops.c | 16 ++++++++++++---- fs/ksmbd/smb2pdu.c | 8 ++++---- fs/ksmbd/smb2pdu.h | 1 + fs/ksmbd/smb_common.h | 1 + fs/ksmbd/transport_ipc.c | 2 ++ 8 files changed, 22 insertions(+), 10 deletions(-) diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index 72dfd155b5bf..42ffb6d9c5d8 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -62,7 +62,6 @@ struct ksmbd_conn { /* References which are made for this Server object*/ atomic_t r_count; unsigned short total_credits; - unsigned short max_credits; spinlock_t credits_lock; wait_queue_head_t req_running_q; /* Lock to protect requests list*/ diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index c6718a05d347..a5c2861792ae 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -103,6 +103,7 @@ struct ksmbd_startup_request { * we set the SPARSE_FILES bit (0x40). */ __u32 sub_auth[3]; /* Subauth value for Security ID */ + __u32 smb2_max_credits; /* MAX credits */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; }; diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 50d0b1022289..6892d1822269 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -326,7 +326,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, ksmbd_debug(SMB, "Insufficient credit charge, given: %d, needed: %d\n", credit_charge, calc_credit_num); return 1; - } else if (credit_charge > conn->max_credits) { + } else if (credit_charge > conn->vals->max_credits) { ksmbd_debug(SMB, "Too large credit charge: %d\n", credit_charge); return 1; } diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index 02a44d28bdaf..ab23da2120b9 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -19,6 +19,7 @@ static struct smb_version_values smb21_server_values = { .max_read_size = SMB21_DEFAULT_IOSIZE, .max_write_size = SMB21_DEFAULT_IOSIZE, .max_trans_size = SMB21_DEFAULT_IOSIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -44,6 +45,7 @@ static struct smb_version_values smb30_server_values = { .max_read_size = SMB3_DEFAULT_IOSIZE, .max_write_size = SMB3_DEFAULT_IOSIZE, .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -70,6 +72,7 @@ static struct smb_version_values smb302_server_values = { .max_read_size = SMB3_DEFAULT_IOSIZE, .max_write_size = SMB3_DEFAULT_IOSIZE, .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -96,6 +99,7 @@ static struct smb_version_values smb311_server_values = { .max_read_size = SMB3_DEFAULT_IOSIZE, .max_write_size = SMB3_DEFAULT_IOSIZE, .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -197,7 +201,6 @@ void init_smb2_1_server(struct ksmbd_conn *conn) conn->ops = &smb2_0_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_HMAC_SHA256_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -215,7 +218,6 @@ void init_smb3_0_server(struct ksmbd_conn *conn) conn->ops = &smb3_0_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -240,7 +242,6 @@ void init_smb3_02_server(struct ksmbd_conn *conn) conn->ops = &smb3_0_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -265,7 +266,6 @@ int init_smb3_11_server(struct ksmbd_conn *conn) conn->ops = &smb3_11_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -304,3 +304,11 @@ void init_smb2_max_trans_size(unsigned int sz) smb302_server_values.max_trans_size = sz; smb311_server_values.max_trans_size = sz; } + +void init_smb2_max_credits(unsigned int sz) +{ + smb21_server_values.max_credits = sz; + smb30_server_values.max_credits = sz; + smb302_server_values.max_credits = sz; + smb311_server_values.max_credits = sz; +} diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index beae94f6033a..7fb0545c21ab 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -308,7 +308,7 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) hdr->CreditCharge = req_hdr->CreditCharge; - if (conn->total_credits > conn->max_credits) { + if (conn->total_credits > conn->vals->max_credits) { hdr->CreditRequest = 0; pr_err("Total credits overflow: %d\n", conn->total_credits); return -EINVAL; @@ -329,12 +329,12 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) if (hdr->Command == SMB2_NEGOTIATE) aux_max = 0; else - aux_max = conn->max_credits - credit_charge; + aux_max = conn->vals->max_credits - credit_charge; aux_credits = min_t(unsigned short, aux_credits, aux_max); credits_granted = credit_charge + aux_credits; - if (conn->max_credits - conn->total_credits < credits_granted) - credits_granted = conn->max_credits - + if (conn->vals->max_credits - conn->total_credits < credits_granted) + credits_granted = conn->vals->max_credits - conn->total_credits; conn->total_credits += credits_granted; diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 4a3e4339d4c4..725b800c29c8 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -980,6 +980,7 @@ int init_smb3_11_server(struct ksmbd_conn *conn); void init_smb2_max_read_size(unsigned int sz); void init_smb2_max_write_size(unsigned int sz); void init_smb2_max_trans_size(unsigned int sz); +void init_smb2_max_credits(unsigned int sz); bool is_smb2_neg_cmd(struct ksmbd_work *work); bool is_smb2_rsp(struct ksmbd_work *work); diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 50590842b651..e1369b4345a9 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -365,6 +365,7 @@ struct smb_version_values { __u32 max_read_size; __u32 max_write_size; __u32 max_trans_size; + __u32 max_credits; __u32 large_lock_type; __u32 exclusive_lock_type; __u32 shared_lock_type; diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 1acf1892a466..3ad6881e0f7e 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -301,6 +301,8 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) init_smb2_max_write_size(req->smb2_max_write); if (req->smb2_max_trans) init_smb2_max_trans_size(req->smb2_max_trans); + if (req->smb2_max_credits) + init_smb2_max_credits(req->smb2_max_credits); ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); From 914d7e5709ac59ded70bea7956d408fe2acd7c3c Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 29 Dec 2021 23:10:03 +0900 Subject: [PATCH 024/149] ksmbd: move credit charge deduction under processing request Moves the credit charge deduction from total_credits under the processing a request. When repeating smb2 lock request and other command request, there will be a problem that ->total_credits does not decrease. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2misc.c | 7 ++----- fs/ksmbd/smb2pdu.c | 16 ++++++++++------ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 6892d1822269..fedcb753c7af 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -289,7 +289,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, unsigned int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len; unsigned short credit_charge = le16_to_cpu(hdr->CreditCharge); void *__hdr = hdr; - int ret; + int ret = 0; switch (hdr->Command) { case SMB2_QUERY_INFO: @@ -332,10 +332,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, } spin_lock(&conn->credits_lock); - if (credit_charge <= conn->total_credits) { - conn->total_credits -= credit_charge; - ret = 0; - } else { + if (credit_charge > conn->total_credits) { ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n", credit_charge, conn->total_credits); ret = 1; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 7fb0545c21ab..706191f5e475 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -299,9 +299,8 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work); struct smb2_hdr *hdr = ksmbd_resp_buf_next(work); struct ksmbd_conn *conn = work->conn; - unsigned short credits_requested; + unsigned short credits_requested, aux_max; unsigned short credit_charge, credits_granted = 0; - unsigned short aux_max, aux_credits; if (work->send_no_response) return 0; @@ -316,6 +315,13 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) credit_charge = max_t(unsigned short, le16_to_cpu(req_hdr->CreditCharge), 1); + if (credit_charge > conn->total_credits) { + ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n", + credit_charge, conn->total_credits); + return -EINVAL; + } + + conn->total_credits -= credit_charge; credits_requested = max_t(unsigned short, le16_to_cpu(req_hdr->CreditRequest), 1); @@ -325,13 +331,11 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) * TODO: Need to adjuct CreditRequest value according to * current cpu load */ - aux_credits = credits_requested - 1; if (hdr->Command == SMB2_NEGOTIATE) - aux_max = 0; + aux_max = 1; else aux_max = conn->vals->max_credits - credit_charge; - aux_credits = min_t(unsigned short, aux_credits, aux_max); - credits_granted = credit_charge + aux_credits; + credits_granted = min_t(unsigned short, credits_requested, aux_max); if (conn->vals->max_credits - conn->total_credits < credits_granted) credits_granted = conn->vals->max_credits - From b589f5db6d4af8f14d70e31e1276b4c017668a26 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 31 Dec 2021 09:26:25 +0900 Subject: [PATCH 025/149] ksmbd: limits exceeding the maximum allowable outstanding requests If the client ignores the CreditResponse received from the server and continues to send the request, ksmbd limits the requests if it exceeds smb2 max credits. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/connection.c | 1 + fs/ksmbd/connection.h | 3 ++- fs/ksmbd/smb2misc.c | 9 +++++++++ fs/ksmbd/smb2pdu.c | 1 + 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 83a94d0bb480..d1d0105be5b1 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -62,6 +62,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); conn->total_credits = 1; + conn->outstanding_credits = 1; init_waitqueue_head(&conn->req_running_q); INIT_LIST_HEAD(&conn->conns_list); diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index 42ffb6d9c5d8..7e0730a262da 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -61,7 +61,8 @@ struct ksmbd_conn { atomic_t req_running; /* References which are made for this Server object*/ atomic_t r_count; - unsigned short total_credits; + unsigned int total_credits; + unsigned int outstanding_credits; spinlock_t credits_lock; wait_queue_head_t req_running_q; /* Lock to protect requests list*/ diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index fedcb753c7af..4a9460153b59 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -337,7 +337,16 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, credit_charge, conn->total_credits); ret = 1; } + + if ((u64)conn->outstanding_credits + credit_charge > conn->vals->max_credits) { + ksmbd_debug(SMB, "Limits exceeding the maximum allowable outstanding requests, given : %u, pending : %u\n", + credit_charge, conn->outstanding_credits); + ret = 1; + } else + conn->outstanding_credits += credit_charge; + spin_unlock(&conn->credits_lock); + return ret; } diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 706191f5e475..867ed982f729 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -322,6 +322,7 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) } conn->total_credits -= credit_charge; + conn->outstanding_credits -= credit_charge; credits_requested = max_t(unsigned short, le16_to_cpu(req_hdr->CreditRequest), 1); From 99b7650ac51847e81b4d5139824e321e6cb76130 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Tue, 4 Jan 2022 14:56:26 +0900 Subject: [PATCH 026/149] ksmbd: smbd: call rdma_accept() under CM handler if CONFIG_LOCKDEP is enabled, the following kernel warning message is generated because rdma_accept() checks whehter the handler_mutex is held by lockdep_assert_held. CM(Connection Manager) holds the mutex before CM handler callback is called. [ 63.211405 ] WARNING: CPU: 1 PID: 345 at drivers/infiniband/core/cma.c:4405 rdma_accept+0x17a/0x350 [ 63.212080 ] RIP: 0010:rdma_accept+0x17a/0x350 ... [ 63.214036 ] Call Trace: [ 63.214098 ] [ 63.214185 ] smb_direct_accept_client+0xb4/0x170 [ksmbd] [ 63.214412 ] smb_direct_prepare+0x322/0x8c0 [ksmbd] [ 63.214555 ] ? rcu_read_lock_sched_held+0x3a/0x70 [ 63.214700 ] ksmbd_conn_handler_loop+0x63/0x270 [ksmbd] [ 63.214826 ] ? ksmbd_conn_alive+0x80/0x80 [ksmbd] [ 63.214952 ] kthread+0x171/0x1a0 [ 63.215039 ] ? set_kthread_struct+0x40/0x40 [ 63.215128 ] ret_from_fork+0x22/0x30 To avoid this, move creating a queue pair and accepting a client from transport_ops->prepare() to smb_direct_handle_connect_request(). Acked-by: Namjae Jeon Signed-off-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 102 ++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 43 deletions(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index f89b64e27836..0fd706d01790 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -568,6 +568,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } t->negotiation_requested = true; t->full_packet_received = true; + enqueue_reassembly(t, recvmsg, 0); wake_up_interruptible(&t->wait_status); break; case SMB_DIRECT_MSG_DATA_TRANSFER: { @@ -1594,19 +1595,13 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) pr_err("error at rdma_accept: %d\n", ret); return ret; } - - wait_event_interruptible(t->wait_status, - t->status != SMB_DIRECT_CS_NEW); - if (t->status != SMB_DIRECT_CS_CONNECTED) - return -ENOTCONN; return 0; } -static int smb_direct_negotiate(struct smb_direct_transport *t) +static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) { int ret; struct smb_direct_recvmsg *recvmsg; - struct smb_direct_negotiate_req *req; recvmsg = get_free_recvmsg(t); if (!recvmsg) @@ -1616,44 +1611,20 @@ static int smb_direct_negotiate(struct smb_direct_transport *t) ret = smb_direct_post_recv(t, recvmsg); if (ret) { pr_err("Can't post recv: %d\n", ret); - goto out; + goto out_err; } t->negotiation_requested = false; ret = smb_direct_accept_client(t); if (ret) { pr_err("Can't accept client\n"); - goto out; + goto out_err; } smb_direct_post_recv_credits(&t->post_recv_credits_work.work); - - ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); - ret = wait_event_interruptible_timeout(t->wait_status, - t->negotiation_requested || - t->status == SMB_DIRECT_CS_DISCONNECTED, - SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); - if (ret <= 0 || t->status == SMB_DIRECT_CS_DISCONNECTED) { - ret = ret < 0 ? ret : -ETIMEDOUT; - goto out; - } - - ret = smb_direct_check_recvmsg(recvmsg); - if (ret == -ECONNABORTED) - goto out; - - req = (struct smb_direct_negotiate_req *)recvmsg->packet; - t->max_recv_size = min_t(int, t->max_recv_size, - le32_to_cpu(req->preferred_send_size)); - t->max_send_size = min_t(int, t->max_send_size, - le32_to_cpu(req->max_receive_size)); - t->max_fragmented_send_size = - le32_to_cpu(req->max_fragmented_size); - - ret = smb_direct_send_negotiate_response(t, ret); -out: - if (recvmsg) - put_recvmsg(t, recvmsg); + return 0; +out_err: + put_recvmsg(t, recvmsg); return ret; } @@ -1890,6 +1861,47 @@ err: static int smb_direct_prepare(struct ksmbd_transport *t) { struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_recvmsg *recvmsg; + struct smb_direct_negotiate_req *req; + int ret; + + ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); + ret = wait_event_interruptible_timeout(st->wait_status, + st->negotiation_requested || + st->status == SMB_DIRECT_CS_DISCONNECTED, + SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); + if (ret <= 0 || st->status == SMB_DIRECT_CS_DISCONNECTED) + return ret < 0 ? ret : -ETIMEDOUT; + + recvmsg = get_first_reassembly(st); + if (!recvmsg) + return -ECONNABORTED; + + ret = smb_direct_check_recvmsg(recvmsg); + if (ret == -ECONNABORTED) + goto out; + + req = (struct smb_direct_negotiate_req *)recvmsg->packet; + st->max_recv_size = min_t(int, st->max_recv_size, + le32_to_cpu(req->preferred_send_size)); + st->max_send_size = min_t(int, st->max_send_size, + le32_to_cpu(req->max_receive_size)); + st->max_fragmented_send_size = + le32_to_cpu(req->max_fragmented_size); + + ret = smb_direct_send_negotiate_response(st, ret); +out: + spin_lock_irq(&st->reassembly_queue_lock); + st->reassembly_queue_length--; + list_del(&recvmsg->list); + spin_unlock_irq(&st->reassembly_queue_lock); + put_recvmsg(st, recvmsg); + + return ret; +} + +static int smb_direct_connect(struct smb_direct_transport *st) +{ int ret; struct ib_qp_cap qp_cap; @@ -1911,13 +1923,11 @@ static int smb_direct_prepare(struct ksmbd_transport *t) return ret; } - ret = smb_direct_negotiate(st); + ret = smb_direct_prepare_negotiation(st); if (ret) { pr_err("Can't negotiate: %d\n", ret); return ret; } - - st->status = SMB_DIRECT_CS_CONNECTED; return 0; } @@ -1933,6 +1943,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) { struct smb_direct_transport *t; + int ret; if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { ksmbd_debug(RDMA, @@ -1945,18 +1956,23 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) if (!t) return -ENOMEM; + ret = smb_direct_connect(t); + if (ret) + goto out_err; + KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, KSMBD_TRANS(t)->conn, "ksmbd:r%u", smb_direct_port); if (IS_ERR(KSMBD_TRANS(t)->handler)) { - int ret = PTR_ERR(KSMBD_TRANS(t)->handler); - + ret = PTR_ERR(KSMBD_TRANS(t)->handler); pr_err("Can't start thread\n"); - free_transport(t); - return ret; + goto out_err; } return 0; +out_err: + free_transport(t); + return ret; } static int smb_direct_listen_handler(struct rdma_cm_id *cm_id, From 41dbda16a0902798e732abc6599de256b9dc3b27 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 6 Jan 2022 10:30:31 +0900 Subject: [PATCH 027/149] ksmbd: add reserved room in ipc request/response Whenever new parameter is added to smb configuration, It is possible to break the execution of the IPC daemon by mismatch size of request/response. This patch tries to reserve space in ipc request/response in advance to prevent that. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/ksmbd_netlink.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index a5c2861792ae..71bfb7de4472 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -104,6 +104,7 @@ struct ksmbd_startup_request { */ __u32 sub_auth[3]; /* Subauth value for Security ID */ __u32 smb2_max_credits; /* MAX credits */ + __u32 reserved[128]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; }; @@ -114,7 +115,7 @@ struct ksmbd_startup_request { * IPC request to shutdown ksmbd server. */ struct ksmbd_shutdown_request { - __s32 reserved; + __s32 reserved[16]; }; /* @@ -123,6 +124,7 @@ struct ksmbd_shutdown_request { struct ksmbd_login_request { __u32 handle; __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -136,6 +138,7 @@ struct ksmbd_login_response { __u16 status; __u16 hash_sz; /* hash size */ __s8 hash[KSMBD_REQ_MAX_HASH_SZ]; /* password hash */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -144,6 +147,7 @@ struct ksmbd_login_response { struct ksmbd_share_config_request { __u32 handle; __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME]; /* share name */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -158,6 +162,7 @@ struct ksmbd_share_config_response { __u16 force_directory_mode; __u16 force_uid; __u16 force_gid; + __u32 reserved[128]; /* Reserved room */ __u32 veto_list_sz; __s8 ____payload[]; }; @@ -188,6 +193,7 @@ struct ksmbd_tree_connect_request { __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; __s8 share[KSMBD_REQ_MAX_SHARE_NAME]; __s8 peer_addr[64]; + __u32 reserved[16]; /* Reserved room */ }; /* @@ -197,6 +203,7 @@ struct ksmbd_tree_connect_response { __u32 handle; __u16 status; __u16 connection_flags; + __u32 reserved[16]; /* Reserved room */ }; /* @@ -205,6 +212,7 @@ struct ksmbd_tree_connect_response { struct ksmbd_tree_disconnect_request { __u64 session_id; /* session id */ __u64 connect_id; /* tree connection id */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -213,6 +221,7 @@ struct ksmbd_tree_disconnect_request { struct ksmbd_logout_request { __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ __u32 account_flags; + __u32 reserved[16]; /* Reserved room */ }; /* From c9f189271cff85d5d735e25dfa4bc95952ec12d8 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Fri, 7 Jan 2022 14:45:30 +0900 Subject: [PATCH 028/149] ksmbd: smbd: create MR pool Create a memory region pool because rdma_rw_ctx_init() uses memory registration if memory registration yields better performance than using multiple SGE entries. Acked-by: Namjae Jeon Signed-off-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 0fd706d01790..f0b17da1cac2 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -428,6 +428,7 @@ static void free_transport(struct smb_direct_transport *t) if (t->qp) { ib_drain_qp(t->qp); + ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs); ib_destroy_qp(t->qp); } @@ -1708,7 +1709,9 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; cap->max_inline_data = 0; - cap->max_rdma_ctxs = 0; + cap->max_rdma_ctxs = + rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) * + smb_direct_max_outstanding_rw_ops; return 0; } @@ -1790,6 +1793,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, { int ret; struct ib_qp_init_attr qp_attr; + int pages_per_rw; t->pd = ib_alloc_pd(t->cm_id->device, 0); if (IS_ERR(t->pd)) { @@ -1837,6 +1841,23 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, t->qp = t->cm_id->qp; t->cm_id->event_handler = smb_direct_cm_handler; + pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; + if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) { + int pages_per_mr, mr_count; + + pages_per_mr = min_t(int, pages_per_rw, + t->cm_id->device->attrs.max_fast_reg_page_list_len); + mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) * + atomic_read(&t->rw_avail_ops); + ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count, + IB_MR_TYPE_MEM_REG, pages_per_mr, 0); + if (ret) { + pr_err("failed to init mr pool count %d pages %d\n", + mr_count, pages_per_mr); + goto err; + } + } + return 0; err: if (t->qp) { From 4d02c4fdc0e256b493f9a3b604c7ff18f0019f17 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Fri, 7 Jan 2022 14:45:31 +0900 Subject: [PATCH 029/149] ksmbd: smbd: change the default maximum read/write, receive size Due to restriction that cannot handle multiple buffer descriptor structures, decrease the maximum read/write size for Windows clients. And set the maximum fragmented receive size in consideration of the receive queue size. Acked-by: Namjae Jeon Signed-off-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index f0b17da1cac2..86fd64511512 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -80,7 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ static int smb_direct_max_receive_size = 8192; -static int smb_direct_max_read_write_size = 1024 * 1024; +static int smb_direct_max_read_write_size = 1048512; static int smb_direct_max_outstanding_rw_ops = 8; @@ -1908,7 +1908,9 @@ static int smb_direct_prepare(struct ksmbd_transport *t) st->max_send_size = min_t(int, st->max_send_size, le32_to_cpu(req->max_receive_size)); st->max_fragmented_send_size = - le32_to_cpu(req->max_fragmented_size); + le32_to_cpu(req->max_fragmented_size); + st->max_fragmented_recv_size = + (st->recv_credit_max * st->max_recv_size) / 2; ret = smb_direct_send_negotiate_response(st, ret); out: From 136dff3a6b71dc16c30b35cc390feb0bfc32ed50 Mon Sep 17 00:00:00 2001 From: Yufan Chen Date: Sun, 9 Jan 2022 11:34:16 +0900 Subject: [PATCH 030/149] ksmbd: add smb-direct shutdown When killing ksmbd server after connecting rdma, ksmbd threads does not terminate properly because the rdma connection is still alive. This patch add shutdown operation to disconnect rdma connection while ksmbd threads terminate. Signed-off-by: Yufan Chen Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/connection.c | 9 ++++++++- fs/ksmbd/connection.h | 1 + fs/ksmbd/transport_rdma.c | 10 ++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index d1d0105be5b1..208d2cff7bd3 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -387,17 +387,24 @@ out: static void stop_sessions(void) { struct ksmbd_conn *conn; + struct ksmbd_transport *t; again: read_lock(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) { struct task_struct *task; - task = conn->transport->handler; + t = conn->transport; + task = t->handler; if (task) ksmbd_debug(CONN, "Stop session handler %s/%d\n", task->comm, task_pid_nr(task)); conn->status = KSMBD_SESS_EXITING; + if (t->ops->shutdown) { + read_unlock(&conn_list_lock); + t->ops->shutdown(t); + read_lock(&conn_list_lock); + } } read_unlock(&conn_list_lock); diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index 7e0730a262da..7a59aacb5daa 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -117,6 +117,7 @@ struct ksmbd_conn_ops { struct ksmbd_transport_ops { int (*prepare)(struct ksmbd_transport *t); void (*disconnect)(struct ksmbd_transport *t); + void (*shutdown)(struct ksmbd_transport *t); int (*read)(struct ksmbd_transport *t, char *buf, unsigned int size); int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov, int size, bool need_invalidate_rkey, diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 86fd64511512..3c1ec1ac0b27 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -1453,6 +1453,15 @@ static void smb_direct_disconnect(struct ksmbd_transport *t) free_transport(st); } +static void smb_direct_shutdown(struct ksmbd_transport *t) +{ + struct smb_direct_transport *st = smb_trans_direct_transfort(t); + + ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", st->cm_id); + + smb_direct_disconnect_rdma_work(&st->disconnect_work); +} + static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -2201,6 +2210,7 @@ out: static struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { .prepare = smb_direct_prepare, .disconnect = smb_direct_disconnect, + .shutdown = smb_direct_shutdown, .writev = smb_direct_writev, .read = smb_direct_read, .rdma_read = smb_direct_rdma_read, From 51edb2ff1c6fc27d3fa73f0773a31597ecd8e230 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 10 Jan 2022 20:48:17 +0100 Subject: [PATCH 031/149] netfilter: nf_tables: typo NULL check in _clone() function This should check for NULL in case memory allocation fails. Reported-by: Julian Wiedmann Fixes: 3b9e2ea6c11b ("netfilter: nft_limit: move stateful fields out of expression data") Fixes: 37f319f37d90 ("netfilter: nft_connlimit: move stateful fields out of expression data") Fixes: 33a24de37e81 ("netfilter: nft_last: move stateful fields out of expression data") Fixes: ed0a0c60f0e5 ("netfilter: nft_quota: move stateful fields out of expression data") Signed-off-by: Pablo Neira Ayuso Link: https://lore.kernel.org/r/20220110194817.53481-1-pablo@netfilter.org Signed-off-by: Jakub Kicinski --- net/netfilter/nft_connlimit.c | 2 +- net/netfilter/nft_last.c | 2 +- net/netfilter/nft_limit.c | 2 +- net/netfilter/nft_quota.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 58dcafe8bf79..7d00a1452b1d 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -206,7 +206,7 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) struct nft_connlimit *priv_src = nft_expr_priv(src); priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC); - if (priv_dst->list) + if (!priv_dst->list) return -ENOMEM; nf_conncount_list_init(priv_dst->list); diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c index 5ee33d0ccd4e..4f745a409d34 100644 --- a/net/netfilter/nft_last.c +++ b/net/netfilter/nft_last.c @@ -106,7 +106,7 @@ static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src) struct nft_last_priv *priv_dst = nft_expr_priv(dst); priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC); - if (priv_dst->last) + if (!priv_dst->last) return -ENOMEM; return 0; diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index f04be5be73a0..c4f308460dd1 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -145,7 +145,7 @@ static int nft_limit_clone(struct nft_limit_priv *priv_dst, priv_dst->invert = priv_src->invert; priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC); - if (priv_dst->limit) + if (!priv_dst->limit) return -ENOMEM; spin_lock_init(&priv_dst->limit->lock); diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 0484aef74273..f394a0b562f6 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -237,7 +237,7 @@ static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src) struct nft_quota *priv_dst = nft_expr_priv(dst); priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC); - if (priv_dst->consumed) + if (!priv_dst->consumed) return -ENOMEM; atomic64_set(priv_dst->consumed, 0); From 7d6019b602de660bfc6a542a68630006ace83b90 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Mon, 10 Jan 2022 12:52:46 -0800 Subject: [PATCH 032/149] Revert "net: vertexcom: default to disabled on kbuild" This reverts commit 6bf950a8ff72920340dfdec93c18bd3f5f35de6a. To align with other vendors, NET_VENDOR configs are supposed to be ON by default, while their drivers should default to OFF. Suggested-by: Jakub Kicinski Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20220110205246.66298-1-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/vertexcom/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/vertexcom/Kconfig b/drivers/net/ethernet/vertexcom/Kconfig index 6e2cf062ddba..4184a635fe01 100644 --- a/drivers/net/ethernet/vertexcom/Kconfig +++ b/drivers/net/ethernet/vertexcom/Kconfig @@ -5,7 +5,7 @@ config NET_VENDOR_VERTEXCOM bool "Vertexcom devices" - default n + default y help If you have a network (Ethernet) card belonging to this class, say Y. From 1e9d74660d4df625b0889e77018f9e94727ceacd Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 8 Jan 2022 13:46:23 +0000 Subject: [PATCH 033/149] bpf: Fix mount source show for bpffs We noticed our tc ebpf tools can't start after we upgrade our in-house kernel version from 4.19 to 5.10. That is because of the behaviour change in bpffs caused by commit d2935de7e4fd ("vfs: Convert bpf to use the new mount API"). In our tc ebpf tools, we do strict environment check. If the environment is not matched, we won't allow to start the ebpf progs. One of the check is whether bpffs is properly mounted. The mount information of bpffs in kernel-4.19 and kernel-5.10 are as follows: - kernel 4.19 $ mount -t bpf bpffs /sys/fs/bpf $ mount -t bpf bpffs on /sys/fs/bpf type bpf (rw,relatime) - kernel 5.10 $ mount -t bpf bpffs /sys/fs/bpf $ mount -t bpf none on /sys/fs/bpf type bpf (rw,relatime) The device name in kernel-5.10 is displayed as none instead of bpffs, then our environment check fails. Currently we modify the tools to adopt to the kernel behaviour change, but I think we'd better change the kernel code to keep the behavior consistent. After this change, the mount information will be displayed the same with the behavior in kernel-4.19, for example: $ mount -t bpf bpffs /sys/fs/bpf $ mount -t bpf bpffs on /sys/fs/bpf type bpf (rw,relatime) Fixes: d2935de7e4fd ("vfs: Convert bpf to use the new mount API") Suggested-by: Daniel Borkmann Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann Acked-by: Christian Brauner Cc: David Howells Cc: Al Viro Link: https://lore.kernel.org/bpf/20220108134623.32467-1-laoar.shao@gmail.com --- kernel/bpf/inode.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 80da1db47c68..5a8d9f7467bf 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -648,12 +648,22 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) int opt; opt = fs_parse(fc, bpf_fs_parameters, param, &result); - if (opt < 0) + if (opt < 0) { /* We might like to report bad mount options here, but * traditionally we've ignored all mount options, so we'd * better continue to ignore non-existing options for bpf. */ - return opt == -ENOPARAM ? 0 : opt; + if (opt == -ENOPARAM) { + opt = vfs_parse_fs_param_source(fc, param); + if (opt != -ENOPARAM) + return opt; + + return 0; + } + + if (opt < 0) + return opt; + } switch (opt) { case OPT_MODE: From 382778edc8262b7535f00523e9eb22edba1b9816 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 7 Jan 2022 23:11:13 +0100 Subject: [PATCH 034/149] xdp: check prog type before updating BPF link MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_xdp_link_update() function didn't check the program type before updating the program, which made it possible to install any program type as an XDP program, which is obviously not good. Syzbot managed to trigger this by swapping in an LWT program on the XDP hook which would crash in a helper call. Fix this by adding a check and bailing out if the types don't match. Fixes: 026a4c28e1db ("bpf, xdp: Implement LINK_UPDATE for BPF XDP link") Reported-by: syzbot+983941aa85af6ded1fd9@syzkaller.appspotmail.com Acked-by: Andrii Nakryiko Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20220107221115.326171-1-toke@redhat.com Signed-off-by: Alexei Starovoitov --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 84a0d9542fe9..1baab07820f6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8981,6 +8981,12 @@ static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, goto out_unlock; } old_prog = link->prog; + if (old_prog->type != new_prog->type || + old_prog->expected_attach_type != new_prog->expected_attach_type) { + err = -EINVAL; + goto out_unlock; + } + if (old_prog == new_prog) { /* no-op, don't disturb drivers */ bpf_prog_put(new_prog); From 4b27480dcaa71e7ee9f56907e419c6a1511fd2b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 7 Jan 2022 23:11:14 +0100 Subject: [PATCH 035/149] bpf/selftests: convert xdp_link test to ASSERT_* macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert the selftest to use the preferred ASSERT_* macros instead of the deprecated CHECK(). v2: - Don't add if statements around checks if they weren't there before. Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20220107221115.326171-2-toke@redhat.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/xdp_link.c | 56 +++++++++---------- 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_link.c b/tools/testing/selftests/bpf/prog_tests/xdp_link.c index 983ab0b47d30..eec0bf83546b 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_link.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_link.c @@ -8,46 +8,47 @@ void serial_test_xdp_link(void) { - __u32 duration = 0, id1, id2, id0 = 0, prog_fd1, prog_fd2, err; DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts, .old_fd = -1); struct test_xdp_link *skel1 = NULL, *skel2 = NULL; + __u32 id1, id2, id0 = 0, prog_fd1, prog_fd2; struct bpf_link_info link_info; struct bpf_prog_info prog_info; struct bpf_link *link; + int err; __u32 link_info_len = sizeof(link_info); __u32 prog_info_len = sizeof(prog_info); skel1 = test_xdp_link__open_and_load(); - if (CHECK(!skel1, "skel_load", "skeleton open and load failed\n")) + if (!ASSERT_OK_PTR(skel1, "skel_load")) goto cleanup; prog_fd1 = bpf_program__fd(skel1->progs.xdp_handler); skel2 = test_xdp_link__open_and_load(); - if (CHECK(!skel2, "skel_load", "skeleton open and load failed\n")) + if (!ASSERT_OK_PTR(skel2, "skel_load")) goto cleanup; prog_fd2 = bpf_program__fd(skel2->progs.xdp_handler); memset(&prog_info, 0, sizeof(prog_info)); err = bpf_obj_get_info_by_fd(prog_fd1, &prog_info, &prog_info_len); - if (CHECK(err, "fd_info1", "failed %d\n", -errno)) + if (!ASSERT_OK(err, "fd_info1")) goto cleanup; id1 = prog_info.id; memset(&prog_info, 0, sizeof(prog_info)); err = bpf_obj_get_info_by_fd(prog_fd2, &prog_info, &prog_info_len); - if (CHECK(err, "fd_info2", "failed %d\n", -errno)) + if (!ASSERT_OK(err, "fd_info2")) goto cleanup; id2 = prog_info.id; /* set initial prog attachment */ err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, prog_fd1, XDP_FLAGS_REPLACE, &opts); - if (CHECK(err, "fd_attach", "initial prog attach failed: %d\n", err)) + if (!ASSERT_OK(err, "fd_attach")) goto cleanup; /* validate prog ID */ err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0); - CHECK(err || id0 != id1, "id1_check", - "loaded prog id %u != id1 %u, err %d", id0, id1, err); + if (!ASSERT_OK(err, "id1_check_err") || !ASSERT_EQ(id0, id1, "id1_check_val")) + goto cleanup; /* BPF link is not allowed to replace prog attachment */ link = bpf_program__attach_xdp(skel1->progs.xdp_handler, IFINDEX_LO); @@ -62,7 +63,7 @@ void serial_test_xdp_link(void) /* detach BPF program */ opts.old_fd = prog_fd1; err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, XDP_FLAGS_REPLACE, &opts); - if (CHECK(err, "prog_detach", "failed %d\n", err)) + if (!ASSERT_OK(err, "prog_detach")) goto cleanup; /* now BPF link should attach successfully */ @@ -73,24 +74,23 @@ void serial_test_xdp_link(void) /* validate prog ID */ err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0); - if (CHECK(err || id0 != id1, "id1_check", - "loaded prog id %u != id1 %u, err %d", id0, id1, err)) + if (!ASSERT_OK(err, "id1_check_err") || !ASSERT_EQ(id0, id1, "id1_check_val")) goto cleanup; /* BPF prog attach is not allowed to replace BPF link */ opts.old_fd = prog_fd1; err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, prog_fd2, XDP_FLAGS_REPLACE, &opts); - if (CHECK(!err, "prog_attach_fail", "unexpected success\n")) + if (!ASSERT_ERR(err, "prog_attach_fail")) goto cleanup; /* Can't force-update when BPF link is active */ err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd2, 0); - if (CHECK(!err, "prog_update_fail", "unexpected success\n")) + if (!ASSERT_ERR(err, "prog_update_fail")) goto cleanup; /* Can't force-detach when BPF link is active */ err = bpf_set_link_xdp_fd(IFINDEX_LO, -1, 0); - if (CHECK(!err, "prog_detach_fail", "unexpected success\n")) + if (!ASSERT_ERR(err, "prog_detach_fail")) goto cleanup; /* BPF link is not allowed to replace another BPF link */ @@ -110,40 +110,34 @@ void serial_test_xdp_link(void) skel2->links.xdp_handler = link; err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0); - if (CHECK(err || id0 != id2, "id2_check", - "loaded prog id %u != id2 %u, err %d", id0, id1, err)) + if (!ASSERT_OK(err, "id2_check_err") || !ASSERT_EQ(id0, id2, "id2_check_val")) goto cleanup; /* updating program under active BPF link works as expected */ err = bpf_link__update_program(link, skel1->progs.xdp_handler); - if (CHECK(err, "link_upd", "failed: %d\n", err)) + if (!ASSERT_OK(err, "link_upd")) goto cleanup; memset(&link_info, 0, sizeof(link_info)); err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &link_info, &link_info_len); - if (CHECK(err, "link_info", "failed: %d\n", err)) + if (!ASSERT_OK(err, "link_info")) goto cleanup; - CHECK(link_info.type != BPF_LINK_TYPE_XDP, "link_type", - "got %u != exp %u\n", link_info.type, BPF_LINK_TYPE_XDP); - CHECK(link_info.prog_id != id1, "link_prog_id", - "got %u != exp %u\n", link_info.prog_id, id1); - CHECK(link_info.xdp.ifindex != IFINDEX_LO, "link_ifindex", - "got %u != exp %u\n", link_info.xdp.ifindex, IFINDEX_LO); + ASSERT_EQ(link_info.type, BPF_LINK_TYPE_XDP, "link_type"); + ASSERT_EQ(link_info.prog_id, id1, "link_prog_id"); + ASSERT_EQ(link_info.xdp.ifindex, IFINDEX_LO, "link_ifindex"); err = bpf_link__detach(link); - if (CHECK(err, "link_detach", "failed %d\n", err)) + if (!ASSERT_OK(err, "link_detach")) goto cleanup; memset(&link_info, 0, sizeof(link_info)); err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &link_info, &link_info_len); - if (CHECK(err, "link_info", "failed: %d\n", err)) - goto cleanup; - CHECK(link_info.prog_id != id1, "link_prog_id", - "got %u != exp %u\n", link_info.prog_id, id1); + + ASSERT_OK(err, "link_info"); + ASSERT_EQ(link_info.prog_id, id1, "link_prog_id"); /* ifindex should be zeroed out */ - CHECK(link_info.xdp.ifindex != 0, "link_ifindex", - "got %u != exp %u\n", link_info.xdp.ifindex, 0); + ASSERT_EQ(link_info.xdp.ifindex, 0, "link_ifindex"); cleanup: test_xdp_link__destroy(skel1); From 036a05f50bd777134b1955f400e8d24c0149fef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 7 Jan 2022 23:11:15 +0100 Subject: [PATCH 036/149] bpf/selftests: Add check for updating XDP bpf_link with wrong program type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a check to the xdp_link selftest that the kernel rejects replacing an XDP program with a different program type on link update. v2: - Split this out into its own patch. Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20220107221115.326171-3-toke@redhat.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/xdp_link.c | 5 +++++ tools/testing/selftests/bpf/progs/test_xdp_link.c | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_link.c b/tools/testing/selftests/bpf/prog_tests/xdp_link.c index eec0bf83546b..b2b357f8c74c 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_link.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_link.c @@ -127,6 +127,11 @@ void serial_test_xdp_link(void) ASSERT_EQ(link_info.prog_id, id1, "link_prog_id"); ASSERT_EQ(link_info.xdp.ifindex, IFINDEX_LO, "link_ifindex"); + /* updating program under active BPF link with different type fails */ + err = bpf_link__update_program(link, skel1->progs.tc_handler); + if (!ASSERT_ERR(err, "link_upd_invalid")) + goto cleanup; + err = bpf_link__detach(link); if (!ASSERT_OK(err, "link_detach")) goto cleanup; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_link.c b/tools/testing/selftests/bpf/progs/test_xdp_link.c index ee7d6ac0f615..64ff32eaae92 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_link.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_link.c @@ -10,3 +10,9 @@ int xdp_handler(struct xdp_md *xdp) { return 0; } + +SEC("tc") +int tc_handler(struct __sk_buff *skb) +{ + return 0; +} From 343e53754b21ae45530623222aa079fecd3cf942 Mon Sep 17 00:00:00 2001 From: Christy Lee Date: Fri, 7 Jan 2022 16:58:54 -0800 Subject: [PATCH 037/149] bpf: Fix incorrect integer literal used for marking scratched stack. env->scratched_stack_slots is a 64-bit value, we should use ULL instead of UL literal values. Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Christy Lee Acked-by: Song Liu Link: https://lore.kernel.org/r/20220108005854.658596-1-christylee@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bfb45381fb3f..a8587210907d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -616,7 +616,7 @@ static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) { - env->scratched_stack_slots |= 1UL << spi; + env->scratched_stack_slots |= 1ULL << spi; } static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) @@ -637,14 +637,14 @@ static bool verifier_state_scratched(const struct bpf_verifier_env *env) static void mark_verifier_state_clean(struct bpf_verifier_env *env) { env->scratched_regs = 0U; - env->scratched_stack_slots = 0UL; + env->scratched_stack_slots = 0ULL; } /* Used for printing the entire verifier state. */ static void mark_verifier_state_scratched(struct bpf_verifier_env *env) { env->scratched_regs = ~0U; - env->scratched_stack_slots = ~0UL; + env->scratched_stack_slots = ~0ULL; } /* The reg state of a pointer or a bounded scalar was saved when From 284a4d94e8e74fbd731ee67e29196656ca823423 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Mon, 10 Jan 2022 10:18:06 +0800 Subject: [PATCH 038/149] mctp: test: zero out sockaddr MCTP now requires that padding bytes are zero. Signed-off-by: Matt Johnston Fixes: 1e4b50f06d97 ("mctp: handle the struct sockaddr_mctp padding fields") Link: https://lore.kernel.org/r/20220110021806.2343023-1-matt@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/test/route-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index 86ad15abf897..750f9f9b4daf 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -285,7 +285,7 @@ static void __mctp_route_test_init(struct kunit *test, struct mctp_test_route **rtp, struct socket **sockp) { - struct sockaddr_mctp addr; + struct sockaddr_mctp addr = {0}; struct mctp_test_route *rt; struct mctp_test_dev *dev; struct socket *sock; From 274c224062ff9545d8df2a6e4764a5fd660d792e Mon Sep 17 00:00:00 2001 From: Conley Lee Date: Tue, 11 Jan 2022 11:05:53 +0800 Subject: [PATCH 039/149] net: ethernet: sun4i-emac: replace magic number with macro This patch remove magic numbers in sun4i-emac.c and replace with macros defined in sun4i-emac.h Signed-off-by: Conley Lee Tested-by: Corentin Labbe Link: https://lore.kernel.org/r/tencent_71466C2135CD1780B19D7844BE3F167C940A@qq.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/allwinner/sun4i-emac.c | 30 ++++++++++++--------- drivers/net/ethernet/allwinner/sun4i-emac.h | 18 +++++++++++++ 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c index 849de4564709..74635a6fa8ca 100644 --- a/drivers/net/ethernet/allwinner/sun4i-emac.c +++ b/drivers/net/ethernet/allwinner/sun4i-emac.c @@ -106,9 +106,9 @@ static void emac_update_speed(struct net_device *dev) /* set EMAC SPEED, depend on PHY */ reg_val = readl(db->membase + EMAC_MAC_SUPP_REG); - reg_val &= ~(0x1 << 8); + reg_val &= ~EMAC_MAC_SUPP_100M; if (db->speed == SPEED_100) - reg_val |= 1 << 8; + reg_val |= EMAC_MAC_SUPP_100M; writel(reg_val, db->membase + EMAC_MAC_SUPP_REG); } @@ -264,7 +264,7 @@ static void emac_dma_done_callback(void *arg) /* re enable interrupt */ reg_val = readl(db->membase + EMAC_INT_CTL_REG); - reg_val |= (0x01 << 8); + reg_val |= EMAC_INT_CTL_RX_EN; writel(reg_val, db->membase + EMAC_INT_CTL_REG); db->emacrx_completed_flag = 1; @@ -429,7 +429,7 @@ static unsigned int emac_powerup(struct net_device *ndev) /* initial EMAC */ /* flush RX FIFO */ reg_val = readl(db->membase + EMAC_RX_CTL_REG); - reg_val |= 0x8; + reg_val |= EMAC_RX_CTL_FLUSH_FIFO; writel(reg_val, db->membase + EMAC_RX_CTL_REG); udelay(1); @@ -441,8 +441,8 @@ static unsigned int emac_powerup(struct net_device *ndev) /* set MII clock */ reg_val = readl(db->membase + EMAC_MAC_MCFG_REG); - reg_val &= (~(0xf << 2)); - reg_val |= (0xD << 2); + reg_val &= ~EMAC_MAC_MCFG_MII_CLKD_MASK; + reg_val |= EMAC_MAC_MCFG_MII_CLKD_72; writel(reg_val, db->membase + EMAC_MAC_MCFG_REG); /* clear RX counter */ @@ -506,7 +506,7 @@ static void emac_init_device(struct net_device *dev) /* enable RX/TX0/RX Hlevel interrup */ reg_val = readl(db->membase + EMAC_INT_CTL_REG); - reg_val |= (0xf << 0) | (0x01 << 8); + reg_val |= (EMAC_INT_CTL_TX_EN | EMAC_INT_CTL_TX_ABRT_EN | EMAC_INT_CTL_RX_EN); writel(reg_val, db->membase + EMAC_INT_CTL_REG); spin_unlock_irqrestore(&db->lock, flags); @@ -637,7 +637,9 @@ static void emac_rx(struct net_device *dev) if (!rxcount) { db->emacrx_completed_flag = 1; reg_val = readl(db->membase + EMAC_INT_CTL_REG); - reg_val |= (0xf << 0) | (0x01 << 8); + reg_val |= (EMAC_INT_CTL_TX_EN | + EMAC_INT_CTL_TX_ABRT_EN | + EMAC_INT_CTL_RX_EN); writel(reg_val, db->membase + EMAC_INT_CTL_REG); /* had one stuck? */ @@ -669,7 +671,9 @@ static void emac_rx(struct net_device *dev) writel(reg_val | EMAC_CTL_RX_EN, db->membase + EMAC_CTL_REG); reg_val = readl(db->membase + EMAC_INT_CTL_REG); - reg_val |= (0xf << 0) | (0x01 << 8); + reg_val |= (EMAC_INT_CTL_TX_EN | + EMAC_INT_CTL_TX_ABRT_EN | + EMAC_INT_CTL_RX_EN); writel(reg_val, db->membase + EMAC_INT_CTL_REG); db->emacrx_completed_flag = 1; @@ -783,20 +787,20 @@ static irqreturn_t emac_interrupt(int irq, void *dev_id) } /* Transmit Interrupt check */ - if (int_status & (0x01 | 0x02)) + if (int_status & EMAC_INT_STA_TX_COMPLETE) emac_tx_done(dev, db, int_status); - if (int_status & (0x04 | 0x08)) + if (int_status & EMAC_INT_STA_TX_ABRT) netdev_info(dev, " ab : %x\n", int_status); /* Re-enable interrupt mask */ if (db->emacrx_completed_flag == 1) { reg_val = readl(db->membase + EMAC_INT_CTL_REG); - reg_val |= (0xf << 0) | (0x01 << 8); + reg_val |= (EMAC_INT_CTL_TX_EN | EMAC_INT_CTL_TX_ABRT_EN | EMAC_INT_CTL_RX_EN); writel(reg_val, db->membase + EMAC_INT_CTL_REG); } else { reg_val = readl(db->membase + EMAC_INT_CTL_REG); - reg_val |= (0xf << 0); + reg_val |= (EMAC_INT_CTL_TX_EN | EMAC_INT_CTL_TX_ABRT_EN); writel(reg_val, db->membase + EMAC_INT_CTL_REG); } diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.h b/drivers/net/ethernet/allwinner/sun4i-emac.h index 38c72d9ec600..90bd9ad77607 100644 --- a/drivers/net/ethernet/allwinner/sun4i-emac.h +++ b/drivers/net/ethernet/allwinner/sun4i-emac.h @@ -38,6 +38,7 @@ #define EMAC_RX_CTL_REG (0x3c) #define EMAC_RX_CTL_AUTO_DRQ_EN (1 << 1) #define EMAC_RX_CTL_DMA_EN (1 << 2) +#define EMAC_RX_CTL_FLUSH_FIFO (1 << 3) #define EMAC_RX_CTL_PASS_ALL_EN (1 << 4) #define EMAC_RX_CTL_PASS_CTL_EN (1 << 5) #define EMAC_RX_CTL_PASS_CRC_ERR_EN (1 << 6) @@ -61,7 +62,21 @@ #define EMAC_RX_IO_DATA_STATUS_OK (1 << 7) #define EMAC_RX_FBC_REG (0x50) #define EMAC_INT_CTL_REG (0x54) +#define EMAC_INT_CTL_RX_EN (1 << 8) +#define EMAC_INT_CTL_TX0_EN (1) +#define EMAC_INT_CTL_TX1_EN (1 << 1) +#define EMAC_INT_CTL_TX_EN (EMAC_INT_CTL_TX0_EN | EMAC_INT_CTL_TX1_EN) +#define EMAC_INT_CTL_TX0_ABRT_EN (0x1 << 2) +#define EMAC_INT_CTL_TX1_ABRT_EN (0x1 << 3) +#define EMAC_INT_CTL_TX_ABRT_EN (EMAC_INT_CTL_TX0_ABRT_EN | EMAC_INT_CTL_TX1_ABRT_EN) #define EMAC_INT_STA_REG (0x58) +#define EMAC_INT_STA_TX0_COMPLETE (0x1) +#define EMAC_INT_STA_TX1_COMPLETE (0x1 << 1) +#define EMAC_INT_STA_TX_COMPLETE (EMAC_INT_STA_TX0_COMPLETE | EMAC_INT_STA_TX1_COMPLETE) +#define EMAC_INT_STA_TX0_ABRT (0x1 << 2) +#define EMAC_INT_STA_TX1_ABRT (0x1 << 3) +#define EMAC_INT_STA_TX_ABRT (EMAC_INT_STA_TX0_ABRT | EMAC_INT_STA_TX1_ABRT) +#define EMAC_INT_STA_RX_COMPLETE (0x1 << 8) #define EMAC_MAC_CTL0_REG (0x5c) #define EMAC_MAC_CTL0_RX_FLOW_CTL_EN (1 << 2) #define EMAC_MAC_CTL0_TX_FLOW_CTL_EN (1 << 3) @@ -87,8 +102,11 @@ #define EMAC_MAC_CLRT_RM (0x0f) #define EMAC_MAC_MAXF_REG (0x70) #define EMAC_MAC_SUPP_REG (0x74) +#define EMAC_MAC_SUPP_100M (0x1 << 8) #define EMAC_MAC_TEST_REG (0x78) #define EMAC_MAC_MCFG_REG (0x7c) +#define EMAC_MAC_MCFG_MII_CLKD_MASK (0xff << 2) +#define EMAC_MAC_MCFG_MII_CLKD_72 (0x0d << 2) #define EMAC_MAC_A0_REG (0x98) #define EMAC_MAC_A1_REG (0x9c) #define EMAC_MAC_A2_REG (0xa0) From 23e7b1bfed61e301853b5e35472820d919498278 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 10 Jan 2022 14:43:06 +0100 Subject: [PATCH 040/149] xfrm: Don't accidentally set RTO_ONLINK in decode_session4() Similar to commit 94e2238969e8 ("xfrm4: strip ECN bits from tos field"), clear the ECN bits from iph->tos when setting ->flowi4_tos. This ensures that the last bit of ->flowi4_tos is cleared, so ip_route_output_key_hash() isn't going to restrict the scope of the route lookup. Use ~INET_ECN_MASK instead of IPTOS_RT_MASK, because we have no reason to clear the high order bits. Found by code inspection, compile tested only. Fixes: 4da3089f2b58 ("[IPSEC]: Use TOS when doing tunnel lookups") Signed-off-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- net/xfrm/xfrm_policy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index dccb8f3318ef..04d1ce9b510f 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -3295,7 +3296,7 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) fl4->flowi4_proto = iph->protocol; fl4->daddr = reverse ? iph->saddr : iph->daddr; fl4->saddr = reverse ? iph->daddr : iph->saddr; - fl4->flowi4_tos = iph->tos; + fl4->flowi4_tos = iph->tos & ~INET_ECN_MASK; if (!ip_is_fragment(iph)) { switch (iph->protocol) { From f7716b318568b22fbf0e3be99279a979e217cf71 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 10 Jan 2022 14:43:09 +0100 Subject: [PATCH 041/149] gre: Don't accidentally set RTO_ONLINK in gre_fill_metadata_dst() Mask the ECN bits before initialising ->flowi4_tos. The tunnel key may have the last ECN bit set, which will interfere with the route lookup process as ip_route_output_key_hash() interpretes this bit specially (to restrict the route scope). Found by code inspection, compile tested only. Fixes: 962924fa2b7a ("ip_gre: Refactor collect metatdata mode tunnel xmit to ip_md_tunnel_xmit") Signed-off-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- net/ipv4/ip_gre.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2ac2b95c5694..99db2e41ed10 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -604,8 +604,9 @@ static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) key = &info->key; ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src, - tunnel_id_to_key32(key->tun_id), key->tos, 0, - skb->mark, skb_get_hash(skb)); + tunnel_id_to_key32(key->tun_id), + key->tos & ~INET_ECN_MASK, 0, skb->mark, + skb_get_hash(skb)); rt = ip_route_output_key(dev_net(dev), &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); From a915deaa9abe4fb3a440312c954253a6a733608e Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 10 Jan 2022 14:43:11 +0100 Subject: [PATCH 042/149] libcxgb: Don't accidentally set RTO_ONLINK in cxgb_find_route() Mask the ECN bits before calling ip_route_output_ports(). The tos variable might be passed directly from an IPv4 header, so it may have the last ECN bit set. This interferes with the route lookup process as ip_route_output_key_hash() interpretes this bit specially (to restrict the route scope). Found by code inspection, compile tested only. Fixes: 804c2f3e36ef ("libcxgb,iw_cxgb4,cxgbit: add cxgb_find_route()") Signed-off-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c b/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c index d04a6c163445..da8d10475a08 100644 --- a/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c +++ b/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c @@ -32,6 +32,7 @@ #include #include +#include #include #include @@ -99,7 +100,7 @@ cxgb_find_route(struct cxgb4_lld_info *lldi, rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip, peer_port, local_port, IPPROTO_TCP, - tos, 0); + tos & ~INET_ECN_MASK, 0); if (IS_ERR(rt)) return NULL; n = dst_neigh_lookup(&rt->dst, &peer_ip); From 48d67543e01d73292e0bb66d3f10fc422e79e031 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 10 Jan 2022 14:43:14 +0100 Subject: [PATCH 043/149] mlx5: Don't accidentally set RTO_ONLINK before mlx5e_route_lookup_ipv4_get() Mask the ECN bits before calling mlx5e_route_lookup_ipv4_get(). The tunnel key might have the last ECN bit set. This interferes with the route lookup process as ip_route_output_key_hash() interpretes this bit specially (to restrict the route scope). Found by code inspection, compile tested only. Fixes: c7b9038d8af6 ("net/mlx5e: TC preparation refactoring for routing update event") Fixes: 9a941117fb76 ("net/mlx5e: Maximize ip tunnel key usage on the TC offloading path") Signed-off-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 33815246fead..378fc8e3bd97 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* Copyright (c) 2018 Mellanox Technologies. */ +#include #include #include #include @@ -235,7 +236,7 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, int err; /* add the IP fields */ - attr.fl.fl4.flowi4_tos = tun_key->tos; + attr.fl.fl4.flowi4_tos = tun_key->tos & ~INET_ECN_MASK; attr.fl.fl4.daddr = tun_key->u.ipv4.dst; attr.fl.fl4.saddr = tun_key->u.ipv4.src; attr.ttl = tun_key->ttl; @@ -350,7 +351,7 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv, int err; /* add the IP fields */ - attr.fl.fl4.flowi4_tos = tun_key->tos; + attr.fl.fl4.flowi4_tos = tun_key->tos & ~INET_ECN_MASK; attr.fl.fl4.daddr = tun_key->u.ipv4.dst; attr.fl.fl4.saddr = tun_key->u.ipv4.src; attr.ttl = tun_key->ttl; From cb963a19d99fc42d9abf4238968ef85fcc2ef3e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 10 Jan 2022 01:47:50 -0800 Subject: [PATCH 044/149] net: sched: do not allocate a tracker in tcf_exts_init() While struct tcf_exts has a net pointer, it is not refcounted until tcf_exts_get_net() is called. Fixes: dbdcda634ce3 ("net: sched: add netns refcount tracker to struct tcf_exts") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20220110094750.236478-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/pkt_cls.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index ebef45e821af..676cb8ea9e15 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -218,8 +218,10 @@ static inline int tcf_exts_init(struct tcf_exts *exts, struct net *net, #ifdef CONFIG_NET_CLS_ACT exts->type = 0; exts->nr_actions = 0; + /* Note: we do not own yet a reference on net. + * This reference might be taken later from tcf_exts_get_net(). + */ exts->net = net; - netns_tracker_alloc(net, &exts->ns_tracker, GFP_KERNEL); exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *), GFP_KERNEL); if (!exts->actions) From e110978d6e060a3e633e3e5e427770a66b330d48 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 9 Jan 2022 20:24:18 +0000 Subject: [PATCH 045/149] nfc: pn544: make array rset_cmd static const Don't populate the read-only array rset_cmd on the stack but instead it static const. Also makes the object code a little smaller. Signed-off-by: Colin Ian King Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220109202418.50641-1-colin.i.king@gmail.com Signed-off-by: Jakub Kicinski --- drivers/nfc/pn544/i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nfc/pn544/i2c.c b/drivers/nfc/pn544/i2c.c index 37d26f01986b..62a0f1a010cb 100644 --- a/drivers/nfc/pn544/i2c.c +++ b/drivers/nfc/pn544/i2c.c @@ -188,7 +188,7 @@ do { \ static void pn544_hci_i2c_platform_init(struct pn544_i2c_phy *phy) { int polarity, retry, ret; - char rset_cmd[] = { 0x05, 0xF9, 0x04, 0x00, 0xC3, 0xE5 }; + static const char rset_cmd[] = { 0x05, 0xF9, 0x04, 0x00, 0xC3, 0xE5 }; int count = sizeof(rset_cmd); nfc_info(&phy->i2c_dev->dev, "Detecting nfc_en polarity\n"); From edcb501e543cecaff5e3d079c798911e680fea65 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 9 Jan 2022 23:17:16 +0000 Subject: [PATCH 046/149] net: phy: at803x: make array offsets static Don't populate the read-only const array offsets on the stack but instead make it static. Also makes the object code a little smaller. Signed-off-by: Colin Ian King Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220109231716.59012-1-colin.i.king@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/at803x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index dae95d9a07e8..5b6c0d120e09 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -421,7 +421,7 @@ static int at803x_set_wol(struct phy_device *phydev, const u8 *mac; int ret, irq_enabled; unsigned int i; - const unsigned int offsets[] = { + static const unsigned int offsets[] = { AT803X_LOC_MAC_ADDR_32_47_OFFSET, AT803X_LOC_MAC_ADDR_16_31_OFFSET, AT803X_LOC_MAC_ADDR_0_15_OFFSET, From 3486eb774f9d6c2cafcfed31936c9a9b7adf8f05 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 11 Jan 2022 09:12:06 +0100 Subject: [PATCH 047/149] Revert "of: net: support NVMEM cells with MAC in text format" This reverts commit 9ed319e411915e882bb4ed99be3ae78667a70022. We can already post process a nvmem cell value in a particular driver. Instead of having yet another place to convert the values, the post processing hook of the nvmem provider should be used in this case. Signed-off-by: Michael Walle Signed-off-by: David S. Miller --- net/core/of_net.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/net/core/of_net.c b/net/core/of_net.c index 95a64c813ae5..f1a9bf7578e7 100644 --- a/net/core/of_net.c +++ b/net/core/of_net.c @@ -61,7 +61,7 @@ static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr) { struct platform_device *pdev = of_find_device_by_node(np); struct nvmem_cell *cell; - const void *buf; + const void *mac; size_t len; int ret; @@ -78,32 +78,21 @@ static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr) if (IS_ERR(cell)) return PTR_ERR(cell); - buf = nvmem_cell_read(cell, &len); + mac = nvmem_cell_read(cell, &len); nvmem_cell_put(cell); - if (IS_ERR(buf)) - return PTR_ERR(buf); + if (IS_ERR(mac)) + return PTR_ERR(mac); - ret = 0; - if (len == ETH_ALEN) { - if (is_valid_ether_addr(buf)) - memcpy(addr, buf, ETH_ALEN); - else - ret = -EINVAL; - } else if (len == 3 * ETH_ALEN - 1) { - u8 mac[ETH_ALEN]; - - if (mac_pton(buf, mac)) - memcpy(addr, mac, ETH_ALEN); - else - ret = -EINVAL; - } else { - ret = -EINVAL; + if (len != ETH_ALEN || !is_valid_ether_addr(mac)) { + kfree(mac); + return -EINVAL; } - kfree(buf); + memcpy(addr, mac, ETH_ALEN); + kfree(mac); - return ret; + return 0; } /** From 2a4d75bfe41232608f5596a6d1369f92ccb20817 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Tue, 11 Jan 2022 16:10:53 +0100 Subject: [PATCH 048/149] net: fix sock_timestamping_bind_phc() to release device Don't forget to release the device in sock_timestamping_bind_phc() after it was used to get the vclock indices. Fixes: d463126e23f1 ("net: sock: extend SO_TIMESTAMPING for PHC binding") Signed-off-by: Miroslav Lichvar Cc: Yangbo Lu Signed-off-by: David S. Miller --- net/core/sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/sock.c b/net/core/sock.c index e21485ab285d..f32ec08a0c37 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -844,6 +844,8 @@ static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) } num = ethtool_get_phc_vclocks(dev, &vclock_index); + dev_put(dev); + for (i = 0; i < num; i++) { if (*(vclock_index + i) == phc_index) { match = true; From 4fbcc1a4cb20fe26ad0225679c536c80f1648221 Mon Sep 17 00:00:00 2001 From: Jordy Zomer Date: Tue, 11 Jan 2022 17:44:51 +0100 Subject: [PATCH 049/149] nfc: st21nfca: Fix potential buffer overflows in EVT_TRANSACTION It appears that there are some buffer overflows in EVT_TRANSACTION. This happens because the length parameters that are passed to memcpy come directly from skb->data and are not guarded in any way. Signed-off-by: Jordy Zomer Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- drivers/nfc/st21nfca/se.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c index a43fc4117fa5..c922f10d0d7b 100644 --- a/drivers/nfc/st21nfca/se.c +++ b/drivers/nfc/st21nfca/se.c @@ -316,6 +316,11 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, return -ENOMEM; transaction->aid_len = skb->data[1]; + + /* Checking if the length of the AID is valid */ + if (transaction->aid_len > sizeof(transaction->aid)) + return -EINVAL; + memcpy(transaction->aid, &skb->data[2], transaction->aid_len); @@ -325,6 +330,11 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, return -EPROTO; transaction->params_len = skb->data[transaction->aid_len + 3]; + + /* Total size is allocated (skb->len - 2) minus fixed array members */ + if (transaction->params_len > ((skb->len - 2) - sizeof(struct nfc_evt_transaction))) + return -EINVAL; + memcpy(transaction->params, skb->data + transaction->aid_len + 4, transaction->params_len); From c12837d1bb31032bead9060dec99ef310d5b9fb7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jan 2022 03:14:45 -0800 Subject: [PATCH 050/149] ref_tracker: use __GFP_NOFAIL more carefully syzbot was able to trigger this warning from new_slab() /* * All existing users of the __GFP_NOFAIL are blockable, so warn * of any new users that actually require GFP_NOWAIT */ if (WARN_ON_ONCE(!can_direct_reclaim)) goto fail; Indeed, we should use __GFP_NOFAIL if direct reclaim is possible. Hopefully in the future we will be able to use SLAB_NOFAILSLAB option so that syzbot can benefit from full ref_tracker even in the presence of memory fault injections. WARNING: CPU: 0 PID: 13 at mm/page_alloc.c:5081 __alloc_pages_slowpath.constprop.0+0x1b7b/0x20d0 mm/page_alloc.c:5081 mm/page_alloc.c:5081 Modules linked in: CPU: 0 PID: 13 Comm: ksoftirqd/0 Not tainted 5.16.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__alloc_pages_slowpath.constprop.0+0x1b7b/0x20d0 mm/page_alloc.c:5081 mm/page_alloc.c:5081 Code: 90 08 00 00 48 81 c7 d8 04 00 00 48 89 f8 48 c1 e8 03 42 80 3c 30 00 0f 84 f0 ea ff ff e8 3d 82 09 00 e9 e6 ea ff ff 4d 89 fd <0f> 0b 48 b8 00 00 00 00 00 fc ff df 48 8b 54 24 30 48 c1 ea 03 80 RSP: 0018:ffffc90000d272b8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff88813fffc300 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000002 RDI: ffff88813fffc348 RBP: ffff88813fffc300 R08: 00000000000013dc R09: 00000000000013c8 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffffc90000d274e8 R14: dffffc0000000000 R15: ffffc90000d274e8 FS: 0000000000000000(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffefe6000f8 CR3: 000000001d21e000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __alloc_pages+0x412/0x500 mm/page_alloc.c:5382 mm/page_alloc.c:5382 alloc_pages+0x1a7/0x300 mm/mempolicy.c:2191 mm/mempolicy.c:2191 alloc_slab_page mm/slub.c:1793 [inline] allocate_slab mm/slub.c:1938 [inline] alloc_slab_page mm/slub.c:1793 [inline] mm/slub.c:1993 allocate_slab mm/slub.c:1938 [inline] mm/slub.c:1993 new_slab+0x349/0x4a0 mm/slub.c:1993 mm/slub.c:1993 ___slab_alloc+0x918/0xfe0 mm/slub.c:3022 mm/slub.c:3022 __slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3109 mm/slub.c:3109 slab_alloc_node mm/slub.c:3200 [inline] slab_alloc mm/slub.c:3242 [inline] slab_alloc_node mm/slub.c:3200 [inline] mm/slub.c:3259 slab_alloc mm/slub.c:3242 [inline] mm/slub.c:3259 kmem_cache_alloc_trace+0x289/0x2c0 mm/slub.c:3259 mm/slub.c:3259 kmalloc include/linux/slab.h:590 [inline] kzalloc include/linux/slab.h:724 [inline] kmalloc include/linux/slab.h:590 [inline] lib/ref_tracker.c:74 kzalloc include/linux/slab.h:724 [inline] lib/ref_tracker.c:74 ref_tracker_alloc+0xe1/0x430 lib/ref_tracker.c:74 lib/ref_tracker.c:74 netdev_tracker_alloc include/linux/netdevice.h:3855 [inline] dev_hold_track include/linux/netdevice.h:3872 [inline] netdev_tracker_alloc include/linux/netdevice.h:3855 [inline] net/core/dst.c:52 dev_hold_track include/linux/netdevice.h:3872 [inline] net/core/dst.c:52 dst_init+0xe0/0x520 net/core/dst.c:52 net/core/dst.c:52 dst_alloc+0x16b/0x1f0 net/core/dst.c:96 net/core/dst.c:96 rt_dst_alloc+0x73/0x450 net/ipv4/route.c:1614 net/ipv4/route.c:1614 ip_route_input_mc net/ipv4/route.c:1720 [inline] ip_route_input_mc net/ipv4/route.c:1720 [inline] net/ipv4/route.c:2465 ip_route_input_rcu.part.0+0x4fe/0xcc0 net/ipv4/route.c:2465 net/ipv4/route.c:2465 ip_route_input_rcu net/ipv4/route.c:2420 [inline] ip_route_input_rcu net/ipv4/route.c:2420 [inline] net/ipv4/route.c:2416 ip_route_input_noref+0x1b8/0x2a0 net/ipv4/route.c:2416 net/ipv4/route.c:2416 ip_rcv_finish_core.constprop.0+0x288/0x1e90 net/ipv4/ip_input.c:354 net/ipv4/ip_input.c:354 ip_rcv_finish+0x135/0x2f0 net/ipv4/ip_input.c:427 net/ipv4/ip_input.c:427 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] net/ipv4/ip_input.c:540 NF_HOOK include/linux/netfilter.h:301 [inline] net/ipv4/ip_input.c:540 ip_rcv+0xaa/0xd0 net/ipv4/ip_input.c:540 net/ipv4/ip_input.c:540 __netif_receive_skb_one_core+0x114/0x180 net/core/dev.c:5350 net/core/dev.c:5350 __netif_receive_skb+0x24/0x1b0 net/core/dev.c:5464 net/core/dev.c:5464 process_backlog+0x2a5/0x6c0 net/core/dev.c:5796 net/core/dev.c:5796 __napi_poll+0xaf/0x440 net/core/dev.c:6364 net/core/dev.c:6364 napi_poll net/core/dev.c:6431 [inline] napi_poll net/core/dev.c:6431 [inline] net/core/dev.c:6518 net_rx_action+0x801/0xb40 net/core/dev.c:6518 net/core/dev.c:6518 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 kernel/softirq.c:558 run_ksoftirqd kernel/softirq.c:921 [inline] run_ksoftirqd kernel/softirq.c:921 [inline] kernel/softirq.c:913 run_ksoftirqd+0x2d/0x60 kernel/softirq.c:913 kernel/softirq.c:913 smpboot_thread_fn+0x645/0x9c0 kernel/smpboot.c:164 kernel/smpboot.c:164 kthread+0x405/0x4f0 kernel/kthread.c:327 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 arch/x86/entry/entry_64.S:295 Fixes: 4e66934eaadc ("lib: add reference counting tracking infrastructure") Signed-off-by: Eric Dumazet Cc: Dmitry Vyukov Reported-by: syzbot Signed-off-by: David S. Miller --- lib/ref_tracker.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/ref_tracker.c b/lib/ref_tracker.c index 0ae2e66dcf0f..a6789c0c626b 100644 --- a/lib/ref_tracker.c +++ b/lib/ref_tracker.c @@ -69,9 +69,12 @@ int ref_tracker_alloc(struct ref_tracker_dir *dir, unsigned long entries[REF_TRACKER_STACK_ENTRIES]; struct ref_tracker *tracker; unsigned int nr_entries; + gfp_t gfp_mask = gfp; unsigned long flags; - *trackerp = tracker = kzalloc(sizeof(*tracker), gfp | __GFP_NOFAIL); + if (gfp & __GFP_DIRECT_RECLAIM) + gfp_mask |= __GFP_NOFAIL; + *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask); if (unlikely(!tracker)) { pr_err_once("memory allocation failure, unreliable refcount tracker.\n"); refcount_inc(&dir->untracked); From 6c0e3b5ce94947b311348c367db9e11dcb2ccc93 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 12 Jan 2022 07:30:10 -0600 Subject: [PATCH 051/149] net: ipa: fix atomic update in ipa_endpoint_replenish() In ipa_endpoint_replenish(), if an error occurs when attempting to replenish a receive buffer, we just quit and try again later. In that case we increment the backlog count to reflect that the attempt was unsuccessful. Then, if the add_one flag was true we increment the backlog again. This second increment is not included in the backlog local variable though, and its value determines whether delayed work should be scheduled. This is a bug. Fix this by determining whether 1 or 2 should be added to the backlog before adding it in a atomic_add_return() call. Reviewed-by: Matthias Kaehlcke Fixes: 84f9bd12d46db ("soc: qcom: ipa: IPA endpoints") Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_endpoint.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 49d9a077d037..8b055885cf3c 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -1080,6 +1080,7 @@ static void ipa_endpoint_replenish(struct ipa_endpoint *endpoint, bool add_one) { struct gsi *gsi; u32 backlog; + int delta; if (!endpoint->replenish_enabled) { if (add_one) @@ -1097,10 +1098,8 @@ static void ipa_endpoint_replenish(struct ipa_endpoint *endpoint, bool add_one) try_again_later: /* The last one didn't succeed, so fix the backlog */ - backlog = atomic_inc_return(&endpoint->replenish_backlog); - - if (add_one) - atomic_inc(&endpoint->replenish_backlog); + delta = add_one ? 2 : 1; + backlog = atomic_add_return(delta, &endpoint->replenish_backlog); /* Whenever a receive buffer transaction completes we'll try to * replenish again. It's unlikely, but if we fail to supply even From c1aaa01dbf4cef95af3e04a5a43986c290e06ea3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 12 Jan 2022 07:30:11 -0600 Subject: [PATCH 052/149] net: ipa: use a bitmap for endpoint replenish_enabled Define a new replenish_flags bitmap to contain Boolean flags associated with an endpoint's replenishing state. Replace the replenish_enabled field with a flag in that bitmap. This is to prepare for the next patch, which adds another flag. Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_endpoint.c | 8 ++++---- drivers/net/ipa/ipa_endpoint.h | 15 +++++++++++++-- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 8b055885cf3c..cddddcedaf72 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -1082,7 +1082,7 @@ static void ipa_endpoint_replenish(struct ipa_endpoint *endpoint, bool add_one) u32 backlog; int delta; - if (!endpoint->replenish_enabled) { + if (!test_bit(IPA_REPLENISH_ENABLED, endpoint->replenish_flags)) { if (add_one) atomic_inc(&endpoint->replenish_saved); return; @@ -1119,7 +1119,7 @@ static void ipa_endpoint_replenish_enable(struct ipa_endpoint *endpoint) u32 max_backlog; u32 saved; - endpoint->replenish_enabled = true; + set_bit(IPA_REPLENISH_ENABLED, endpoint->replenish_flags); while ((saved = atomic_xchg(&endpoint->replenish_saved, 0))) atomic_add(saved, &endpoint->replenish_backlog); @@ -1133,7 +1133,7 @@ static void ipa_endpoint_replenish_disable(struct ipa_endpoint *endpoint) { u32 backlog; - endpoint->replenish_enabled = false; + clear_bit(IPA_REPLENISH_ENABLED, endpoint->replenish_flags); while ((backlog = atomic_xchg(&endpoint->replenish_backlog, 0))) atomic_add(backlog, &endpoint->replenish_saved); } @@ -1690,7 +1690,7 @@ static void ipa_endpoint_setup_one(struct ipa_endpoint *endpoint) /* RX transactions require a single TRE, so the maximum * backlog is the same as the maximum outstanding TREs. */ - endpoint->replenish_enabled = false; + clear_bit(IPA_REPLENISH_ENABLED, endpoint->replenish_flags); atomic_set(&endpoint->replenish_saved, gsi_channel_tre_max(gsi, endpoint->channel_id)); atomic_set(&endpoint->replenish_backlog, 0); diff --git a/drivers/net/ipa/ipa_endpoint.h b/drivers/net/ipa/ipa_endpoint.h index 0a859d10312d..07d5c20e5f00 100644 --- a/drivers/net/ipa/ipa_endpoint.h +++ b/drivers/net/ipa/ipa_endpoint.h @@ -40,6 +40,17 @@ enum ipa_endpoint_name { #define IPA_ENDPOINT_MAX 32 /* Max supported by driver */ +/** + * enum ipa_replenish_flag: RX buffer replenish flags + * + * @IPA_REPLENISH_ENABLED: Whether receive buffer replenishing is enabled + * @IPA_REPLENISH_COUNT: Number of defined replenish flags + */ +enum ipa_replenish_flag { + IPA_REPLENISH_ENABLED, + IPA_REPLENISH_COUNT, /* Number of flags (must be last) */ +}; + /** * struct ipa_endpoint - IPA endpoint information * @ipa: IPA pointer @@ -51,7 +62,7 @@ enum ipa_endpoint_name { * @trans_tre_max: Maximum number of TRE descriptors per transaction * @evt_ring_id: GSI event ring used by the endpoint * @netdev: Network device pointer, if endpoint uses one - * @replenish_enabled: Whether receive buffer replenishing is enabled + * @replenish_flags: Replenishing state flags * @replenish_ready: Number of replenish transactions without doorbell * @replenish_saved: Replenish requests held while disabled * @replenish_backlog: Number of buffers needed to fill hardware queue @@ -72,7 +83,7 @@ struct ipa_endpoint { struct net_device *netdev; /* Receive buffer replenishing for RX endpoints */ - bool replenish_enabled; + DECLARE_BITMAP(replenish_flags, IPA_REPLENISH_COUNT); u32 replenish_ready; atomic_t replenish_saved; atomic_t replenish_backlog; From 998c0bd2b3715244da7639cc4e6a2062cb79c3f4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 12 Jan 2022 07:30:12 -0600 Subject: [PATCH 053/149] net: ipa: prevent concurrent replenish We have seen cases where an endpoint RX completion interrupt arrives while replenishing for the endpoint is underway. This causes another instance of replenishing to begin as part of completing the receive transaction. If this occurs it can lead to transaction corruption. Use a new flag to ensure only one replenish instance for an endpoint executes at a time. Fixes: 84f9bd12d46db ("soc: qcom: ipa: IPA endpoints") Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_endpoint.c | 13 +++++++++++++ drivers/net/ipa/ipa_endpoint.h | 2 ++ 2 files changed, 15 insertions(+) diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index cddddcedaf72..68291a3efd04 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -1088,15 +1088,27 @@ static void ipa_endpoint_replenish(struct ipa_endpoint *endpoint, bool add_one) return; } + /* If already active, just update the backlog */ + if (test_and_set_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags)) { + if (add_one) + atomic_inc(&endpoint->replenish_backlog); + return; + } + while (atomic_dec_not_zero(&endpoint->replenish_backlog)) if (ipa_endpoint_replenish_one(endpoint)) goto try_again_later; + + clear_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags); + if (add_one) atomic_inc(&endpoint->replenish_backlog); return; try_again_later: + clear_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags); + /* The last one didn't succeed, so fix the backlog */ delta = add_one ? 2 : 1; backlog = atomic_add_return(delta, &endpoint->replenish_backlog); @@ -1691,6 +1703,7 @@ static void ipa_endpoint_setup_one(struct ipa_endpoint *endpoint) * backlog is the same as the maximum outstanding TREs. */ clear_bit(IPA_REPLENISH_ENABLED, endpoint->replenish_flags); + clear_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags); atomic_set(&endpoint->replenish_saved, gsi_channel_tre_max(gsi, endpoint->channel_id)); atomic_set(&endpoint->replenish_backlog, 0); diff --git a/drivers/net/ipa/ipa_endpoint.h b/drivers/net/ipa/ipa_endpoint.h index 07d5c20e5f00..0313cdc607de 100644 --- a/drivers/net/ipa/ipa_endpoint.h +++ b/drivers/net/ipa/ipa_endpoint.h @@ -44,10 +44,12 @@ enum ipa_endpoint_name { * enum ipa_replenish_flag: RX buffer replenish flags * * @IPA_REPLENISH_ENABLED: Whether receive buffer replenishing is enabled + * @IPA_REPLENISH_ACTIVE: Whether replenishing is underway * @IPA_REPLENISH_COUNT: Number of defined replenish flags */ enum ipa_replenish_flag { IPA_REPLENISH_ENABLED, + IPA_REPLENISH_ACTIVE, IPA_REPLENISH_COUNT, /* Number of flags (must be last) */ }; From fcfb894d5952786db3b2b2b9cd3a95cdcedb6ff0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jan 2022 04:53:00 -0800 Subject: [PATCH 054/149] net: bridge: fix net device refcount tracking issue in error path I left one dev_put() in br_add_if() error path and sure enough syzbot found its way. As the tracker is allocated in new_nbp(), we must make sure to properly free it. We have to call dev_put_track(dev, &p->dev_tracker) before @p object is freed, of course. This is not an issue because br_add_if() owns a reference on @dev. Fixes: b2dcdc7f731d ("net: bridge: add net device refcount tracker") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/bridge/br_if.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index a52ad81596b7..55f47cadb114 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -615,6 +615,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, err = dev_set_allmulti(dev, 1); if (err) { br_multicast_del_port(p); + dev_put_track(dev, &p->dev_tracker); kfree(p); /* kobject not yet init'd, manually free */ goto err1; } @@ -724,10 +725,10 @@ err3: sysfs_remove_link(br->ifobj, p->dev->name); err2: br_multicast_del_port(p); + dev_put_track(dev, &p->dev_tracker); kobject_put(&p->kobj); dev_set_allmulti(dev, -1); err1: - dev_put(dev); return err; } From 7b9b1d449a7cef675d6926c4538edee8e601d823 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jan 2022 04:59:39 -0800 Subject: [PATCH 055/149] net/smc: fix possible NULL deref in smc_pnet_add_eth() I missed that @ndev value can be NULL. I prefer not factorizing this NULL check, and instead clearly document where a NULL might be expected. general protection fault, probably for non-canonical address 0xdffffc00000000ba: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x00000000000005d0-0x00000000000005d7] CPU: 0 PID: 19875 Comm: syz-executor.2 Not tainted 5.16.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__lock_acquire+0xd7a/0x5470 kernel/locking/lockdep.c:4897 Code: 14 0e 41 bf 01 00 00 00 0f 86 c8 00 00 00 89 05 5c 20 14 0e e9 bd 00 00 00 48 b8 00 00 00 00 00 fc ff df 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 9f 2e 00 00 49 81 3e 20 c5 1a 8f 0f 84 52 f3 ff RSP: 0018:ffffc900057071d0 EFLAGS: 00010002 RAX: dffffc0000000000 RBX: 1ffff92000ae0e65 RCX: 1ffff92000ae0e4c RDX: 00000000000000ba RSI: 0000000000000000 RDI: 0000000000000001 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000001 R10: fffffbfff1b24ae2 R11: 000000000008808a R12: 0000000000000000 R13: ffff888040ca4000 R14: 00000000000005d0 R15: 0000000000000000 FS: 00007fbd683e0700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2be22000 CR3: 0000000013fea000 CR4: 00000000003526f0 Call Trace: lock_acquire kernel/locking/lockdep.c:5637 [inline] lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5602 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x39/0x50 kernel/locking/spinlock.c:162 ref_tracker_alloc+0x182/0x440 lib/ref_tracker.c:84 netdev_tracker_alloc include/linux/netdevice.h:3859 [inline] smc_pnet_add_eth net/smc/smc_pnet.c:372 [inline] smc_pnet_enter net/smc/smc_pnet.c:492 [inline] smc_pnet_add+0x49a/0x14d0 net/smc/smc_pnet.c:555 genl_family_rcv_msg_doit+0x228/0x320 net/netlink/genetlink.c:731 genl_family_rcv_msg net/netlink/genetlink.c:775 [inline] genl_rcv_msg+0x328/0x580 net/netlink/genetlink.c:792 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494 genl_rcv+0x24/0x40 net/netlink/genetlink.c:803 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:725 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2413 ___sys_sendmsg+0xf3/0x170 net/socket.c:2467 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2496 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: b60645248af3 ("net/smc: add net device tracker to struct smc_pnetentry") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/smc/smc_pnet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index db9825c01e0a..291f1484a1b7 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -369,7 +369,8 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); new_pe->ndev = ndev; - netdev_tracker_alloc(ndev, &new_pe->dev_tracker, GFP_KERNEL); + if (ndev) + netdev_tracker_alloc(ndev, &new_pe->dev_tracker, GFP_KERNEL); rc = -EEXIST; new_netdev = true; write_lock(&pnettable->lock); From 4e5bd03ae34652cd932ab4c91c71c511793df75c Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Wed, 12 Jan 2022 20:54:18 +0800 Subject: [PATCH 056/149] net: bonding: fix bond_xmit_broadcast return value error bug In Linux bonding scenario, one packet is copied to several copies and sent by all slave device of bond0 in mode 3(broadcast mode). The mode 3 xmit function bond_xmit_broadcast() only ueses the last slave device's tx result as the final result. In this case, if the last slave device is down, then it always return NET_XMIT_DROP, even though the other slave devices xmit success. It may cause the tx statistics error, and cause the application (e.g. scp) consider the network is unreachable. For example, use the following command to configure server A. echo 3 > /sys/class/net/bond0/bonding/mode ifconfig bond0 up ifenslave bond0 eth0 eth1 ifconfig bond0 192.168.1.125 ifconfig eth0 up ifconfig eth1 down The slave device eth0 and eth1 are connected to server B(192.168.1.107). Run the ping 192.168.1.107 -c 3 -i 0.2 command, the following information is displayed. PING 192.168.1.107 (192.168.1.107) 56(84) bytes of data. 64 bytes from 192.168.1.107: icmp_seq=1 ttl=64 time=0.077 ms 64 bytes from 192.168.1.107: icmp_seq=2 ttl=64 time=0.056 ms 64 bytes from 192.168.1.107: icmp_seq=3 ttl=64 time=0.051 ms 192.168.1.107 ping statistics 0 packets transmitted, 3 received Actually, the slave device eth0 of the bond successfully sends three ICMP packets, but the result shows that 0 packets are transmitted. Also if we use scp command to get remote files, the command end with the following printings. ssh_exchange_identification: read: Connection timed out So this patch modifies the bond_xmit_broadcast to return NET_XMIT_SUCCESS if one slave device in the bond sends packets successfully. If all slave devices send packets fail, the discarded packets stats is increased. The skb is released when there is no slave device in the bond or the last slave device is down. Fixes: ae46f184bc1f ("bonding: propagate transmit status") Signed-off-by: Jie Wang Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 07fc603c2fa7..fce80b57f15b 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4884,25 +4884,39 @@ static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb, struct bonding *bond = netdev_priv(bond_dev); struct slave *slave = NULL; struct list_head *iter; + bool xmit_suc = false; + bool skb_used = false; bond_for_each_slave_rcu(bond, slave, iter) { - if (bond_is_last_slave(bond, slave)) - break; - if (bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + struct sk_buff *skb2; + if (!(bond_slave_is_up(slave) && slave->link == BOND_LINK_UP)) + continue; + + if (bond_is_last_slave(bond, slave)) { + skb2 = skb; + skb_used = true; + } else { + skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) { net_err_ratelimited("%s: Error: %s: skb_clone() failed\n", bond_dev->name, __func__); continue; } - bond_dev_queue_xmit(bond, skb2, slave->dev); } - } - if (slave && bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) - return bond_dev_queue_xmit(bond, skb, slave->dev); - return bond_tx_drop(bond_dev, skb); + if (bond_dev_queue_xmit(bond, skb2, slave->dev) == NETDEV_TX_OK) + xmit_suc = true; + } + + if (!skb_used) + dev_kfree_skb_any(skb); + + if (xmit_suc) + return NETDEV_TX_OK; + + atomic_long_inc(&bond_dev->tx_dropped); + return NET_XMIT_DROP; } /*------------------------- Device initialization ---------------------------*/ From 8973d7b8638f1c2615eec495dfe70122a01a9e1b Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 4 Jan 2022 15:56:44 +0100 Subject: [PATCH 057/149] dt-bindings: net: oxnas-dwmac: Add bindings for OX810SE Add SoC specific bindings for OX810SE support. Signed-off-by: Neil Armstrong Acked-by: Rob Herring Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/oxnas-dwmac.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/net/oxnas-dwmac.txt b/Documentation/devicetree/bindings/net/oxnas-dwmac.txt index d7117a22fd87..27db496f1ce8 100644 --- a/Documentation/devicetree/bindings/net/oxnas-dwmac.txt +++ b/Documentation/devicetree/bindings/net/oxnas-dwmac.txt @@ -9,6 +9,9 @@ Required properties on all platforms: - compatible: For the OX820 SoC, it should be : - "oxsemi,ox820-dwmac" to select glue - "snps,dwmac-3.512" to select IP version. + For the OX810SE SoC, it should be : + - "oxsemi,ox810se-dwmac" to select glue + - "snps,dwmac-3.512" to select IP version. - clocks: Should contain phandles to the following clocks - clock-names: Should contain the following: From 72f1f7e46c6e96b2ae300d750de01ac75d625b4e Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 4 Jan 2022 15:56:45 +0100 Subject: [PATCH 058/149] net: stmmac: dwmac-oxnas: Add support for OX810SE Add support for OX810SE dwmac glue setup, which is a simplified version of the OX820 introduced later with more control on the PHY interface. Signed-off-by: Neil Armstrong Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac-oxnas.c | 115 +++++++++++++----- 1 file changed, 86 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c index adfeb8d3293d..62a69a91ab22 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -48,16 +49,75 @@ #define DWMAC_RX_VARDELAY(d) ((d) << DWMAC_RX_VARDELAY_SHIFT) #define DWMAC_RXN_VARDELAY(d) ((d) << DWMAC_RXN_VARDELAY_SHIFT) +struct oxnas_dwmac; + +struct oxnas_dwmac_data { + int (*setup)(struct oxnas_dwmac *dwmac); +}; + struct oxnas_dwmac { struct device *dev; struct clk *clk; struct regmap *regmap; + const struct oxnas_dwmac_data *data; }; +static int oxnas_dwmac_setup_ox810se(struct oxnas_dwmac *dwmac) +{ + unsigned int value; + int ret; + + ret = regmap_read(dwmac->regmap, OXNAS_DWMAC_CTRL_REGOFFSET, &value); + if (ret < 0) + return ret; + + /* Enable GMII_GTXCLK to follow GMII_REFCLK, required for gigabit PHY */ + value |= BIT(DWMAC_CKEN_GTX) | + /* Use simple mux for 25/125 Mhz clock switching */ + BIT(DWMAC_SIMPLE_MUX); + + regmap_write(dwmac->regmap, OXNAS_DWMAC_CTRL_REGOFFSET, value); + + return 0; +} + +static int oxnas_dwmac_setup_ox820(struct oxnas_dwmac *dwmac) +{ + unsigned int value; + int ret; + + ret = regmap_read(dwmac->regmap, OXNAS_DWMAC_CTRL_REGOFFSET, &value); + if (ret < 0) + return ret; + + /* Enable GMII_GTXCLK to follow GMII_REFCLK, required for gigabit PHY */ + value |= BIT(DWMAC_CKEN_GTX) | + /* Use simple mux for 25/125 Mhz clock switching */ + BIT(DWMAC_SIMPLE_MUX) | + /* set auto switch tx clock source */ + BIT(DWMAC_AUTO_TX_SOURCE) | + /* enable tx & rx vardelay */ + BIT(DWMAC_CKEN_TX_OUT) | + BIT(DWMAC_CKEN_TXN_OUT) | + BIT(DWMAC_CKEN_TX_IN) | + BIT(DWMAC_CKEN_RX_OUT) | + BIT(DWMAC_CKEN_RXN_OUT) | + BIT(DWMAC_CKEN_RX_IN); + regmap_write(dwmac->regmap, OXNAS_DWMAC_CTRL_REGOFFSET, value); + + /* set tx & rx vardelay */ + value = DWMAC_TX_VARDELAY(4) | + DWMAC_TXN_VARDELAY(2) | + DWMAC_RX_VARDELAY(10) | + DWMAC_RXN_VARDELAY(8); + regmap_write(dwmac->regmap, OXNAS_DWMAC_DELAY_REGOFFSET, value); + + return 0; +} + static int oxnas_dwmac_init(struct platform_device *pdev, void *priv) { struct oxnas_dwmac *dwmac = priv; - unsigned int value; int ret; /* Reset HW here before changing the glue configuration */ @@ -69,35 +129,11 @@ static int oxnas_dwmac_init(struct platform_device *pdev, void *priv) if (ret) return ret; - ret = regmap_read(dwmac->regmap, OXNAS_DWMAC_CTRL_REGOFFSET, &value); - if (ret < 0) { + ret = dwmac->data->setup(dwmac); + if (ret) clk_disable_unprepare(dwmac->clk); - return ret; - } - /* Enable GMII_GTXCLK to follow GMII_REFCLK, required for gigabit PHY */ - value |= BIT(DWMAC_CKEN_GTX) | - /* Use simple mux for 25/125 Mhz clock switching */ - BIT(DWMAC_SIMPLE_MUX) | - /* set auto switch tx clock source */ - BIT(DWMAC_AUTO_TX_SOURCE) | - /* enable tx & rx vardelay */ - BIT(DWMAC_CKEN_TX_OUT) | - BIT(DWMAC_CKEN_TXN_OUT) | - BIT(DWMAC_CKEN_TX_IN) | - BIT(DWMAC_CKEN_RX_OUT) | - BIT(DWMAC_CKEN_RXN_OUT) | - BIT(DWMAC_CKEN_RX_IN); - regmap_write(dwmac->regmap, OXNAS_DWMAC_CTRL_REGOFFSET, value); - - /* set tx & rx vardelay */ - value = DWMAC_TX_VARDELAY(4) | - DWMAC_TXN_VARDELAY(2) | - DWMAC_RX_VARDELAY(10) | - DWMAC_RXN_VARDELAY(8); - regmap_write(dwmac->regmap, OXNAS_DWMAC_DELAY_REGOFFSET, value); - - return 0; + return ret; } static void oxnas_dwmac_exit(struct platform_device *pdev, void *priv) @@ -128,6 +164,12 @@ static int oxnas_dwmac_probe(struct platform_device *pdev) goto err_remove_config_dt; } + dwmac->data = (const struct oxnas_dwmac_data *)of_device_get_match_data(&pdev->dev); + if (!dwmac->data) { + ret = -EINVAL; + goto err_remove_config_dt; + } + dwmac->dev = &pdev->dev; plat_dat->bsp_priv = dwmac; plat_dat->init = oxnas_dwmac_init; @@ -166,8 +208,23 @@ err_remove_config_dt: return ret; } +static const struct oxnas_dwmac_data ox810se_dwmac_data = { + .setup = oxnas_dwmac_setup_ox810se, +}; + +static const struct oxnas_dwmac_data ox820_dwmac_data = { + .setup = oxnas_dwmac_setup_ox820, +}; + static const struct of_device_id oxnas_dwmac_match[] = { - { .compatible = "oxsemi,ox820-dwmac" }, + { + .compatible = "oxsemi,ox810se-dwmac", + .data = &ox810se_dwmac_data, + }, + { + .compatible = "oxsemi,ox820-dwmac", + .data = &ox820_dwmac_data, + }, { } }; MODULE_DEVICE_TABLE(of, oxnas_dwmac_match); From 9c0c2c7aa23cbf78277557cfe8ec8bb7689225fe Mon Sep 17 00:00:00 2001 From: Yevhen Orlov Date: Tue, 11 Jan 2022 03:08:26 +0200 Subject: [PATCH 059/149] net: marvell: prestera: Cleanup router struct Field "aborted" was added in 69204174cc5c ("net: marvell: prestera: Add prestera router infra"). It will not be used. So remove. Signed-off-by: Yevhen Orlov Link: https://lore.kernel.org/r/20220111010826.3779-1-yevhen.orlov@plvision.eu Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/prestera/prestera.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera.h b/drivers/net/ethernet/marvell/prestera/prestera.h index a0a5a8e6bd8c..2fd9ef2fe5d6 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera.h +++ b/drivers/net/ethernet/marvell/prestera/prestera.h @@ -283,7 +283,6 @@ struct prestera_router { struct list_head rif_entry_list; struct notifier_block inetaddr_nb; struct notifier_block inetaddr_valid_nb; - bool aborted; }; struct prestera_rxtx_params { From 6a1ba8758f67e37a48fd2f940032a2b186bcf021 Mon Sep 17 00:00:00 2001 From: Yevhen Orlov Date: Tue, 11 Jan 2022 03:10:14 +0200 Subject: [PATCH 060/149] net: marvell: prestera: Refactor get/put VR functions * Use refcount, instead of uint * Increment/decrement recount inside get/put * Fix error path in __prestera_vr_create. Remove unnecessary kfree. * Make __prestera_vr_destroy symmetric to "create" Fixes: bca5859bc6c6 ("net: marvell: prestera: add hardware router objects accounting") Signed-off-by: Yevhen Orlov Link: https://lore.kernel.org/r/20220111011014.4418-1-yevhen.orlov@plvision.eu Signed-off-by: Jakub Kicinski --- .../marvell/prestera/prestera_router_hw.c | 32 +++++++++---------- .../marvell/prestera/prestera_router_hw.h | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c index 5866a4be50f5..d5befd1d1440 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c @@ -47,13 +47,8 @@ static struct prestera_vr *__prestera_vr_create(struct prestera_switch *sw, struct netlink_ext_ack *extack) { struct prestera_vr *vr; - u16 hw_vr_id; int err; - err = prestera_hw_vr_create(sw, &hw_vr_id); - if (err) - return ERR_PTR(-ENOMEM); - vr = kzalloc(sizeof(*vr), GFP_KERNEL); if (!vr) { err = -ENOMEM; @@ -61,23 +56,26 @@ static struct prestera_vr *__prestera_vr_create(struct prestera_switch *sw, } vr->tb_id = tb_id; - vr->hw_vr_id = hw_vr_id; + + err = prestera_hw_vr_create(sw, &vr->hw_vr_id); + if (err) + goto err_hw_create; list_add(&vr->router_node, &sw->router->vr_list); return vr; -err_alloc_vr: - prestera_hw_vr_delete(sw, hw_vr_id); +err_hw_create: kfree(vr); +err_alloc_vr: return ERR_PTR(err); } static void __prestera_vr_destroy(struct prestera_switch *sw, struct prestera_vr *vr) { - prestera_hw_vr_delete(sw, vr->hw_vr_id); list_del(&vr->router_node); + prestera_hw_vr_delete(sw, vr->hw_vr_id); kfree(vr); } @@ -87,17 +85,22 @@ static struct prestera_vr *prestera_vr_get(struct prestera_switch *sw, u32 tb_id struct prestera_vr *vr; vr = __prestera_vr_find(sw, tb_id); - if (!vr) + if (vr) { + refcount_inc(&vr->refcount); + } else { vr = __prestera_vr_create(sw, tb_id, extack); - if (IS_ERR(vr)) - return ERR_CAST(vr); + if (IS_ERR(vr)) + return ERR_CAST(vr); + + refcount_set(&vr->refcount, 1); + } return vr; } static void prestera_vr_put(struct prestera_switch *sw, struct prestera_vr *vr) { - if (!vr->ref_cnt) + if (refcount_dec_and_test(&vr->refcount)) __prestera_vr_destroy(sw, vr); } @@ -158,7 +161,6 @@ void prestera_rif_entry_destroy(struct prestera_switch *sw, iface.vr_id = e->vr->hw_vr_id; prestera_hw_rif_delete(sw, e->hw_id, &iface); - e->vr->ref_cnt--; prestera_vr_put(sw, e->vr); kfree(e); } @@ -183,7 +185,6 @@ prestera_rif_entry_create(struct prestera_switch *sw, if (IS_ERR(e->vr)) goto err_vr_get; - e->vr->ref_cnt++; memcpy(&e->addr, addr, sizeof(e->addr)); /* HW */ @@ -198,7 +199,6 @@ prestera_rif_entry_create(struct prestera_switch *sw, return e; err_hw_create: - e->vr->ref_cnt--; prestera_vr_put(sw, e->vr); err_vr_get: err_key_copy: diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h index fed53595f7bb..ab5e013ac3ad 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h +++ b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h @@ -6,7 +6,7 @@ struct prestera_vr { struct list_head router_node; - unsigned int ref_cnt; + refcount_t refcount; u32 tb_id; /* key (kernel fib table id) */ u16 hw_vr_id; /* virtual router ID */ u8 __pad[2]; From 32d098bb2e494cf282968ae654dadf7299bb73b0 Mon Sep 17 00:00:00 2001 From: Yevhen Orlov Date: Tue, 11 Jan 2022 03:10:51 +0200 Subject: [PATCH 061/149] net: marvell: prestera: Refactor router functions * Reverse xmas tree variables order * User friendly messages on error paths * Refactor __prestera_inetaddr_event to use early return Signed-off-by: Yevhen Orlov Link: https://lore.kernel.org/r/20220111011051.4941-1-yevhen.orlov@plvision.eu Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/prestera/prestera_hw.c | 4 ++-- .../marvell/prestera/prestera_router.c | 20 ++++++++++--------- .../marvell/prestera/prestera_router_hw.c | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_hw.c index 51fc841b1e7a..e6bfadc874c5 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_hw.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_hw.c @@ -1831,8 +1831,8 @@ static int prestera_iface_to_msg(struct prestera_iface *iface, int prestera_hw_rif_create(struct prestera_switch *sw, struct prestera_iface *iif, u8 *mac, u16 *rif_id) { - struct prestera_msg_rif_req req; struct prestera_msg_rif_resp resp; + struct prestera_msg_rif_req req; int err; memcpy(req.mac, mac, ETH_ALEN); @@ -1868,9 +1868,9 @@ int prestera_hw_rif_delete(struct prestera_switch *sw, u16 rif_id, int prestera_hw_vr_create(struct prestera_switch *sw, u16 *vr_id) { - int err; struct prestera_msg_vr_resp resp; struct prestera_msg_vr_req req; + int err; err = prestera_cmd_ret(sw, PRESTERA_CMD_TYPE_ROUTER_VR_CREATE, &req.cmd, sizeof(req), &resp.ret, sizeof(resp)); diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router.c b/drivers/net/ethernet/marvell/prestera/prestera_router.c index 8a3b7b664358..607b88bfa451 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_router.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_router.c @@ -25,10 +25,10 @@ static int __prestera_inetaddr_port_event(struct net_device *port_dev, struct netlink_ext_ack *extack) { struct prestera_port *port = netdev_priv(port_dev); - int err; - struct prestera_rif_entry *re; struct prestera_rif_entry_key re_key = {}; + struct prestera_rif_entry *re; u32 kern_tb_id; + int err; err = prestera_is_valid_mac_addr(port, port_dev->dev_addr); if (err) { @@ -45,21 +45,21 @@ static int __prestera_inetaddr_port_event(struct net_device *port_dev, switch (event) { case NETDEV_UP: if (re) { - NL_SET_ERR_MSG_MOD(extack, "rif_entry already exist"); + NL_SET_ERR_MSG_MOD(extack, "RIF already exist"); return -EEXIST; } re = prestera_rif_entry_create(port->sw, &re_key, prestera_fix_tb_id(kern_tb_id), port_dev->dev_addr); if (!re) { - NL_SET_ERR_MSG_MOD(extack, "Can't create rif_entry"); + NL_SET_ERR_MSG_MOD(extack, "Can't create RIF"); return -EINVAL; } dev_hold(port_dev); break; case NETDEV_DOWN: if (!re) { - NL_SET_ERR_MSG_MOD(extack, "rif_entry not exist"); + NL_SET_ERR_MSG_MOD(extack, "Can't find RIF"); return -EEXIST; } prestera_rif_entry_destroy(port->sw, re); @@ -75,11 +75,11 @@ static int __prestera_inetaddr_event(struct prestera_switch *sw, unsigned long event, struct netlink_ext_ack *extack) { - if (prestera_netdev_check(dev) && !netif_is_bridge_port(dev) && - !netif_is_lag_port(dev) && !netif_is_ovs_port(dev)) - return __prestera_inetaddr_port_event(dev, event, extack); + if (!prestera_netdev_check(dev) || netif_is_bridge_port(dev) || + netif_is_lag_port(dev) || netif_is_ovs_port(dev)) + return 0; - return 0; + return __prestera_inetaddr_port_event(dev, event, extack); } static int __prestera_inetaddr_cb(struct notifier_block *nb, @@ -126,6 +126,8 @@ static int __prestera_inetaddr_valid_cb(struct notifier_block *nb, goto out; if (ipv4_is_multicast(ivi->ivi_addr)) { + NL_SET_ERR_MSG_MOD(ivi->extack, + "Multicast addr on RIF is not supported"); err = -EINVAL; goto out; } diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c index d5befd1d1440..490e9b61fd8d 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c @@ -123,7 +123,7 @@ __prestera_rif_entry_key_copy(const struct prestera_rif_entry_key *in, out->iface.vlan_id = in->iface.vlan_id; break; default: - pr_err("Unsupported iface type"); + WARN(1, "Unsupported iface type"); return -EINVAL; } From e179f045f90d8947d3c2addebd620341b7daa4a1 Mon Sep 17 00:00:00 2001 From: Yevhen Orlov Date: Tue, 11 Jan 2022 03:11:29 +0200 Subject: [PATCH 062/149] net: marvell: prestera: Fix deinit sequence for router * Add missed call prestera_router_fini in prestera_switch_fini * Add prestera_router_hw_fini, which verify lists are empty Fixes: 69204174cc5c ("net: marvell: prestera: Add prestera router infra") Signed-off-by: Yevhen Orlov Link: https://lore.kernel.org/r/20220111011129.5457-1-yevhen.orlov@plvision.eu Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/prestera/prestera_main.c | 1 + drivers/net/ethernet/marvell/prestera/prestera_router.c | 4 ++-- drivers/net/ethernet/marvell/prestera/prestera_router_hw.c | 6 ++++++ drivers/net/ethernet/marvell/prestera/prestera_router_hw.h | 1 + 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index 08fdd1e50388..cad93f747d0c 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -982,6 +982,7 @@ static void prestera_switch_fini(struct prestera_switch *sw) prestera_event_handlers_unregister(sw); prestera_rxtx_switch_fini(sw); prestera_switchdev_fini(sw); + prestera_router_fini(sw); prestera_netdev_event_handler_unregister(sw); prestera_hw_switch_fini(sw); } diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router.c b/drivers/net/ethernet/marvell/prestera/prestera_router.c index 607b88bfa451..6ef4d32b8fdd 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_router.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_router.c @@ -168,7 +168,7 @@ int prestera_router_init(struct prestera_switch *sw) err_register_inetaddr_notifier: unregister_inetaddr_validator_notifier(&router->inetaddr_valid_nb); err_register_inetaddr_validator_notifier: - /* prestera_router_hw_fini */ + prestera_router_hw_fini(sw); err_router_lib_init: kfree(sw->router); return err; @@ -178,7 +178,7 @@ void prestera_router_fini(struct prestera_switch *sw) { unregister_inetaddr_notifier(&sw->router->inetaddr_nb); unregister_inetaddr_validator_notifier(&sw->router->inetaddr_valid_nb); - /* router_hw_fini */ + prestera_router_hw_fini(sw); kfree(sw->router); sw->router = NULL; } diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c index 490e9b61fd8d..e5592b69ad37 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c @@ -29,6 +29,12 @@ int prestera_router_hw_init(struct prestera_switch *sw) return 0; } +void prestera_router_hw_fini(struct prestera_switch *sw) +{ + WARN_ON(!list_empty(&sw->router->vr_list)); + WARN_ON(!list_empty(&sw->router->rif_entry_list)); +} + static struct prestera_vr *__prestera_vr_find(struct prestera_switch *sw, u32 tb_id) { diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h index ab5e013ac3ad..b6b028551868 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h +++ b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h @@ -32,5 +32,6 @@ prestera_rif_entry_create(struct prestera_switch *sw, struct prestera_rif_entry_key *k, u32 tb_id, const unsigned char *addr); int prestera_router_hw_init(struct prestera_switch *sw); +void prestera_router_hw_fini(struct prestera_switch *sw); #endif /* _PRESTERA_ROUTER_HW_H_ */ From ed6ae5ca437d9d238117d90e95f7f2cc27da1b31 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Fri, 7 Jan 2022 12:38:42 +0000 Subject: [PATCH 063/149] sit: allow encapsulated IPv6 traffic to be delivered locally While experimenting with FOU encapsulation Amir noticed that encapsulated IPv6 traffic fails to be delivered, if the peer IP address is configured locally. It can be easily verified by creating a sit interface like below: $ sudo ip link add name fou_test type sit remote 127.0.0.1 encap fou encap-sport auto encap-dport 1111 $ sudo ip link set fou_test up and sending some IPv4 and IPv6 traffic to it $ ping -I fou_test -c 1 1.1.1.1 $ ping6 -I fou_test -c 1 fe80::d0b0:dfff:fe4c:fcbc "tcpdump -i any udp dst port 1111" will confirm that only the first IPv4 ping was encapsulated and attempted to be delivered. This seems like a limitation: for example, in a cloud environment the "peer" service may be arbitrarily scheduled on any server within the cluster, where all nodes are trying to send encapsulated traffic. And the unlucky node will not be able to. Moreover, delivering encapsulated IPv4 traffic locally is allowed. But I may not have all the context about this restriction and this code predates the observable git history. Reported-by: Amir Razmjou Signed-off-by: Ignat Korchagin Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220107123842.211335-1-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/ipv6/sit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a618dce7e0bc..c0b138c20992 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -956,7 +956,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr); } - if (rt->rt_type != RTN_UNICAST) { + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); dev->stats.tx_carrier_errors++; goto tx_error_icmp; From 078c6a1cbd4cd7496048786beec2e312577bebbf Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Tue, 11 Jan 2022 23:11:32 +0100 Subject: [PATCH 064/149] net: qmi_wwan: add ZTE MF286D modem 19d2:1485 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modem from ZTE MF286D is an Qualcomm MDM9250 based 3G/4G modem. T: Bus=02 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 3 Spd=5000 MxCh= 0 D: Ver= 3.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=19d2 ProdID=1485 Rev=52.87 S: Manufacturer=ZTE,Incorporated S: Product=ZTE Technologies MSM S: SerialNumber=MF286DZTED000000 C:* #Ifs= 7 Cfg#= 1 Atr=80 MxPwr=896mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=06 Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=02 Prot=ff Driver=rndis_host E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=83(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms Signed-off-by: Pawel Dembicki Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index f510e8219470..0d5bc26c9f2e 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1316,6 +1316,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x19d2, 0x1426, 2)}, /* ZTE MF91 */ {QMI_FIXED_INTF(0x19d2, 0x1428, 2)}, /* Telewell TW-LTE 4G v2 */ {QMI_FIXED_INTF(0x19d2, 0x1432, 3)}, /* ZTE ME3620 */ + {QMI_FIXED_INTF(0x19d2, 0x1485, 5)}, /* ZTE MF286D */ {QMI_FIXED_INTF(0x19d2, 0x2002, 4)}, /* ZTE (Vodafone) K3765-Z */ {QMI_FIXED_INTF(0x2001, 0x7e16, 3)}, /* D-Link DWM-221 */ {QMI_FIXED_INTF(0x2001, 0x7e19, 4)}, /* D-Link DWM-221 B1 */ From de2d807b294d3d2ce5e59043ae2634016765d076 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 12 Jan 2022 12:28:05 +0200 Subject: [PATCH 065/149] sch_api: Don't skip qdisc attach on ingress The attach callback of struct Qdisc_ops is used by only a few qdiscs: mq, mqprio and htb. qdisc_graft() contains the following logic (pseudocode): if (!qdisc->ops->attach) { if (ingress) do ingress stuff; else do egress stuff; } if (!ingress) { ... if (qdisc->ops->attach) qdisc->ops->attach(qdisc); } else { ... } As we see, the attach callback is not called if the qdisc is being attached to ingress (TC_H_INGRESS). That wasn't a problem for mq and mqprio, since they contain a check that they are attached to TC_H_ROOT, and they can't be attached to TC_H_INGRESS anyway. However, the commit cited below added the attach callback to htb. It is needed for the hardware offload, but in the non-offload mode it simulates the "do egress stuff" part of the pseudocode above. The problem is that when htb is attached to ingress, neither "do ingress stuff" nor attach() is called. It results in an inconsistency, and the following message is printed to dmesg: unregister_netdevice: waiting for lo to become free. Usage count = 2 This commit addresses the issue by running "do ingress stuff" in the ingress flow even in the attach callback is present, which is fine, because attach isn't going to be called afterwards. The bug was found by syzbot and reported by Eric. Fixes: d03b195b5aa0 ("sch_htb: Hierarchical QoS hardware offload") Signed-off-by: Maxim Mikityanskiy Reported-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index c9c6f49f9c28..2cb496c84878 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1062,7 +1062,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, qdisc_offload_graft_root(dev, new, old, extack); - if (new && new->ops->attach) + if (new && new->ops->attach && !ingress) goto skip; for (i = 0; i < num_q; i++) { From 2d7c86a8f9cdce1408c4f3c69d94d007eff2f179 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:50 +0530 Subject: [PATCH 066/149] libceph: generalize addr/ip parsing based on delimiter ... and remove hardcoded function name in ceph_parse_ips(). [ idryomov: delim parameter, drop CEPH_ADDR_PARSE_DEFAULT_DELIM ] Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 3 ++- fs/ceph/super.c | 2 +- include/linux/ceph/libceph.h | 2 +- include/linux/ceph/messenger.h | 2 +- net/ceph/ceph_common.c | 9 ++++----- net/ceph/messenger.c | 15 ++++++++------- 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 953fa134cd3d..909dbe6111bf 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -6497,7 +6497,8 @@ static int rbd_add_parse_args(const char *buf, pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; pctx.opts->trim = RBD_TRIM_DEFAULT; - ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL); + ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL, + ','); if (ret) goto out_err; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bab61232dc5a..c444371ebc38 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -272,7 +272,7 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dout("server path '%s'\n", fsopt->server_path); ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, - pctx->copts, fc->log.log); + pctx->copts, fc->log.log, ','); if (ret) return ret; diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 409d8c29bc4f..c72285d8594e 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -301,7 +301,7 @@ struct fs_parameter; struct fc_log; struct ceph_options *ceph_alloc_options(void); int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt, - struct fc_log *l); + struct fc_log *l, char delim); int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, struct fc_log *l); int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 0e6e9ad3c3bf..ff99ce094cfa 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -532,7 +532,7 @@ extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr); extern int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, - int max_count, int *count); + int max_count, int *count, char delim); extern int ceph_msgr_init(void); extern void ceph_msgr_exit(void); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 97d6ea763e32..851b0c4c5730 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -422,14 +422,14 @@ out: } int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt, - struct fc_log *l) + struct fc_log *l, char delim) { struct p_log log = {.prefix = "libceph", .log = l}; int ret; - /* ip1[:port1][,ip2[:port2]...] */ + /* ip1[:port1][ip2[:port2]...] */ ret = ceph_parse_ips(buf, buf + len, opt->mon_addr, CEPH_MAX_MON, - &opt->num_mon); + &opt->num_mon, delim); if (ret) { error_plog(&log, "Failed to parse monitor IPs: %d", ret); return ret; @@ -455,8 +455,7 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, case Opt_ip: err = ceph_parse_ips(param->string, param->string + param->size, - &opt->my_addr, - 1, NULL); + &opt->my_addr, 1, NULL, ','); if (err) { error_plog(&log, "Failed to parse ip: %d", err); return err; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 57d043b382ed..929ed91f2ec3 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1267,30 +1267,31 @@ static int ceph_parse_server_name(const char *name, size_t namelen, */ int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, - int max_count, int *count) + int max_count, int *count, char delim) { int i, ret = -EINVAL; const char *p = c; dout("parse_ips on '%.*s'\n", (int)(end-c), c); for (i = 0; i < max_count; i++) { + char cur_delim = delim; const char *ipend; int port; - char delim = ','; if (*p == '[') { - delim = ']'; + cur_delim = ']'; p++; } - ret = ceph_parse_server_name(p, end - p, &addr[i], delim, &ipend); + ret = ceph_parse_server_name(p, end - p, &addr[i], cur_delim, + &ipend); if (ret) goto bad; ret = -EINVAL; p = ipend; - if (delim == ']') { + if (cur_delim == ']') { if (*p != ']') { dout("missing matching ']'\n"); goto bad; @@ -1326,11 +1327,11 @@ int ceph_parse_ips(const char *c, const char *end, addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY; addr[i].nonce = 0; - dout("parse_ips got %s\n", ceph_pr_addr(&addr[i])); + dout("%s got %s\n", __func__, ceph_pr_addr(&addr[i])); if (p == end) break; - if (*p != ',') + if (*p != delim) goto bad; p++; } From 4153c7fc937a2afa077dbdb9fe3189b9981f423c Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:51 +0530 Subject: [PATCH 067/149] libceph: rename parse_fsid() to ceph_parse_fsid() and export ... as it is too generic. also, use __func__ when logging rather than hardcoding the function name. Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 1 + net/ceph/ceph_common.c | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index c72285d8594e..644f224eccf7 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -296,6 +296,7 @@ extern bool libceph_compatible(void *data); extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); extern void *ceph_kvmalloc(size_t size, gfp_t flags); +extern int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid); struct fs_parameter; struct fc_log; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 851b0c4c5730..decae43b4262 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -217,14 +217,14 @@ void *ceph_kvmalloc(size_t size, gfp_t flags) return p; } -static int parse_fsid(const char *str, struct ceph_fsid *fsid) +int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; char tmp[3]; int err = -EINVAL; int d; - dout("parse_fsid '%s'\n", str); + dout("%s '%s'\n", __func__, str); tmp[2] = 0; while (*str && i < 16) { if (ispunct(*str)) { @@ -244,9 +244,10 @@ static int parse_fsid(const char *str, struct ceph_fsid *fsid) if (i == 16) err = 0; - dout("parse_fsid ret %d got fsid %pU\n", err, fsid); + dout("%s ret %d got fsid %pU\n", __func__, err, fsid); return err; } +EXPORT_SYMBOL(ceph_parse_fsid); /* * ceph options @@ -464,7 +465,7 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, break; case Opt_fsid: - err = parse_fsid(param->string, &opt->fsid); + err = ceph_parse_fsid(param->string, &opt->fsid); if (err) { error_plog(&log, "Failed to parse fsid: %d", err); return err; From 7b19b4db5add8d9f50e854907a82a10ba4d27c42 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:52 +0530 Subject: [PATCH 068/149] ceph: new device mount syntax Old mount device syntax (source) has the following problems: - mounts to the same cluster but with different fsnames and/or creds have identical device string which can confuse xfstests. - Userspace mount helper tool resolves monitor addresses and fill in mon addrs automatically, but that means the device shown in /proc/mounts is different than what was used for mounting. New device syntax is as follows: cephuser@fsid.mycephfs2=/path Note, there is no "monitor address" in the device string. That gets passed in as mount option. This keeps the device string same when monitor addresses change (on remounts). Also note that the userspace mount helper tool is backward compatible. I.e., the mount helper will fallback to using old syntax after trying to mount with the new syntax. [ idryomov: drop CEPH_MON_ADDR_MNTOPT_DELIM ] Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 141 ++++++++++++++++++++++++++++++++++++++++++++---- fs/ceph/super.h | 3 ++ 2 files changed, 134 insertions(+), 10 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c444371ebc38..e2c40d055711 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -146,6 +146,7 @@ enum { Opt_mds_namespace, Opt_recover_session, Opt_source, + Opt_mon_addr, /* string args above */ Opt_dirstat, Opt_rbytes, @@ -197,6 +198,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_u32 ("rsize", Opt_rsize), fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), + fsparam_string ("mon_addr", Opt_mon_addr), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), {} @@ -228,9 +230,92 @@ static void canonicalize_path(char *path) } /* - * Parse the source parameter. Distinguish the server list from the path. + * Check if the mds namespace in ceph_mount_options matches + * the passed in namespace string. First time match (when + * ->mds_namespace is NULL) is treated specially, since + * ->mds_namespace needs to be initialized by the caller. + */ +static int namespace_equals(struct ceph_mount_options *fsopt, + const char *namespace, size_t len) +{ + return !(fsopt->mds_namespace && + (strlen(fsopt->mds_namespace) != len || + strncmp(fsopt->mds_namespace, namespace, len))); +} + +static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + int r; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + if (*dev_name_end != ':') + return invalfc(fc, "separator ':' missing in source"); + + r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name, + pctx->copts, fc->log.log, ','); + if (r) + return r; + + fsopt->new_dev_syntax = false; + return 0; +} + +static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + size_t len; + struct ceph_fsid fsid; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + char *fsid_start, *fs_name_start; + + if (*dev_name_end != '=') { + dout("separator '=' missing in source"); + return -EINVAL; + } + + fsid_start = strchr(dev_name, '@'); + if (!fsid_start) + return invalfc(fc, "missing cluster fsid"); + ++fsid_start; /* start of cluster fsid */ + + fs_name_start = strchr(fsid_start, '.'); + if (!fs_name_start) + return invalfc(fc, "missing file system name"); + + if (ceph_parse_fsid(fsid_start, &fsid)) + return invalfc(fc, "Invalid FSID"); + + ++fs_name_start; /* start of file system name */ + len = dev_name_end - fs_name_start; + + if (!namespace_equals(fsopt, fs_name_start, len)) + return invalfc(fc, "Mismatching mds_namespace"); + kfree(fsopt->mds_namespace); + fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL); + if (!fsopt->mds_namespace) + return -ENOMEM; + dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace); + + fsopt->new_dev_syntax = true; + return 0; +} + +/* + * Parse the source parameter for new device format. Distinguish the device + * spec from the path. Try parsing new device format and fallback to old + * format if needed. * - * The source will look like: + * New device syntax will looks like: + * =/ + * where + * is name@fsid.fsname + * is optional, but if present must begin with '/' + * (monitor addresses are passed via mount option) + * + * Old device syntax is: * [,...]:[] * where * is [:] @@ -263,24 +348,44 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dev_name_end = dev_name + strlen(dev_name); } - dev_name_end--; /* back up to ':' separator */ - if (dev_name_end < dev_name || *dev_name_end != ':') - return invalfc(fc, "No path or : separator in source"); + dev_name_end--; /* back up to separator */ + if (dev_name_end < dev_name) + return invalfc(fc, "Path missing in source"); dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); if (fsopt->server_path) dout("server path '%s'\n", fsopt->server_path); - ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, - pctx->copts, fc->log.log, ','); - if (ret) - return ret; + dout("trying new device syntax"); + ret = ceph_parse_new_source(dev_name, dev_name_end, fc); + if (ret) { + if (ret != -EINVAL) + return ret; + dout("trying old device syntax"); + ret = ceph_parse_old_source(dev_name, dev_name_end, fc); + if (ret) + return ret; + } fc->source = param->string; param->string = NULL; return 0; } +static int ceph_parse_mon_addr(struct fs_parameter *param, + struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + kfree(fsopt->mon_addr); + fsopt->mon_addr = param->string; + param->string = NULL; + + return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr), + pctx->copts, fc->log.log, '/'); +} + static int ceph_parse_mount_param(struct fs_context *fc, struct fs_parameter *param) { @@ -306,6 +411,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, param->string = NULL; break; case Opt_mds_namespace: + if (!namespace_equals(fsopt, param->string, strlen(param->string))) + return invalfc(fc, "Mismatching mds_namespace"); kfree(fsopt->mds_namespace); fsopt->mds_namespace = param->string; param->string = NULL; @@ -323,6 +430,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, if (fc->source) return invalfc(fc, "Multiple sources specified"); return ceph_parse_source(param, fc); + case Opt_mon_addr: + return ceph_parse_mon_addr(param, fc); case Opt_wsize: if (result.uint_32 < PAGE_SIZE || result.uint_32 > CEPH_MAX_WRITE_SIZE) @@ -474,6 +583,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) kfree(args->mds_namespace); kfree(args->server_path); kfree(args->fscache_uniq); + kfree(args->mon_addr); kfree(args); } @@ -517,6 +627,10 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; + ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr); + if (ret) + return ret; + return ceph_compare_options(new_opt, fsc->client); } @@ -572,9 +686,13 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0) seq_puts(m, ",copyfrom"); - if (fsopt->mds_namespace) + /* dump mds_namespace when old device syntax is in use */ + if (fsopt->mds_namespace && !fsopt->new_dev_syntax) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + if (fsopt->mon_addr) + seq_printf(m, ",mon_addr=%s", fsopt->mon_addr); + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) seq_show_option(m, "recover_session", "clean"); @@ -1060,6 +1178,7 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) static int ceph_get_tree(struct fs_context *fc) { struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; struct super_block *sb; struct ceph_fs_client *fsc; struct dentry *res; @@ -1071,6 +1190,8 @@ static int ceph_get_tree(struct fs_context *fc) if (!fc->source) return invalfc(fc, "No source"); + if (fsopt->new_dev_syntax && !fsopt->mon_addr) + return invalfc(fc, "No monitor address"); /* create client (which we may/may not use) */ fsc = create_fs_client(pctx->opts, pctx->copts); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ac331aa07cfa..ec6b221e5b62 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -89,6 +89,8 @@ struct ceph_mount_options { unsigned int max_readdir; /* max readdir result (entries) */ unsigned int max_readdir_bytes; /* max readdir result (bytes) */ + bool new_dev_syntax; + /* * everything above this point can be memcmp'd; everything below * is handled in compare_mount_options() @@ -98,6 +100,7 @@ struct ceph_mount_options { char *mds_namespace; /* default NULL */ char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ + char *mon_addr; }; struct ceph_fs_client { From 2167f2cc686a97911a0b06ba9c97cec304b7c432 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:53 +0530 Subject: [PATCH 069/149] ceph: record updated mon_addr on remount Note that the new monitors are just shown in /proc/mounts. Ceph does not (re)connect to new monitors yet. [ jlayton: s/printk\(KERN_NOTICE/pr_notice(/ s/strcmp/strcmp_null/ ] Signed-off-by: Venky Shankar Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e2c40d055711..31b5786cd98f 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1277,6 +1277,13 @@ static int ceph_reconfigure_fc(struct fs_context *fc) else ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) { + kfree(fsc->mount_options->mon_addr); + fsc->mount_options->mon_addr = fsopt->mon_addr; + fsopt->mon_addr = NULL; + pr_notice("ceph: monitor addresses recorded, but not used for reconnection"); + } + sync_filesystem(fc->root->d_sb); return 0; } From e1b9eb50763d108166651ca67aae09893332c6b0 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:54 +0530 Subject: [PATCH 070/149] doc: document new CephFS mount device syntax Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- Documentation/filesystems/ceph.rst | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/ceph.rst b/Documentation/filesystems/ceph.rst index 7d2ef4e27273..4942e018db85 100644 --- a/Documentation/filesystems/ceph.rst +++ b/Documentation/filesystems/ceph.rst @@ -82,7 +82,7 @@ Mount Syntax The basic mount syntax is:: - # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt + # mount -t ceph user@fsid.fs_name=/[subdir] mnt -o mon_addr=monip1[:port][/monip2[:port]] You only need to specify a single monitor, as the client will get the full list when it connects. (However, if the monitor you specify @@ -90,16 +90,35 @@ happens to be down, the mount won't succeed.) The port can be left off if the monitor is using the default. So if the monitor is at 1.2.3.4:: - # mount -t ceph 1.2.3.4:/ /mnt/ceph + # mount -t ceph cephuser@07fe3187-00d9-42a3-814b-72a4d5e7d5be.cephfs=/ /mnt/ceph -o mon_addr=1.2.3.4 is sufficient. If /sbin/mount.ceph is installed, a hostname can be -used instead of an IP address. +used instead of an IP address and the cluster FSID can be left out +(as the mount helper will fill it in by reading the ceph configuration +file):: + # mount -t ceph cephuser@cephfs=/ /mnt/ceph -o mon_addr=mon-addr +Multiple monitor addresses can be passed by separating each address with a slash (`/`):: + + # mount -t ceph cephuser@cephfs=/ /mnt/ceph -o mon_addr=192.168.1.100/192.168.1.101 + +When using the mount helper, monitor address can be read from ceph +configuration file if available. Note that, the cluster FSID (passed as part +of the device string) is validated by checking it with the FSID reported by +the monitor. Mount Options ============= + mon_addr=ip_address[:port][/ip_address[:port]] + Monitor address to the cluster. This is used to bootstrap the + connection to the cluster. Once connection is established, the + monitor addresses in the monitor map are followed. + + fsid=cluster-id + FSID of the cluster (from `ceph fsid` command). + ip=A.B.C.D[:N] Specify the IP and/or port the client should bind to locally. There is normally not much reason to do this. If the IP is not From adbed05ed62d1f3b6f6c5cb88ec52c1ffafc0fd9 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 3 Nov 2021 10:30:39 +0530 Subject: [PATCH 071/149] ceph: mount syntax module parameter Add read-only module parameters for supported mount syntaxes. Primary user is the user-space mount helper for catching v2 syntax bugs during testing by cross verifying if the kernel supports v2 syntax on mount failure. Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 31b5786cd98f..2166fc2c1625 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1461,6 +1461,14 @@ bool disable_send_metrics = false; module_param_cb(disable_send_metrics, ¶m_ops_metrics, &disable_send_metrics, 0644); MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)"); +/* for both v1 and v2 syntax */ +static bool mount_support = true; +static const struct kernel_param_ops param_ops_mount_syntax = { + .get = param_get_bool, +}; +module_param_cb(mount_syntax_v1, ¶m_ops_mount_syntax, &mount_support, 0444); +module_param_cb(mount_syntax_v2, ¶m_ops_mount_syntax, &mount_support, 0444); + module_init(init_ceph); module_exit(exit_ceph); From 8e55ba8caae5cd380b1c9c81a426602a667e110e Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Wed, 10 Nov 2021 23:30:21 +0530 Subject: [PATCH 072/149] ceph: Fix incorrect statfs report for small quota Problem: The statfs reports incorrect free/available space for quota less then CEPH_BLOCK size (4M). Solution: For quota less than CEPH_BLOCK size, smaller block size of 4K is used. But if quota is less than 4K, it is decided to go with binary use/free of 4K block. For quota size less than 4K size, report the total=used=4K,free=0 when quota is full and total=free=4K,used=0 otherwise. Signed-off-by: Kotresh HR Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 14 ++++++++++++++ fs/ceph/super.h | 1 + 2 files changed, 15 insertions(+) diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 620c691af40e..24ae13ea2241 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -494,10 +494,24 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) if (ci->i_max_bytes) { total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT; used = ci->i_rbytes >> CEPH_BLOCK_SHIFT; + /* For quota size less than 4MB, use 4KB block size */ + if (!total) { + total = ci->i_max_bytes >> CEPH_4K_BLOCK_SHIFT; + used = ci->i_rbytes >> CEPH_4K_BLOCK_SHIFT; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } /* It is possible for a quota to be exceeded. * Report 'zero' in that case */ free = total > used ? total - used : 0; + /* For quota size less than 4KB, report the + * total=used=4KB,free=0 when quota is full + * and total=free=4KB, used=0 otherwise */ + if (!total) { + total = 1; + free = ci->i_max_bytes > ci->i_rbytes ? 1 : 0; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } } spin_unlock(&ci->i_ceph_lock); if (total) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ec6b221e5b62..533189b537b2 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -32,6 +32,7 @@ * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) +#define CEPH_4K_BLOCK_SHIFT 12 /* 4 KB */ #define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ From 435a120a47eed0b3a1ac7b86cf1f7707bf2242ce Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 27 Nov 2021 17:21:04 +0000 Subject: [PATCH 073/149] rbd: make const pointer spaces a static const array Don't populate the const array spaces on the stack but make it static const and make the pointer an array to remove a dereference. Shrinks object code a little too. Also clean up intent, currently it is spaces and should be a tab. Signed-off-by: Colin Ian King Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 909dbe6111bf..1bf1595420a8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -6191,7 +6191,7 @@ static inline size_t next_token(const char **buf) * These are the characters that produce nonzero for * isspace() in the "C" and "POSIX" locales. */ - const char *spaces = " \f\n\r\t\v"; + static const char spaces[] = " \f\n\r\t\v"; *buf += strspn(*buf, spaces); /* Find start of token */ From af9ceae83cd26c9319bb2cdab23bb16d39300cbd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 26 Jan 2021 13:41:38 -0500 Subject: [PATCH 074/149] ceph: drop send metrics debug message This pops every second and isn't very useful. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/metric.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index c57699d8408d..0fcba68f9a99 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -160,8 +160,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, msg->hdr.version = cpu_to_le16(1); msg->hdr.compat_version = cpu_to_le16(1); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - dout("client%llu send metrics to mds%d\n", - ceph_client_gid(mdsc->fsc->client), s->s_mds); ceph_con_send(&s->s_con, msg); return true; From 0078ea3b0566e3da09ae8e1e4fbfd708702f2876 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 9 Nov 2021 09:54:49 -0500 Subject: [PATCH 075/149] ceph: don't check for quotas on MDS stray dirs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 玮文 胡 reported seeing the WARN_RATELIMIT pop when writing to an inode that had been transplanted into the stray dir. The client was trying to look up the quotarealm info from the parent and that tripped the warning. Change the ceph_vino_is_reserved helper to not throw a warning for MDS stray directories (0x100 - 0x1ff), only for reserved dirs that are not in that range. Also, fix ceph_has_realms_with_quotas to return false when encountering a reserved inode. URL: https://tracker.ceph.com/issues/53180 Reported-by: Hu Weiwen Signed-off-by: Jeff Layton Reviewed-by: Luis Henriques Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 3 +++ fs/ceph/super.h | 20 ++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 24ae13ea2241..a338a3ec0dc4 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -30,6 +30,9 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode) /* if root is the real CephFS root, we don't have quota realms */ if (root && ceph_ino(root) == CEPH_INO_ROOT) return false; + /* MDS stray dirs have no quota realms */ + if (ceph_vino_is_reserved(ceph_inode(inode)->i_vino)) + return false; /* otherwise, we can't know for sure */ return true; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 533189b537b2..5ec465a1350a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -539,19 +539,23 @@ static inline int ceph_ino_compare(struct inode *inode, void *data) * * These come from src/mds/mdstypes.h in the ceph sources. */ -#define CEPH_MAX_MDS 0x100 -#define CEPH_NUM_STRAY 10 +#define CEPH_MAX_MDS 0x100 +#define CEPH_NUM_STRAY 10 #define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) +#define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS) #define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) { - if (vino.ino < CEPH_INO_SYSTEM_BASE && - vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) { - WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino); - return true; - } - return false; + if (vino.ino >= CEPH_INO_SYSTEM_BASE || + vino.ino < CEPH_MDS_INO_MDSDIR_OFFSET) + return false; + + /* Don't warn on mdsdirs */ + WARN_RATELIMIT(vino.ino >= CEPH_MDS_INO_LOG_OFFSET, + "Attempt to access reserved inode number 0x%llx", + vino.ino); + return true; } static inline struct inode *ceph_find_inode(struct super_block *sb, From 94cc0877cad0bc6ca84686c4fa874bf530eb8b88 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 30 Nov 2021 14:12:13 -0500 Subject: [PATCH 076/149] ceph: add new "nopagecache" option CephFS is a bit unlike most other filesystems in that it only conditionally does buffered I/O based on the caps that it gets from the MDS. In most cases, unless there is contended access for an inode the MDS does give Fbc caps to the client, so the unbuffered codepaths are only infrequently traveled and are difficult to test. At one time, the "-o sync" mount option would give you this behavior, but that was removed in commit 7ab9b3807097 ("ceph: Don't use ceph-sync-mode for synchronous-fs."). Add a new mount option to tell the client to ignore Fbc caps when doing I/O, and to use the synchronous codepaths exclusively, even on non-O_DIRECT file descriptors. We already have an ioctl that forces this behavior on a per-file basis, so we can just always set the CEPH_F_SYNC flag in the file description on such mounts. Additionally, this patch also changes the client to not request Fbc when doing direct I/O. We aren't using the cache with O_DIRECT so we don't have any need for those caps. Signed-off-by: Jeff Layton Acked-by: Greg Farnum Reviewed-by: Venky Shankar Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 24 +++++++++++++++--------- fs/ceph/super.c | 11 +++++++++++ fs/ceph/super.h | 1 + 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c138e8126286..7de5db51c3d0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -204,6 +204,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, int fmode, bool isdir) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mount_options *opt = + ceph_inode_to_client(&ci->vfs_inode)->mount_options; struct ceph_file_info *fi; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, @@ -225,6 +227,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, if (!fi) return -ENOMEM; + if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + fi->flags |= CEPH_F_SYNC; + file->private_data = fi; } @@ -1536,7 +1541,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) struct ceph_inode_info *ci = ceph_inode(inode); bool direct_lock = iocb->ki_flags & IOCB_DIRECT; ssize_t ret; - int want, got = 0; + int want = 0, got = 0; int retry_op = 0, read = 0; again: @@ -1551,13 +1556,14 @@ again: else ceph_start_io_read(inode); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_CACHE; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_CACHE; + want |= CEPH_CAP_FILE_LAZYIO; + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got); if (ret < 0) { - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_end_io_direct(inode); else ceph_end_io_read(inode); @@ -1691,7 +1697,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; - int err, want, got; + int err, want = 0, got; bool direct_lock = false; u32 map_flags; u64 pool_flags; @@ -1766,10 +1772,10 @@ retry_snap: dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_BUFFER; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; + want |= CEPH_CAP_FILE_LAZYIO; got = 0; err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got); if (err < 0) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 2166fc2c1625..eb255d24e321 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -160,6 +160,7 @@ enum { Opt_quotadf, Opt_copyfrom, Opt_wsync, + Opt_pagecache, }; enum ceph_recover_session_mode { @@ -201,6 +202,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_string ("mon_addr", Opt_mon_addr), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), + fsparam_flag_no ("pagecache", Opt_pagecache), {} }; @@ -564,6 +566,12 @@ static int ceph_parse_mount_param(struct fs_context *fc, else fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS; break; + case Opt_pagecache: + if (result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE; + break; default: BUG(); } @@ -699,6 +707,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) seq_puts(m, ",wsync"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + seq_puts(m, ",nopagecache"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 5ec465a1350a..0d489a973d5e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -46,6 +46,7 @@ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ +#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ From 76bdbc7ac777adb6bc316bfe3f57b3de93c50985 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 6 Jan 2022 09:35:52 +0800 Subject: [PATCH 077/149] ceph: remove redundant Lsx caps check The newcaps has already included the Ls, no need to check it again. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c447fa2e2d1f..62448691608f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3375,8 +3375,7 @@ static void handle_cap_grant(struct inode *inode, if ((newcaps & CEPH_CAP_LINK_SHARED) && (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); - if (inode->i_nlink == 0 && - (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + if (inode->i_nlink == 0) deleted_inode = true; } From a0b3a15eab6bc2e90008460b646d53e7d9dcdbbb Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 10 Jan 2022 18:28:33 -0500 Subject: [PATCH 078/149] ceph: move CEPH_SUPER_MAGIC definition to magic.h The uapi headers are missing the ceph definition. Move it there so userland apps can ID cephfs. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 2 ++ fs/ceph/super.h | 3 --- include/uapi/linux/magic.h | 1 + 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index eb255d24e321..dbcf9b743c88 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -27,6 +27,8 @@ #include #include +#include + static DEFINE_SPINLOCK(ceph_fsc_lock); static LIST_HEAD(ceph_fsc_list); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0d489a973d5e..dc9fe7ed3fa8 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -25,9 +25,6 @@ #include #endif -/* f_type in struct statfs */ -#define CEPH_SUPER_MAGIC 0x00c36400 - /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 35687dcb1a42..53a3c20394cf 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -6,6 +6,7 @@ #define AFFS_SUPER_MAGIC 0xadff #define AFS_SUPER_MAGIC 0x5346414F #define AUTOFS_SUPER_MAGIC 0x0187 +#define CEPH_SUPER_MAGIC 0x00c36400 #define CODA_SUPER_MAGIC 0x73757245 #define CRAMFS_MAGIC 0x28cd3d45 /* some random number */ #define CRAMFS_MAGIC_WEND 0x453dcd28 /* magic number with the wrong endianess */ From d7b4303411025b675e8c0927307c578a0ce3f75c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 12 Jan 2022 23:55:33 +0000 Subject: [PATCH 079/149] atm: iphase: remove redundant pointer skb The pointer skb is redundant, it is assigned a value that is never read and hence can be removed. Cleans up clang scan warning: drivers/atm/iphase.c:205:18: warning: Although the value stored to 'skb' is used in the enclosing expression, the value is never actually read from 'skb' [deadcode.DeadStores] Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/atm/iphase.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c index bc8e8d9f176b..3e726ee91fdc 100644 --- a/drivers/atm/iphase.c +++ b/drivers/atm/iphase.c @@ -178,7 +178,6 @@ static void ia_hack_tcq(IADEV *dev) { static u16 get_desc (IADEV *dev, struct ia_vcc *iavcc) { u_short desc_num, i; - struct sk_buff *skb; struct ia_vcc *iavcc_r = NULL; unsigned long delta; static unsigned long timer = 0; @@ -202,8 +201,7 @@ static u16 get_desc (IADEV *dev, struct ia_vcc *iavcc) { else dev->ffL.tcq_rd -= 2; *(u_short *)(dev->seg_ram + dev->ffL.tcq_rd) = i+1; - if (!(skb = dev->desc_tbl[i].txskb) || - !(iavcc_r = dev->desc_tbl[i].iavcc)) + if (!dev->desc_tbl[i].txskb || !(iavcc_r = dev->desc_tbl[i].iavcc)) printk("Fatal err, desc table vcc or skb is NULL\n"); else iavcc_r->vc_desc_cnt--; From 33cb0ff30cff104e753f7882c99e54cf67ea7903 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 12 Jan 2022 22:21:27 +0200 Subject: [PATCH 080/149] net: mscc: ocelot: don't let phylink re-enable TX PAUSE on the NPI port Since commit b39648079db4 ("net: mscc: ocelot: disable flow control on NPI interface"), flow control should be disabled on the DSA CPU port when used in NPI mode. However, the commit blamed in the Fixes: tag below broke this, because it allowed felix_phylink_mac_link_up() to overwrite SYS_PAUSE_CFG_PAUSE_ENA for the DSA CPU port. This issue became noticeable since the device tree update from commit 8fcea7be5736 ("arm64: dts: ls1028a: mark internal links between Felix and ENETC as capable of flow control"). The solution is to check whether this is the currently configured NPI port from ocelot_phylink_mac_link_up(), and to not modify the statically disabled PAUSE frame transmission if it is. When the port is configured for lossless mode as opposed to tail drop mode, but the link partner (DSA master) doesn't observe the transmitted PAUSE frames, the switch termination throughput is much worse, as can be seen below. Before: root@debian:~# iperf3 -c 192.168.100.2 Connecting to host 192.168.100.2, port 5201 [ 5] local 192.168.100.1 port 37504 connected to 192.168.100.2 port 5201 [ ID] Interval Transfer Bitrate Retr Cwnd [ 5] 0.00-1.00 sec 28.4 MBytes 238 Mbits/sec 357 22.6 KBytes [ 5] 1.00-2.00 sec 33.6 MBytes 282 Mbits/sec 426 19.8 KBytes [ 5] 2.00-3.00 sec 34.0 MBytes 285 Mbits/sec 343 21.2 KBytes [ 5] 3.00-4.00 sec 32.9 MBytes 276 Mbits/sec 354 22.6 KBytes [ 5] 4.00-5.00 sec 32.3 MBytes 271 Mbits/sec 297 18.4 KBytes ^C[ 5] 5.00-5.06 sec 2.05 MBytes 270 Mbits/sec 45 19.8 KBytes - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-5.06 sec 163 MBytes 271 Mbits/sec 1822 sender [ 5] 0.00-5.06 sec 0.00 Bytes 0.00 bits/sec receiver After: root@debian:~# iperf3 -c 192.168.100.2 Connecting to host 192.168.100.2, port 5201 [ 5] local 192.168.100.1 port 49470 connected to 192.168.100.2 port 5201 [ ID] Interval Transfer Bitrate Retr Cwnd [ 5] 0.00-1.00 sec 112 MBytes 941 Mbits/sec 259 143 KBytes [ 5] 1.00-2.00 sec 110 MBytes 920 Mbits/sec 329 144 KBytes [ 5] 2.00-3.00 sec 112 MBytes 936 Mbits/sec 255 144 KBytes [ 5] 3.00-4.00 sec 110 MBytes 927 Mbits/sec 355 105 KBytes [ 5] 4.00-5.00 sec 110 MBytes 926 Mbits/sec 350 156 KBytes [ 5] 5.00-6.00 sec 110 MBytes 925 Mbits/sec 305 148 KBytes [ 5] 6.00-7.00 sec 110 MBytes 924 Mbits/sec 320 143 KBytes [ 5] 7.00-8.00 sec 110 MBytes 925 Mbits/sec 273 97.6 KBytes [ 5] 8.00-9.00 sec 109 MBytes 913 Mbits/sec 299 141 KBytes [ 5] 9.00-10.00 sec 110 MBytes 922 Mbits/sec 287 146 KBytes - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-10.00 sec 1.08 GBytes 926 Mbits/sec 3032 sender [ 5] 0.00-10.00 sec 1.08 GBytes 925 Mbits/sec receiver Fixes: de274be32cb2 ("net: dsa: felix: set TX flow control according to the phylink_mac_link_up resolution") Reported-by: Xiaoliang Yang Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index b1311b656e17..455293aa6343 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -771,7 +771,10 @@ void ocelot_phylink_mac_link_up(struct ocelot *ocelot, int port, ocelot_write_rix(ocelot, 0, ANA_POL_FLOWC, port); - ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, tx_pause); + /* Don't attempt to send PAUSE frames on the NPI port, it's broken */ + if (port != ocelot->npi) + ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, + tx_pause); /* Undo the effects of ocelot_phylink_mac_link_down: * enable MAC module From de0e444706ed98f2a13275a988d8595184be6d5c Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Thu, 13 Jan 2022 15:28:59 +0800 Subject: [PATCH 081/149] kselftests/net: adapt the timeout to the largest runtime timeout in settings is used by each case under the same directory, so it should adapt to the maximum runtime. A normally running net/fib_nexthops.sh may be killed by this unsuitable timeout. Furthermore, since the defect[1] of kselftests framework, net/fib_nexthops.sh which might take at least (300 * 4) seconds would block the whole kselftests framework previously. $ git grep -w 'sleep 300' tools/testing/selftests/net tools/testing/selftests/net/fib_nexthops.sh: sleep 300 tools/testing/selftests/net/fib_nexthops.sh: sleep 300 tools/testing/selftests/net/fib_nexthops.sh: sleep 300 tools/testing/selftests/net/fib_nexthops.sh: sleep 300 Enlarge the timeout by plus 300 based on the obvious largest runtime to avoid the blocking. [1]: https://www.spinics.net/lists/kernel/msg4185370.html Signed-off-by: Zhou Jie Signed-off-by: Li Zhijian Signed-off-by: David S. Miller --- tools/testing/selftests/net/settings | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/settings b/tools/testing/selftests/net/settings index 694d70710ff0..dfc27cdc6c05 100644 --- a/tools/testing/selftests/net/settings +++ b/tools/testing/selftests/net/settings @@ -1 +1 @@ -timeout=300 +timeout=1500 From 61f434b0280ed65495831f1b6e1a5c21a90f47c6 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 13 Jan 2022 16:36:40 +0800 Subject: [PATCH 082/149] net/smc: Resolve the race between link group access and termination We encountered some crashes caused by the race between the access and the termination of link groups. Here are some of panic stacks we met: 1) Race between smc_clc_wait_msg() and __smc_lgr_terminate() BUG: kernel NULL pointer dereference, address: 00000000000002f0 Workqueue: smc_hs_wq smc_listen_work [smc] RIP: 0010:smc_clc_wait_msg+0x3eb/0x5c0 [smc] Call Trace: ? smc_clc_send_accept+0x45/0xa0 [smc] ? smc_clc_send_accept+0x45/0xa0 [smc] smc_listen_work+0x783/0x1220 [smc] ? finish_task_switch+0xc4/0x2e0 ? process_one_work+0x1ad/0x3c0 process_one_work+0x1ad/0x3c0 worker_thread+0x4c/0x390 ? rescuer_thread+0x320/0x320 kthread+0x149/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 smc_listen_work() abnormal case like port error --------------------------------------------------------------- | __smc_lgr_terminate() | |- smc_conn_kill() | |- smc_lgr_unregister_conn() | |- set conn->lgr = NULL smc_clc_wait_msg() | |- access conn->lgr (panic) | 2) Race between smc_setsockopt() and __smc_lgr_terminate() BUG: kernel NULL pointer dereference, address: 00000000000002e8 RIP: 0010:smc_setsockopt+0x17a/0x280 [smc] Call Trace: __sys_setsockopt+0xfc/0x190 __x64_sys_setsockopt+0x20/0x30 do_syscall_64+0x34/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae smc_setsockopt() abnormal case like port error -------------------------------------------------------------- | __smc_lgr_terminate() | |- smc_conn_kill() | |- smc_lgr_unregister_conn() | |- set conn->lgr = NULL mod_delayed_work() | |- access conn->lgr (panic) | There are some other panic places and they are caused by the similar reason as described above, which is accessing link group after termination, thus getting a NULL pointer or invalid resource. Currently, there seems to be no synchronization between the link group access and a sudden termination of it. This patch tries to fix this by introducing reference count of link group and not freeing link group until reference count is zero. Link group might be referred to by links or smc connections. So the operation to the link group reference count can be concluded as follows: object [hold or initialized as 1] [put] ------------------------------------------------------------------- link group smc_lgr_create() smc_lgr_free() connections smc_conn_create() smc_conn_free() links smcr_link_init() smcr_link_clear() Througth this way, we extend the life cycle of link group and ensure it is longer than the life cycle of connections and links above it, so that avoid invalid access to link group after its termination. Signed-off-by: Wen Gu Signed-off-by: David S. Miller --- net/smc/smc.h | 1 + net/smc/smc_core.c | 60 +++++++++++++++++++++++++++++++++++++--------- net/smc/smc_core.h | 3 +++ 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index 1a4fc1c6c4ab..3d0b8e300deb 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -221,6 +221,7 @@ struct smc_connection { */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ + u8 freed : 1; /* normal termiation */ u8 out_of_sync : 1; /* out of sync with peer */ }; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 8935ef4811b0..fcb5e15bfa95 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -218,7 +218,6 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); - conn->lgr = NULL; } int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) @@ -752,6 +751,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; + smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); @@ -806,6 +806,7 @@ out: lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); + smc_lgr_put(lgr); /* lgr_hold above */ return rc; } @@ -844,6 +845,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->terminating = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; + refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */ mutex_init(&lgr->sndbufs_lock); mutex_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); @@ -1130,8 +1132,19 @@ void smc_conn_free(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - if (!lgr) + if (!lgr || conn->freed) + /* Connection has never been registered in a + * link group, or has already been freed. + */ return; + + conn->freed = 1; + if (!conn->alert_token_local) + /* Connection has already unregistered from + * link group. + */ + goto lgr_put; + if (lgr->is_smcd) { if (!list_empty(&lgr->list)) smc_ism_unset_conn(conn); @@ -1148,6 +1161,8 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); +lgr_put: + smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */ } /* unregister a link from a buf_desc */ @@ -1206,9 +1221,10 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk) /* must be called under lgr->llc_conf_mutex lock */ void smcr_link_clear(struct smc_link *lnk, bool log) { + struct smc_link_group *lgr = lnk->lgr; struct smc_ib_device *smcibdev; - if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED) + if (!lgr || lnk->state == SMC_LNK_UNUSED) return; lnk->peer_qpn = 0; smc_llc_link_clear(lnk, log); @@ -1226,6 +1242,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log) lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); + smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, @@ -1290,6 +1307,21 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) __smc_lgr_free_bufs(lgr, true); } +/* won't be freed until no one accesses to lgr anymore */ +static void __smc_lgr_free(struct smc_link_group *lgr) +{ + smc_lgr_free_bufs(lgr); + if (lgr->is_smcd) { + if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) + wake_up(&lgr->smcd->lgrs_deleted); + } else { + smc_wr_free_lgr_mem(lgr); + if (!atomic_dec_return(&lgr_cnt)) + wake_up(&lgrs_deleted); + } + kfree(lgr); +} + /* remove a link group */ static void smc_lgr_free(struct smc_link_group *lgr) { @@ -1305,19 +1337,23 @@ static void smc_lgr_free(struct smc_link_group *lgr) smc_llc_lgr_clear(lgr); } - smc_lgr_free_bufs(lgr); destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(&lgr->smcd->dev); - if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) - wake_up(&lgr->smcd->lgrs_deleted); - } else { - smc_wr_free_lgr_mem(lgr); - if (!atomic_dec_return(&lgr_cnt)) - wake_up(&lgrs_deleted); } - kfree(lgr); + smc_lgr_put(lgr); /* theoretically last lgr_put */ +} + +void smc_lgr_hold(struct smc_link_group *lgr) +{ + refcount_inc(&lgr->refcnt); +} + +void smc_lgr_put(struct smc_link_group *lgr) +{ + if (refcount_dec_and_test(&lgr->refcnt)) + __smc_lgr_free(lgr); } static void smc_sk_wake_ups(struct smc_sock *smc) @@ -1856,6 +1892,8 @@ create: goto out; } } + smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ + conn->freed = 0; conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 521c64a3d8d3..f8b5397a9dbe 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -249,6 +249,7 @@ struct smc_link_group { u8 terminating : 1;/* lgr is terminating */ u8 freeing : 1; /* lgr is being freed */ + refcount_t refcnt; /* lgr reference count */ bool is_smcd; /* SMC-R or SMC-D */ u8 smc_version; u8 negotiated_eid[SMC_MAX_EID_LEN]; @@ -487,6 +488,8 @@ struct smc_clc_msg_accept_confirm; void smc_lgr_cleanup_early(struct smc_link_group *lgr); void smc_lgr_terminate_sched(struct smc_link_group *lgr); +void smc_lgr_hold(struct smc_link_group *lgr); +void smc_lgr_put(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, From 91341fa0003befd097e190ec2a4bf63ad957c49a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 13 Jan 2022 01:22:29 -0800 Subject: [PATCH 083/149] inet: frags: annotate races around fqdir->dead and fqdir->high_thresh Both fields can be read/written without synchronization, add proper accessors and documentation. Fixes: d5dd88794a13 ("inet: fix various use-after-free in defrags units") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_frag.h | 11 +++++++++-- include/net/ipv6_frag.h | 3 ++- net/ipv4/inet_fragment.c | 8 +++++--- net/ipv4/ip_fragment.c | 3 ++- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 48cc5795ceda..63540be0fc34 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -117,8 +117,15 @@ int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net); static inline void fqdir_pre_exit(struct fqdir *fqdir) { - fqdir->high_thresh = 0; /* prevent creation of new frags */ - fqdir->dead = true; + /* Prevent creation of new frags. + * Pairs with READ_ONCE() in inet_frag_find(). + */ + WRITE_ONCE(fqdir->high_thresh, 0); + + /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire() + * and ip6frag_expire_frag_queue(). + */ + WRITE_ONCE(fqdir->dead, true); } void fqdir_exit(struct fqdir *fqdir); diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 851029ecff13..0a4779175a52 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -67,7 +67,8 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) struct sk_buff *head; rcu_read_lock(); - if (fq->q.fqdir->dead) + /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */ + if (READ_ONCE(fq->q.fqdir->dead)) goto out_rcu_unlock; spin_lock(&fq->q.lock); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 05cd198d7a6b..341096807100 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -235,9 +235,9 @@ void inet_frag_kill(struct inet_frag_queue *fq) /* The RCU read lock provides a memory barrier * guaranteeing that if fqdir->dead is false then * the hash table destruction will not start until - * after we unlock. Paired with inet_frags_exit_net(). + * after we unlock. Paired with fqdir_pre_exit(). */ - if (!fqdir->dead) { + if (!READ_ONCE(fqdir->dead)) { rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, fqdir->f->rhash_params); refcount_dec(&fq->refcnt); @@ -352,9 +352,11 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) { + /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ + long high_thresh = READ_ONCE(fqdir->high_thresh); struct inet_frag_queue *fq = NULL, *prev; - if (!fqdir->high_thresh || frag_mem_limit(fqdir) > fqdir->high_thresh) + if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) return NULL; rcu_read_lock(); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index cfeb8890f94e..fad803d2d711 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -144,7 +144,8 @@ static void ip_expire(struct timer_list *t) rcu_read_lock(); - if (qp->q.fqdir->dead) + /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */ + if (READ_ONCE(qp->q.fqdir->dead)) goto out_rcu_unlock; spin_lock(&qp->q.lock); From ea89c6c0983c39702a4a52ccaa4702e0cb71179b Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 13 Jan 2022 16:36:41 +0800 Subject: [PATCH 084/149] net/smc: Introduce a new conn->lgr validity check helper It is no longer suitable to identify whether a smc connection is registered in a link group through checking if conn->lgr is NULL, because conn->lgr won't be reset even the connection is unregistered from a link group. So this patch introduces a new helper smc_conn_lgr_valid() and replaces all the check of conn->lgr in original implementation with the new helper to judge if conn->lgr is valid to use. Signed-off-by: Wen Gu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 +++++- net/smc/smc_cdc.c | 3 ++- net/smc/smc_clc.c | 2 +- net/smc/smc_core.c | 14 ++++++++------ net/smc/smc_core.h | 5 +++++ net/smc/smc_diag.c | 6 +++--- 6 files changed, 24 insertions(+), 12 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index aa3bcaaeabf7..961854e56736 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -634,9 +634,13 @@ static void smc_conn_abort(struct smc_sock *smc, int local_first) { struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; + bool lgr_valid = false; + + if (smc_conn_lgr_valid(conn)) + lgr_valid = true; smc_conn_free(conn); - if (local_first) + if (local_first && lgr_valid) smc_lgr_cleanup_early(lgr); } diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 84c8a4374fdd..9d5a97168969 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -197,7 +197,8 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) { int rc; - if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) + if (!smc_conn_lgr_valid(conn) || + (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) return -EPIPE; if (conn->lgr->is_smcd) { diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 6be95a2a7b25..ce27399b38b1 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -774,7 +774,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX; dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? SMC_FIRST_CONTACT_MASK : 0; - if ((!smc->conn.lgr || !smc->conn.lgr->is_smcd) && + if ((!smc_conn_lgr_valid(&smc->conn) || !smc->conn.lgr->is_smcd) && smc_ib_is_valid_local_systemid()) memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index fcb5e15bfa95..2096841d91ef 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -211,7 +211,7 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - if (!lgr) + if (!smc_conn_lgr_valid(conn)) return; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { @@ -1139,7 +1139,7 @@ void smc_conn_free(struct smc_connection *conn) return; conn->freed = 1; - if (!conn->alert_token_local) + if (!smc_conn_lgr_valid(conn)) /* Connection has already unregistered from * link group. */ @@ -2278,14 +2278,16 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { - if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || + !smc_link_active(conn->lnk)) return; smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { - if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || + !smc_link_active(conn->lnk)) return; smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -2294,7 +2296,7 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { int i; - if (!conn->lgr || conn->lgr->is_smcd) + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&conn->lgr->lnk[i])) @@ -2308,7 +2310,7 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { int i; - if (!conn->lgr || conn->lgr->is_smcd) + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&conn->lgr->lnk[i])) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index f8b5397a9dbe..599455d00020 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -410,6 +410,11 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +static inline bool smc_conn_lgr_valid(struct smc_connection *conn) +{ + return conn->lgr && conn->alert_token_local; +} + /* * Returns true if the specified link is usable. * diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 7c8dad28c18d..b8898c787d23 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -89,7 +89,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, r->diag_state = sk->sk_state; if (smc->use_fallback) r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP; - else if (smc->conn.lgr && smc->conn.lgr->is_smcd) + else if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd) r->diag_mode = SMC_DIAG_MODE_SMCD; else r->diag_mode = SMC_DIAG_MODE_SMCR; @@ -142,7 +142,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, goto errout; } - if (smc->conn.lgr && !smc->conn.lgr->is_smcd && + if (smc_conn_lgr_valid(&smc->conn) && !smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_link *link = smc->conn.lnk; @@ -164,7 +164,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; } - if (smc->conn.lgr && smc->conn.lgr->is_smcd && + if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_connection *conn = &smc->conn; From 20c9398d3309d170300d67643b851fd26783af24 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 13 Jan 2022 16:36:42 +0800 Subject: [PATCH 085/149] net/smc: Resolve the race between SMC-R link access and clear We encountered some crashes caused by the race between SMC-R link access and link clear that triggered by abnormal link group termination, such as port error. Here is an example of this kind of crashes: BUG: kernel NULL pointer dereference, address: 0000000000000000 Workqueue: smc_hs_wq smc_listen_work [smc] RIP: 0010:smc_llc_flow_initiate+0x44/0x190 [smc] Call Trace: ? __smc_buf_create+0x75a/0x950 [smc] smcr_lgr_reg_rmbs+0x2a/0xbf [smc] smc_listen_work+0xf72/0x1230 [smc] ? process_one_work+0x25c/0x600 process_one_work+0x25c/0x600 worker_thread+0x4f/0x3a0 ? process_one_work+0x600/0x600 kthread+0x15d/0x1a0 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 smc_listen_work() __smc_lgr_terminate() --------------------------------------------------------------- | smc_lgr_free() | |- smcr_link_clear() | |- memset(lnk, 0) smc_listen_rdma_reg() | |- smcr_lgr_reg_rmbs() | |- smc_llc_flow_initiate() | |- access lnk->lgr (panic) | These crashes are similarly caused by clearing SMC-R link resources when some functions is still accessing to them. This patch tries to fix the issue by introducing reference count of SMC-R links and ensuring that the sensitive resources of links won't be cleared until reference count reaches zero. The operation to the SMC-R link reference count can be concluded as follows: object [hold or initialized as 1] [put] -------------------------------------------------------------------- links smcr_link_init() smcr_link_clear() connections smc_conn_create() smc_conn_free() Through this way, the clear of SMC-R links is later than the free of all the smc connections above it, thus avoiding the unsafe reference to SMC-R links. Signed-off-by: Wen Gu Signed-off-by: David S. Miller --- net/smc/smc_core.c | 52 +++++++++++++++++++++++++++++++++++----------- net/smc/smc_core.h | 4 ++++ 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2096841d91ef..7199eeeb1667 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -748,6 +748,8 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, } get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); + refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */ + lnk->clearing = 0; lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; @@ -998,8 +1000,12 @@ void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk) { atomic_dec(&conn->lnk->conn_cnt); + /* link_hold in smc_conn_create() */ + smcr_link_put(conn->lnk); conn->lnk = to_lnk; atomic_inc(&conn->lnk->conn_cnt); + /* link_put in smc_conn_free() */ + smcr_link_hold(conn->lnk); } struct smc_link *smc_switch_conns(struct smc_link_group *lgr, @@ -1162,6 +1168,8 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); lgr_put: + if (!lgr->is_smcd) + smcr_link_put(conn->lnk); /* link_hold in smc_conn_create() */ smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */ } @@ -1218,22 +1226,11 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk) } } -/* must be called under lgr->llc_conf_mutex lock */ -void smcr_link_clear(struct smc_link *lnk, bool log) +static void __smcr_link_clear(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_ib_device *smcibdev; - if (!lgr || lnk->state == SMC_LNK_UNUSED) - return; - lnk->peer_qpn = 0; - smc_llc_link_clear(lnk, log); - smcr_buf_unmap_lgr(lnk); - smcr_rtoken_clear_link(lnk); - smc_ib_modify_qp_error(lnk); - smc_wr_free_link(lnk); - smc_ib_destroy_queue_pair(lnk); - smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); @@ -1245,6 +1242,35 @@ void smcr_link_clear(struct smc_link *lnk, bool log) smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ } +/* must be called under lgr->llc_conf_mutex lock */ +void smcr_link_clear(struct smc_link *lnk, bool log) +{ + if (!lnk->lgr || lnk->clearing || + lnk->state == SMC_LNK_UNUSED) + return; + lnk->clearing = 1; + lnk->peer_qpn = 0; + smc_llc_link_clear(lnk, log); + smcr_buf_unmap_lgr(lnk); + smcr_rtoken_clear_link(lnk); + smc_ib_modify_qp_error(lnk); + smc_wr_free_link(lnk); + smc_ib_destroy_queue_pair(lnk); + smc_ib_dealloc_protection_domain(lnk); + smcr_link_put(lnk); /* theoretically last link_put */ +} + +void smcr_link_hold(struct smc_link *lnk) +{ + refcount_inc(&lnk->refcnt); +} + +void smcr_link_put(struct smc_link *lnk) +{ + if (refcount_dec_and_test(&lnk->refcnt)) + __smcr_link_clear(lnk); +} + static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { @@ -1893,6 +1919,8 @@ create: } } smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ + if (!conn->lgr->is_smcd) + smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */ conn->freed = 0; conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 599455d00020..4cb03e942364 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -137,6 +137,8 @@ struct smc_link { u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ u8 link_idx; /* index in lgr link array */ u8 link_is_asym; /* is link asymmetric? */ + u8 clearing : 1; /* link is being cleared */ + refcount_t refcnt; /* link reference count */ struct smc_link_group *lgr; /* parent link group */ struct work_struct link_down_wrk; /* wrk to bring link down */ char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */ @@ -526,6 +528,8 @@ void smc_core_exit(void); int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini); void smcr_link_clear(struct smc_link *lnk, bool log); +void smcr_link_hold(struct smc_link *lnk); +void smcr_link_put(struct smc_link *lnk); void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk); int smcr_buf_map_lgr(struct smc_link *lnk); From a6fadfd757ceb2f206948b1b54b9c13e77292644 Mon Sep 17 00:00:00 2001 From: Kyoungkyu Park Date: Thu, 13 Jan 2022 13:17:08 +0900 Subject: [PATCH 086/149] net: qmi_wwan: Add Hucom Wireless HM-211S/K MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Hucom Wireless HM-211S/K is an LTE module based on Qualcomm MDM9207. This module supports LTE Band 1, 3, 5, 7, 8 and WCDMA Band 1. Manual testing showed that only interface number two replies to QMI messages. T: Bus=01 Lev=02 Prnt=02 Port=01 Cnt=01 Dev#= 3 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=22de ProdID=9051 Rev= 3.18 S: Manufacturer=Android S: Product=Android S: SerialNumber=0123456789ABCDEF C:* #Ifs= 4 Cfg#= 1 Atr=80 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=85(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Kyoungkyu Park Acked-by: Bjørn Mork Link: https://lore.kernel.org/r/Yd+nxAA6KorDpQFv@choryu-tfx5470h Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 0d5bc26c9f2e..37e5f3495362 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1402,6 +1402,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x413c, 0x81e0, 0)}, /* Dell Wireless 5821e with eSIM support*/ {QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)}, /* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */ {QMI_FIXED_INTF(0x03f0, 0x9d1d, 1)}, /* HP lt4120 Snapdragon X5 LTE */ + {QMI_QUIRK_SET_DTR(0x22de, 0x9051, 2)}, /* Hucom Wireless HM-211S/K */ {QMI_FIXED_INTF(0x22de, 0x9061, 3)}, /* WeTelecom WPD-600N */ {QMI_QUIRK_SET_DTR(0x1e0e, 0x9001, 5)}, /* SIMCom 7100E, 7230E, 7600E ++ */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)}, /* Quectel EC21 Mini PCIe */ From fb80445c438c78b40b547d12b8d56596ce4ccfeb Mon Sep 17 00:00:00 2001 From: Kevin Bracey Date: Wed, 12 Jan 2022 19:02:10 +0200 Subject: [PATCH 087/149] net_sched: restore "mpu xxx" handling commit 56b765b79e9a ("htb: improved accuracy at high rates") broke "overhead X", "linklayer atm" and "mpu X" attributes. "overhead X" and "linklayer atm" have already been fixed. This restores the "mpu X" handling, as might be used by DOCSIS or Ethernet shaping: tc class add ... htb rate X overhead 4 mpu 64 The code being fixed is used by htb, tbf and act_police. Cake has its own mpu handling. qdisc_calculate_pkt_len still uses the size table containing values adjusted for mpu by user space. iproute2 tc has always passed mpu into the kernel via a tc_ratespec structure, but the kernel never directly acted on it, merely stored it so that it could be read back by `tc class show`. Rather, tc would generate length-to-time tables that included the mpu (and linklayer) in their construction, and the kernel used those tables. Since v3.7, the tables were no longer used. Along with "mpu", this also broke "overhead" and "linklayer" which were fixed in 01cb71d2d47b ("net_sched: restore "overhead xxx" handling", v3.10) and 8a8e3d84b171 ("net_sched: restore "linklayer atm" handling", v3.11). "overhead" was fixed by simply restoring use of tc_ratespec::overhead - this had originally been used by the kernel but was initially omitted from the new non-table-based calculations. "linklayer" had been handled in the table like "mpu", but the mode was not originally passed in tc_ratespec. The new implementation was made to handle it by getting new versions of tc to pass the mode in an extended tc_ratespec, and for older versions of tc the table contents were analysed at load time to deduce linklayer. As "mpu" has always been given to the kernel in tc_ratespec, accompanying the mpu-based table, we can restore system functionality with no userspace change by making the kernel act on the tc_ratespec value. Fixes: 56b765b79e9a ("htb: improved accuracy at high rates") Signed-off-by: Kevin Bracey Cc: Eric Dumazet Cc: Jiri Pirko Cc: Vimalkumar Link: https://lore.kernel.org/r/20220112170210.1014351-1-kevin@bracey.fi Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 5 +++++ net/sched/sch_generic.c | 1 + 2 files changed, 6 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c11dbac5abb2..472843eedbae 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1244,6 +1244,7 @@ struct psched_ratecfg { u64 rate_bytes_ps; /* bytes per second */ u32 mult; u16 overhead; + u16 mpu; u8 linklayer; u8 shift; }; @@ -1253,6 +1254,9 @@ static inline u64 psched_l2t_ns(const struct psched_ratecfg *r, { len += r->overhead; + if (len < r->mpu) + len = r->mpu; + if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift; @@ -1275,6 +1279,7 @@ static inline void psched_ratecfg_getrate(struct tc_ratespec *res, res->rate = min_t(u64, r->rate_bytes_ps, ~0U); res->overhead = r->overhead; + res->mpu = r->mpu; res->linklayer = (r->linklayer & TC_LINKLAYER_MASK); } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index b07bd1c7330f..f893d9a81b01 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1529,6 +1529,7 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r, { memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; + r->mpu = conf->mpu; r->rate_bytes_ps = max_t(u64, conf->rate, rate64); r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift); From 9deb48b53e7f4056c2eaa2dc2ee3338df619e4f6 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Thu, 13 Jan 2022 22:46:07 +0300 Subject: [PATCH 088/149] bcmgenet: add WOL IRQ check The driver neglects to check the result of platform_get_irq_optional()'s call and blithely passes the negative error codes to devm_request_irq() (which takes *unsigned* IRQ #), causing it to fail with -EINVAL. Stop calling devm_request_irq() with the invalid IRQ #s. Fixes: 8562056f267d ("net: bcmgenet: request Wake-on-LAN interrupt") Signed-off-by: Sergey Shtylyov Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 226f4403cfed..87f1056e29ff 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -4020,10 +4020,12 @@ static int bcmgenet_probe(struct platform_device *pdev) /* Request the WOL interrupt and advertise suspend if available */ priv->wol_irq_disabled = true; - err = devm_request_irq(&pdev->dev, priv->wol_irq, bcmgenet_wol_isr, 0, - dev->name, priv); - if (!err) - device_set_wakeup_capable(&pdev->dev, 1); + if (priv->wol_irq > 0) { + err = devm_request_irq(&pdev->dev, priv->wol_irq, + bcmgenet_wol_isr, 0, dev->name, priv); + if (!err) + device_set_wakeup_capable(&pdev->dev, 1); + } /* Set the needed headroom to account for any possible * features enabling/disabling at runtime From 0bf3885324a8599e3af4c7379b8d4f621c9bbffa Mon Sep 17 00:00:00 2001 From: Markus Reichl Date: Thu, 13 Jan 2022 21:01:11 +0100 Subject: [PATCH 089/149] net: usb: Correct reset handling of smsc95xx On boards with LAN9514 and no preconfigured MAC address we don't get an ip address from DHCP after commit a049a30fc27c ("net: usb: Correct PHY handling of smsc95xx") anymore. Adding an explicit reset before starting the phy fixes the issue. [1] https://lore.kernel.org/netdev/199eebbd6b97f52b9119c9fa4fd8504f8a34de18.camel@collabora.com/ From: Gabriel Hojda Fixes: a049a30fc27c ("net: usb: Correct PHY handling of smsc95xx") Signed-off-by: Gabriel Hojda Signed-off-by: Markus Reichl Tested-by: Alexander Stein Signed-off-by: David S. Miller --- drivers/net/usb/smsc95xx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index abe0149ed917..bc1e3dd67c04 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -1962,7 +1962,8 @@ static const struct driver_info smsc95xx_info = { .bind = smsc95xx_bind, .unbind = smsc95xx_unbind, .link_reset = smsc95xx_link_reset, - .reset = smsc95xx_start_phy, + .reset = smsc95xx_reset, + .check_connect = smsc95xx_start_phy, .stop = smsc95xx_stop, .rx_fixup = smsc95xx_rx_fixup, .tx_fixup = smsc95xx_tx_fixup, From 2255634100bffe0abc1a300f20aee26980891ba4 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 14 Jan 2022 11:02:46 +0800 Subject: [PATCH 090/149] kselftests/net: list all available tests in usage() So that users can run/query them easily. $ ./fcnal-test.sh -h usage: fcnal-test.sh OPTS -4 IPv4 tests only -6 IPv6 tests only -t Test name/set to run -p Pause on fail -P Pause after each test -v Be verbose Tests: ipv4_ping ipv4_tcp ipv4_udp ipv4_bind ipv4_runtime ipv4_netfilter ipv6_ping ipv6_tcp ipv6_udp ipv6_bind ipv6_runtime ipv6_netfilter use_cases Suggested-by: David Ahern Signed-off-by: Li Zhijian Signed-off-by: David S. Miller --- tools/testing/selftests/net/fcnal-test.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 412d85205546..3f4c8cfe7aca 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -4059,6 +4059,9 @@ usage: ${0##*/} OPTS -p Pause on fail -P Pause after each test -v Be verbose + +Tests: + $TESTS_IPV4 $TESTS_IPV6 $TESTS_OTHER EOF } From 6c8dc12cd925e5fa8c152633338b2b35c4c89258 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 14 Jan 2022 14:12:52 +1100 Subject: [PATCH 091/149] net: apple: mace: Fix build since dev_addr constification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit adeef3e32146 ("net: constify netdev->dev_addr") the mace driver no longer builds with various errors (pmac32_defconfig): linux/drivers/net/ethernet/apple/mace.c: In function ‘mace_probe’: linux/drivers/net/ethernet/apple/mace.c:170:20: error: assignment of read-only location ‘*(dev->dev_addr + (sizetype)j)’ 170 | dev->dev_addr[j] = rev ? bitrev8(addr[j]): addr[j]; | ^ linux/drivers/net/ethernet/apple/mace.c: In function ‘mace_reset’: linux/drivers/net/ethernet/apple/mace.c:349:32: warning: passing argument 2 of ‘__mace_set_address’ discards ‘const’ qualifier from pointer target type 349 | __mace_set_address(dev, dev->dev_addr); | ~~~^~~~~~~~~~ linux/drivers/net/ethernet/apple/mace.c:93:62: note: expected ‘void *’ but argument is of type ‘const unsigned char *’ 93 | static void __mace_set_address(struct net_device *dev, void *addr); | ~~~~~~^~~~ linux/drivers/net/ethernet/apple/mace.c: In function ‘__mace_set_address’: linux/drivers/net/ethernet/apple/mace.c:388:36: error: assignment of read-only location ‘*(dev->dev_addr + (sizetype)i)’ 388 | out_8(&mb->padr, dev->dev_addr[i] = p[i]); | ^ Fix it by making the modifications to a local macaddr variable and then passing that to eth_hw_addr_set(), as well as adding some missing const qualifiers. Signed-off-by: Michael Ellerman Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/apple/mace.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/apple/mace.c b/drivers/net/ethernet/apple/mace.c index 4b80e3a52a19..6f8c91eb1263 100644 --- a/drivers/net/ethernet/apple/mace.c +++ b/drivers/net/ethernet/apple/mace.c @@ -90,7 +90,7 @@ static void mace_set_timeout(struct net_device *dev); static void mace_tx_timeout(struct timer_list *t); static inline void dbdma_reset(volatile struct dbdma_regs __iomem *dma); static inline void mace_clean_rings(struct mace_data *mp); -static void __mace_set_address(struct net_device *dev, void *addr); +static void __mace_set_address(struct net_device *dev, const void *addr); /* * If we can't get a skbuff when we need it, we use this area for DMA. @@ -112,6 +112,7 @@ static int mace_probe(struct macio_dev *mdev, const struct of_device_id *match) struct net_device *dev; struct mace_data *mp; const unsigned char *addr; + u8 macaddr[ETH_ALEN]; int j, rev, rc = -EBUSY; if (macio_resource_count(mdev) != 3 || macio_irq_count(mdev) != 3) { @@ -167,8 +168,9 @@ static int mace_probe(struct macio_dev *mdev, const struct of_device_id *match) rev = addr[0] == 0 && addr[1] == 0xA0; for (j = 0; j < 6; ++j) { - dev->dev_addr[j] = rev ? bitrev8(addr[j]): addr[j]; + macaddr[j] = rev ? bitrev8(addr[j]): addr[j]; } + eth_hw_addr_set(dev, macaddr); mp->chipid = (in_8(&mp->mace->chipid_hi) << 8) | in_8(&mp->mace->chipid_lo); @@ -369,11 +371,12 @@ static void mace_reset(struct net_device *dev) out_8(&mb->plscc, PORTSEL_GPSI + ENPLSIO); } -static void __mace_set_address(struct net_device *dev, void *addr) +static void __mace_set_address(struct net_device *dev, const void *addr) { struct mace_data *mp = netdev_priv(dev); volatile struct mace __iomem *mb = mp->mace; - unsigned char *p = addr; + const unsigned char *p = addr; + u8 macaddr[ETH_ALEN]; int i; /* load up the hardware address */ @@ -385,7 +388,10 @@ static void __mace_set_address(struct net_device *dev, void *addr) ; } for (i = 0; i < 6; ++i) - out_8(&mb->padr, dev->dev_addr[i] = p[i]); + out_8(&mb->padr, macaddr[i] = p[i]); + + eth_hw_addr_set(dev, macaddr); + if (mp->chipid != BROKEN_ADDRCHG_REV) out_8(&mb->iac, 0); } From ea938248557a52e231a31f338eac4baee36a8626 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 14 Jan 2022 14:13:16 +1100 Subject: [PATCH 092/149] net: apple: bmac: Fix build since dev_addr constification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit adeef3e32146 ("net: constify netdev->dev_addr") the bmac driver no longer builds with the following errors (pmac32_defconfig): linux/drivers/net/ethernet/apple/bmac.c: In function ‘bmac_probe’: linux/drivers/net/ethernet/apple/bmac.c:1287:20: error: assignment of read-only location ‘*(dev->dev_addr + (sizetype)j)’ 1287 | dev->dev_addr[j] = rev ? bitrev8(addr[j]): addr[j]; | ^ Fix it by making the modifications to a local macaddr variable and then passing that to eth_hw_addr_set(). We don't use the existing addr variable because the bitrev8() would mutate it, but it is already used unreversed later in the function. Signed-off-by: Michael Ellerman Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/apple/bmac.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/apple/bmac.c b/drivers/net/ethernet/apple/bmac.c index 9a650d1c1bdd..4d2ba30c2fbd 100644 --- a/drivers/net/ethernet/apple/bmac.c +++ b/drivers/net/ethernet/apple/bmac.c @@ -1237,6 +1237,7 @@ static int bmac_probe(struct macio_dev *mdev, const struct of_device_id *match) struct bmac_data *bp; const unsigned char *prop_addr; unsigned char addr[6]; + u8 macaddr[6]; struct net_device *dev; int is_bmac_plus = ((int)match->data) != 0; @@ -1284,7 +1285,9 @@ static int bmac_probe(struct macio_dev *mdev, const struct of_device_id *match) rev = addr[0] == 0 && addr[1] == 0xA0; for (j = 0; j < 6; ++j) - dev->dev_addr[j] = rev ? bitrev8(addr[j]): addr[j]; + macaddr[j] = rev ? bitrev8(addr[j]): addr[j]; + + eth_hw_addr_set(dev, macaddr); /* Enable chip without interrupts for now */ bmac_enable_and_reset_chip(dev); From 99218cbf81bf21355a3de61cd46a706d36e900e6 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Fri, 14 Jan 2022 06:57:24 +0000 Subject: [PATCH 093/149] lib82596: Fix IRQ check in sni_82596_probe platform_get_irq() returns negative error number instead 0 on failure. And the doc of platform_get_irq() provides a usage example: int irq = platform_get_irq(pdev, 0); if (irq < 0) return irq; Fix the check of return value to catch errors correctly. Fixes: 115978859272 ("i825xx: Move the Intel 82586/82593/82596 based drivers") Signed-off-by: Miaoqian Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/i825xx/sni_82596.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/i825xx/sni_82596.c b/drivers/net/ethernet/i825xx/sni_82596.c index 27937c5d7956..daec9ce04531 100644 --- a/drivers/net/ethernet/i825xx/sni_82596.c +++ b/drivers/net/ethernet/i825xx/sni_82596.c @@ -117,9 +117,10 @@ static int sni_82596_probe(struct platform_device *dev) netdevice->dev_addr[5] = readb(eth_addr + 0x06); iounmap(eth_addr); - if (!netdevice->irq) { + if (netdevice->irq < 0) { printk(KERN_ERR "%s: IRQ not found for i82596 at 0x%lx\n", __FILE__, netdevice->base_addr); + retval = netdevice->irq; goto probe_failed; } From d90d0c175cf2982789d336dda928c0f69d3e8a9d Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 14 Jan 2022 15:14:30 +0800 Subject: [PATCH 094/149] net: stmmac: Fix "Unbalanced pm_runtime_enable!" warning If the device is PCI based like intel-eth-pci, pm_runtime_enable() is already called by pci_pm_init(). So only pm_runtime_enable() when it's not already enabled. Signed-off-by: Kai-Heng Feng Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 63ff2dad8c85..6708ca2aa4f7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7159,7 +7159,8 @@ int stmmac_dvr_probe(struct device *device, pm_runtime_get_noresume(device); pm_runtime_set_active(device); - pm_runtime_enable(device); + if (!pm_runtime_enabled(device)) + pm_runtime_enable(device); if (priv->hw->pcs != STMMAC_PCS_TBI && priv->hw->pcs != STMMAC_PCS_RTBI) { From 9d6d7f1cb67cdee15f1a0e85aacfb924e0e02435 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Jan 2022 08:43:28 -0800 Subject: [PATCH 095/149] af_unix: annote lockless accesses to unix_tot_inflight & gc_in_progress wait_for_unix_gc() reads unix_tot_inflight & gc_in_progress without synchronization. Adds READ_ONCE()/WRITE_ONCE() and their associated comments to better document the intent. BUG: KCSAN: data-race in unix_inflight / wait_for_unix_gc write to 0xffffffff86e2b7c0 of 4 bytes by task 9380 on cpu 0: unix_inflight+0x1e8/0x260 net/unix/scm.c:63 unix_attach_fds+0x10c/0x1e0 net/unix/scm.c:121 unix_scm_to_skb net/unix/af_unix.c:1674 [inline] unix_dgram_sendmsg+0x679/0x16b0 net/unix/af_unix.c:1817 unix_seqpacket_sendmsg+0xcc/0x110 net/unix/af_unix.c:2258 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg net/socket.c:724 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2409 ___sys_sendmsg net/socket.c:2463 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2549 __do_sys_sendmmsg net/socket.c:2578 [inline] __se_sys_sendmmsg net/socket.c:2575 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2575 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffffffff86e2b7c0 of 4 bytes by task 9375 on cpu 1: wait_for_unix_gc+0x24/0x160 net/unix/garbage.c:196 unix_dgram_sendmsg+0x8e/0x16b0 net/unix/af_unix.c:1772 unix_seqpacket_sendmsg+0xcc/0x110 net/unix/af_unix.c:2258 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg net/socket.c:724 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2409 ___sys_sendmsg net/socket.c:2463 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2549 __do_sys_sendmmsg net/socket.c:2578 [inline] __se_sys_sendmmsg net/socket.c:2575 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2575 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000002 -> 0x00000004 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 9375 Comm: syz-executor.1 Not tainted 5.16.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 9915672d4127 ("af_unix: limit unix_tot_inflight") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20220114164328.2038499-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/unix/garbage.c | 14 +++++++++++--- net/unix/scm.c | 6 ++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 12e2ddaf887f..d45d5366115a 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -192,8 +192,11 @@ void wait_for_unix_gc(void) { /* If number of inflight sockets is insane, * force a garbage collect right now. + * Paired with the WRITE_ONCE() in unix_inflight(), + * unix_notinflight() and gc_in_progress(). */ - if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress) + if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && + !READ_ONCE(gc_in_progress)) unix_gc(); wait_event(unix_gc_wait, gc_in_progress == false); } @@ -213,7 +216,9 @@ void unix_gc(void) if (gc_in_progress) goto out; - gc_in_progress = true; + /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + WRITE_ONCE(gc_in_progress, true); + /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. @@ -299,7 +304,10 @@ void unix_gc(void) /* All candidates should have been detached by now. */ BUG_ON(!list_empty(&gc_candidates)); - gc_in_progress = false; + + /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + WRITE_ONCE(gc_in_progress, false); + wake_up(&unix_gc_wait); out: diff --git a/net/unix/scm.c b/net/unix/scm.c index 052ae709ce28..aa27a02478dc 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -60,7 +60,8 @@ void unix_inflight(struct user_struct *user, struct file *fp) } else { BUG_ON(list_empty(&u->link)); } - unix_tot_inflight++; + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); } user->unix_inflight++; spin_unlock(&unix_gc_lock); @@ -80,7 +81,8 @@ void unix_notinflight(struct user_struct *user, struct file *fp) if (atomic_long_dec_and_test(&u->inflight)) list_del_init(&u->link); - unix_tot_inflight--; + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); } user->unix_inflight--; spin_unlock(&unix_gc_lock); From 80f15f3bef9e9c2cc29888a6773df44de0a0c65f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 14 Jan 2022 15:36:37 +0200 Subject: [PATCH 096/149] net: mscc: ocelot: don't dereference NULL pointers with shared tc filters The following command sequence: tc qdisc del dev swp0 clsact tc qdisc add dev swp0 ingress_block 1 clsact tc qdisc add dev swp1 ingress_block 1 clsact tc filter add block 1 flower action drop tc qdisc del dev swp0 clsact produces the following NPD: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000014 pc : vcap_entry_set+0x14/0x70 lr : ocelot_vcap_filter_del+0x198/0x234 Call trace: vcap_entry_set+0x14/0x70 ocelot_vcap_filter_del+0x198/0x234 ocelot_cls_flower_destroy+0x94/0xe4 felix_cls_flower_del+0x70/0x84 dsa_slave_setup_tc_block_cb+0x13c/0x60c dsa_slave_setup_tc_block_cb_ig+0x20/0x30 tc_setup_cb_reoffload+0x44/0x120 fl_reoffload+0x280/0x320 tcf_block_playback_offloads+0x6c/0x184 tcf_block_unbind+0x80/0xe0 tcf_block_setup+0x174/0x214 tcf_block_offload_cmd.isra.0+0x100/0x13c tcf_block_offload_unbind+0x5c/0xa0 __tcf_block_put+0x54/0x174 tcf_block_put_ext+0x5c/0x74 clsact_destroy+0x40/0x60 qdisc_destroy+0x4c/0x150 qdisc_put+0x70/0x90 qdisc_graft+0x3f0/0x4c0 tc_get_qdisc+0x1cc/0x364 rtnetlink_rcv_msg+0x124/0x340 The reason is that the driver isn't prepared to receive two tc filters with the same cookie. It unconditionally creates a new struct ocelot_vcap_filter for each tc filter, and it adds all filters with the same identifier (cookie) to the ocelot_vcap_block. The problem is here, in ocelot_vcap_filter_del(): /* Gets index of the filter */ index = ocelot_vcap_block_get_filter_index(block, filter); if (index < 0) return index; /* Delete filter */ ocelot_vcap_block_remove_filter(ocelot, block, filter); /* Move up all the blocks over the deleted filter */ for (i = index; i < block->count; i++) { struct ocelot_vcap_filter *tmp; tmp = ocelot_vcap_block_find_filter_by_index(block, i); vcap_entry_set(ocelot, i, tmp); } what will happen is ocelot_vcap_block_get_filter_index() will return the index (@index) of the first filter found with that cookie. This is _not_ the index of _this_ filter, but the other one with the same cookie, because ocelot_vcap_filter_equal() gets fooled. Then later, ocelot_vcap_block_remove_filter() is coded to remove all filters that are ocelot_vcap_filter_equal() with the passed @filter. So unexpectedly, both filters get deleted from the list. Then ocelot_vcap_filter_del() will attempt to move all the other filters up, again finding them by index (@i). The block count is 2, @index was 0, so it will attempt to move up filter @i=0 and @i=1. It assigns tmp = ocelot_vcap_block_find_filter_by_index(block, i), which is now a NULL pointer because ocelot_vcap_block_remove_filter() has removed more than one filter. As far as I can see, this problem has been there since the introduction of tc offload support, however I cannot test beyond the blamed commit due to hardware availability. In any case, any fix cannot be backported that far, due to lots of changes to the code base. Therefore, let's go for the correct solution, which is to not call ocelot_vcap_filter_add() and ocelot_vcap_filter_del(), unless the filter is actually unique and not shared. For the shared filters, we should just modify the ingress port mask and call ocelot_vcap_filter_replace(), a function introduced by commit 95706be13b9f ("net: mscc: ocelot: create a function that replaces an existing VCAP filter"). This way, block->rules will only contain filters with unique cookies, by design. Fixes: 07d985eef073 ("net: dsa: felix: Wire up the ocelot cls_flower methods") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_flower.c | 29 ++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index beb9379424c0..4a0fda22d343 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -805,13 +805,34 @@ int ocelot_cls_flower_replace(struct ocelot *ocelot, int port, struct netlink_ext_ack *extack = f->common.extack; struct ocelot_vcap_filter *filter; int chain = f->common.chain_index; - int ret; + int block_id, ret; if (chain && !ocelot_find_vcap_filter_that_points_at(ocelot, chain)) { NL_SET_ERR_MSG_MOD(extack, "No default GOTO action points to this chain"); return -EOPNOTSUPP; } + block_id = ocelot_chain_to_block(chain, ingress); + if (block_id < 0) { + NL_SET_ERR_MSG_MOD(extack, "Cannot offload to this chain"); + return -EOPNOTSUPP; + } + + filter = ocelot_vcap_block_find_filter_by_id(&ocelot->block[block_id], + f->cookie, true); + if (filter) { + /* Filter already exists on other ports */ + if (!ingress) { + NL_SET_ERR_MSG_MOD(extack, "VCAP ES0 does not support shared filters"); + return -EOPNOTSUPP; + } + + filter->ingress_port_mask |= BIT(port); + + return ocelot_vcap_filter_replace(ocelot, filter); + } + + /* Filter didn't exist, create it now */ filter = ocelot_vcap_filter_create(ocelot, port, ingress, f); if (!filter) return -ENOMEM; @@ -874,6 +895,12 @@ int ocelot_cls_flower_destroy(struct ocelot *ocelot, int port, if (filter->type == OCELOT_VCAP_FILTER_DUMMY) return ocelot_vcap_dummy_filter_del(ocelot, filter); + if (ingress) { + filter->ingress_port_mask &= ~BIT(port); + if (filter->ingress_port_mask) + return ocelot_vcap_filter_replace(ocelot, filter); + } + return ocelot_vcap_filter_del(ocelot, filter); } EXPORT_SYMBOL_GPL(ocelot_cls_flower_destroy); From 214b3369ab9b0a6f28d6c970220c209417edbc65 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sat, 15 Jan 2022 09:49:18 -0800 Subject: [PATCH 097/149] net: ethernet: mtk_eth_soc: fix error checking in mtk_mac_config() Clang static analysis reports this problem mtk_eth_soc.c:394:7: warning: Branch condition evaluates to a garbage value if (err) ^~~ err is not initialized and only conditionally set. So intitialize err. Fixes: 7e538372694b ("net: ethernet: mediatek: Re-add support SGMII") Signed-off-by: Tom Rix Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index b67b4323cff0..f02d07ec5ccb 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -267,7 +267,7 @@ static void mtk_mac_config(struct phylink_config *config, unsigned int mode, phylink_config); struct mtk_eth *eth = mac->hw; u32 mcr_cur, mcr_new, sid, i; - int val, ge_mode, err; + int val, ge_mode, err = 0; /* MT76x8 has no hardware settings between for the MAC */ if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628) && From 9a9acdccdfa430457986db608ee4ae4840107057 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 15 Jan 2022 13:45:03 +0100 Subject: [PATCH 098/149] net: ethernet: sun4i-emac: Fix an error handling path in emac_probe() A dma_request_chan() call is hidden in emac_configure_dma(). It must be released in the probe if an error occurs, as already done in the remove function. Add the corresponding dma_release_channel() call. Fixes: 47869e82c8b8 ("sun4i-emac.c: add dma support") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/allwinner/sun4i-emac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c index 74635a6fa8ca..621ce742ad21 100644 --- a/drivers/net/ethernet/allwinner/sun4i-emac.c +++ b/drivers/net/ethernet/allwinner/sun4i-emac.c @@ -1072,6 +1072,7 @@ out_clk_disable_unprepare: clk_disable_unprepare(db->clk); out_dispose_mapping: irq_dispose_mapping(ndev->irq); + dma_release_channel(db->rx_chan); out_iounmap: iounmap(db->membase); out: From 020a45aff1190c32b1087cd75b57fbf6bff46ea6 Mon Sep 17 00:00:00 2001 From: Mohammad Athari Bin Ismail Date: Sat, 15 Jan 2022 17:25:15 +0800 Subject: [PATCH 099/149] net: phy: marvell: add Marvell specific PHY loopback Existing genphy_loopback() is not applicable for Marvell PHY. Besides configuring bit-6 and bit-13 in Page 0 Register 0 (Copper Control Register), it is also required to configure same bits in Page 2 Register 21 (MAC Specific Control Register 2) according to speed of the loopback is operating. Tested working on Marvell88E1510 PHY for all speeds (1000/100/10Mbps). FIXME: Based on trial and error test, it seem 1G need to have delay between soft reset and loopback enablement. Fixes: 014068dcb5b1 ("net: phy: genphy_loopback: add link speed configuration") Cc: # 5.15.x Signed-off-by: Mohammad Athari Bin Ismail Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 56 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 739859c0dfb1..fa71fb7a66b5 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -189,6 +189,8 @@ #define MII_88E1510_GEN_CTRL_REG_1_MODE_RGMII_SGMII 0x4 #define MII_88E1510_GEN_CTRL_REG_1_RESET 0x8000 /* Soft reset */ +#define MII_88E1510_MSCR_2 0x15 + #define MII_VCT5_TX_RX_MDI0_COUPLING 0x10 #define MII_VCT5_TX_RX_MDI1_COUPLING 0x11 #define MII_VCT5_TX_RX_MDI2_COUPLING 0x12 @@ -1932,6 +1934,58 @@ static void marvell_get_stats(struct phy_device *phydev, data[i] = marvell_get_stat(phydev, i); } +static int m88e1510_loopback(struct phy_device *phydev, bool enable) +{ + int err; + + if (enable) { + u16 bmcr_ctl = 0, mscr2_ctl = 0; + + if (phydev->speed == SPEED_1000) + bmcr_ctl = BMCR_SPEED1000; + else if (phydev->speed == SPEED_100) + bmcr_ctl = BMCR_SPEED100; + + if (phydev->duplex == DUPLEX_FULL) + bmcr_ctl |= BMCR_FULLDPLX; + + err = phy_write(phydev, MII_BMCR, bmcr_ctl); + if (err < 0) + return err; + + if (phydev->speed == SPEED_1000) + mscr2_ctl = BMCR_SPEED1000; + else if (phydev->speed == SPEED_100) + mscr2_ctl = BMCR_SPEED100; + + err = phy_modify_paged(phydev, MII_MARVELL_MSCR_PAGE, + MII_88E1510_MSCR_2, BMCR_SPEED1000 | + BMCR_SPEED100, mscr2_ctl); + if (err < 0) + return err; + + /* Need soft reset to have speed configuration takes effect */ + err = genphy_soft_reset(phydev); + if (err < 0) + return err; + + /* FIXME: Based on trial and error test, it seem 1G need to have + * delay between soft reset and loopback enablement. + */ + if (phydev->speed == SPEED_1000) + msleep(1000); + + return phy_modify(phydev, MII_BMCR, BMCR_LOOPBACK, + BMCR_LOOPBACK); + } else { + err = phy_modify(phydev, MII_BMCR, BMCR_LOOPBACK, 0); + if (err < 0) + return err; + + return phy_config_aneg(phydev); + } +} + static int marvell_vct5_wait_complete(struct phy_device *phydev) { int i; @@ -3078,7 +3132,7 @@ static struct phy_driver marvell_drivers[] = { .get_sset_count = marvell_get_sset_count, .get_strings = marvell_get_strings, .get_stats = marvell_get_stats, - .set_loopback = genphy_loopback, + .set_loopback = m88e1510_loopback, .get_tunable = m88e1011_get_tunable, .set_tunable = m88e1011_set_tunable, .cable_test_start = marvell_vct7_cable_test_start, From f542cdfa3083a309e3caafbbdf41490c4935492a Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Sat, 15 Jan 2022 10:34:30 +0800 Subject: [PATCH 100/149] net: wwan: Fix MRU mismatch issue which may lead to data connection lost In pci_generic.c there is a 'mru_default' in struct mhi_pci_dev_info. This value shall be used for whole mhi if it's given a value for a specific product. But in function mhi_net_rx_refill_work(), it's still using hard code value MHI_DEFAULT_MRU. 'mru_default' shall have higher priority than MHI_DEFAULT_MRU. And after checking, this change could help fix a data connection lost issue. Fixes: 5c2c85315948 ("bus: mhi: pci-generic: configurable network interface MRU") Signed-off-by: Shujun Wang Signed-off-by: Slark Xiao Reviewed-by: Loic Poulain Signed-off-by: David S. Miller --- drivers/net/wwan/mhi_wwan_mbim.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wwan/mhi_wwan_mbim.c b/drivers/net/wwan/mhi_wwan_mbim.c index 71bf9b4f769f..6872782e8dd8 100644 --- a/drivers/net/wwan/mhi_wwan_mbim.c +++ b/drivers/net/wwan/mhi_wwan_mbim.c @@ -385,13 +385,13 @@ static void mhi_net_rx_refill_work(struct work_struct *work) int err; while (!mhi_queue_is_full(mdev, DMA_FROM_DEVICE)) { - struct sk_buff *skb = alloc_skb(MHI_DEFAULT_MRU, GFP_KERNEL); + struct sk_buff *skb = alloc_skb(mbim->mru, GFP_KERNEL); if (unlikely(!skb)) break; err = mhi_queue_skb(mdev, DMA_FROM_DEVICE, skb, - MHI_DEFAULT_MRU, MHI_EOT); + mbim->mru, MHI_EOT); if (unlikely(err)) { kfree_skb(skb); break; From 9404bc1e58e4bed41f629da9e3b890b2ca1f8b9c Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 14 Jan 2022 21:35:45 +0800 Subject: [PATCH 101/149] net/smc: Remove unused function declaration The declaration of smc_wr_tx_dismiss_slots() is unused. So remove it. Fixes: 349d43127dac ("net/smc: fix kernel panic caused by race of smc_sock") Signed-off-by: Wen Gu Reviewed-by: Dust Li Signed-off-by: David S. Miller --- net/smc/smc_wr.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 47512ccce5ef..a54e90a1110f 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -125,10 +125,6 @@ int smc_wr_tx_v2_send(struct smc_link *link, int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, unsigned long timeout); void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); -void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, - smc_wr_tx_filter filter, - smc_wr_tx_dismisser dismisser, - unsigned long data); void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); From 0a6e6b3c7db6c34e3d149f09cd714972f8753e3f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 16 Jan 2022 01:02:20 -0800 Subject: [PATCH 102/149] ipv4: update fib_info_cnt under spinlock protection In the past, free_fib_info() was supposed to be called under RTNL protection. This eventually was no longer the case. Instead of enforcing RTNL it seems we simply can move fib_info_cnt changes to occur when fib_info_lock is held. v2: David Laight suggested to update fib_info_cnt only when an entry is added/deleted to/from the hash table, as fib_info_cnt is used to make sure hash table size is optimal. BUG: KCSAN: data-race in fib_create_info / free_fib_info write to 0xffffffff86e243a0 of 4 bytes by task 26429 on cpu 0: fib_create_info+0xe78/0x3440 net/ipv4/fib_semantics.c:1428 fib_table_insert+0x148/0x10c0 net/ipv4/fib_trie.c:1224 fib_magic+0x195/0x1e0 net/ipv4/fib_frontend.c:1087 fib_add_ifaddr+0xd0/0x2e0 net/ipv4/fib_frontend.c:1109 fib_netdev_event+0x178/0x510 net/ipv4/fib_frontend.c:1466 notifier_call_chain kernel/notifier.c:83 [inline] raw_notifier_call_chain+0x53/0xb0 kernel/notifier.c:391 __dev_notify_flags+0x1d3/0x3b0 dev_change_flags+0xa2/0xc0 net/core/dev.c:8872 do_setlink+0x810/0x2410 net/core/rtnetlink.c:2719 rtnl_group_changelink net/core/rtnetlink.c:3242 [inline] __rtnl_newlink net/core/rtnetlink.c:3396 [inline] rtnl_newlink+0xb10/0x13b0 net/core/rtnetlink.c:3506 rtnetlink_rcv_msg+0x745/0x7e0 net/core/rtnetlink.c:5571 netlink_rcv_skb+0x14e/0x250 net/netlink/af_netlink.c:2496 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:5589 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x5fc/0x6c0 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x726/0x840 net/netlink/af_netlink.c:1921 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg net/socket.c:724 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2409 ___sys_sendmsg net/socket.c:2463 [inline] __sys_sendmsg+0x195/0x230 net/socket.c:2492 __do_sys_sendmsg net/socket.c:2501 [inline] __se_sys_sendmsg net/socket.c:2499 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2499 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffffffff86e243a0 of 4 bytes by task 31505 on cpu 1: free_fib_info+0x35/0x80 net/ipv4/fib_semantics.c:252 fib_info_put include/net/ip_fib.h:575 [inline] nsim_fib4_rt_destroy drivers/net/netdevsim/fib.c:294 [inline] nsim_fib4_rt_replace drivers/net/netdevsim/fib.c:403 [inline] nsim_fib4_rt_insert drivers/net/netdevsim/fib.c:431 [inline] nsim_fib4_event drivers/net/netdevsim/fib.c:461 [inline] nsim_fib_event drivers/net/netdevsim/fib.c:881 [inline] nsim_fib_event_work+0x15ca/0x2cf0 drivers/net/netdevsim/fib.c:1477 process_one_work+0x3fc/0x980 kernel/workqueue.c:2298 process_scheduled_works kernel/workqueue.c:2361 [inline] worker_thread+0x7df/0xa70 kernel/workqueue.c:2447 kthread+0x2c7/0x2e0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 value changed: 0x00000d2d -> 0x00000d2e Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 31505 Comm: kworker/1:21 Not tainted 5.16.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events nsim_fib_event_work Fixes: 48bb9eb47b27 ("netdevsim: fib: Add dummy implementation for FIB offload") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: David Laight Cc: Ido Schimmel Cc: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 828de171708f..45619c005b8d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -249,7 +249,6 @@ void free_fib_info(struct fib_info *fi) pr_warn("Freeing alive fib_info %p\n", fi); return; } - fib_info_cnt--; call_rcu(&fi->rcu, free_fib_info_rcu); } @@ -260,6 +259,10 @@ void fib_release_info(struct fib_info *fi) spin_lock_bh(&fib_info_lock); if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); + + /* Paired with READ_ONCE() in fib_create_info(). */ + WRITE_ONCE(fib_info_cnt, fib_info_cnt - 1); + if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); if (fi->nh) { @@ -1430,7 +1433,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg, #endif err = -ENOBUFS; - if (fib_info_cnt >= fib_info_hash_size) { + + /* Paired with WRITE_ONCE() in fib_release_info() */ + if (READ_ONCE(fib_info_cnt) >= fib_info_hash_size) { unsigned int new_size = fib_info_hash_size << 1; struct hlist_head *new_info_hash; struct hlist_head *new_laddrhash; @@ -1462,7 +1467,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, return ERR_PTR(err); } - fib_info_cnt++; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; fi->fib_scope = cfg->fc_scope; @@ -1591,6 +1595,7 @@ link_it: refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); spin_lock_bh(&fib_info_lock); + fib_info_cnt++; hlist_add_head(&fi->fib_hash, &fib_info_hash[fib_info_hashfn(fi)]); if (fi->fib_prefsrc) { From 56d99e81ecbc997a5f984684d0eeb583992b2072 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Sun, 16 Jan 2022 15:43:42 +0800 Subject: [PATCH 103/149] net/smc: Fix hung_task when removing SMC-R devices A hung_task is observed when removing SMC-R devices. Suppose that a link group has two active links(lnk_A, lnk_B) associated with two different SMC-R devices(dev_A, dev_B). When dev_A is removed, the link group will be removed from smc_lgr_list and added into lgr_linkdown_list. lnk_A will be cleared and smcibdev(A)->lnk_cnt will reach to zero. However, when dev_B is removed then, the link group can't be found in smc_lgr_list and lnk_B won't be cleared, making smcibdev->lnk_cnt never reaches zero, which causes a hung_task. This patch fixes this issue by restoring the implementation of smc_smcr_terminate_all() to what it was before commit 349d43127dac ("net/smc: fix kernel panic caused by race of smc_sock"). The original implementation also satisfies the intention that make sure QP destroy earlier than CQ destroy because we will always wait for smcibdev->lnk_cnt reaches zero, which guarantees QP has been destroyed. Fixes: 349d43127dac ("net/smc: fix kernel panic caused by race of smc_sock") Signed-off-by: Wen Gu Signed-off-by: David S. Miller --- net/smc/smc_core.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 7199eeeb1667..29525d03b253 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1531,16 +1531,11 @@ void smc_smcd_terminate_all(struct smcd_dev *smcd) /* Called when an SMCR device is removed or the smc module is unloaded. * If smcibdev is given, all SMCR link groups using this device are terminated. * If smcibdev is NULL, all SMCR link groups are terminated. - * - * We must wait here for QPs been destroyed before we destroy the CQs, - * or we won't received any CQEs and cdc_pend_tx_wr cannot reach 0 thus - * smc_sock cannot be released. */ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); - LIST_HEAD(lgr_linkdown_list); int i; spin_lock_bh(&smc_lgr_list.lock); @@ -1552,7 +1547,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].smcibdev == smcibdev) - list_move_tail(&lgr->list, &lgr_linkdown_list); + smcr_link_down_cond_sched(&lgr->lnk[i]); } } } @@ -1564,16 +1559,6 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) __smc_lgr_terminate(lgr, false); } - list_for_each_entry_safe(lgr, lg, &lgr_linkdown_list, list) { - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].smcibdev == smcibdev) { - mutex_lock(&lgr->llc_conf_mutex); - smcr_link_down_cond(&lgr->lnk[i]); - mutex_unlock(&lgr->llc_conf_mutex); - } - } - } - if (smcibdev) { if (atomic_read(&smcibdev->lnk_cnt)) wait_event(smcibdev->lnks_deleted, From 429e3d123d9a50cc9882402e40e0ac912d88cfcf Mon Sep 17 00:00:00 2001 From: Moshe Tal Date: Sun, 16 Jan 2022 19:39:29 +0200 Subject: [PATCH 104/149] bonding: Fix extraction of ports from the packet headers Wrong hash sends single stream to multiple output interfaces. The offset calculation was relative to skb->head, fix it to be relative to skb->data. Fixes: a815bde56b15 ("net, bonding: Refactor bond_xmit_hash for use with xdp_buff") Reviewed-by: Jussi Maki Reviewed-by: Saeed Mahameed Reviewed-by: Gal Pressman Signed-off-by: Moshe Tal Acked-by: Jay Vosburgh Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index fce80b57f15b..ec498ce70f35 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3874,8 +3874,8 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) skb->l4_hash) return skb->hash; - return __bond_xmit_hash(bond, skb, skb->head, skb->protocol, - skb->mac_header, skb->network_header, + return __bond_xmit_hash(bond, skb, skb->data, skb->protocol, + skb_mac_offset(skb), skb_network_offset(skb), skb_headlen(skb)); } From c0b7f7d7e0ad44f35745c01964b3fa2833e298cb Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 17 Jan 2022 13:53:00 +0100 Subject: [PATCH 105/149] net: ocelot: Fix the call to switchdev_bridge_port_offload In the blamed commit, the call to the function switchdev_bridge_port_offload was passing the wrong argument for atomic_nb. It was ocelot_netdevice_nb instead of ocelot_swtchdev_nb. This patch fixes this issue. Fixes: 4e51bf44a03af6 ("net: bridge: move the switchdev object replay helpers to "push" mode") Signed-off-by: Horatiu Vultur Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 8115c3db252e..e271b6225b72 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -1187,7 +1187,7 @@ static int ocelot_netdevice_bridge_join(struct net_device *dev, ocelot_port_bridge_join(ocelot, port, bridge); err = switchdev_bridge_port_offload(brport_dev, dev, priv, - &ocelot_netdevice_nb, + &ocelot_switchdev_nb, &ocelot_switchdev_blocking_nb, false, extack); if (err) @@ -1201,7 +1201,7 @@ static int ocelot_netdevice_bridge_join(struct net_device *dev, err_switchdev_sync: switchdev_bridge_port_unoffload(brport_dev, priv, - &ocelot_netdevice_nb, + &ocelot_switchdev_nb, &ocelot_switchdev_blocking_nb); err_switchdev_offload: ocelot_port_bridge_leave(ocelot, port, bridge); @@ -1214,7 +1214,7 @@ static void ocelot_netdevice_pre_bridge_leave(struct net_device *dev, struct ocelot_port_private *priv = netdev_priv(dev); switchdev_bridge_port_unoffload(brport_dev, priv, - &ocelot_netdevice_nb, + &ocelot_switchdev_nb, &ocelot_switchdev_blocking_nb); } From db094aa8140e6da25ba77345b24a8af0c220665e Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 17 Jan 2022 11:27:32 +0200 Subject: [PATCH 106/149] net/tls: Fix another skb memory leak when running kTLS traffic This patch is a followup to commit ffef737fd037 ("net/tls: Fix skb memory leak when running kTLS traffic") Which was missing another sk_defer_free_flush() call in tls_sw_splice_read(). Fixes: f35f821935d8 ("tcp: defer skb freeing after socket lock is released") Signed-off-by: Gal Pressman Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 95e774f1b91f..efc84845bb6b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2059,6 +2059,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, splice_read_end: release_sock(sk); + sk_defer_free_flush(sk); return copied ? : err; } From 79074a72d335dbd021a716d8cc65cba3b2f706ab Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 17 Jan 2022 11:27:33 +0200 Subject: [PATCH 107/149] net: Flush deferred skb free on socket destroy The cited Fixes patch moved to a deferred skb approach where the skbs are not freed immediately under the socket lock. Add a WARN_ON_ONCE() to verify the deferred list is empty on socket destroy, and empty it to prevent potential memory leaks. Fixes: f35f821935d8 ("tcp: defer skb freeing after socket lock is released") Signed-off-by: Gal Pressman Signed-off-by: David S. Miller --- net/core/sock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/sock.c b/net/core/sock.c index f32ec08a0c37..4ff806d71921 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2049,6 +2049,9 @@ void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); + WARN_ON_ONCE(!llist_empty(&sk->defer_list)); + sk_defer_free_flush(sk); + if (rcu_access_pointer(sk->sk_reuseport_cb)) { reuseport_detach_sock(sk); use_call_rcu = true; From a87b0fd4f9003f8521226e226cf92b18147b3519 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 13 Jan 2022 12:31:16 +0100 Subject: [PATCH 108/149] s390/cpumf: Support for CPU Measurement Facility CSVN 7 Adds support for the CPU Measurement Counter Facility second version number 7. Signed-off-by: Thomas Richter Acked-by: Sumanth Korikkar Signed-off-by: Heiko Carstens --- arch/s390/kernel/perf_cpum_cf_common.c | 4 ++-- arch/s390/kernel/perf_cpum_cf_events.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c index 30f0242de4a5..8ee48672233f 100644 --- a/arch/s390/kernel/perf_cpum_cf_common.c +++ b/arch/s390/kernel/perf_cpum_cf_common.c @@ -178,7 +178,7 @@ size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, case CPUMF_CTR_SET_CRYPTO: if (info->csvn >= 1 && info->csvn <= 5) ctrset_size = 16; - else if (info->csvn == 6) + else if (info->csvn == 6 || info->csvn == 7) ctrset_size = 20; break; case CPUMF_CTR_SET_EXT: @@ -188,7 +188,7 @@ size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, ctrset_size = 48; else if (info->csvn >= 3 && info->csvn <= 5) ctrset_size = 128; - else if (info->csvn == 6) + else if (info->csvn == 6 || info->csvn == 7) ctrset_size = 160; break; case CPUMF_CTR_SET_MT_DIAG: diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index 37265f551a11..52c1fe23b823 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -344,7 +344,7 @@ static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = { NULL, }; -static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = { +static struct attribute *cpumcf_svn_67_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), @@ -715,8 +715,8 @@ __init const struct attribute_group **cpumf_cf_event_group(void) case 1 ... 5: csvn = cpumcf_svn_12345_pmu_event_attr; break; - case 6: - csvn = cpumcf_svn_6_pmu_event_attr; + case 6 ... 7: + csvn = cpumcf_svn_67_pmu_event_attr; break; default: csvn = none; From 745f5d20e7936931f924410f32d8b0e599b5990e Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 13 Jan 2022 12:51:56 +0100 Subject: [PATCH 109/149] s390/cpumf: Support for CPU Measurement Sampling Facility LS bit Adds support for the CPU Measurement Sampling Facility limit sampling bit in the sampling device driver. Limited samples have no valueable information are not collected. Signed-off-by: Thomas Richter Acked-by: Sumanth Korikkar Signed-off-by: Heiko Carstens --- arch/s390/include/asm/cpu_mf.h | 4 +++- arch/s390/kernel/perf_cpum_sf.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index 0d90cbeb89b4..e3f12db46cfc 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -109,7 +109,9 @@ struct hws_basic_entry { unsigned int AS:2; /* 29-30 PSW address-space control */ unsigned int I:1; /* 31 entry valid or invalid */ unsigned int CL:2; /* 32-33 Configuration Level */ - unsigned int:14; + unsigned int H:1; /* 34 Host Indicator */ + unsigned int LS:1; /* 35 Limited Sampling */ + unsigned int:12; unsigned int prim_asn:16; /* primary ASN */ unsigned long long ia; /* Instruction Address */ unsigned long long gpp; /* Guest Program Parameter */ diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index db62def4ef28..332a49965130 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1179,7 +1179,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, sample = (struct hws_basic_entry *) *sdbt; while ((unsigned long *) sample < (unsigned long *) te) { /* Check for an empty sample */ - if (!sample->def) + if (!sample->def || sample->LS) break; /* Update perf event period */ From 012a224e1fa31fc256aab921f691598e03db6018 Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Tue, 11 Jan 2022 11:00:03 +0100 Subject: [PATCH 110/149] s390/uaccess: introduce bit field for OAC specifier Previously, we've used magic values to specify the OAC (operand-access control) for mvcos. Instead we introduce a bit field for it. When using a bit field, we cannot use an immediate value with K constraint anymore, since GCC older than 10 doesn't recognize the bit field union as a compile time constant. To make things work with older compilers, load the OAC value through a register. Bloat-o-meter reports a slight increase in kernel size with this change: Total: Before=15692135, After=15693015, chg +0.01% Signed-off-by: Nico Boehr Co-developed-by: Janis Schoetterl-Glausch Signed-off-by: Janis Schoetterl-Glausch Link: https://lore.kernel.org/r/20220111100003.743116-1-scgl@linux.ibm.com Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Vasily Gorbik Cc: Sven Schnelle Signed-off-by: Heiko Carstens --- arch/s390/include/asm/uaccess.h | 120 ++++++++++++++++++++------------ arch/s390/lib/uaccess.c | 24 +++++-- 2 files changed, 95 insertions(+), 49 deletions(-) diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index ce550d06abc3..147cb3534ce4 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -49,51 +49,85 @@ int __get_user_bad(void) __attribute__((noreturn)); #ifdef CONFIG_HAVE_MARCH_Z10_FEATURES -#define __put_get_user_asm(to, from, size, insn) \ -({ \ - int __rc; \ - \ - asm volatile( \ - insn " 0,%[spec]\n" \ - "0: mvcos %[_to],%[_from],%[_size]\n" \ - "1: xr %[rc],%[rc]\n" \ - "2:\n" \ - ".pushsection .fixup, \"ax\"\n" \ - "3: lhi %[rc],%[retval]\n" \ - " jg 2b\n" \ - ".popsection\n" \ - EX_TABLE(0b,3b) EX_TABLE(1b,3b) \ - : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \ - : [_size] "d" (size), [_from] "Q" (*(from)), \ - [retval] "K" (-EFAULT), [spec] "K" (0x81UL) \ - : "cc", "0"); \ - __rc; \ +union oac { + unsigned int val; + struct { + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac1; + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac2; + }; +}; + +#define __put_get_user_asm(to, from, size, oac_spec) \ +({ \ + int __rc; \ + \ + asm volatile( \ + " lr 0,%[spec]\n" \ + "0: mvcos %[_to],%[_from],%[_size]\n" \ + "1: xr %[rc],%[rc]\n" \ + "2:\n" \ + ".pushsection .fixup, \"ax\"\n" \ + "3: lhi %[rc],%[retval]\n" \ + " jg 2b\n" \ + ".popsection\n" \ + EX_TABLE(0b,3b) EX_TABLE(1b,3b) \ + : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \ + : [_size] "d" (size), [_from] "Q" (*(from)), \ + [retval] "K" (-EFAULT), [spec] "d" (oac_spec.val) \ + : "cc", "0"); \ + __rc; \ }) +#define __put_user_asm(to, from, size) \ + __put_get_user_asm(to, from, size, ((union oac) { \ + .oac1.as = PSW_BITS_AS_SECONDARY, \ + .oac1.a = 1 \ + })) + +#define __get_user_asm(to, from, size) \ + __put_get_user_asm(to, from, size, ((union oac) { \ + .oac2.as = PSW_BITS_AS_SECONDARY, \ + .oac2.a = 1 \ + })) \ + static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) { int rc; switch (size) { case 1: - rc = __put_get_user_asm((unsigned char __user *)ptr, - (unsigned char *)x, - size, "llilh"); + rc = __put_user_asm((unsigned char __user *)ptr, + (unsigned char *)x, + size); break; case 2: - rc = __put_get_user_asm((unsigned short __user *)ptr, - (unsigned short *)x, - size, "llilh"); + rc = __put_user_asm((unsigned short __user *)ptr, + (unsigned short *)x, + size); break; case 4: - rc = __put_get_user_asm((unsigned int __user *)ptr, - (unsigned int *)x, - size, "llilh"); + rc = __put_user_asm((unsigned int __user *)ptr, + (unsigned int *)x, + size); break; case 8: - rc = __put_get_user_asm((unsigned long __user *)ptr, - (unsigned long *)x, - size, "llilh"); + rc = __put_user_asm((unsigned long __user *)ptr, + (unsigned long *)x, + size); break; default: __put_user_bad(); @@ -108,24 +142,24 @@ static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsign switch (size) { case 1: - rc = __put_get_user_asm((unsigned char *)x, - (unsigned char __user *)ptr, - size, "lghi"); + rc = __get_user_asm((unsigned char *)x, + (unsigned char __user *)ptr, + size); break; case 2: - rc = __put_get_user_asm((unsigned short *)x, - (unsigned short __user *)ptr, - size, "lghi"); + rc = __get_user_asm((unsigned short *)x, + (unsigned short __user *)ptr, + size); break; case 4: - rc = __put_get_user_asm((unsigned int *)x, - (unsigned int __user *)ptr, - size, "lghi"); + rc = __get_user_asm((unsigned int *)x, + (unsigned int __user *)ptr, + size); break; case 8: - rc = __put_get_user_asm((unsigned long *)x, - (unsigned long __user *)ptr, - size, "lghi"); + rc = __get_user_asm((unsigned long *)x, + (unsigned long __user *)ptr, + size); break; default: __get_user_bad(); diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index a596e69d3c47..8a5d21461889 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -62,10 +62,14 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr unsigned long size) { unsigned long tmp1, tmp2; + union oac spec = { + .oac2.as = PSW_BITS_AS_SECONDARY, + .oac2.a = 1, + }; tmp1 = -4096UL; asm volatile( - " lghi 0,%[spec]\n" + " lr 0,%[spec]\n" "0: .insn ss,0xc80000000000,0(%0,%2),0(%1),0\n" "6: jz 4f\n" "1: algr %0,%3\n" @@ -84,7 +88,7 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr "5:\n" EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : [spec] "K" (0x81UL) + : [spec] "d" (spec.val) : "cc", "memory", "0"); return size; } @@ -135,10 +139,14 @@ static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x, unsigned long size) { unsigned long tmp1, tmp2; + union oac spec = { + .oac1.as = PSW_BITS_AS_SECONDARY, + .oac1.a = 1, + }; tmp1 = -4096UL; asm volatile( - " llilh 0,%[spec]\n" + " lr 0,%[spec]\n" "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" "6: jz 4f\n" "1: algr %0,%3\n" @@ -157,7 +165,7 @@ static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x, "5:\n" EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : [spec] "K" (0x81UL) + : [spec] "d" (spec.val) : "cc", "memory", "0"); return size; } @@ -207,10 +215,14 @@ EXPORT_SYMBOL(raw_copy_to_user); static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size) { unsigned long tmp1, tmp2; + union oac spec = { + .oac1.as = PSW_BITS_AS_SECONDARY, + .oac1.a = 1, + }; tmp1 = -4096UL; asm volatile( - " llilh 0,%[spec]\n" + " lr 0,%[spec]\n" "0: .insn ss,0xc80000000000,0(%0,%1),0(%4),0\n" " jz 4f\n" "1: algr %0,%2\n" @@ -228,7 +240,7 @@ static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size "5:\n" EX_TABLE(0b,2b) EX_TABLE(3b,5b) : "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2) - : "a" (empty_zero_page), [spec] "K" (0x81UL) + : "a" (empty_zero_page), [spec] "d" (spec.val) : "cc", "memory", "0"); return size; } From 5754f9084f261f6fbfdcc6e57dae97c86a6ff688 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 17 Jan 2022 13:31:40 +0100 Subject: [PATCH 111/149] s390: add Sven Schnelle as reviewer Sven Schnelle will help reviewing s390 architecture code. Acked-by: Sven Schnelle Acked-by: Alexander Gordeev Acked-by: Vasily Gorbik Acked-by: Christian Borntraeger Signed-off-by: Heiko Carstens --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1f7658bab9e7..253abb28795b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16611,6 +16611,7 @@ M: Heiko Carstens M: Vasily Gorbik M: Christian Borntraeger R: Alexander Gordeev +R: Sven Schnelle L: linux-s390@vger.kernel.org S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ From 0b39536cc699db6850c426db7f9cb45923de40c5 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Fri, 14 Jan 2022 06:48:20 +0000 Subject: [PATCH 112/149] gpio: mpc8xxx: Fix IRQ check in mpc8xxx_probe platform_get_irq() returns negative error number instead 0 on failure. And the doc of platform_get_irq() provides a usage example: int irq = platform_get_irq(pdev, 0); if (irq < 0) return irq; Fix the check of return value to catch errors correctly. Fixes: 76c47d1449fc ("gpio: mpc8xxx: Add ACPI support") Signed-off-by: Miaoqian Lin Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mpc8xxx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index 70d6ae20b1da..01634c8d27b3 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -388,8 +388,8 @@ static int mpc8xxx_probe(struct platform_device *pdev) } mpc8xxx_gc->irqn = platform_get_irq(pdev, 0); - if (!mpc8xxx_gc->irqn) - return 0; + if (mpc8xxx_gc->irqn < 0) + return mpc8xxx_gc->irqn; mpc8xxx_gc->irq = irq_domain_create_linear(fwnode, MPC8XXX_GPIO_PINS, From 30fee1d7462a446ade399c0819717a830cbdca69 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Fri, 14 Jan 2022 06:51:24 +0000 Subject: [PATCH 113/149] gpio: idt3243x: Fix IRQ check in idt_gpio_probe platform_get_irq() returns negative error number instead 0 on failure. And the doc of platform_get_irq() provides a usage example: int irq = platform_get_irq(pdev, 0); if (irq < 0) return irq; Fix the check of return value to catch errors correctly. Fixes: 4195926aedca ("gpio: Add support for IDT 79RC3243x GPIO controller") Signed-off-by: Miaoqian Lin Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-idt3243x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-idt3243x.c b/drivers/gpio/gpio-idt3243x.c index 50003ad2e589..08493b05be2d 100644 --- a/drivers/gpio/gpio-idt3243x.c +++ b/drivers/gpio/gpio-idt3243x.c @@ -164,8 +164,8 @@ static int idt_gpio_probe(struct platform_device *pdev) return PTR_ERR(ctrl->pic); parent_irq = platform_get_irq(pdev, 0); - if (!parent_irq) - return -EINVAL; + if (parent_irq < 0) + return parent_irq; girq = &ctrl->gc.irq; girq->chip = &idt_gpio_irqchip; From 5765cee119bf5a36c94d20eceb37c445508934be Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 17 Jan 2022 14:52:33 +0000 Subject: [PATCH 114/149] net: sfp: fix high power modules without diagnostic monitoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 7cfa9c92d0a3 ("net: sfp: avoid power switch on address-change modules") unintetionally changed the semantics for high power modules without the digital diagnostics monitoring. We repeatedly attempt to read the power status from the non-existing 0xa2 address in a futile hope this failure is temporary: [ 8.856051] sfp sfp-eth3: module NTT 0000000000000000 rev 0000 sn 0000000000000000 dc 160408 [ 8.865843] mvpp2 f4000000.ethernet eth3: switched to inband/1000base-x link mode [ 8.873469] sfp sfp-eth3: Failed to read EEPROM: -5 [ 8.983251] sfp sfp-eth3: Failed to read EEPROM: -5 [ 9.103250] sfp sfp-eth3: Failed to read EEPROM: -5 We previosuly assumed such modules were powered up in the correct mode, continuing without further configuration as long as the required power class was supported by the host. Restore this behaviour, while preserving the intent of subsequent patches to avoid the "Address Change Sequence not supported" warning if we are not going to be accessing the DDM address. Fixes: 7cfa9c92d0a3 ("net: sfp: avoid power switch on address-change modules") Reported-by: 照山周一郎 Tested-by: 照山周一郎 Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/phy/sfp.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index ab77a9f439ef..4720b24ca51b 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -1641,17 +1641,20 @@ static int sfp_sm_probe_for_phy(struct sfp *sfp) static int sfp_module_parse_power(struct sfp *sfp) { u32 power_mW = 1000; + bool supports_a2; if (sfp->id.ext.options & cpu_to_be16(SFP_OPTIONS_POWER_DECL)) power_mW = 1500; if (sfp->id.ext.options & cpu_to_be16(SFP_OPTIONS_HIGH_POWER_LEVEL)) power_mW = 2000; + supports_a2 = sfp->id.ext.sff8472_compliance != + SFP_SFF8472_COMPLIANCE_NONE || + sfp->id.ext.diagmon & SFP_DIAGMON_DDM; + if (power_mW > sfp->max_power_mW) { /* Module power specification exceeds the allowed maximum. */ - if (sfp->id.ext.sff8472_compliance == - SFP_SFF8472_COMPLIANCE_NONE && - !(sfp->id.ext.diagmon & SFP_DIAGMON_DDM)) { + if (!supports_a2) { /* The module appears not to implement bus address * 0xa2, so assume that the module powers up in the * indicated mode. @@ -1668,11 +1671,25 @@ static int sfp_module_parse_power(struct sfp *sfp) } } + if (power_mW <= 1000) { + /* Modules below 1W do not require a power change sequence */ + sfp->module_power_mW = power_mW; + return 0; + } + + if (!supports_a2) { + /* The module power level is below the host maximum and the + * module appears not to implement bus address 0xa2, so assume + * that the module powers up in the indicated mode. + */ + return 0; + } + /* If the module requires a higher power mode, but also requires * an address change sequence, warn the user that the module may * not be functional. */ - if (sfp->id.ext.diagmon & SFP_DIAGMON_ADDRMODE && power_mW > 1000) { + if (sfp->id.ext.diagmon & SFP_DIAGMON_ADDRMODE) { dev_warn(sfp->dev, "Address Change Sequence not supported but module requires %u.%uW, module may not be functional\n", power_mW / 1000, (power_mW / 100) % 10); From 9dec0368b9640c09ef5af48214e097245e57a204 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 11 Jan 2022 14:05:02 -0800 Subject: [PATCH 115/149] xfs: remove the XFS_IOC_FSSETDM definitions Remove the definitions for these ioctls, since the functionality (and, weirdly, the 32-bit compat ioctl definitions) were removed from the kernel in November 2019. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_fs.h | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index c43877c8a279..52b48b24ff66 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -92,21 +92,6 @@ struct getbmapx { #define XFS_FMR_OWN_COW FMR_OWNER('X', 7) /* cow staging */ #define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */ -/* - * Structure for XFS_IOC_FSSETDM. - * For use by backup and restore programs to set the XFS on-disk inode - * fields di_dmevmask and di_dmstate. These must be set to exactly and - * only values previously obtained via xfs_bulkstat! (Specifically the - * struct xfs_bstat fields bs_dmevmask and bs_dmstate.) - */ -#ifndef HAVE_FSDMIDATA -struct fsdmidata { - __u32 fsd_dmevmask; /* corresponds to di_dmevmask */ - __u16 fsd_padding; - __u16 fsd_dmstate; /* corresponds to di_dmstate */ -}; -#endif - /* * File segment locking set data type for 64 bit access. * Also used for all the RESV/FREE interfaces. @@ -562,16 +547,10 @@ typedef struct xfs_fsop_handlereq { /* * Compound structures for passing args through Handle Request interfaces - * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle - * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and - * XFS_IOC_ATTRMULTI_BY_HANDLE + * xfs_attrlist_by_handle, xfs_attrmulti_by_handle + * - ioctls: XFS_IOC_ATTRLIST_BY_HANDLE, and XFS_IOC_ATTRMULTI_BY_HANDLE */ -typedef struct xfs_fsop_setdm_handlereq { - struct xfs_fsop_handlereq hreq; /* handle information */ - struct fsdmidata __user *data; /* DMAPI data */ -} xfs_fsop_setdm_handlereq_t; - /* * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface. * @@ -789,7 +768,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) #define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) -#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata) +/* XFS_IOC_FSSETDM ------- deprecated 39 */ #define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64) #define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64) #define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64) @@ -831,7 +810,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */ #define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */ -#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) +/* XFS_IOC_FSSETDM_BY_HANDLE -- deprecated 121 */ #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) #define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4) From 4d1b97f9ce7c0d2af2bb85b12d48e6902172a28e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 7 Jan 2022 17:45:51 -0800 Subject: [PATCH 116/149] xfs: kill the XFS_IOC_{ALLOC,FREE}SP* ioctls According to the glibc compat header for Irix 4, these ioctls originated in April 1991 as a (somewhat clunky) way to preallocate space at the end of a file on an EFS filesystem. XFS, which was released in Irix 5.3 in December 1993, picked up these ioctls to maintain compatibility and they were ported to Linux in the early 2000s. Recently it was pointed out to me they still lurk in the kernel, even though the Linux fallocate syscall supplanted the functionality a long time ago. fstests doesn't seem to include any real functional or stress tests for these ioctls, which means that the code quality is ... very questionable. Most notably, it was a stale disk block exposure vector for 21 years and nobody noticed or complained. As mature programmers say, "If you're not testing it, it's broken." Given all that, let's withdraw these ioctls from the XFS userspace API. Normally we'd set a long deprecation process, but I estimate that there aren't any real users, so let's trigger a warning in dmesg and return -ENOTTY. See: CVE-2021-4155 Augments: 983d8e60f508 ("xfs: map unwritten blocks in XFS_IOC_{ALLOC,FREE}SP just like fallocate") Signed-off-by: Darrick J. Wong Reviewed-by: Eric Sandeen Reviewed-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 7 ++-- fs/xfs/xfs_bmap_util.h | 2 +- fs/xfs/xfs_file.c | 3 +- fs/xfs/xfs_ioctl.c | 92 +++--------------------------------------- fs/xfs/xfs_ioctl.h | 6 --- fs/xfs/xfs_ioctl32.c | 27 ------------- 6 files changed, 10 insertions(+), 127 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 73a36b7be3bd..575060a7c768 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -771,8 +771,7 @@ int xfs_alloc_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, - int alloc_type) + xfs_off_t len) { xfs_mount_t *mp = ip->i_mount; xfs_off_t count; @@ -865,8 +864,8 @@ xfs_alloc_file_space( goto error; error = xfs_bmapi_write(tp, ip, startoffset_fsb, - allocatesize_fsb, alloc_type, 0, imapp, - &nimaps); + allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, + &nimaps); if (error) goto error; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 9f993168b55b..24b37d211f1d 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -54,7 +54,7 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, /* preallocation and hole punch interface */ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, int alloc_type); + xfs_off_t len); int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 27594738b0d1..d81a28cada35 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1052,8 +1052,7 @@ xfs_file_fallocate( } if (!xfs_is_always_cow_inode(ip)) { - error = xfs_alloc_file_space(ip, offset, len, - XFS_BMAPI_PREALLOC); + error = xfs_alloc_file_space(ip, offset, len); if (error) goto out_unlock; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 29231a8c8a45..64a7ef4a7298 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -627,86 +627,6 @@ xfs_attrmulti_by_handle( return error; } -int -xfs_ioc_space( - struct file *filp, - xfs_flock64_t *bf) -{ - struct inode *inode = file_inode(filp); - struct xfs_inode *ip = XFS_I(inode); - struct iattr iattr; - enum xfs_prealloc_flags flags = XFS_PREALLOC_CLEAR; - uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - int error; - - if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) - return -EPERM; - - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - if (xfs_is_always_cow_inode(ip)) - return -EOPNOTSUPP; - - if (filp->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; - if (filp->f_mode & FMODE_NOCMTIME) - flags |= XFS_PREALLOC_INVISIBLE; - - error = mnt_want_write_file(filp); - if (error) - return error; - - xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); - if (error) - goto out_unlock; - inode_dio_wait(inode); - - switch (bf->l_whence) { - case 0: /*SEEK_SET*/ - break; - case 1: /*SEEK_CUR*/ - bf->l_start += filp->f_pos; - break; - case 2: /*SEEK_END*/ - bf->l_start += XFS_ISIZE(ip); - break; - default: - error = -EINVAL; - goto out_unlock; - } - - if (bf->l_start < 0 || bf->l_start > inode->i_sb->s_maxbytes) { - error = -EINVAL; - goto out_unlock; - } - - if (bf->l_start > XFS_ISIZE(ip)) { - error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), - bf->l_start - XFS_ISIZE(ip), 0); - if (error) - goto out_unlock; - } - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = bf->l_start; - error = xfs_vn_setattr_size(file_mnt_user_ns(filp), file_dentry(filp), - &iattr); - if (error) - goto out_unlock; - - error = xfs_update_prealloc_flags(ip, flags); - -out_unlock: - xfs_iunlock(ip, iolock); - mnt_drop_write_file(filp); - return error; -} - /* Return 0 on success or positive error */ int xfs_fsbulkstat_one_fmt( @@ -1964,13 +1884,11 @@ xfs_file_ioctl( case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: { - xfs_flock64_t bf; - - if (copy_from_user(&bf, arg, sizeof(bf))) - return -EFAULT; - return xfs_ioc_space(filp, &bf); - } + case XFS_IOC_FREESP64: + xfs_warn_once(mp, + "%s should use fallocate; XFS_IOC_{ALLOC,FREE}SP ioctl unsupported", + current->comm); + return -ENOTTY; case XFS_IOC_DIOINFO: { struct xfs_buftarg *target = xfs_inode_buftarg(ip); struct dioattr da; diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 845d3bcab74b..d4abba2c13c1 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -10,12 +10,6 @@ struct xfs_bstat; struct xfs_ibulk; struct xfs_inogrp; - -extern int -xfs_ioc_space( - struct file *filp, - xfs_flock64_t *bf); - int xfs_ioc_swapext( xfs_swapext_t *sxp); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 8783af203cfc..004ed2a251e8 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -27,22 +27,6 @@ _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) #ifdef BROKEN_X86_ALIGNMENT -STATIC int -xfs_compat_flock64_copyin( - xfs_flock64_t *bf, - compat_xfs_flock64_t __user *arg32) -{ - if (get_user(bf->l_type, &arg32->l_type) || - get_user(bf->l_whence, &arg32->l_whence) || - get_user(bf->l_start, &arg32->l_start) || - get_user(bf->l_len, &arg32->l_len) || - get_user(bf->l_sysid, &arg32->l_sysid) || - get_user(bf->l_pid, &arg32->l_pid) || - copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32))) - return -EFAULT; - return 0; -} - STATIC int xfs_compat_ioc_fsgeometry_v1( struct xfs_mount *mp, @@ -445,17 +429,6 @@ xfs_file_compat_ioctl( switch (cmd) { #if defined(BROKEN_X86_ALIGNMENT) - case XFS_IOC_ALLOCSP_32: - case XFS_IOC_FREESP_32: - case XFS_IOC_ALLOCSP64_32: - case XFS_IOC_FREESP64_32: { - struct xfs_flock64 bf; - - if (xfs_compat_flock64_copyin(&bf, arg)) - return -EFAULT; - cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(filp, &bf); - } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(ip->i_mount, arg); case XFS_IOC_FSGROWFSDATA_32: { From b3bb9413e717b44e4aea833d07f14e90fb91cf97 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 Dec 2021 13:07:38 -0800 Subject: [PATCH 117/149] xfs: remove the XFS_IOC_{ALLOC,FREE}SP* definitions Now that we've made these ioctls defunct, move them from xfs_fs.h to xfs_ioctl.c, which effectively removes them from the publicly supported ioctl interfaces for XFS. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Eric Sandeen --- fs/xfs/libxfs/xfs_fs.h | 8 ++++---- fs/xfs/xfs_ioctl.c | 9 +++++++++ fs/xfs/xfs_ioctl32.h | 4 ---- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 52b48b24ff66..505533c43a92 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -760,13 +760,13 @@ struct xfs_scrub_metadata { * For 'documentation' purposed more than anything else, * the "cmd #" field reflects the IRIX fcntl number. */ -#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) -#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +/* XFS_IOC_ALLOCSP ------- deprecated 10 */ +/* XFS_IOC_FREESP -------- deprecated 11 */ #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) #define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR -#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) -#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) +/* XFS_IOC_ALLOCSP64 ----- deprecated 36 */ +/* XFS_IOC_FREESP64 ------ deprecated 37 */ #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) /* XFS_IOC_FSSETDM ------- deprecated 39 */ #define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 64a7ef4a7298..03a6198c97f6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1854,6 +1854,15 @@ xfs_fs_eofblocks_from_user( return 0; } +/* + * These long-unused ioctls were removed from the official ioctl API in 5.17, + * but retain these definitions so that we can log warnings about them. + */ +#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) +#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) +#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 9929482bf358..fc5a91f3a5e0 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -154,10 +154,6 @@ typedef struct compat_xfs_flock64 { __s32 l_pad[4]; /* reserve area */ } compat_xfs_flock64_t; -#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64) -#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64) -#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64) -#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64) #define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64) #define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) #define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) From 2836615aa22de55b8fca5e32fe1b27a67cda625e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 18 Jan 2022 03:43:40 -0800 Subject: [PATCH 118/149] netns: add schedule point in ops_exit_list() When under stress, cleanup_net() can have to dismantle netns in big numbers. ops_exit_list() currently calls many helpers [1] that have no schedule point, and we can end up with soft lockups, particularly on hosts with many cpus. Even for moderate amount of netns processed by cleanup_net() this patch avoids latency spikes. [1] Some of these helpers like fib_sync_up() and fib_sync_down_dev() are very slow because net/ipv4/fib_semantics.c uses host-wide hash tables, and ifindex is used as the only input of two hash functions. ifindexes tend to be the same for all netns (lo.ifindex==1 per instance) This will be fixed in a separate patch. Fixes: 72ad937abd0a ("net: Add support for batching network namespace cleanups") Signed-off-by: Eric Dumazet Cc: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/net_namespace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9b7171c40434..a5b5bb99c644 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -164,8 +164,10 @@ static void ops_exit_list(const struct pernet_operations *ops, { struct net *net; if (ops->exit) { - list_for_each_entry(net, net_exit_list, exit_list) + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); + cond_resched(); + } } if (ops->exit_batch) ops->exit_batch(net_exit_list); From 2fd5dcb1c8ef96c9f0fa8bda53ca480524b80ae7 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Thu, 13 Jan 2022 09:51:39 +0900 Subject: [PATCH 119/149] ksmbd: smbd: fix missing client's memory region invalidation if the Channel of a SMB2 WRITE request is SMB2_CHANNEL_RDMA_V1_INVALIDTE, a client does not invalidate its memory regions but ksmbd must do it by sending a SMB2 WRITE response with IB_WR_SEND_WITH_INV. But if errors occur while processing a SMB2 READ/WRITE request, ksmbd sends a response with IB_WR_SEND. So a client could use memory regions already in use. Acked-by: Namjae Jeon Signed-off-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 75 +++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 28 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 867ed982f729..15f331dbe17a 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6124,6 +6124,26 @@ out: return err; } +static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, + struct smb2_buffer_desc_v1 *desc, + __le32 Channel, + __le16 ChannelInfoOffset, + __le16 ChannelInfoLength) +{ + if (work->conn->dialect == SMB30_PROT_ID && + Channel != SMB2_CHANNEL_RDMA_V1) + return -EINVAL; + + if (ChannelInfoOffset == 0 || + le16_to_cpu(ChannelInfoLength) < sizeof(*desc)) + return -EINVAL; + + work->need_invalidate_rkey = + (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); + work->remote_key = le32_to_cpu(desc->token); + return 0; +} + static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, struct smb2_read_req *req, void *data_buf, size_t length) @@ -6132,18 +6152,6 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; int err; - if (work->conn->dialect == SMB30_PROT_ID && - req->Channel != SMB2_CHANNEL_RDMA_V1) - return -EINVAL; - - if (req->ReadChannelInfoOffset == 0 || - le16_to_cpu(req->ReadChannelInfoLength) < sizeof(*desc)) - return -EINVAL; - - work->need_invalidate_rkey = - (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); - work->remote_key = le32_to_cpu(desc->token); - err = ksmbd_conn_rdma_write(work->conn, data_buf, length, le32_to_cpu(desc->token), le64_to_cpu(desc->offset), @@ -6165,7 +6173,7 @@ int smb2_read(struct ksmbd_work *work) struct ksmbd_conn *conn = work->conn; struct smb2_read_req *req; struct smb2_read_rsp *rsp; - struct ksmbd_file *fp; + struct ksmbd_file *fp = NULL; loff_t offset; size_t length, mincount; ssize_t nbytes = 0, remain_bytes = 0; @@ -6179,6 +6187,18 @@ int smb2_read(struct ksmbd_work *work) return smb2_read_pipe(work); } + if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || + req->Channel == SMB2_CHANNEL_RDMA_V1) { + err = smb2_set_remote_key_for_rdma(work, + (struct smb2_buffer_desc_v1 *) + &req->Buffer[0], + req->Channel, + req->ReadChannelInfoOffset, + req->ReadChannelInfoLength); + if (err) + goto out; + } + fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId), le64_to_cpu(req->PersistentFileId)); if (!fp) { @@ -6364,21 +6384,6 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; - if (work->conn->dialect == SMB30_PROT_ID && - req->Channel != SMB2_CHANNEL_RDMA_V1) - return -EINVAL; - - if (req->Length != 0 || req->DataOffset != 0) - return -EINVAL; - - if (req->WriteChannelInfoOffset == 0 || - le16_to_cpu(req->WriteChannelInfoLength) < sizeof(*desc)) - return -EINVAL; - - work->need_invalidate_rkey = - (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); - work->remote_key = le32_to_cpu(desc->token); - data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); if (!data_buf) return -ENOMEM; @@ -6425,6 +6430,20 @@ int smb2_write(struct ksmbd_work *work) return smb2_write_pipe(work); } + if (req->Channel == SMB2_CHANNEL_RDMA_V1 || + req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { + if (req->Length != 0 || req->DataOffset != 0) + return -EINVAL; + err = smb2_set_remote_key_for_rdma(work, + (struct smb2_buffer_desc_v1 *) + &req->Buffer[0], + req->Channel, + req->WriteChannelInfoOffset, + req->WriteChannelInfoLength); + if (err) + goto out; + } + if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { ksmbd_debug(SMB, "User does not have write permission\n"); err = -EACCES; From b207602fb04537cb21ac38fabd7577eca2fa05ae Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 15 Jan 2022 14:49:00 +0300 Subject: [PATCH 120/149] ksmbd: uninitialized variable in create_socket() The "ksmbd_socket" variable is not initialized on this error path. Cc: stable@vger.kernel.org Fixes: 0626e6641f6b ("cifsd: add server handler for central processing and tranport layers") Signed-off-by: Dan Carpenter Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/transport_tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index c14320e03b69..82a1429bbe12 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -404,7 +404,7 @@ static int create_socket(struct interface *iface) &ksmbd_socket); if (ret) { pr_err("Can't create socket for ipv4: %d\n", ret); - goto out_error; + goto out_clear; } sin.sin_family = PF_INET; @@ -462,6 +462,7 @@ static int create_socket(struct interface *iface) out_error: tcp_destroy_socket(ksmbd_socket); +out_clear: iface->ksmbd_socket = NULL; return ret; } From ac090d9c90b087d6fb714e54b2a6dd1e6c373ed6 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 17 Jan 2022 22:16:01 +0900 Subject: [PATCH 121/149] ksmbd: fix guest connection failure with nautilus MS-SMB2 describe session sign like the following. Session.SigningRequired MUST be set to TRUE under the following conditions: - If the SMB2_NEGOTIATE_SIGNING_REQUIRED bit is set in the SecurityMode field of the client request. - If the SMB2_SESSION_FLAG_IS_GUEST bit is not set in the SessionFlags field and Session.IsAnonymous is FALSE and either Connection.ShouldSign or global RequireMessageSigning is TRUE. When trying guest account connection using nautilus, The login failure happened on session setup. ksmbd does not allow this connection when the user is a guest and the connection sign is set. Just do not set session sign instead of error response as described in the specification. And this change improves the guest connection in Nautilus. Fixes: e2f34481b24d ("cifsd: add server-side procedures for SMB3") Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 64 ++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 15f331dbe17a..1866c81c5c99 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -1464,11 +1464,6 @@ static int ntlm_authenticate(struct ksmbd_work *work) } if (user_guest(sess->user)) { - if (conn->sign) { - ksmbd_debug(SMB, "Guest login not allowed when signing enabled\n"); - return -EPERM; - } - rsp->SessionFlags = SMB2_SESSION_FLAG_IS_GUEST_LE; } else { struct authenticate_message *authblob; @@ -1481,38 +1476,39 @@ static int ntlm_authenticate(struct ksmbd_work *work) ksmbd_debug(SMB, "authentication failed\n"); return -EPERM; } + } + /* + * If session state is SMB2_SESSION_VALID, We can assume + * that it is reauthentication. And the user/password + * has been verified, so return it here. + */ + if (sess->state == SMB2_SESSION_VALID) { + if (conn->binding) + goto binding_session; + return 0; + } + + if ((rsp->SessionFlags != SMB2_SESSION_FLAG_IS_GUEST_LE && + (conn->sign || server_conf.enforced_signing)) || + (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) + sess->sign = true; + + if (smb3_encryption_negotiated(conn) && + !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { + rc = conn->ops->generate_encryptionkey(sess); + if (rc) { + ksmbd_debug(SMB, + "SMB3 encryption key generation failed\n"); + return -EINVAL; + } + sess->enc = true; + rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; /* - * If session state is SMB2_SESSION_VALID, We can assume - * that it is reauthentication. And the user/password - * has been verified, so return it here. + * signing is disable if encryption is enable + * on this session */ - if (sess->state == SMB2_SESSION_VALID) { - if (conn->binding) - goto binding_session; - return 0; - } - - if ((conn->sign || server_conf.enforced_signing) || - (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) - sess->sign = true; - - if (smb3_encryption_negotiated(conn) && - !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { - rc = conn->ops->generate_encryptionkey(sess); - if (rc) { - ksmbd_debug(SMB, - "SMB3 encryption key generation failed\n"); - return -EINVAL; - } - sess->enc = true; - rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; - /* - * signing is disable if encryption is enable - * on this session - */ - sess->sign = false; - } + sess->sign = false; } binding_session: From be80a1d3f9dbe5aee79a325964f7037fe2d92f30 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 10 Jan 2022 14:05:49 +0000 Subject: [PATCH 122/149] bpf: Generalize check_ctx_reg for reuse with other types Generalize the check_ctx_reg() helper function into a more generic named one so that it can be reused for other register types as well to check whether their offset is non-zero. No functional change. Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- kernel/bpf/btf.c | 2 +- kernel/bpf/verifier.c | 21 +++++++++++---------- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 143401d4c9d9..e9993172f892 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -519,8 +519,8 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); -int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno); +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 33bb8ae4a804..e16dafeb2450 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5686,7 +5686,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, i, btf_type_str(t)); return -EINVAL; } - if (check_ctx_reg(env, reg, regno)) + if (check_ptr_off_reg(env, reg, regno)) return -EINVAL; } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) { const struct btf_type *reg_ref_t; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a8587210907d..9b8334068e71 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3969,16 +3969,16 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) { - /* Access to ctx or passing it to a helper is only allowed in - * its original, unmodified form. + /* Access to this pointer-typed register or passing it to a helper + * is only allowed in its original, unmodified form. */ if (reg->off) { - verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", - regno, reg->off); + verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); return -EACCES; } @@ -3986,7 +3986,8 @@ int check_ctx_reg(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); return -EACCES; } @@ -4437,7 +4438,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_ctx_reg(env, reg, regno); + err = check_ptr_off_reg(env, reg, regno); if (err < 0) return err; @@ -5305,7 +5306,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err; if (type == PTR_TO_CTX) { - err = check_ctx_reg(env, reg, regno); + err = check_ptr_off_reg(env, reg, regno); if (err < 0) return err; } @@ -9651,7 +9652,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } - err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); + err = check_ptr_off_reg(env, ®s[ctx_reg], ctx_reg); if (err < 0) return err; From d400a6cf1c8a57cdf10f35220ead3284320d85ff Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 14 Jan 2022 13:58:36 +0000 Subject: [PATCH 123/149] bpf: Mark PTR_TO_FUNC register initially with zero offset Similar as with other pointer types where we use ldimm64, clear the register content to zero first, and then populate the PTR_TO_FUNC type and subprogno number. Currently this is not done, and leads to reuse of stale register tracking data. Given for special ldimm64 cases we always clear the register offset, make it common for all cases, so it won't be forgotten in future. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9b8334068e71..ffec0baaf2b6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9508,9 +9508,13 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - if (insn->src_reg == BPF_PSEUDO_BTF_ID) { - mark_reg_known_zero(env, regs, insn->dst_reg); + /* All special src_reg cases are listed below. From this point onwards + * we either succeed and assign a corresponding dst_reg->type after + * zeroing the offset, or fail and reject the program. + */ + mark_reg_known_zero(env, regs, insn->dst_reg); + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { dst_reg->type = aux->btf_var.reg_type; switch (base_type(dst_reg->type)) { case PTR_TO_MEM: @@ -9548,7 +9552,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } map = env->used_maps[aux->map_index]; - mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || From 6788ab23508bddb0a9d88e104284922cb2c22b77 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 10 Jan 2022 14:40:40 +0000 Subject: [PATCH 124/149] bpf: Generally fix helper register offset check Right now the assertion on check_ptr_off_reg() is only enforced for register types PTR_TO_CTX (and open coded also for PTR_TO_BTF_ID), however, this is insufficient since many other PTR_TO_* register types such as PTR_TO_FUNC do not handle/expect register offsets when passed to helper functions. Given this can slip-through easily when adding new types, make this an explicit allow-list and reject all other current and future types by default if this is encountered. Also, extend check_ptr_off_reg() to handle PTR_TO_BTF_ID as well instead of duplicating it. For PTR_TO_BTF_ID, reg->off is used for BTF to match expected BTF ids if struct offset is used. This part still needs to be allowed, but the dynamic off from the tnum must be rejected. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ffec0baaf2b6..e0b3f4d683eb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3969,14 +3969,15 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +static int __check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + bool fixed_off_ok) { /* Access to this pointer-typed register or passing it to a helper * is only allowed in its original, unmodified form. */ - if (reg->off) { + if (!fixed_off_ok && reg->off) { verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", reg_type_str(env, reg->type), regno, reg->off); return -EACCES; @@ -3994,6 +3995,12 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, return 0; } +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + return __check_ptr_off_reg(env, reg, regno, false); +} + static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, @@ -5245,12 +5252,6 @@ found: kernel_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } - - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", - regno); - return -EACCES; - } } return 0; @@ -5305,10 +5306,26 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (err) return err; - if (type == PTR_TO_CTX) { - err = check_ptr_off_reg(env, reg, regno); + switch ((u32)type) { + case SCALAR_VALUE: + /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_BUF: + case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_STACK: + break; + /* All the rest must be rejected: */ + default: + err = __check_ptr_off_reg(env, reg, regno, + type == PTR_TO_BTF_ID); if (err < 0) return err; + break; } skip_type_check: From 64620e0a1e712a778095bd35cbb277dc2259281f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 11 Jan 2022 14:43:41 +0000 Subject: [PATCH 125/149] bpf: Fix out of bounds access for ringbuf helpers Both bpf_ringbuf_submit() and bpf_ringbuf_discard() have ARG_PTR_TO_ALLOC_MEM in their bpf_func_proto definition as their first argument. They both expect the result from a prior bpf_ringbuf_reserve() call which has a return type of RET_PTR_TO_ALLOC_MEM_OR_NULL. Meaning, after a NULL check in the code, the verifier will promote the register type in the non-NULL branch to a PTR_TO_MEM and in the NULL branch to a known zero scalar. Generally, pointer arithmetic on PTR_TO_MEM is allowed, so the latter could have an offset. The ARG_PTR_TO_ALLOC_MEM expects a PTR_TO_MEM register type. However, the non- zero result from bpf_ringbuf_reserve() must be fed into either bpf_ringbuf_submit() or bpf_ringbuf_discard() but with the original offset given it will then read out the struct bpf_ringbuf_hdr mapping. The verifier missed to enforce a zero offset, so that out of bounds access can be triggered which could be used to escalate privileges if unprivileged BPF was enabled (disabled by default in kernel). Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: (SecCoder Security Lab) Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e0b3f4d683eb..c72c57a6684f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5318,9 +5318,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: case PTR_TO_STACK: + /* Some of the argument types nevertheless require a + * zero register offset. + */ + if (arg_type == ARG_PTR_TO_ALLOC_MEM) + goto force_off_check; break; /* All the rest must be rejected: */ default: +force_off_check: err = __check_ptr_off_reg(env, reg, regno, type == PTR_TO_BTF_ID); if (err < 0) From a672b2e36a648afb04ad3bda93b6bda947a479a5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 13 Jan 2022 11:11:30 +0000 Subject: [PATCH 126/149] bpf: Fix ringbuf memory type confusion when passing to helpers The bpf_ringbuf_submit() and bpf_ringbuf_discard() have ARG_PTR_TO_ALLOC_MEM in their bpf_func_proto definition as their first argument, and thus both expect the result from a prior bpf_ringbuf_reserve() call which has a return type of RET_PTR_TO_ALLOC_MEM_OR_NULL. While the non-NULL memory from bpf_ringbuf_reserve() can be passed to other helpers, the two sinks (bpf_ringbuf_submit(), bpf_ringbuf_discard()) right now only enforce a register type of PTR_TO_MEM. This can lead to potential type confusion since it would allow other PTR_TO_MEM memory to be passed into the two sinks which did not come from bpf_ringbuf_reserve(). Add a new MEM_ALLOC composable type attribute for PTR_TO_MEM, and enforce that: - bpf_ringbuf_reserve() returns NULL or PTR_TO_MEM | MEM_ALLOC - bpf_ringbuf_submit() and bpf_ringbuf_discard() only take PTR_TO_MEM | MEM_ALLOC but not plain PTR_TO_MEM arguments via ARG_PTR_TO_ALLOC_MEM - however, other helpers might treat PTR_TO_MEM | MEM_ALLOC as plain PTR_TO_MEM to populate the memory area when they use ARG_PTR_TO_{UNINIT_,}MEM in their func proto description Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++++++-- kernel/bpf/verifier.c | 6 +++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6e947cd91152..fa517ae604ad 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -316,7 +316,12 @@ enum bpf_type_flag { */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = MEM_RDONLY, + /* MEM was "allocated" from a different helper, and cannot be mixed + * with regular non-MEM_ALLOC'ed MEM types. + */ + MEM_ALLOC = BIT(2 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_ALLOC, }; /* Max number of base types. */ @@ -400,7 +405,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, - RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c72c57a6684f..a39eedecc93a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -570,6 +570,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env, if (type & MEM_RDONLY) strncpy(prefix, "rdonly_", 16); + if (type & MEM_ALLOC) + strncpy(prefix, "alloc_", 16); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -5135,6 +5137,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, + PTR_TO_MEM | MEM_ALLOC, PTR_TO_BUF, }, }; @@ -5152,7 +5155,7 @@ static const struct bpf_reg_types int_ptr_types = { static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; -static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; +static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; @@ -5315,6 +5318,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, case PTR_TO_MAP_VALUE: case PTR_TO_MEM: case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_MEM | MEM_ALLOC: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: case PTR_TO_STACK: From 722e4db3ae0d52b2e3801280afbe19cf2d188e91 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 10 Jan 2022 13:44:18 +0000 Subject: [PATCH 127/149] bpf, selftests: Add various ringbuf tests with invalid offset Assert that the verifier is rejecting invalid offsets on the ringbuf entries: # ./test_verifier | grep ring #947/u ringbuf: invalid reservation offset 1 OK #947/p ringbuf: invalid reservation offset 1 OK #948/u ringbuf: invalid reservation offset 2 OK #948/p ringbuf: invalid reservation offset 2 OK Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- .../testing/selftests/bpf/verifier/ringbuf.c | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 tools/testing/selftests/bpf/verifier/ringbuf.c diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c new file mode 100644 index 000000000000..68cae6947cc4 --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/ringbuf.c @@ -0,0 +1,64 @@ +{ + "ringbuf: invalid reservation offset 1", + .insns = { + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + /* store a pointer to the reserved memory in R6 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + /* spill R6(mem) into the stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + /* fill it back in R7 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8), + /* should be able to access *(R7) = 0 */ + BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0), + /* submit the reserved ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + /* add invalid offset to reserved ringbuf memory */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xcafe), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .result = REJECT, + .errstr = "dereference of modified mem ptr R1", +}, +{ + "ringbuf: invalid reservation offset 2", + .insns = { + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + /* store a pointer to the reserved memory in R6 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + /* spill R6(mem) into the stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + /* fill it back in R7 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8), + /* add invalid offset to reserved ringbuf memory */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0xcafe), + /* should be able to access *(R7) = 0 */ + BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0), + /* submit the reserved ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .result = REJECT, + .errstr = "R7 min value is outside of the allowed memory range", +}, From 37c8d4807d1b8b521b30310dce97f6695dc2c2c6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 12 Jan 2022 12:39:48 +0000 Subject: [PATCH 128/149] bpf, selftests: Add ringbuf memory type confusion test Add two tests, one which asserts that ring buffer memory can be passed to other helpers for populating its entry area, and another one where verifier rejects different type of memory passed to bpf_ringbuf_submit(). Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/d_path.c | 14 ++++++++ .../bpf/progs/test_d_path_check_types.c | 32 ++++++++++++++++++ .../testing/selftests/bpf/verifier/ringbuf.c | 33 ++++++++++++++++++- .../selftests/bpf/verifier/spill_fill.c | 2 +- 4 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_d_path_check_types.c diff --git a/tools/testing/selftests/bpf/prog_tests/d_path.c b/tools/testing/selftests/bpf/prog_tests/d_path.c index 32fc5b3b5cf6..911345c526e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/d_path.c +++ b/tools/testing/selftests/bpf/prog_tests/d_path.c @@ -10,6 +10,7 @@ #include "test_d_path.skel.h" #include "test_d_path_check_rdonly_mem.skel.h" +#include "test_d_path_check_types.skel.h" static int duration; @@ -167,6 +168,16 @@ static void test_d_path_check_rdonly_mem(void) test_d_path_check_rdonly_mem__destroy(skel); } +static void test_d_path_check_types(void) +{ + struct test_d_path_check_types *skel; + + skel = test_d_path_check_types__open_and_load(); + ASSERT_ERR_PTR(skel, "unexpected_load_passing_wrong_type"); + + test_d_path_check_types__destroy(skel); +} + void test_d_path(void) { if (test__start_subtest("basic")) @@ -174,4 +185,7 @@ void test_d_path(void) if (test__start_subtest("check_rdonly_mem")) test_d_path_check_rdonly_mem(); + + if (test__start_subtest("check_alloc_mem")) + test_d_path_check_types(); } diff --git a/tools/testing/selftests/bpf/progs/test_d_path_check_types.c b/tools/testing/selftests/bpf/progs/test_d_path_check_types.c new file mode 100644 index 000000000000..7e02b7361307 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_d_path_check_types.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +extern const int bpf_prog_active __ksym; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 12); +} ringbuf SEC(".maps"); + +SEC("fentry/security_inode_getattr") +int BPF_PROG(d_path_check_rdonly_mem, struct path *path, struct kstat *stat, + __u32 request_mask, unsigned int query_flags) +{ + void *active; + u32 cpu; + + cpu = bpf_get_smp_processor_id(); + active = (void *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); + if (active) { + /* FAIL here! 'active' points to 'regular' memory. It + * cannot be submitted to ring buffer. + */ + bpf_ringbuf_submit(active, 0); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c index 68cae6947cc4..b64d33e4833c 100644 --- a/tools/testing/selftests/bpf/verifier/ringbuf.c +++ b/tools/testing/selftests/bpf/verifier/ringbuf.c @@ -28,7 +28,7 @@ }, .fixup_map_ringbuf = { 1 }, .result = REJECT, - .errstr = "dereference of modified mem ptr R1", + .errstr = "dereference of modified alloc_mem ptr R1", }, { "ringbuf: invalid reservation offset 2", @@ -62,3 +62,34 @@ .result = REJECT, .errstr = "R7 min value is outside of the allowed memory range", }, +{ + "ringbuf: check passing rb mem to helpers", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + /* pass allocated ring buffer memory to fib lookup */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 8), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_fib_lookup), + /* submit the ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 2 }, + .prog_type = BPF_PROG_TYPE_XDP, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c index 1a8eb9672bd1..8cfc5349d2a8 100644 --- a/tools/testing/selftests/bpf/verifier/spill_fill.c +++ b/tools/testing/selftests/bpf/verifier/spill_fill.c @@ -84,7 +84,7 @@ }, .fixup_map_ringbuf = { 1 }, .result = REJECT, - .errstr = "R0 pointer arithmetic on mem_or_null prohibited", + .errstr = "R0 pointer arithmetic on alloc_mem_or_null prohibited", }, { "check corrupted spill/fill", From 2e5644b1bab2ccea9cfc7a9520af95b94eb0dbf1 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 18 Jan 2022 15:41:24 -0600 Subject: [PATCH 129/149] net: axienet: increase reset timeout The previous timeout of 1ms was too short to handle some cases where the core is reset just after the input clocks were started, which will be introduced in an upcoming patch. Increase the timeout to 50ms. Also simplify the reset timeout checking to use read_poll_timeout. Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Robert Hancock Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- .../net/ethernet/xilinx/xilinx_axienet_main.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 23ac353b35fe..9c5b24af61fa 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -496,7 +496,8 @@ static void axienet_setoptions(struct net_device *ndev, u32 options) static int __axienet_device_reset(struct axienet_local *lp) { - u32 timeout; + u32 value; + int ret; /* Reset Axi DMA. This would reset Axi Ethernet core as well. The reset * process of Axi DMA takes a while to complete as all pending @@ -506,15 +507,13 @@ static int __axienet_device_reset(struct axienet_local *lp) * they both reset the entire DMA core, so only one needs to be used. */ axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, XAXIDMA_CR_RESET_MASK); - timeout = DELAY_OF_ONE_MILLISEC; - while (axienet_dma_in32(lp, XAXIDMA_TX_CR_OFFSET) & - XAXIDMA_CR_RESET_MASK) { - udelay(1); - if (--timeout == 0) { - netdev_err(lp->ndev, "%s: DMA reset timeout!\n", - __func__); - return -ETIMEDOUT; - } + ret = read_poll_timeout(axienet_dma_in32, value, + !(value & XAXIDMA_CR_RESET_MASK), + DELAY_OF_ONE_MILLISEC, 50000, false, lp, + XAXIDMA_TX_CR_OFFSET); + if (ret) { + dev_err(lp->dev, "%s: DMA reset timeout!\n", __func__); + return ret; } return 0; From b400c2f4f4c53c86594dd57098970d97d488bfde Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 18 Jan 2022 15:41:25 -0600 Subject: [PATCH 130/149] net: axienet: Wait for PhyRstCmplt after core reset When resetting the device, wait for the PhyRstCmplt bit to be set in the interrupt status register before continuing initialization, to ensure that the core is actually ready. When using an external PHY, this also ensures we do not start trying to access the PHY while it is still in reset. The PHY reset is initiated by the core reset which is triggered just above, but remains asserted for 5ms after the core is reset according to the documentation. The MgtRdy bit could also be waited for, but unfortunately when using 7-series devices, the bit does not appear to work as documented (it seems to behave as some sort of link state indication and not just an indication the transceiver is ready) so it can't really be relied on for this purpose. Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Robert Hancock Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 9c5b24af61fa..3a2d7e8c3f66 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -516,6 +516,16 @@ static int __axienet_device_reset(struct axienet_local *lp) return ret; } + /* Wait for PhyRstCmplt bit to be set, indicating the PHY reset has finished */ + ret = read_poll_timeout(axienet_ior, value, + value & XAE_INT_PHYRSTCMPLT_MASK, + DELAY_OF_ONE_MILLISEC, 50000, false, lp, + XAE_IS_OFFSET); + if (ret) { + dev_err(lp->dev, "%s: timeout waiting for PhyRstCmplt\n", __func__); + return ret; + } + return 0; } From 04cc2da39698efd7eb2e30c112538922d26f848e Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 18 Jan 2022 15:41:26 -0600 Subject: [PATCH 131/149] net: axienet: reset core on initialization prior to MDIO access In some cases where the Xilinx Ethernet core was used in 1000Base-X or SGMII modes, which use the internal PCS/PMA PHY, and the MGT transceiver clock source for the PCS was not running at the time the FPGA logic was loaded, the core would come up in a state where the PCS could not be found on the MDIO bus. To fix this, the Ethernet core (including the PCS) should be reset after enabling the clocks, prior to attempting to access the PCS using of_mdio_find_device. Fixes: 1a02556086fc (net: axienet: Properly handle PCS/PMA PHY for 1000BaseX mode) Signed-off-by: Robert Hancock Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 3a2d7e8c3f66..53ff38cbc37b 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -2036,6 +2036,11 @@ static int axienet_probe(struct platform_device *pdev) lp->coalesce_count_rx = XAXIDMA_DFT_RX_THRESHOLD; lp->coalesce_count_tx = XAXIDMA_DFT_TX_THRESHOLD; + /* Reset core now that clocks are enabled, prior to accessing MDIO */ + ret = __axienet_device_reset(lp); + if (ret) + goto cleanup_clk; + lp->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); if (lp->phy_node) { ret = axienet_mdio_setup(lp); From 95978df6fa328df619c15312e65ece469c2be2d2 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 18 Jan 2022 15:41:27 -0600 Subject: [PATCH 132/149] net: axienet: add missing memory barriers This driver was missing some required memory barriers: Use dma_rmb to ensure we see all updates to the descriptor after we see that an entry has been completed. Use wmb and rmb to avoid stale descriptor status between the TX path and TX complete IRQ path. Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Robert Hancock Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 53ff38cbc37b..fb486a457c76 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -632,6 +632,8 @@ static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd, if (nr_bds == -1 && !(status & XAXIDMA_BD_STS_COMPLETE_MASK)) break; + /* Ensure we see complete descriptor update */ + dma_rmb(); phys = desc_get_phys_addr(lp, cur_p); dma_unmap_single(ndev->dev.parent, phys, (cur_p->cntrl & XAXIDMA_BD_CTRL_LENGTH_MASK), @@ -645,8 +647,10 @@ static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd, cur_p->app1 = 0; cur_p->app2 = 0; cur_p->app4 = 0; - cur_p->status = 0; cur_p->skb = NULL; + /* ensure our transmit path and device don't prematurely see status cleared */ + wmb(); + cur_p->status = 0; if (sizep) *sizep += status & XAXIDMA_BD_STS_ACTUAL_LEN_MASK; @@ -704,6 +708,9 @@ static inline int axienet_check_tx_bd_space(struct axienet_local *lp, int num_frag) { struct axidma_bd *cur_p; + + /* Ensure we see all descriptor updates from device or TX IRQ path */ + rmb(); cur_p = &lp->tx_bd_v[(lp->tx_bd_tail + num_frag) % lp->tx_bd_num]; if (cur_p->status & XAXIDMA_BD_STS_ALL_MASK) return NETDEV_TX_BUSY; @@ -843,6 +850,8 @@ static void axienet_recv(struct net_device *ndev) tail_p = lp->rx_bd_p + sizeof(*lp->rx_bd_v) * lp->rx_bd_ci; + /* Ensure we see complete descriptor update */ + dma_rmb(); phys = desc_get_phys_addr(lp, cur_p); dma_unmap_single(ndev->dev.parent, phys, lp->max_frm_size, DMA_FROM_DEVICE); From 70f5817deddbc6ef3faa35841cab83c280cc653a Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 18 Jan 2022 15:41:28 -0600 Subject: [PATCH 133/149] net: axienet: limit minimum TX ring size The driver will not work properly if the TX ring size is set to below MAX_SKB_FRAGS + 1 since it needs to hold at least one full maximally fragmented packet in the TX ring. Limit setting the ring size to below this value. Fixes: 8b09ca823ffb4 ("net: axienet: Make RX/TX ring sizes configurable") Signed-off-by: Robert Hancock Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index fb486a457c76..3f92001bacaf 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -43,6 +43,7 @@ /* Descriptors defines for Tx and Rx DMA */ #define TX_BD_NUM_DEFAULT 64 #define RX_BD_NUM_DEFAULT 1024 +#define TX_BD_NUM_MIN (MAX_SKB_FRAGS + 1) #define TX_BD_NUM_MAX 4096 #define RX_BD_NUM_MAX 4096 @@ -1370,7 +1371,8 @@ axienet_ethtools_set_ringparam(struct net_device *ndev, if (ering->rx_pending > RX_BD_NUM_MAX || ering->rx_mini_pending || ering->rx_jumbo_pending || - ering->rx_pending > TX_BD_NUM_MAX) + ering->tx_pending < TX_BD_NUM_MIN || + ering->tx_pending > TX_BD_NUM_MAX) return -EINVAL; if (netif_running(ndev)) From 996defd7f8b5dafc1d480b7585c7c62437f80c3c Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 18 Jan 2022 15:41:29 -0600 Subject: [PATCH 134/149] net: axienet: Fix TX ring slot available check The check for whether a TX ring slot was available was incorrect, since a slot which had been loaded with transmit data but the device had not started transmitting would be treated as available, potentially causing non-transmitted slots to be overwritten. The control field in the descriptor should be checked, rather than the status field (which may only be updated when the device completes the entry). Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Robert Hancock Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 3f92001bacaf..85fe2b3bd37a 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -643,7 +643,6 @@ static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd, if (cur_p->skb && (status & XAXIDMA_BD_STS_COMPLETE_MASK)) dev_consume_skb_irq(cur_p->skb); - cur_p->cntrl = 0; cur_p->app0 = 0; cur_p->app1 = 0; cur_p->app2 = 0; @@ -651,6 +650,7 @@ static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd, cur_p->skb = NULL; /* ensure our transmit path and device don't prematurely see status cleared */ wmb(); + cur_p->cntrl = 0; cur_p->status = 0; if (sizep) @@ -713,7 +713,7 @@ static inline int axienet_check_tx_bd_space(struct axienet_local *lp, /* Ensure we see all descriptor updates from device or TX IRQ path */ rmb(); cur_p = &lp->tx_bd_v[(lp->tx_bd_tail + num_frag) % lp->tx_bd_num]; - if (cur_p->status & XAXIDMA_BD_STS_ALL_MASK) + if (cur_p->cntrl) return NETDEV_TX_BUSY; return 0; } From aba57a823d2985a2cc8c74a2535f3a88e68d9424 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 18 Jan 2022 15:41:30 -0600 Subject: [PATCH 135/149] net: axienet: fix number of TX ring slots for available check The check for the number of available TX ring slots was off by 1 since a slot is required for the skb header as well as each fragment. This could result in overwriting a TX ring slot that was still in use. Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Robert Hancock Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 85fe2b3bd37a..8dc9e92e05d2 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -747,7 +747,7 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev) num_frag = skb_shinfo(skb)->nr_frags; cur_p = &lp->tx_bd_v[lp->tx_bd_tail]; - if (axienet_check_tx_bd_space(lp, num_frag)) { + if (axienet_check_tx_bd_space(lp, num_frag + 1)) { if (netif_queue_stopped(ndev)) return NETDEV_TX_BUSY; @@ -757,7 +757,7 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev) smp_mb(); /* Space might have just been freed - check again */ - if (axienet_check_tx_bd_space(lp, num_frag)) + if (axienet_check_tx_bd_space(lp, num_frag + 1)) return NETDEV_TX_BUSY; netif_wake_queue(ndev); From bb193e3db8b86a63f26889c99e14fd30c9ebd72a Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 18 Jan 2022 15:41:31 -0600 Subject: [PATCH 136/149] net: axienet: fix for TX busy handling Network driver documentation indicates we should be avoiding returning NETDEV_TX_BUSY from ndo_start_xmit in normal cases, since it requires the packets to be requeued. Instead the queue should be stopped after a packet is added to the TX ring when there may not be enough room for an additional one. Also, when TX ring entries are completed, we should only wake the queue if we know there is room for another full maximally fragmented packet. Print a warning if there is insufficient space at the start of start_xmit, since this should no longer happen. Combined with increasing the default TX ring size (in a subsequent patch), this appears to recover the TX performance lost by previous changes to actually manage the TX ring state properly. Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Robert Hancock Signed-off-by: David S. Miller --- .../net/ethernet/xilinx/xilinx_axienet_main.c | 86 ++++++++++--------- 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 8dc9e92e05d2..b4f42ee9b75d 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -660,6 +660,32 @@ static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd, return i; } +/** + * axienet_check_tx_bd_space - Checks if a BD/group of BDs are currently busy + * @lp: Pointer to the axienet_local structure + * @num_frag: The number of BDs to check for + * + * Return: 0, on success + * NETDEV_TX_BUSY, if any of the descriptors are not free + * + * This function is invoked before BDs are allocated and transmission starts. + * This function returns 0 if a BD or group of BDs can be allocated for + * transmission. If the BD or any of the BDs are not free the function + * returns a busy status. This is invoked from axienet_start_xmit. + */ +static inline int axienet_check_tx_bd_space(struct axienet_local *lp, + int num_frag) +{ + struct axidma_bd *cur_p; + + /* Ensure we see all descriptor updates from device or TX IRQ path */ + rmb(); + cur_p = &lp->tx_bd_v[(lp->tx_bd_tail + num_frag) % lp->tx_bd_num]; + if (cur_p->cntrl) + return NETDEV_TX_BUSY; + return 0; +} + /** * axienet_start_xmit_done - Invoked once a transmit is completed by the * Axi DMA Tx channel. @@ -689,33 +715,8 @@ static void axienet_start_xmit_done(struct net_device *ndev) /* Matches barrier in axienet_start_xmit */ smp_mb(); - netif_wake_queue(ndev); -} - -/** - * axienet_check_tx_bd_space - Checks if a BD/group of BDs are currently busy - * @lp: Pointer to the axienet_local structure - * @num_frag: The number of BDs to check for - * - * Return: 0, on success - * NETDEV_TX_BUSY, if any of the descriptors are not free - * - * This function is invoked before BDs are allocated and transmission starts. - * This function returns 0 if a BD or group of BDs can be allocated for - * transmission. If the BD or any of the BDs are not free the function - * returns a busy status. This is invoked from axienet_start_xmit. - */ -static inline int axienet_check_tx_bd_space(struct axienet_local *lp, - int num_frag) -{ - struct axidma_bd *cur_p; - - /* Ensure we see all descriptor updates from device or TX IRQ path */ - rmb(); - cur_p = &lp->tx_bd_v[(lp->tx_bd_tail + num_frag) % lp->tx_bd_num]; - if (cur_p->cntrl) - return NETDEV_TX_BUSY; - return 0; + if (!axienet_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1)) + netif_wake_queue(ndev); } /** @@ -748,19 +749,14 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev) cur_p = &lp->tx_bd_v[lp->tx_bd_tail]; if (axienet_check_tx_bd_space(lp, num_frag + 1)) { - if (netif_queue_stopped(ndev)) - return NETDEV_TX_BUSY; - + /* Should not happen as last start_xmit call should have + * checked for sufficient space and queue should only be + * woken when sufficient space is available. + */ netif_stop_queue(ndev); - - /* Matches barrier in axienet_start_xmit_done */ - smp_mb(); - - /* Space might have just been freed - check again */ - if (axienet_check_tx_bd_space(lp, num_frag + 1)) - return NETDEV_TX_BUSY; - - netif_wake_queue(ndev); + if (net_ratelimit()) + netdev_warn(ndev, "TX ring unexpectedly full\n"); + return NETDEV_TX_BUSY; } if (skb->ip_summed == CHECKSUM_PARTIAL) { @@ -821,6 +817,18 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev) if (++lp->tx_bd_tail >= lp->tx_bd_num) lp->tx_bd_tail = 0; + /* Stop queue if next transmit may not have space */ + if (axienet_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1)) { + netif_stop_queue(ndev); + + /* Matches barrier in axienet_start_xmit_done */ + smp_mb(); + + /* Space might have just been freed - check again */ + if (!axienet_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1)) + netif_wake_queue(ndev); + } + return NETDEV_TX_OK; } From 2d19c3fd80178160dd505ccd7fed1643831227a5 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 18 Jan 2022 15:41:32 -0600 Subject: [PATCH 137/149] net: axienet: increase default TX ring size to 128 With previous changes to make the driver handle the TX ring size more correctly, the default TX ring size of 64 appears to significantly bottleneck TX performance to around 600 Mbps on a 1 Gbps link on ZynqMP. Increasing this to 128 seems to bring performance up to near line rate and shouldn't cause excess bufferbloat (this driver doesn't yet support modern byte-based queue management). Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Robert Hancock Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index b4f42ee9b75d..377c94ec2486 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -41,7 +41,7 @@ #include "xilinx_axienet.h" /* Descriptors defines for Tx and Rx DMA */ -#define TX_BD_NUM_DEFAULT 64 +#define TX_BD_NUM_DEFAULT 128 #define RX_BD_NUM_DEFAULT 1024 #define TX_BD_NUM_MIN (MAX_SKB_FRAGS + 1) #define TX_BD_NUM_MAX 4096 From dded08927ca3c31a5c37f8e7f95fe98770475dd4 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 19 Jan 2022 08:48:16 +0100 Subject: [PATCH 138/149] nfc: llcp: fix NULL error pointer dereference on sendmsg() after failed bind() Syzbot detected a NULL pointer dereference of nfc_llcp_sock->dev pointer (which is a 'struct nfc_dev *') with calls to llcp_sock_sendmsg() after a failed llcp_sock_bind(). The message being sent is a SOCK_DGRAM. KASAN report: BUG: KASAN: null-ptr-deref in nfc_alloc_send_skb+0x2d/0xc0 Read of size 4 at addr 00000000000005c8 by task llcp_sock_nfc_a/899 CPU: 5 PID: 899 Comm: llcp_sock_nfc_a Not tainted 5.16.0-rc6-next-20211224-00001-gc6437fbf18b0 #125 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 Call Trace: dump_stack_lvl+0x45/0x59 ? nfc_alloc_send_skb+0x2d/0xc0 __kasan_report.cold+0x117/0x11c ? mark_lock+0x480/0x4f0 ? nfc_alloc_send_skb+0x2d/0xc0 kasan_report+0x38/0x50 nfc_alloc_send_skb+0x2d/0xc0 nfc_llcp_send_ui_frame+0x18c/0x2a0 ? nfc_llcp_send_i_frame+0x230/0x230 ? __local_bh_enable_ip+0x86/0xe0 ? llcp_sock_connect+0x470/0x470 ? llcp_sock_connect+0x470/0x470 sock_sendmsg+0x8e/0xa0 ____sys_sendmsg+0x253/0x3f0 ... The issue was visible only with multiple simultaneous calls to bind() and sendmsg(), which resulted in most of the bind() calls to fail. The bind() was failing on checking if there is available WKS/SDP/SAP (respective bit in 'struct nfc_llcp_local' fields). When there was no available WKS/SDP/SAP, the bind returned error but the sendmsg() to such socket was able to trigger mentioned NULL pointer dereference of nfc_llcp_sock->dev. The code looks simply racy and currently it protects several paths against race with checks for (!nfc_llcp_sock->local) which is NULL-ified in error paths of bind(). The llcp_sock_sendmsg() did not have such check but called function nfc_llcp_send_ui_frame() had, although not protected with lock_sock(). Therefore the race could look like (same socket is used all the time): CPU0 CPU1 ==== ==== llcp_sock_bind() - lock_sock() - success - release_sock() - return 0 llcp_sock_sendmsg() - lock_sock() - release_sock() llcp_sock_bind(), same socket - lock_sock() - error - nfc_llcp_send_ui_frame() - if (!llcp_sock->local) - llcp_sock->local = NULL - nfc_put_device(dev) - dereference llcp_sock->dev - release_sock() - return -ERRNO The nfc_llcp_send_ui_frame() checked llcp_sock->local outside of the lock, which is racy and ineffective check. Instead, its caller llcp_sock_sendmsg(), should perform the check inside lock_sock(). Reported-and-tested-by: syzbot+7f23bcddf626e0593a39@syzkaller.appspotmail.com Fixes: b874dec21d1c ("NFC: Implement LLCP connection less Tx path") Cc: Signed-off-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- net/nfc/llcp_sock.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 6cfd30fc0798..0b93a17b9f11 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -789,6 +789,11 @@ static int llcp_sock_sendmsg(struct socket *sock, struct msghdr *msg, lock_sock(sk); + if (!llcp_sock->local) { + release_sock(sk); + return -ENODEV; + } + if (sk->sk_type == SOCK_DGRAM) { DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, addr, msg->msg_name); From 1771afd47430f5e95c9c3a2e3a8a63e67402d3fe Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 18 Jan 2022 11:22:04 +0100 Subject: [PATCH 139/149] net: cpsw: avoid alignment faults by taking NET_IP_ALIGN into account Both versions of the CPSW driver declare a CPSW_HEADROOM_NA macro that takes NET_IP_ALIGN into account, but fail to use it appropriately when storing incoming packets in memory. This results in the IPv4 source and destination addresses to appear misaligned in memory, which causes aligment faults that need to be fixed up in software. So let's switch from CPSW_HEADROOM to CPSW_HEADROOM_NA where needed. This gets rid of any alignment faults on the RX path on a Beaglebone White. Fixes: 9ed4050c0d75 ("net: ethernet: ti: cpsw: add XDP support") Cc: Grygorii Strashko Cc: Ilias Apalodimas Signed-off-by: Ard Biesheuvel Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 6 +++--- drivers/net/ethernet/ti/cpsw_new.c | 6 +++--- drivers/net/ethernet/ti/cpsw_priv.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 33142d505fc8..03575c017500 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -349,7 +349,7 @@ static void cpsw_rx_handler(void *token, int len, int status) struct cpsw_common *cpsw = ndev_to_cpsw(xmeta->ndev); int pkt_size = cpsw->rx_packet_max; int ret = 0, port, ch = xmeta->ch; - int headroom = CPSW_HEADROOM; + int headroom = CPSW_HEADROOM_NA; struct net_device *ndev = xmeta->ndev; struct cpsw_priv *priv; struct page_pool *pool; @@ -392,7 +392,7 @@ static void cpsw_rx_handler(void *token, int len, int status) } if (priv->xdp_prog) { - int headroom = CPSW_HEADROOM, size = len; + int size = len; xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); if (status & CPDMA_RX_VLAN_ENCAP) { @@ -442,7 +442,7 @@ requeue: xmeta->ndev = ndev; xmeta->ch = ch; - dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM; + dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM_NA; ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, dma, pkt_size, 0); if (ret < 0) { diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 279e261e4720..bd4b1528cf99 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -283,7 +283,7 @@ static void cpsw_rx_handler(void *token, int len, int status) { struct page *new_page, *page = token; void *pa = page_address(page); - int headroom = CPSW_HEADROOM; + int headroom = CPSW_HEADROOM_NA; struct cpsw_meta_xdp *xmeta; struct cpsw_common *cpsw; struct net_device *ndev; @@ -336,7 +336,7 @@ static void cpsw_rx_handler(void *token, int len, int status) } if (priv->xdp_prog) { - int headroom = CPSW_HEADROOM, size = len; + int size = len; xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); if (status & CPDMA_RX_VLAN_ENCAP) { @@ -386,7 +386,7 @@ requeue: xmeta->ndev = ndev; xmeta->ch = ch; - dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM; + dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM_NA; ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, dma, pkt_size, 0); if (ret < 0) { diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index 3537502e5e8b..ba220593e6db 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -1122,7 +1122,7 @@ int cpsw_fill_rx_channels(struct cpsw_priv *priv) xmeta->ndev = priv->ndev; xmeta->ch = ch; - dma = page_pool_get_dma_addr(page) + CPSW_HEADROOM; + dma = page_pool_get_dma_addr(page) + CPSW_HEADROOM_NA; ret = cpdma_chan_idle_submit_mapped(cpsw->rxv[ch].ch, page, dma, cpsw->rx_packet_max, From f1131b9c23fb4a3540a774828ff49f421619f902 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Tue, 18 Jan 2022 13:08:12 +0200 Subject: [PATCH 140/149] net: phy: micrel: use kszphy_suspend()/kszphy_resume for irq aware devices On a setup with KSZ9131 and MACB drivers it happens on suspend path, from time to time, that the PHY interrupt arrives after PHY and MACB were suspended (PHY via genphy_suspend(), MACB via macb_suspend()). In this case the phy_read() at the beginning of kszphy_handle_interrupt() will fail (as MACB driver is suspended at this time) leading to phy_error() being called and a stack trace being displayed on console. To solve this .suspend/.resume functions for all KSZ devices implementing .handle_interrupt were replaced with kszphy_suspend()/kszphy_resume() which disable/enable interrupt before/after calling genphy_suspend()/genphy_resume(). The fix has been adapted for all KSZ devices which implements .handle_interrupt but it has been tested only on KSZ9131. Fixes: 59ca4e58b917 ("net: phy: micrel: implement generic .handle_interrupt() callback") Signed-off-by: Claudiu Beznea Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 4570cb9535b7..a7ebcdab415b 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -1726,8 +1726,8 @@ static struct phy_driver ksphy_driver[] = { .config_init = kszphy_config_init, .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, - .suspend = genphy_suspend, - .resume = genphy_resume, + .suspend = kszphy_suspend, + .resume = kszphy_resume, }, { .phy_id = PHY_ID_KSZ8021, .phy_id_mask = 0x00ffffff, @@ -1741,8 +1741,8 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - .suspend = genphy_suspend, - .resume = genphy_resume, + .suspend = kszphy_suspend, + .resume = kszphy_resume, }, { .phy_id = PHY_ID_KSZ8031, .phy_id_mask = 0x00ffffff, @@ -1756,8 +1756,8 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - .suspend = genphy_suspend, - .resume = genphy_resume, + .suspend = kszphy_suspend, + .resume = kszphy_resume, }, { .phy_id = PHY_ID_KSZ8041, .phy_id_mask = MICREL_PHY_ID_MASK, @@ -1788,8 +1788,8 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - .suspend = genphy_suspend, - .resume = genphy_resume, + .suspend = kszphy_suspend, + .resume = kszphy_resume, }, { .name = "Micrel KSZ8051", /* PHY_BASIC_FEATURES */ @@ -1802,8 +1802,8 @@ static struct phy_driver ksphy_driver[] = { .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, .match_phy_device = ksz8051_match_phy_device, - .suspend = genphy_suspend, - .resume = genphy_resume, + .suspend = kszphy_suspend, + .resume = kszphy_resume, }, { .phy_id = PHY_ID_KSZ8001, .name = "Micrel KSZ8001 or KS8721", @@ -1817,8 +1817,8 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - .suspend = genphy_suspend, - .resume = genphy_resume, + .suspend = kszphy_suspend, + .resume = kszphy_resume, }, { .phy_id = PHY_ID_KSZ8081, .name = "Micrel KSZ8081 or KSZ8091", @@ -1848,8 +1848,8 @@ static struct phy_driver ksphy_driver[] = { .config_init = ksz8061_config_init, .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, - .suspend = genphy_suspend, - .resume = genphy_resume, + .suspend = kszphy_suspend, + .resume = kszphy_resume, }, { .phy_id = PHY_ID_KSZ9021, .phy_id_mask = 0x000ffffe, @@ -1864,8 +1864,8 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - .suspend = genphy_suspend, - .resume = genphy_resume, + .suspend = kszphy_suspend, + .resume = kszphy_resume, .read_mmd = genphy_read_mmd_unsupported, .write_mmd = genphy_write_mmd_unsupported, }, { @@ -1883,7 +1883,7 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - .suspend = genphy_suspend, + .suspend = kszphy_suspend, .resume = kszphy_resume, }, { .phy_id = PHY_ID_LAN8814, @@ -1928,7 +1928,7 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - .suspend = genphy_suspend, + .suspend = kszphy_suspend, .resume = kszphy_resume, }, { .phy_id = PHY_ID_KSZ8873MLL, From baa59504c1cd0cca7d41954a45ee0b3dc78e41a0 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Tue, 18 Jan 2022 05:41:10 -0800 Subject: [PATCH 141/149] net: mscc: ocelot: fix using match before it is set Clang static analysis reports this issue ocelot_flower.c:563:8: warning: 1st function call argument is an uninitialized value !is_zero_ether_addr(match.mask->dst)) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The variable match is used before it is set. So move the block. Fixes: 75944fda1dfe ("net: mscc: ocelot: offload ingress skbedit and vlan actions to VCAP IS1") Signed-off-by: Tom Rix Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_flower.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index 4a0fda22d343..949858891973 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -559,13 +559,6 @@ ocelot_flower_parse_key(struct ocelot *ocelot, int port, bool ingress, return -EOPNOTSUPP; } - if (filter->block_id == VCAP_IS1 && - !is_zero_ether_addr(match.mask->dst)) { - NL_SET_ERR_MSG_MOD(extack, - "Key type S1_NORMAL cannot match on destination MAC"); - return -EOPNOTSUPP; - } - /* The hw support mac matches only for MAC_ETYPE key, * therefore if other matches(port, tcp flags, etc) are added * then just bail out @@ -580,6 +573,14 @@ ocelot_flower_parse_key(struct ocelot *ocelot, int port, bool ingress, return -EOPNOTSUPP; flow_rule_match_eth_addrs(rule, &match); + + if (filter->block_id == VCAP_IS1 && + !is_zero_ether_addr(match.mask->dst)) { + NL_SET_ERR_MSG_MOD(extack, + "Key type S1_NORMAL cannot match on destination MAC"); + return -EOPNOTSUPP; + } + filter->key_type = OCELOT_VCAP_KEY_ETYPE; ether_addr_copy(filter->key.etype.dmac.value, match.key->dst); From 6198c722019774d38018457a8bfb9ba3ed8c931e Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Tue, 18 Jan 2022 22:50:50 +0100 Subject: [PATCH 142/149] net/fsl: xgmac_mdio: Add workaround for erratum A-009885 Once an MDIO read transaction is initiated, we must read back the data register within 16 MDC cycles after the transaction completes. Outside of this window, reads may return corrupt data. Therefore, disable local interrupts in the critical section, to maximize the probability that we can satisfy this requirement. Fixes: d55ad2967d89 ("powerpc/mpc85xx: Create dts components for the FSL QorIQ DPAA FMan") Signed-off-by: Tobias Waldekranz Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/xgmac_mdio.c | 25 ++++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/freescale/xgmac_mdio.c b/drivers/net/ethernet/freescale/xgmac_mdio.c index 5b8b9bcf41a2..bf566ac3195b 100644 --- a/drivers/net/ethernet/freescale/xgmac_mdio.c +++ b/drivers/net/ethernet/freescale/xgmac_mdio.c @@ -51,6 +51,7 @@ struct tgec_mdio_controller { struct mdio_fsl_priv { struct tgec_mdio_controller __iomem *mdio_base; bool is_little_endian; + bool has_a009885; bool has_a011043; }; @@ -186,10 +187,10 @@ static int xgmac_mdio_read(struct mii_bus *bus, int phy_id, int regnum) { struct mdio_fsl_priv *priv = (struct mdio_fsl_priv *)bus->priv; struct tgec_mdio_controller __iomem *regs = priv->mdio_base; + unsigned long flags; uint16_t dev_addr; uint32_t mdio_stat; uint32_t mdio_ctl; - uint16_t value; int ret; bool endian = priv->is_little_endian; @@ -221,12 +222,18 @@ static int xgmac_mdio_read(struct mii_bus *bus, int phy_id, int regnum) return ret; } + if (priv->has_a009885) + /* Once the operation completes, i.e. MDIO_STAT_BSY clears, we + * must read back the data register within 16 MDC cycles. + */ + local_irq_save(flags); + /* Initiate the read */ xgmac_write32(mdio_ctl | MDIO_CTL_READ, ®s->mdio_ctl, endian); ret = xgmac_wait_until_done(&bus->dev, regs, endian); if (ret) - return ret; + goto irq_restore; /* Return all Fs if nothing was there */ if ((xgmac_read32(®s->mdio_stat, endian) & MDIO_STAT_RD_ER) && @@ -234,13 +241,17 @@ static int xgmac_mdio_read(struct mii_bus *bus, int phy_id, int regnum) dev_dbg(&bus->dev, "Error while reading PHY%d reg at %d.%hhu\n", phy_id, dev_addr, regnum); - return 0xffff; + ret = 0xffff; + } else { + ret = xgmac_read32(®s->mdio_data, endian) & 0xffff; + dev_dbg(&bus->dev, "read %04x\n", ret); } - value = xgmac_read32(®s->mdio_data, endian) & 0xffff; - dev_dbg(&bus->dev, "read %04x\n", value); +irq_restore: + if (priv->has_a009885) + local_irq_restore(flags); - return value; + return ret; } static int xgmac_mdio_probe(struct platform_device *pdev) @@ -287,6 +298,8 @@ static int xgmac_mdio_probe(struct platform_device *pdev) priv->is_little_endian = device_property_read_bool(&pdev->dev, "little-endian"); + priv->has_a009885 = device_property_read_bool(&pdev->dev, + "fsl,erratum-a009885"); priv->has_a011043 = device_property_read_bool(&pdev->dev, "fsl,erratum-a011043"); From ea11fc509ff26039046744d1bdfdf2747dda558c Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Tue, 18 Jan 2022 22:50:51 +0100 Subject: [PATCH 143/149] dt-bindings: net: Document fsl,erratum-a009885 Update FMan binding documentation with the newly added workaround for erratum A-009885. Signed-off-by: Tobias Waldekranz Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/fsl-fman.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/devicetree/bindings/net/fsl-fman.txt b/Documentation/devicetree/bindings/net/fsl-fman.txt index c00fb0d22c7b..020337f3c05f 100644 --- a/Documentation/devicetree/bindings/net/fsl-fman.txt +++ b/Documentation/devicetree/bindings/net/fsl-fman.txt @@ -410,6 +410,15 @@ PROPERTIES The settings and programming routines for internal/external MDIO are different. Must be included for internal MDIO. +- fsl,erratum-a009885 + Usage: optional + Value type: + Definition: Indicates the presence of the A009885 + erratum describing that the contents of MDIO_DATA may + become corrupt unless it is read within 16 MDC cycles + of MDIO_CFG[BSY] being cleared, when performing an + MDIO read operation. + - fsl,erratum-a011043 Usage: optional Value type: From 0d375d610fa96524e2ee2b46830a46a7bfa92a9f Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Tue, 18 Jan 2022 22:50:52 +0100 Subject: [PATCH 144/149] powerpc/fsl/dts: Enable WA for erratum A-009885 on fman3l MDIO buses This block is used in (at least) T1024 and T1040, including their variants like T1023 etc. Fixes: d55ad2967d89 ("powerpc/mpc85xx: Create dts components for the FSL QorIQ DPAA FMan") Signed-off-by: Tobias Waldekranz Signed-off-by: Jakub Kicinski --- arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi index c90702b04a53..48e5cd61599c 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi @@ -79,6 +79,7 @@ fman0: fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xfc000 0x1000>; + fsl,erratum-a009885; }; xmdio0: mdio@fd000 { @@ -86,6 +87,7 @@ fman0: fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xfd000 0x1000>; + fsl,erratum-a009885; }; }; From 3f7c239c7844d2044ed399399d97a5f1c6008e1b Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Tue, 18 Jan 2022 22:50:53 +0100 Subject: [PATCH 145/149] net/fsl: xgmac_mdio: Fix incorrect iounmap when removing module As reported by sparse: In the remove path, the driver would attempt to unmap its own priv pointer - instead of the io memory that it mapped in probe. Fixes: 9f35a7342cff ("net/fsl: introduce Freescale 10G MDIO driver") Signed-off-by: Tobias Waldekranz Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/xgmac_mdio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/xgmac_mdio.c b/drivers/net/ethernet/freescale/xgmac_mdio.c index bf566ac3195b..266e562bd67a 100644 --- a/drivers/net/ethernet/freescale/xgmac_mdio.c +++ b/drivers/net/ethernet/freescale/xgmac_mdio.c @@ -331,9 +331,10 @@ err_ioremap: static int xgmac_mdio_remove(struct platform_device *pdev) { struct mii_bus *bus = platform_get_drvdata(pdev); + struct mdio_fsl_priv *priv = bus->priv; mdiobus_unregister(bus); - iounmap(bus->priv); + iounmap(priv->mdio_base); mdiobus_free(bus); return 0; From d07418afea8f1d9896aaf9dc5ae47ac4f45b220c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jan 2022 02:04:12 -0800 Subject: [PATCH 146/149] ipv4: avoid quadratic behavior in netns dismantle net/ipv4/fib_semantics.c uses an hash table of 256 slots, keyed by device ifindexes: fib_info_devhash[DEVINDEX_HASHSIZE] Problem is that with network namespaces, devices tend to use the same ifindex. lo device for instance has a fixed ifindex of one, for all network namespaces. This means that hosts with thousands of netns spend a lot of time looking at some hash buckets with thousands of elements, notably at netns dismantle. Simply add a per netns perturbation (net_hash_mix()) to spread elements more uniformely. Also change fib_devindex_hashfn() to use more entropy. Fixes: aa79e66eee5d ("net: Make ifindex generation per-net namespace") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 45619c005b8d..9813949da104 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -319,11 +320,15 @@ static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) static inline unsigned int fib_devindex_hashfn(unsigned int val) { - unsigned int mask = DEVINDEX_HASHSIZE - 1; + return hash_32(val, DEVINDEX_HASHBITS); +} - return (val ^ - (val >> DEVINDEX_HASHBITS) ^ - (val >> (DEVINDEX_HASHBITS * 2))) & mask; +static struct hlist_head * +fib_info_devhash_bucket(const struct net_device *dev) +{ + u32 val = net_hash_mix(dev_net(dev)) ^ dev->ifindex; + + return &fib_info_devhash[fib_devindex_hashfn(val)]; } static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, @@ -433,12 +438,11 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev) { struct hlist_head *head; struct fib_nh *nh; - unsigned int hash; spin_lock(&fib_info_lock); - hash = fib_devindex_hashfn(dev->ifindex); - head = &fib_info_devhash[hash]; + head = fib_info_devhash_bucket(dev); + hlist_for_each_entry(nh, head, nh_hash) { if (nh->fib_nh_dev == dev && nh->fib_nh_gw4 == gw && @@ -1609,12 +1613,10 @@ link_it: } else { change_nexthops(fi) { struct hlist_head *head; - unsigned int hash; if (!nexthop_nh->fib_nh_dev) continue; - hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex); - head = &fib_info_devhash[hash]; + head = fib_info_devhash_bucket(nexthop_nh->fib_nh_dev); hlist_add_head(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) } @@ -1966,8 +1968,7 @@ void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) { - unsigned int hash = fib_devindex_hashfn(dev->ifindex); - struct hlist_head *head = &fib_info_devhash[hash]; + struct hlist_head *head = fib_info_devhash_bucket(dev); struct fib_nh *nh; hlist_for_each_entry(nh, head, nh_hash) { @@ -1986,12 +1987,11 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) */ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { - int ret = 0; - int scope = RT_SCOPE_NOWHERE; + struct hlist_head *head = fib_info_devhash_bucket(dev); struct fib_info *prev_fi = NULL; - unsigned int hash = fib_devindex_hashfn(dev->ifindex); - struct hlist_head *head = &fib_info_devhash[hash]; + int scope = RT_SCOPE_NOWHERE; struct fib_nh *nh; + int ret = 0; if (force) scope = -1; @@ -2136,7 +2136,6 @@ out: int fib_sync_up(struct net_device *dev, unsigned char nh_flags) { struct fib_info *prev_fi; - unsigned int hash; struct hlist_head *head; struct fib_nh *nh; int ret; @@ -2152,8 +2151,7 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) } prev_fi = NULL; - hash = fib_devindex_hashfn(dev->ifindex); - head = &fib_info_devhash[hash]; + head = fib_info_devhash_bucket(dev); ret = 0; hlist_for_each_entry(nh, head, nh_hash) { From 79eb15da3cd68f04b06edf73f9bbafa70a06871f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jan 2022 02:04:13 -0800 Subject: [PATCH 147/149] ipv4: add net_hash_mix() dispersion to fib_info_laddrhash keys net/ipv4/fib_semantics.c uses a hash table (fib_info_laddrhash) in which fib_sync_down_addr() can locate fib_info based on IPv4 local address. This hash table is resized based on total number of hashed fib_info, but the hash function is only using the local address. For hosts having many active network namespaces, all fib_info for loopback devices (IPv4 address 127.0.0.1) are hashed into a single bucket, making netns dismantles very slow. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 9813949da104..b4589861b84c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(fib_info_lock); static struct hlist_head *fib_info_hash; static struct hlist_head *fib_info_laddrhash; static unsigned int fib_info_hash_size; +static unsigned int fib_info_hash_bits; static unsigned int fib_info_cnt; #define DEVINDEX_HASHBITS 8 @@ -1247,13 +1248,13 @@ int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, return err; } -static inline unsigned int fib_laddr_hashfn(__be32 val) +static struct hlist_head * +fib_info_laddrhash_bucket(const struct net *net, __be32 val) { - unsigned int mask = (fib_info_hash_size - 1); + u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val, + fib_info_hash_bits); - return ((__force u32)val ^ - ((__force u32)val >> 7) ^ - ((__force u32)val >> 14)) & mask; + return &fib_info_laddrhash[slot]; } static struct hlist_head *fib_info_hash_alloc(int bytes) @@ -1289,6 +1290,7 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, old_info_hash = fib_info_hash; old_laddrhash = fib_info_laddrhash; fib_info_hash_size = new_size; + fib_info_hash_bits = ilog2(new_size); for (i = 0; i < old_size; i++) { struct hlist_head *head = &fib_info_hash[i]; @@ -1306,21 +1308,20 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, } fib_info_hash = new_info_hash; + fib_info_laddrhash = new_laddrhash; for (i = 0; i < old_size; i++) { - struct hlist_head *lhead = &fib_info_laddrhash[i]; + struct hlist_head *lhead = &old_laddrhash[i]; struct hlist_node *n; struct fib_info *fi; hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) { struct hlist_head *ldest; - unsigned int new_hash; - new_hash = fib_laddr_hashfn(fi->fib_prefsrc); - ldest = &new_laddrhash[new_hash]; + ldest = fib_info_laddrhash_bucket(fi->fib_net, + fi->fib_prefsrc); hlist_add_head(&fi->fib_lhash, ldest); } } - fib_info_laddrhash = new_laddrhash; spin_unlock_bh(&fib_info_lock); @@ -1605,7 +1606,7 @@ link_it: if (fi->fib_prefsrc) { struct hlist_head *head; - head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; + head = fib_info_laddrhash_bucket(net, fi->fib_prefsrc); hlist_add_head(&fi->fib_lhash, head); } if (fi->nh) { @@ -1877,16 +1878,16 @@ nla_put_failure: */ int fib_sync_down_addr(struct net_device *dev, __be32 local) { - int ret = 0; - unsigned int hash = fib_laddr_hashfn(local); - struct hlist_head *head = &fib_info_laddrhash[hash]; int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; struct net *net = dev_net(dev); + struct hlist_head *head; struct fib_info *fi; + int ret = 0; if (!fib_info_laddrhash || local == 0) return 0; + head = fib_info_laddrhash_bucket(net, local); hlist_for_each_entry(fi, head, fib_lhash) { if (!net_eq(fi->fib_net, net) || fi->fib_tb_id != tb_id) From 7c1cf55577782725ea2bc24687767c8fe8e57486 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 19 Jan 2022 09:04:31 +0800 Subject: [PATCH 148/149] gpio: idt3243x: Fix an ignored error return from platform_get_irq() The return from the call to platform_get_irq() is int, it can be a negative error code, however this is being assigned to an unsigned int variable 'parent_irq', so making 'parent_irq' an int. Eliminate the following coccicheck warning: ./drivers/gpio/gpio-idt3243x.c:167:6-16: WARNING: Unsigned expression compared with zero: parent_irq < 0 Reported-by: Abaci Robot Fixes: 30fee1d7462a ("gpio: idt3243x: Fix IRQ check in idt_gpio_probe") Signed-off-by: Yang Li Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-idt3243x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-idt3243x.c b/drivers/gpio/gpio-idt3243x.c index 08493b05be2d..52b8b72ded77 100644 --- a/drivers/gpio/gpio-idt3243x.c +++ b/drivers/gpio/gpio-idt3243x.c @@ -132,7 +132,7 @@ static int idt_gpio_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct gpio_irq_chip *girq; struct idt_gpio_ctrl *ctrl; - unsigned int parent_irq; + int parent_irq; int ngpios; int ret; From 9f51ce0b9e73f83bab2442b36d5e247a81bd3401 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 19 Jan 2022 09:04:32 +0800 Subject: [PATCH 149/149] gpio: mpc8xxx: Fix an ignored error return from platform_get_irq() The return from the call to platform_get_irq() is int, it can be a negative error code, however this is being assigned to an unsigned int variable 'irqn', so making 'irqn' an int. Eliminate the following coccicheck warning: ./drivers/gpio/gpio-mpc8xxx.c:391:5-21: WARNING: Unsigned expression compared with zero: mpc8xxx_gc -> irqn < 0 Reported-by: Abaci Robot Fixes: 0b39536cc699 ("gpio: mpc8xxx: Fix IRQ check in mpc8xxx_probe") Signed-off-by: Yang Li Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mpc8xxx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index 01634c8d27b3..a964e25ea620 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -47,7 +47,7 @@ struct mpc8xxx_gpio_chip { unsigned offset, int value); struct irq_domain *irq; - unsigned int irqn; + int irqn; }; /*