From 5a24be76af79d9aa4d84cef2ca5a1147d35be019 Mon Sep 17 00:00:00 2001 From: Tom Chung Date: Mon, 29 May 2023 18:00:09 +0800 Subject: [PATCH 01/98] drm/amd/display: fix the system hang while disable PSR [ Upstream commit ea2062dd1f0384ae1b136d333ee4ced15bedae38 ] [Why] When the PSR enabled. If you try to adjust the timing parameters, it may cause system hang. Because the timing mismatch with the DMCUB settings. [How] Disable the PSR before adjusting timing parameters. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Acked-by: Stylon Wang Signed-off-by: Tom Chung Reviewed-by: Wayne Lin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 938aa11acb2d..5acd088f34f3 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -9218,6 +9218,12 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, if (acrtc_state->abm_level != dm_old_crtc_state->abm_level) bundle->stream_update.abm_level = &acrtc_state->abm_level; + mutex_lock(&dm->dc_lock); + if ((acrtc_state->update_type > UPDATE_TYPE_FAST) && + acrtc_state->stream->link->psr_settings.psr_allow_active) + amdgpu_dm_psr_disable(acrtc_state->stream); + mutex_unlock(&dm->dc_lock); + /* * If FreeSync state on the stream has changed then we need to * re-adjust the min/max bounds now that DC doesn't handle this @@ -9231,9 +9237,6 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, spin_unlock_irqrestore(&pcrtc->dev->event_lock, flags); } mutex_lock(&dm->dc_lock); - if ((acrtc_state->update_type > UPDATE_TYPE_FAST) && - acrtc_state->stream->link->psr_settings.psr_allow_active) - amdgpu_dm_psr_disable(acrtc_state->stream); dc_commit_updates_for_stream(dm->dc, bundle->surface_updates, From 39e787253720f8983cb5e0a370f0e3fe6d1dbac1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 23 Nov 2022 14:25:57 -0500 Subject: [PATCH 02/98] tracing: Add tracing_reset_all_online_cpus_unlocked() function commit e18eb8783ec4949adebc7d7b0fdb65f65bfeefd9 upstream. Currently the tracing_reset_all_online_cpus() requires the trace_types_lock held. But only one caller of this function actually has that lock held before calling it, and the other just takes the lock so that it can call it. More users of this function is needed where the lock is not held. Add a tracing_reset_all_online_cpus_unlocked() function for the one use case that calls it without being held, and also add a lockdep_assert to make sure it is held when called. Then have tracing_reset_all_online_cpus() take the lock internally, such that callers do not need to worry about taking it. Link: https://lkml.kernel.org/r/20221123192741.658273220@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Zheng Yejian Signed-off-by: Steven Rostedt (Google) Signed-off-by: Zheng Yejian Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 11 ++++++++++- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 2 +- kernel/trace/trace_events_synth.c | 2 -- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8b5abd6e36c..0202f23ae960 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2175,10 +2175,12 @@ void tracing_reset_online_cpus(struct array_buffer *buf) } /* Must have trace_types_lock held */ -void tracing_reset_all_online_cpus(void) +void tracing_reset_all_online_cpus_unlocked(void) { struct trace_array *tr; + lockdep_assert_held(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->clear_trace) continue; @@ -2190,6 +2192,13 @@ void tracing_reset_all_online_cpus(void) } } +void tracing_reset_all_online_cpus(void) +{ + mutex_lock(&trace_types_lock); + tracing_reset_all_online_cpus_unlocked(); + mutex_unlock(&trace_types_lock); +} + /* * The tgid_map array maps from pid to tgid; i.e. the value stored at index i * is the tgid last observed corresponding to pid=i. diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 66b6c8395fbc..2c3d9b6ce148 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -580,6 +580,7 @@ int tracing_is_enabled(void); void tracing_reset_online_cpus(struct array_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); +void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1aadc9a6487b..160298d285c0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2974,7 +2974,7 @@ static void trace_module_remove_events(struct module *mod) * over from this module may be passed to the new module events and * unexpected results may occur. */ - tracing_reset_all_online_cpus(); + tracing_reset_all_online_cpus_unlocked(); } static int trace_module_notify(struct notifier_block *self, diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 2fdf3fd591e1..08c7df42ade7 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1416,7 +1416,6 @@ int synth_event_delete(const char *event_name) mutex_unlock(&event_mutex); if (mod) { - mutex_lock(&trace_types_lock); /* * It is safest to reset the ring buffer if the module * being unloaded registered any events that were @@ -1428,7 +1427,6 @@ int synth_event_delete(const char *event_name) * occur. */ tracing_reset_all_online_cpus(); - mutex_unlock(&trace_types_lock); } return ret; From b5f0e898f674713397e096afc933ddafbe7be963 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Thu, 24 Nov 2022 14:55:35 +0100 Subject: [PATCH 03/98] tpm, tpm_tis: Claim locality in interrupt handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0e069265bce5a40c4eee52e2364bbbd4dabee94a upstream. Writing the TPM_INT_STATUS register in the interrupt handler to clear the interrupts only has effect if a locality is held. Since this is not guaranteed at the time the interrupt is fired, claim the locality explicitly in the handler. Signed-off-by: Lino Sanfilippo Tested-by: Michael Niewöhner Tested-by: Jarkko Sakkinen Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: Greg Kroah-Hartman --- drivers/char/tpm/tpm_tis_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index 512c867495ea..365761055df3 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -731,7 +731,9 @@ static irqreturn_t tis_int_handler(int dummy, void *dev_id) wake_up_interruptible(&priv->int_queue); /* Clear interrupts handled with TPM_EOI */ + tpm_tis_request_locality(chip, 0); rc = tpm_tis_write32(priv, TPM_INT_STATUS(priv->locality), interrupt); + tpm_tis_relinquish_locality(chip, 0); if (rc < 0) return IRQ_NONE; From fb7c68bbccad1a92d9078a5b2160d448da542b7a Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Mon, 20 Jun 2022 16:37:07 -0400 Subject: [PATCH 04/98] drm/amd/display: Add minimal pipe split transition state commit 97ca308925a50aa80711ccfaf814fa3898374862 upstream. [WHY?] When adding/removing a plane to some configurations, unsupported pipe programming can occur when moving to a new plane. Such cases include pipe split on multi-display, with MPO, and/or ODM. [HOW?] Add a safe transistion state that minimizes pipe usage before programming new configuration. When adding a plane, the current state has the least pipes required so it is applied without splitting. This must be applied prior to updating the plane_state for seamless transition. When removing a plane, the new state has the least pieps required so it is applied without splitting. Signed-off-by: Rodrigo Siqueira Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/display/dc/core/dc.c | 277 +++++++++++++++++++++ drivers/gpu/drm/amd/display/dc/dc_stream.h | 20 +- 2 files changed, 296 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index eca882438f6e..220a26e45a28 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -2534,6 +2534,137 @@ static void copy_stream_update_to_stream(struct dc *dc, } } +void dc_reset_state(struct dc *dc, struct dc_state *context) +{ + dc_resource_state_destruct(context); + + /* clear the structure, but don't reset the reference count */ + memset(context, 0, offsetof(struct dc_state, refcount)); + + init_state(dc, context); +} + +static bool update_planes_and_stream_state(struct dc *dc, + struct dc_surface_update *srf_updates, int surface_count, + struct dc_stream_state *stream, + struct dc_stream_update *stream_update, + enum surface_update_type *new_update_type, + struct dc_state **new_context) +{ + struct dc_state *context; + int i, j; + enum surface_update_type update_type; + const struct dc_stream_status *stream_status; + struct dc_context *dc_ctx = dc->ctx; + + stream_status = dc_stream_get_status(stream); + + if (!stream_status) { + if (surface_count) /* Only an error condition if surf_count non-zero*/ + ASSERT(false); + + return false; /* Cannot commit surface to stream that is not committed */ + } + + context = dc->current_state; + + update_type = dc_check_update_surfaces_for_stream( + dc, srf_updates, surface_count, stream_update, stream_status); + + /* update current stream with the new updates */ + copy_stream_update_to_stream(dc, context, stream, stream_update); + + /* do not perform surface update if surface has invalid dimensions + * (all zero) and no scaling_info is provided + */ + if (surface_count > 0) { + for (i = 0; i < surface_count; i++) { + if ((srf_updates[i].surface->src_rect.width == 0 || + srf_updates[i].surface->src_rect.height == 0 || + srf_updates[i].surface->dst_rect.width == 0 || + srf_updates[i].surface->dst_rect.height == 0) && + (!srf_updates[i].scaling_info || + srf_updates[i].scaling_info->src_rect.width == 0 || + srf_updates[i].scaling_info->src_rect.height == 0 || + srf_updates[i].scaling_info->dst_rect.width == 0 || + srf_updates[i].scaling_info->dst_rect.height == 0)) { + DC_ERROR("Invalid src/dst rects in surface update!\n"); + return false; + } + } + } + + if (update_type >= update_surface_trace_level) + update_surface_trace(dc, srf_updates, surface_count); + + if (update_type >= UPDATE_TYPE_FULL) { + struct dc_plane_state *new_planes[MAX_SURFACES] = {0}; + + for (i = 0; i < surface_count; i++) + new_planes[i] = srf_updates[i].surface; + + /* initialize scratch memory for building context */ + context = dc_create_state(dc); + if (context == NULL) { + DC_ERROR("Failed to allocate new validate context!\n"); + return false; + } + + dc_resource_state_copy_construct( + dc->current_state, context); + + /*remove old surfaces from context */ + if (!dc_rem_all_planes_for_stream(dc, stream, context)) { + + BREAK_TO_DEBUGGER(); + goto fail; + } + + /* add surface to context */ + if (!dc_add_all_planes_for_stream(dc, stream, new_planes, surface_count, context)) { + + BREAK_TO_DEBUGGER(); + goto fail; + } + } + + /* save update parameters into surface */ + for (i = 0; i < surface_count; i++) { + struct dc_plane_state *surface = srf_updates[i].surface; + + copy_surface_update_to_plane(surface, &srf_updates[i]); + + if (update_type >= UPDATE_TYPE_MED) { + for (j = 0; j < dc->res_pool->pipe_count; j++) { + struct pipe_ctx *pipe_ctx = &context->res_ctx.pipe_ctx[j]; + + if (pipe_ctx->plane_state != surface) + continue; + + resource_build_scaling_params(pipe_ctx); + } + } + } + + if (update_type == UPDATE_TYPE_FULL) { + if (!dc->res_pool->funcs->validate_bandwidth(dc, context, false)) { + BREAK_TO_DEBUGGER(); + goto fail; + } + } + + *new_context = context; + *new_update_type = update_type; + + return true; + +fail: + dc_release_state(context); + + return false; + +} + static void commit_planes_do_stream_update(struct dc *dc, struct dc_stream_state *stream, struct dc_stream_update *stream_update, @@ -2931,6 +3062,152 @@ static void commit_planes_for_stream(struct dc *dc, } } +static bool commit_minimal_transition_state(struct dc *dc, + struct dc_state *transition_base_context) +{ + struct dc_state *transition_context = dc_create_state(dc); + enum pipe_split_policy tmp_policy; + enum dc_status ret = DC_ERROR_UNEXPECTED; + unsigned int i, j; + + if (!transition_context) + return false; + + tmp_policy = dc->debug.pipe_split_policy; + dc->debug.pipe_split_policy = MPC_SPLIT_AVOID; + + dc_resource_state_copy_construct(transition_base_context, transition_context); + + //commit minimal state + if (dc->res_pool->funcs->validate_bandwidth(dc, transition_context, false)) { + for (i = 0; i < transition_context->stream_count; i++) { + struct dc_stream_status *stream_status = &transition_context->stream_status[i]; + + for (j = 0; j < stream_status->plane_count; j++) { + struct dc_plane_state *plane_state = stream_status->plane_states[j]; + + /* force vsync flip when reconfiguring pipes to prevent underflow + * and corruption + */ + plane_state->flip_immediate = false; + } + } + + ret = dc_commit_state_no_check(dc, transition_context); + } + + //always release as dc_commit_state_no_check retains in good case + dc_release_state(transition_context); + + //restore previous pipe split policy + dc->debug.pipe_split_policy = tmp_policy; + + if (ret != DC_OK) { + //this should never happen + BREAK_TO_DEBUGGER(); + return false; + } + + //force full surface update + for (i = 0; i < dc->current_state->stream_count; i++) { + for (j = 0; j < dc->current_state->stream_status[i].plane_count; j++) { + dc->current_state->stream_status[i].plane_states[j]->update_flags.raw = 0xFFFFFFFF; + } + } + + return true; +} + +bool dc_update_planes_and_stream(struct dc *dc, + struct dc_surface_update *srf_updates, int surface_count, + struct dc_stream_state *stream, + struct dc_stream_update *stream_update) +{ + struct dc_state *context; + enum surface_update_type update_type; + int i; + + /* In cases where MPO and split or ODM are used transitions can + * cause underflow. Apply stream configuration with minimal pipe + * split first to avoid unsupported transitions for active pipes. + */ + bool force_minimal_pipe_splitting = false; + bool is_plane_addition = false; + + struct dc_stream_status *cur_stream_status = stream_get_status(dc->current_state, stream); + + if (cur_stream_status && + dc->current_state->stream_count > 0 && + dc->debug.pipe_split_policy != MPC_SPLIT_AVOID) { + /* determine if minimal transition is required */ + if (cur_stream_status->plane_count > surface_count) { + force_minimal_pipe_splitting = true; + } else if (cur_stream_status->plane_count < surface_count) { + force_minimal_pipe_splitting = true; + is_plane_addition = true; + } + } + + /* on plane addition, minimal state is the current one */ + if (force_minimal_pipe_splitting && is_plane_addition && + !commit_minimal_transition_state(dc, dc->current_state)) + return false; + + if (!update_planes_and_stream_state( + dc, + srf_updates, + surface_count, + stream, + stream_update, + &update_type, + &context)) + return false; + + /* on plane addition, minimal state is the new one */ + if (force_minimal_pipe_splitting && !is_plane_addition) { + if (!commit_minimal_transition_state(dc, context)) { + dc_release_state(context); + return false; + } + + update_type = UPDATE_TYPE_FULL; + } + + commit_planes_for_stream( + dc, + srf_updates, + surface_count, + stream, + stream_update, + update_type, + context); + + if (dc->current_state != context) { + + /* Since memory free requires elevated IRQL, an interrupt + * request is generated by mem free. If this happens + * between freeing and reassigning the context, our vsync + * interrupt will call into dc and cause a memory + * corruption BSOD. Hence, we first reassign the context, + * then free the old context. + */ + + struct dc_state *old = dc->current_state; + + dc->current_state = context; + dc_release_state(old); + + // clear any forced full updates + for (i = 0; i < dc->res_pool->pipe_count; i++) { + struct pipe_ctx *pipe_ctx = &context->res_ctx.pipe_ctx[i]; + + if (pipe_ctx->plane_state && pipe_ctx->stream == stream) + pipe_ctx->plane_state->force_full_update = false; + } + } + return true; +} + void dc_commit_updates_for_stream(struct dc *dc, struct dc_surface_update *srf_updates, int surface_count, diff --git a/drivers/gpu/drm/amd/display/dc/dc_stream.h b/drivers/gpu/drm/amd/display/dc/dc_stream.h index 7644f0e747d3..3e606faff58f 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_stream.h +++ b/drivers/gpu/drm/amd/display/dc/dc_stream.h @@ -287,6 +287,25 @@ bool dc_is_stream_unchanged( bool dc_is_stream_scaling_unchanged( struct dc_stream_state *old_stream, struct dc_stream_state *stream); +/* + * Setup stream attributes if no stream updates are provided + * there will be no impact on the stream parameters + * + * Set up surface attributes and associate to a stream + * The surfaces parameter is an absolute set of all surface active for the stream. + * If no surfaces are provided, the stream will be blanked; no memory read. + * Any flip related attribute changes must be done through this interface. + * + * After this call: + * Surfaces attributes are programmed and configured to be composed into stream. + * This does not trigger a flip. No surface address is programmed. + * + */ +bool dc_update_planes_and_stream(struct dc *dc, + struct dc_surface_update *surface_updates, int surface_count, + struct dc_stream_state *dc_stream, + struct dc_stream_update *stream_update); + /* * Set up surface attributes and associate to a stream * The surfaces parameter is an absolute set of all surface active for the stream. @@ -297,7 +316,6 @@ bool dc_is_stream_scaling_unchanged( * Surfaces attributes are programmed and configured to be composed into stream. * This does not trigger a flip. No surface address is programmed. */ - void dc_commit_updates_for_stream(struct dc *dc, struct dc_surface_update *srf_updates, int surface_count, From eea850c025b5c86a16533056ce871a863b5d726c Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Thu, 6 Oct 2022 16:40:55 -0400 Subject: [PATCH 05/98] drm/amd/display: Use dc_update_planes_and_stream commit f7511289821ffccc07579406d6ab520aa11049f5 upstream. [Why & How] The old dc_commit_updates_for_stream lacks manipulation for many corner cases where the DC feature requires special attention; as a result, it starts to show its limitation (e.g., the SubVP feature is not supported by it, among other cases). To modernize and unify our internal API, this commit replaces the old dc_commit_updates_for_stream with dc_update_planes_and_stream, which has more features. Reviewed-by: Harry Wentland Acked-by: Qingqing Zhuo Signed-off-by: Rodrigo Siqueira Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 5acd088f34f3..6d963761c2d1 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2550,10 +2550,12 @@ static void dm_gpureset_commit_state(struct dc_state *dc_state, bundle->surface_updates[m].surface->force_full_update = true; } - dc_commit_updates_for_stream( - dm->dc, bundle->surface_updates, + + dc_update_planes_and_stream(dm->dc, + bundle->surface_updates, dc_state->stream_status->plane_count, - dc_state->streams[k], &bundle->stream_update, dc_state); + dc_state->streams[k], + &bundle->stream_update); } cleanup: @@ -9238,12 +9240,11 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, } mutex_lock(&dm->dc_lock); - dc_commit_updates_for_stream(dm->dc, - bundle->surface_updates, - planes_count, - acrtc_state->stream, - &bundle->stream_update, - dc_state); + dc_update_planes_and_stream(dm->dc, + bundle->surface_updates, + planes_count, + acrtc_state->stream, + &bundle->stream_update); /** * Enable or disable the interrupts on the backend. @@ -9669,12 +9670,11 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) mutex_lock(&dm->dc_lock); - dc_commit_updates_for_stream(dm->dc, - dummy_updates, - status->plane_count, - dm_new_crtc_state->stream, - &stream_update, - dc_state); + dc_update_planes_and_stream(dm->dc, + dummy_updates, + status->plane_count, + dm_new_crtc_state->stream, + &stream_update); mutex_unlock(&dm->dc_lock); } From 3c1aa91b37f9ecee11747e6dc2341ede7b60d526 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Thu, 23 Feb 2023 11:36:08 -0700 Subject: [PATCH 06/98] drm/amd/display: Add wrapper to call planes and stream update commit 81f743a08f3b214638aa389e252ae5e6c3592e7c upstream. [Why & How] This commit is part of a sequence of changes that replaces the commit sequence used in the DC with a new one. As a result of this transition, we moved some specific parts from the commit sequence and brought them to amdgpu_dm. This commit adds a wrapper inside DM that enable our drivers to do any necessary preparation or change before we offload the plane/stream update to DC. Reviewed-by: Harry Wentland Acked-by: Qingqing Zhuo Signed-off-by: Rodrigo Siqueira Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 51 +++++++++++++++---- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 6d963761c2d1..be863af956bb 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -353,6 +353,35 @@ static inline bool is_dc_timing_adjust_needed(struct dm_crtc_state *old_state, return false; } +/** + * update_planes_and_stream_adapter() - Send planes to be updated in DC + * + * DC has a generic way to update planes and stream via + * dc_update_planes_and_stream function; however, DM might need some + * adjustments and preparation before calling it. This function is a wrapper + * for the dc_update_planes_and_stream that does any required configuration + * before passing control to DC. + */ +static inline bool update_planes_and_stream_adapter(struct dc *dc, + int update_type, + int planes_count, + struct dc_stream_state *stream, + struct dc_stream_update *stream_update, + struct dc_surface_update *array_of_surface_update) +{ + /* + * Previous frame finished and HW is ready for optimization. + */ + if (update_type == UPDATE_TYPE_FAST) + dc_post_update_surfaces_to_stream(dc); + + return dc_update_planes_and_stream(dc, + array_of_surface_update, + planes_count, + stream, + stream_update); +} + /** * dm_pflip_high_irq() - Handle pageflip interrupt * @interrupt_params: ignored @@ -2551,11 +2580,12 @@ static void dm_gpureset_commit_state(struct dc_state *dc_state, true; } - dc_update_planes_and_stream(dm->dc, - bundle->surface_updates, - dc_state->stream_status->plane_count, - dc_state->streams[k], - &bundle->stream_update); + update_planes_and_stream_adapter(dm->dc, + UPDATE_TYPE_FULL, + dc_state->stream_status->plane_count, + dc_state->streams[k], + &bundle->stream_update, + bundle->surface_updates); } cleanup: @@ -9240,11 +9270,12 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, } mutex_lock(&dm->dc_lock); - dc_update_planes_and_stream(dm->dc, - bundle->surface_updates, - planes_count, - acrtc_state->stream, - &bundle->stream_update); + update_planes_and_stream_adapter(dm->dc, + acrtc_state->update_type, + planes_count, + acrtc_state->stream, + &bundle->stream_update, + bundle->surface_updates); /** * Enable or disable the interrupts on the backend. From 024a24e5d4dd77b785af139897fbcb77b4e420fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jun 2023 11:18:30 +0200 Subject: [PATCH 07/98] tick/common: Align tick period during sched_timer setup commit 13bb06f8dd42071cb9a49f6e21099eea05d4b856 upstream. The tick period is aligned very early while the first clock_event_device is registered. At that point the system runs in periodic mode and switches later to one-shot mode if possible. The next wake-up event is programmed based on the aligned value (tick_next_period) but the delta value, that is used to program the clock_event_device, is computed based on ktime_get(). With the subtracted offset, the device fires earlier than the exact time frame. With a large enough offset the system programs the timer for the next wake-up and the remaining time left is too small to make any boot progress. The system hangs. Move the alignment later to the setup of tick_sched timer. At this point the system switches to oneshot mode and a high resolution clocksource is available. At this point it is safe to align tick_next_period because ktime_get() will now return accurate (not jiffies based) time. [bigeasy: Patch description + testing]. Fixes: e9523a0d81899 ("tick/common: Align tick period with the HZ tick.") Reported-by: Mathias Krause Reported-by: "Bhatnagar, Rishabh" Suggested-by: Mathias Krause Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Richard W.M. Jones Tested-by: Mathias Krause Acked-by: SeongJae Park Cc: stable@vger.kernel.org Link: https://lore.kernel.org/5a56290d-806e-b9a5-f37c-f21958b5a8c0@grsecurity.net Link: https://lore.kernel.org/12c6f9a3-d087-b824-0d05-0d18c9bc1bf3@amazon.com Link: https://lore.kernel.org/r/20230615091830.RxMV2xf_@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/time/tick-common.c | 13 +------------ kernel/time/tick-sched.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 65b8658da829..e9138cd7a0f5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -218,19 +218,8 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - ktime_t next_p; - u32 rem; - tick_do_timer_cpu = cpu; - - next_p = ktime_get(); - div_u64_rem(next_p, TICK_NSEC, &rem); - if (rem) { - next_p -= rem; - next_p += TICK_NSEC; - } - - tick_next_period = next_p; + tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case set diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e014927c82e7..f42d0776bc84 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -161,8 +161,19 @@ static ktime_t tick_init_jiffy_update(void) raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ - if (last_jiffies_update == 0) + if (last_jiffies_update == 0) { + u32 rem; + + /* + * Ensure that the tick is aligned to a multiple of + * TICK_NSEC. + */ + div_u64_rem(tick_next_period, TICK_NSEC, &rem); + if (rem) + tick_next_period += TICK_NSEC - rem; + last_jiffies_update = tick_next_period; + } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); From b28fc26683b4d6f5ec8e0f2a3ad193c92799029b Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 8 Jun 2023 18:38:43 +0200 Subject: [PATCH 08/98] selftests: mptcp: lib: skip if missing symbol commit 673004821ab98c6645bd21af56a290854e88f533 upstream. Selftests are supposed to run on any kernels, including the old ones not supporting all MPTCP features. New functions are now available to easily detect if a certain feature is missing by looking at kallsyms. These new helpers are going to be used in the following commits. In order to ease the backport of such future patches, it would be good if this patch is backported up to the introduction of MPTCP selftests, hence the Fixes tag below: this type of check was supposed to be done from the beginning. Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368 Fixes: 048d19d444be ("mptcp: add basic kselftest for mptcp") Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/net/mptcp/config | 1 + .../testing/selftests/net/mptcp/mptcp_lib.sh | 38 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config index 0faaccd21447..f7d33a16051c 100644 --- a/tools/testing/selftests/net/mptcp/config +++ b/tools/testing/selftests/net/mptcp/config @@ -1,3 +1,4 @@ +CONFIG_KALLSYMS=y CONFIG_MPTCP=y CONFIG_IPV6=y CONFIG_MPTCP_IPV6=y diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 3286536b79d5..29b65f4b73b2 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -38,3 +38,41 @@ mptcp_lib_check_mptcp() { exit ${KSFT_SKIP} fi } + +mptcp_lib_check_kallsyms() { + if ! mptcp_lib_has_file "/proc/kallsyms"; then + echo "SKIP: CONFIG_KALLSYMS is missing" + exit ${KSFT_SKIP} + fi +} + +# Internal: use mptcp_lib_kallsyms_has() instead +__mptcp_lib_kallsyms_has() { + local sym="${1}" + + mptcp_lib_check_kallsyms + + grep -q " ${sym}" /proc/kallsyms +} + +# $1: part of a symbol to look at, add '$' at the end for full name +mptcp_lib_kallsyms_has() { + local sym="${1}" + + if __mptcp_lib_kallsyms_has "${sym}"; then + return 0 + fi + + mptcp_lib_fail_if_expected_feature "${sym} symbol not found" +} + +# $1: part of a symbol to look at, add '$' at the end for full name +mptcp_lib_kallsyms_doesnt_have() { + local sym="${1}" + + if ! __mptcp_lib_kallsyms_has "${sym}"; then + return 0 + fi + + mptcp_lib_fail_if_expected_feature "${sym} symbol has been found" +} From 726d033133e7edf8326aeebe52951e9569c70e39 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Sat, 10 Jun 2023 18:11:36 +0200 Subject: [PATCH 09/98] selftests: mptcp: lib: skip if not below kernel version commit b1a6a38ab8a633546cefae890da842f19e006c74 upstream. Selftests are supposed to run on any kernels, including the old ones not supporting all MPTCP features. A new function is now available to easily detect if a feature is missing by looking at the kernel version. That's clearly not ideal and this kind of check should be avoided as soon as possible. But sometimes, there are no external sign that a "feature" is available or not: internal behaviours can change without modifying the uAPI and these selftests are verifying the internal behaviours. Sometimes, the only (easy) way to verify if the feature is present is to run the test but then the validation cannot determine if there is a failure with the feature or if the feature is missing. Then it looks better to check the kernel version instead of having tests that can never fail. In any case, we need a solution not to have a whole selftest being marked as failed just because one sub-test has failed. Note that this env var car be set to 1 not to do such check and run the linked sub-test: SELFTESTS_MPTCP_LIB_NO_KVERSION_CHECK. This new helper is going to be used in the following commits. In order to ease the backport of such future patches, it would be good if this patch is backported up to the introduction of MPTCP selftests, hence the Fixes tag below: this type of check was supposed to be done from the beginning. Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368 Fixes: 048d19d444be ("mptcp: add basic kselftest for mptcp") Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- .../testing/selftests/net/mptcp/mptcp_lib.sh | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 29b65f4b73b2..f32045b23b89 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -76,3 +76,29 @@ mptcp_lib_kallsyms_doesnt_have() { mptcp_lib_fail_if_expected_feature "${sym} symbol has been found" } + +# !!!AVOID USING THIS!!! +# Features might not land in the expected version and features can be backported +# +# $1: kernel version, e.g. 6.3 +mptcp_lib_kversion_ge() { + local exp_maj="${1%.*}" + local exp_min="${1#*.}" + local v maj min + + # If the kernel has backported features, set this env var to 1: + if [ "${SELFTESTS_MPTCP_LIB_NO_KVERSION_CHECK:-}" = "1" ]; then + return 0 + fi + + v=$(uname -r | cut -d'.' -f1,2) + maj=${v%.*} + min=${v#*.} + + if [ "${maj}" -gt "${exp_maj}" ] || + { [ "${maj}" -eq "${exp_maj}" ] && [ "${min}" -ge "${exp_min}" ]; }; then + return 0 + fi + + mptcp_lib_fail_if_expected_feature "kernel version ${1} lower than ${v}" +} From ac65930751c4dadef6b4f747cda001ecf27f2fb9 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 13 Feb 2023 11:20:07 -0700 Subject: [PATCH 10/98] selftests/mount_setattr: fix redefine struct mount_attr build error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d8e45bf1aed2e5fddd8985b5bb1aaf774a97aba8 upstream. Fix the following build error due to redefining struct mount_attr by removing duplicate define from mount_setattr_test.c gcc -g -isystem .../tools/testing/selftests/../../../usr/include -Wall -O2 -pthread mount_setattr_test.c -o .../tools/testing/selftests/mount_setattr/mount_setattr_test mount_setattr_test.c:107:8: error: redefinition of ‘struct mount_attr’ 107 | struct mount_attr { | ^~~~~~~~~~ In file included from /usr/include/x86_64-linux-gnu/sys/mount.h:32, from mount_setattr_test.c:10: .../usr/include/linux/mount.h:129:8: note: originally defined here 129 | struct mount_attr { | ^~~~~~~~~~ make: *** [../lib.mk:145: .../tools/testing/selftests/mount_setattr/mount_setattr_test] Error 1 Signed-off-by: Shuah Khan Cc: Hardik Garg Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/mount_setattr/mount_setattr_test.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 969647228817..c6a8c732b802 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -104,13 +104,6 @@ #else #define __NR_mount_setattr 442 #endif - -struct mount_attr { - __u64 attr_set; - __u64 attr_clr; - __u64 propagation; - __u64 userns_fd; -}; #endif #ifndef __NR_open_tree From 979a941d7ed30577d93555bb028fc6c9b366f7d5 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 22 Jun 2023 13:55:38 +0200 Subject: [PATCH 11/98] selftests: mptcp: pm nl: remove hardcoded default limits commit 2177d0b08e421971e035672b70f3228d9485c650 upstream. Selftests are supposed to run on any kernels, including the old ones not supporting all MPTCP features. One of them is the checks of the default limits returned by the MPTCP in-kernel path-manager. The default values have been modified by commit 72bcbc46a5c3 ("mptcp: increase default max additional subflows to 2"). Instead of comparing with hardcoded values, we can get the default one and compare with them. Note that if we expect to have the latest version, we continue to check the hardcoded values to avoid unexpected behaviour changes. Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368 Fixes: eedbc685321b ("selftests: add PM netlink functional tests") Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski Signed-off-by: Matthieu Baerts Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/net/mptcp/pm_netlink.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 7fb06c854852..306372b1526a 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -73,8 +73,12 @@ check() } check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list" -check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 + +default_limits="$(ip netns exec $ns1 ./pm_nl_ctl limits)" +if mptcp_lib_expect_all_features; then + check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 subflows 0" "defaults limits" +fi ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo @@ -121,12 +125,10 @@ ip netns exec $ns1 ./pm_nl_ctl flush check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs" ip netns exec $ns1 ./pm_nl_ctl limits 9 1 -check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 -subflows 0" "rcv addrs above hard limit" +check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "rcv addrs above hard limit" ip netns exec $ns1 ./pm_nl_ctl limits 1 9 -check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 -subflows 0" "subflows above hard limit" +check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "subflows above hard limit" ip netns exec $ns1 ./pm_nl_ctl limits 8 8 check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8 From 64cb73ea77abfdacae2a111194989b726b9ddb83 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 22 Jun 2023 15:34:31 +0200 Subject: [PATCH 12/98] selftests: mptcp: join: use 'iptables-legacy' if available commit 0c4cd3f86a40028845ad6f8af5b37165666404cd upstream. IPTables commands using 'iptables-nft' fail on old kernels, at least 5.15 because it doesn't see the default IPTables chains: $ iptables -L iptables/1.8.2 Failed to initialize nft: Protocol not supported As a first step before switching to NFTables, we can use iptables-legacy if available. Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368 Fixes: 8d014eaa9254 ("selftests: mptcp: add ADD_ADDR timeout test case") Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski Signed-off-by: Matthieu Baerts Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 7330e32bb9ab..e6391e234e97 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -12,6 +12,8 @@ cinfail="" cinsent="" cout="" ksft_skip=4 +iptables="iptables" +ip6tables="ip6tables" timeout_poll=30 timeout_test=$((timeout_poll * 2 + 1)) mptcp_connect="" @@ -126,9 +128,9 @@ reset_with_add_addr_timeout() local ip="${1:-4}" local tables - tables="iptables" + tables="${iptables}" if [ $ip -eq 6 ]; then - tables="ip6tables" + tables="${ip6tables}" fi reset @@ -171,8 +173,10 @@ if [ $? -ne 0 ];then exit $ksft_skip fi -iptables -V > /dev/null 2>&1 -if [ $? -ne 0 ];then +if iptables-legacy -V &> /dev/null; then + iptables="iptables-legacy" + ip6tables="ip6tables-legacy" +elif ! iptables -V &> /dev/null; then echo "SKIP: Could not run all tests without iptables tool" exit $ksft_skip fi From 485f6be2549c5d54e85373ba3cd43215df85b643 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 22 Jun 2023 15:35:37 +0200 Subject: [PATCH 13/98] selftests: mptcp: join: skip check if MIB counter not supported commit 47867f0a7e831e24e5eab3330667ce9682d50fb1 upstream. Selftests are supposed to run on any kernels, including the old ones not supporting all MPTCP features. One of them is the MPTCP MIB counters introduced in commit fc518953bc9c ("mptcp: add and use MIB counter infrastructure") and more later. The MPTCP Join selftest heavily relies on these counters. If a counter is not supported by the kernel, it is not displayed when using 'nstat -z'. We can then detect that and skip the verification. A new helper (get_counter()) has been added to do the required checks and return an error if the counter is not available. Note that if we expect to have these features available and if SELFTESTS_MPTCP_LIB_EXPECT_ALL_FEATURES env var is set to 1, the tests will be marked as failed instead of skipped. This new helper also makes sure we get the exact counter we want to avoid issues we had in the past, e.g. with MPTcpExtRmAddr and MPTcpExtRmAddrDrop sharing the same prefix. While at it, we uniform the way we fetch a MIB counter. Note for the backports: we rarely change these modified blocks so if there is are conflicts, it is very likely because a counter is not used in the older kernels and we don't need that chunk. Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368 Fixes: b08fbf241064 ("selftests: add test-cases for MPTCP MP_JOIN") Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski Signed-off-by: Matthieu Baerts Signed-off-by: Greg Kroah-Hartman --- .../testing/selftests/net/mptcp/mptcp_join.sh | 169 ++++++++++-------- 1 file changed, 99 insertions(+), 70 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index e6391e234e97..368fae525fea 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -247,6 +247,22 @@ is_v6() [ -z "${1##*:*}" ] } +# $1: ns ; $2: counter +get_counter() +{ + local ns="${1}" + local counter="${2}" + local count + + count=$(ip netns exec ${ns} nstat -asz "${counter}" | awk 'NR==1 {next} {print $2}') + if [ -z "${count}" ]; then + mptcp_lib_fail_if_expected_feature "${counter} counter" + return 1 + fi + + echo "${count}" +} + do_transfer() { listener_ns="$1" @@ -560,9 +576,10 @@ chk_csum_nr() echo -n " " fi printf " %-36s %s" "$msg" "sum" - count=`ip netns exec $ns1 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != 0 ]; then + count=$(get_counter ${ns1} "MPTcpExtDataCsumErr") + if [ -z "$count" ]; then + echo -n "[skip]" + elif [ "$count" != 0 ]; then echo "[fail] got $count data checksum error[s] expected 0" ret=1 dump_stats=1 @@ -570,9 +587,10 @@ chk_csum_nr() echo -n "[ ok ]" fi echo -n " - csum " - count=`ip netns exec $ns2 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != 0 ]; then + count=$(get_counter ${ns2} "MPTcpExtDataCsumErr") + if [ -z "$count" ]; then + echo "[skip]" + elif [ "$count" != 0 ]; then echo "[fail] got $count data checksum error[s] expected 0" ret=1 dump_stats=1 @@ -595,9 +613,10 @@ chk_fail_nr() local dump_stats printf "%-39s %s" " " "ftx" - count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPFailTx | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$mp_fail_nr_tx" ]; then + count=$(get_counter ${ns1} "MPTcpExtMPFailTx") + if [ -z "$count" ]; then + echo -n "[skip]" + elif [ "$count" != "$mp_fail_nr_tx" ]; then echo "[fail] got $count MP_FAIL[s] TX expected $mp_fail_nr_tx" ret=1 dump_stats=1 @@ -606,9 +625,10 @@ chk_fail_nr() fi echo -n " - frx " - count=`ip netns exec $ns2 nstat -as | grep MPTcpExtMPFailRx | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$mp_fail_nr_rx" ]; then + count=$(get_counter ${ns2} "MPTcpExtMPFailRx") + if [ -z "$count" ]; then + echo "[skip]" + elif [ "$count" != "$mp_fail_nr_rx" ]; then echo "[fail] got $count MP_FAIL[s] RX expected $mp_fail_nr_rx" ret=1 dump_stats=1 @@ -635,9 +655,10 @@ chk_join_nr() local with_cookie printf "%02u %-36s %s" "$TEST_COUNT" "$msg" "syn" - count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinSynRx | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$syn_nr" ]; then + count=$(get_counter ${ns1} "MPTcpExtMPJoinSynRx") + if [ -z "$count" ]; then + echo -n "[skip]" + elif [ "$count" != "$syn_nr" ]; then echo "[fail] got $count JOIN[s] syn expected $syn_nr" ret=1 dump_stats=1 @@ -647,9 +668,10 @@ chk_join_nr() echo -n " - synack" with_cookie=`ip netns exec $ns2 sysctl -n net.ipv4.tcp_syncookies` - count=`ip netns exec $ns2 nstat -as | grep MPTcpExtMPJoinSynAckRx | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$syn_ack_nr" ]; then + count=$(get_counter ${ns2} "MPTcpExtMPJoinSynAckRx") + if [ -z "$count" ]; then + echo -n "[skip]" + elif [ "$count" != "$syn_ack_nr" ]; then # simult connections exceeding the limit with cookie enabled could go up to # synack validation as the conn limit can be enforced reliably only after # the subflow creation @@ -665,9 +687,10 @@ chk_join_nr() fi echo -n " - ack" - count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinAckRx | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$ack_nr" ]; then + count=$(get_counter ${ns1} "MPTcpExtMPJoinAckRx") + if [ -z "$count" ]; then + echo "[skip]" + elif [ "$count" != "$ack_nr" ]; then echo "[fail] got $count JOIN[s] ack expected $ack_nr" ret=1 dump_stats=1 @@ -702,14 +725,13 @@ chk_stale_nr() local recover_nr printf "%-39s %-18s" " " "stale" - stale_nr=`ip netns exec $ns nstat -as | grep MPTcpExtSubflowStale | awk '{print $2}'` - [ -z "$stale_nr" ] && stale_nr=0 - recover_nr=`ip netns exec $ns nstat -as | grep MPTcpExtSubflowRecover | awk '{print $2}'` - [ -z "$recover_nr" ] && recover_nr=0 - - if [ $stale_nr -lt $stale_min ] || - [ $stale_max -gt 0 -a $stale_nr -gt $stale_max ] || - [ $((stale_nr - $recover_nr)) -ne $stale_delta ]; then + stale_nr=$(get_counter ${ns} "MPTcpExtSubflowStale") + recover_nr=$(get_counter ${ns} "MPTcpExtSubflowRecover") + if [ -z "$stale_nr" ] || [ -z "$recover_nr" ]; then + echo "[skip]" + elif [ $stale_nr -lt $stale_min ] || + [ $stale_max -gt 0 -a $stale_nr -gt $stale_max ] || + [ $((stale_nr - $recover_nr)) -ne $stale_delta ]; then echo "[fail] got $stale_nr stale[s] $recover_nr recover[s], " \ " expected stale in range [$stale_min..$stale_max]," \ " stale-recover delta $stale_delta " @@ -740,9 +762,10 @@ chk_add_nr() local dump_stats printf "%-39s %s" " " "add" - count=`ip netns exec $ns2 nstat -as MPTcpExtAddAddr | grep MPTcpExtAddAddr | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$add_nr" ]; then + count=$(get_counter ${ns2} "MPTcpExtAddAddr") + if [ -z "$count" ]; then + echo -n "[skip]" + elif [ "$count" != "$add_nr" ]; then echo "[fail] got $count ADD_ADDR[s] expected $add_nr" ret=1 dump_stats=1 @@ -751,9 +774,10 @@ chk_add_nr() fi echo -n " - echo " - count=`ip netns exec $ns1 nstat -as | grep MPTcpExtEchoAdd | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$echo_nr" ]; then + count=$(get_counter ${ns1} "MPTcpExtEchoAdd") + if [ -z "$count" ]; then + echo -n "[skip]" + elif [ "$count" != "$echo_nr" ]; then echo "[fail] got $count ADD_ADDR echo[s] expected $echo_nr" ret=1 dump_stats=1 @@ -763,9 +787,10 @@ chk_add_nr() if [ $port_nr -gt 0 ]; then echo -n " - pt " - count=`ip netns exec $ns2 nstat -as | grep MPTcpExtPortAdd | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$port_nr" ]; then + count=$(get_counter ${ns2} "MPTcpExtPortAdd") + if [ -z "$count" ]; then + echo "[skip]" + elif [ "$count" != "$port_nr" ]; then echo "[fail] got $count ADD_ADDR[s] with a port-number expected $port_nr" ret=1 dump_stats=1 @@ -774,10 +799,10 @@ chk_add_nr() fi printf "%-39s %s" " " "syn" - count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinPortSynRx | - awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$syn_nr" ]; then + count=$(get_counter ${ns1} "MPTcpExtMPJoinPortSynRx") + if [ -z "$count" ]; then + echo -n "[skip]" + elif [ "$count" != "$syn_nr" ]; then echo "[fail] got $count JOIN[s] syn with a different \ port-number expected $syn_nr" ret=1 @@ -787,10 +812,10 @@ chk_add_nr() fi echo -n " - synack" - count=`ip netns exec $ns2 nstat -as | grep MPTcpExtMPJoinPortSynAckRx | - awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$syn_ack_nr" ]; then + count=$(get_counter ${ns2} "MPTcpExtMPJoinPortSynAckRx") + if [ -z "$count" ]; then + echo -n "[skip]" + elif [ "$count" != "$syn_ack_nr" ]; then echo "[fail] got $count JOIN[s] synack with a different \ port-number expected $syn_ack_nr" ret=1 @@ -800,10 +825,10 @@ chk_add_nr() fi echo -n " - ack" - count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinPortAckRx | - awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$ack_nr" ]; then + count=$(get_counter ${ns1} "MPTcpExtMPJoinPortAckRx") + if [ -z "$count" ]; then + echo "[skip]" + elif [ "$count" != "$ack_nr" ]; then echo "[fail] got $count JOIN[s] ack with a different \ port-number expected $ack_nr" ret=1 @@ -813,10 +838,10 @@ chk_add_nr() fi printf "%-39s %s" " " "syn" - count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMismatchPortSynRx | - awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$mis_syn_nr" ]; then + count=$(get_counter ${ns1} "MPTcpExtMismatchPortSynRx") + if [ -z "$count" ]; then + echo -n "[skip]" + elif [ "$count" != "$mis_syn_nr" ]; then echo "[fail] got $count JOIN[s] syn with a mismatched \ port-number expected $mis_syn_nr" ret=1 @@ -826,10 +851,10 @@ chk_add_nr() fi echo -n " - ack " - count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMismatchPortAckRx | - awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$mis_ack_nr" ]; then + count=$(get_counter ${ns1} "MPTcpExtMismatchPortAckRx") + if [ -z "$count" ]; then + echo "[skip]" + elif [ "$count" != "$mis_ack_nr" ]; then echo "[fail] got $count JOIN[s] ack with a mismatched \ port-number expected $mis_ack_nr" ret=1 @@ -868,9 +893,10 @@ chk_rm_nr() fi printf "%-39s %s" " " "rm " - count=`ip netns exec $addr_ns nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$rm_addr_nr" ]; then + count=$(get_counter ${addr_ns} "MPTcpExtRmAddr") + if [ -z "$count" ]; then + echo -n "[skip]" + elif [ "$count" != "$rm_addr_nr" ]; then echo "[fail] got $count RM_ADDR[s] expected $rm_addr_nr" ret=1 dump_stats=1 @@ -879,9 +905,10 @@ chk_rm_nr() fi echo -n " - sf " - count=`ip netns exec $subflow_ns nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$rm_subflow_nr" ]; then + count=$(get_counter ${subflow_ns} "MPTcpExtRmSubflow") + if [ -z "$count" ]; then + echo "[skip]" + elif [ "$count" != "$rm_subflow_nr" ]; then echo "[fail] got $count RM_SUBFLOW[s] expected $rm_subflow_nr" ret=1 dump_stats=1 @@ -905,9 +932,10 @@ chk_prio_nr() local dump_stats printf "%-39s %s" " " "ptx" - count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPPrioTx | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$mp_prio_nr_tx" ]; then + count=$(get_counter ${ns1} "MPTcpExtMPPrioTx") + if [ -z "$count" ]; then + echo -n "[skip]" + elif [ "$count" != "$mp_prio_nr_tx" ]; then echo "[fail] got $count MP_PRIO[s] TX expected $mp_prio_nr_tx" ret=1 dump_stats=1 @@ -916,9 +944,10 @@ chk_prio_nr() fi echo -n " - prx " - count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPPrioRx | awk '{print $2}'` - [ -z "$count" ] && count=0 - if [ "$count" != "$mp_prio_nr_rx" ]; then + count=$(get_counter ${ns1} "MPTcpExtMPPrioRx") + if [ -z "$count" ]; then + echo "[skip]" + elif [ "$count" != "$mp_prio_nr_rx" ]; then echo "[fail] got $count MP_PRIO[s] RX expected $mp_prio_nr_rx" ret=1 dump_stats=1 From b12011cea56bd80d52e2d9a6a320ee6ce89034c8 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 9 Jun 2023 12:57:32 +0900 Subject: [PATCH 14/98] nilfs2: fix buffer corruption due to concurrent device reads commit 679bd7ebdd315bf457a4740b306ae99f1d0a403d upstream. As a result of analysis of a syzbot report, it turned out that in three cases where nilfs2 allocates block device buffers directly via sb_getblk, concurrent reads to the device can corrupt the allocated buffers. Nilfs2 uses sb_getblk for segment summary blocks, that make up a log header, and the super root block, that is the trailer, and when moving and writing the second super block after fs resize. In any of these, since the uptodate flag is not set when storing metadata to be written in the allocated buffers, the stored metadata will be overwritten if a device read of the same block occurs concurrently before the write. This causes metadata corruption and misbehavior in the log write itself, causing warnings in nilfs_btree_assign() as reported. Fix these issues by setting an uptodate flag on the buffer head on the first or before modifying each buffer obtained with sb_getblk, and clearing the flag on failure. When setting the uptodate flag, the lock_buffer/unlock_buffer pair is used to perform necessary exclusive control, and the buffer is filled to ensure that uninitialized bytes are not mixed into the data read from others. As for buffers for segment summary blocks, they are filled incrementally, so if the uptodate flag was unset on their allocation, set the flag and zero fill the buffer once at that point. Also, regarding the superblock move routine, the starting point of the memset call to zerofill the block is incorrectly specified, which can cause a buffer overflow on file systems with block sizes greater than 4KiB. In addition, if the superblock is moved within a large block, it is necessary to assume the possibility that the data in the superblock will be destroyed by zero-filling before copying. So fix these potential issues as well. Link: https://lkml.kernel.org/r/20230609035732.20426-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+31837fe952932efc8fb9@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/00000000000030000a05e981f475@google.com Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/segbuf.c | 6 ++++++ fs/nilfs2/segment.c | 7 +++++++ fs/nilfs2/super.c | 25 +++++++++++++++++++++++-- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 56872e93823d..5e845eea1af0 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -101,6 +101,12 @@ int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf) if (unlikely(!bh)) return -ENOMEM; + lock_buffer(bh); + if (!buffer_uptodate(bh)) { + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); nilfs_segbuf_add_segsum_buffer(segbuf, bh); return 0; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index c25f28094508..5c310eb7dd0c 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -979,10 +979,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, unsigned int isz, srsz; bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; + + lock_buffer(bh_sr); raw_sr = (struct nilfs_super_root *)bh_sr->b_data; isz = nilfs->ns_inode_size; srsz = NILFS_SR_BYTES(isz); + raw_sr->sr_sum = 0; /* Ensure initialization within this update */ raw_sr->sr_bytes = cpu_to_le16(srsz); raw_sr->sr_nongc_ctime = cpu_to_le64(nilfs_doing_gc() ? @@ -996,6 +999,8 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + NILFS_SR_SUFILE_OFFSET(isz), 1); memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz); + set_buffer_uptodate(bh_sr); + unlock_buffer(bh_sr); } static void nilfs_redirty_inodes(struct list_head *head) @@ -1778,6 +1783,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(segbuf, logs, sb_list) { list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { + clear_buffer_uptodate(bh); if (bh->b_page != bd_page) { if (bd_page) end_page_writeback(bd_page); @@ -1789,6 +1795,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) b_assoc_buffers) { clear_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { + clear_buffer_uptodate(bh); if (bh->b_page != bd_page) { end_page_writeback(bd_page); bd_page = bh->b_page; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 9d5e4ce7426b..130ffa8a9bed 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -372,10 +372,31 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) goto out; } nsbp = (void *)nsbh->b_data + offset; - memset(nsbp, 0, nilfs->ns_blocksize); + + lock_buffer(nsbh); + if (sb2i >= 0) { + /* + * The position of the second superblock only changes by 4KiB, + * which is larger than the maximum superblock data size + * (= 1KiB), so there is no need to use memmove() to allow + * overlap between source and destination. + */ + memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize); + + /* + * Zero fill after copy to avoid overwriting in case of move + * within the same block. + */ + memset(nsbh->b_data, 0, offset); + memset((void *)nsbp + nilfs->ns_sbsize, 0, + nsbh->b_size - offset - nilfs->ns_sbsize); + } else { + memset(nsbh->b_data, 0, nsbh->b_size); + } + set_buffer_uptodate(nsbh); + unlock_buffer(nsbh); if (sb2i >= 0) { - memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize); brelse(nilfs->ns_sbh[sb2i]); nilfs->ns_sbh[sb2i] = nsbh; nilfs->ns_sbp[sb2i] = nsbp; From 1d6c93206839b244130f4de1a8b217c321200e91 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 14 Jun 2023 17:29:21 +0200 Subject: [PATCH 15/98] ACPI: sleep: Avoid breaking S3 wakeup due to might_sleep() commit 22db06337f590d01d79f60f181d8dfe5a9ef9085 upstream. The addition of might_sleep() to down_timeout() caused the latter to enable interrupts unconditionally in some cases, which in turn broke the ACPI S3 wakeup path in acpi_suspend_enter(), where down_timeout() is called by acpi_disable_all_gpes() via acpi_ut_acquire_mutex(). Namely, if CONFIG_DEBUG_ATOMIC_SLEEP is set, might_sleep() causes might_resched() to be used and if CONFIG_PREEMPT_VOLUNTARY is set, this triggers __cond_resched() which may call preempt_schedule_common(), so __schedule() gets invoked and it ends up with enabled interrupts (in the prev == next case). Now, enabling interrupts early in the S3 wakeup path causes the kernel to crash. Address this by modifying acpi_suspend_enter() to disable GPEs without attempting to acquire the sleeping lock which is not needed in that code path anyway. Fixes: 99409b935c9a ("locking/semaphore: Add might_sleep() to down_*() family") Reported-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Cc: 5.15+ # 5.15+ Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/acpica/achware.h | 2 -- drivers/acpi/sleep.c | 16 ++++++++++++---- include/acpi/acpixf.h | 1 + 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h index 810de0b4c125..9c3ad33e926a 100644 --- a/drivers/acpi/acpica/achware.h +++ b/drivers/acpi/acpica/achware.h @@ -101,8 +101,6 @@ acpi_status acpi_hw_get_gpe_status(struct acpi_gpe_event_info *gpe_event_info, acpi_event_status *event_status); -acpi_status acpi_hw_disable_all_gpes(void); - acpi_status acpi_hw_enable_all_runtime_gpes(void); acpi_status acpi_hw_enable_all_wakeup_gpes(void); diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index d7194047d256..b277e25b276c 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -635,11 +635,19 @@ static int acpi_suspend_enter(suspend_state_t pm_state) } /* - * Disable and clear GPE status before interrupt is enabled. Some GPEs - * (like wakeup GPE) haven't handler, this can avoid such GPE misfire. - * acpi_leave_sleep_state will reenable specific GPEs later + * Disable all GPE and clear their status bits before interrupts are + * enabled. Some GPEs (like wakeup GPEs) have no handlers and this can + * prevent them from producing spurious interrups. + * + * acpi_leave_sleep_state() will reenable specific GPEs later. + * + * Because this code runs on one CPU with disabled interrupts (all of + * the other CPUs are offline at this time), it need not acquire any + * sleeping locks which may trigger an implicit preemption point even + * if there is no contention, so avoid doing that by using a low-level + * library routine here. */ - acpi_disable_all_gpes(); + acpi_hw_disable_all_gpes(); /* Allow EC transactions to happen. */ acpi_ec_unblock_transactions(); diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index fa02e3ff0faf..9d45a6001bc0 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -749,6 +749,7 @@ ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_event_status *event_status)) ACPI_HW_DEPENDENT_RETURN_UINT32(u32 acpi_dispatch_gpe(acpi_handle gpe_device, u32 gpe_number)) +ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_hw_disable_all_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_disable_all_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_runtime_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_wakeup_gpes(void)) From 953dd7e2df8181d5ce4117fca347992d616f0621 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 15 Jun 2023 15:42:59 +1000 Subject: [PATCH 16/98] KVM: Avoid illegal stage2 mapping on invalid memory slot commit 2230f9e1171a2e9731422a14d1bbc313c0b719d1 upstream. We run into guest hang in edk2 firmware when KSM is kept as running on the host. The edk2 firmware is waiting for status 0x80 from QEMU's pflash device (TYPE_PFLASH_CFI01) during the operation of sector erasing or buffered write. The status is returned by reading the memory region of the pflash device and the read request should have been forwarded to QEMU and emulated by it. Unfortunately, the read request is covered by an illegal stage2 mapping when the guest hang issue occurs. The read request is completed with QEMU bypassed and wrong status is fetched. The edk2 firmware runs into an infinite loop with the wrong status. The illegal stage2 mapping is populated due to same page sharing by KSM at (C) even the associated memory slot has been marked as invalid at (B) when the memory slot is requested to be deleted. It's notable that the active and inactive memory slots can't be swapped when we're in the middle of kvm_mmu_notifier_change_pte() because kvm->mn_active_invalidate_count is elevated, and kvm_swap_active_memslots() will busy loop until it reaches to zero again. Besides, the swapping from the active to the inactive memory slots is also avoided by holding &kvm->srcu in __kvm_handle_hva_range(), corresponding to synchronize_srcu_expedited() in kvm_swap_active_memslots(). CPU-A CPU-B ----- ----- ioctl(kvm_fd, KVM_SET_USER_MEMORY_REGION) kvm_vm_ioctl_set_memory_region kvm_set_memory_region __kvm_set_memory_region kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE) kvm_invalidate_memslot kvm_copy_memslot kvm_replace_memslot kvm_swap_active_memslots (A) kvm_arch_flush_shadow_memslot (B) same page sharing by KSM kvm_mmu_notifier_invalidate_range_start : kvm_mmu_notifier_change_pte kvm_handle_hva_range __kvm_handle_hva_range kvm_set_spte_gfn (C) : kvm_mmu_notifier_invalidate_range_end Fix the issue by skipping the invalid memory slot at (C) to avoid the illegal stage2 mapping so that the read request for the pflash's status is forwarded to QEMU and emulated by it. In this way, the correct pflash's status can be returned from QEMU to break the infinite loop in the edk2 firmware. We tried a git-bisect and the first problematic commit is cd4c71835228 (" KVM: arm64: Convert to the gfn-based MMU notifier callbacks"). With this, clean_dcache_guest_page() is called after the memory slots are iterated in kvm_mmu_notifier_change_pte(). clean_dcache_guest_page() is called before the iteration on the memory slots before this commit. This change literally enlarges the racy window between kvm_mmu_notifier_change_pte() and memory slot removal so that we're able to reproduce the issue in a practical test case. However, the issue exists since commit d5d8184d35c9 ("KVM: ARM: Memory virtualization setup"). Cc: stable@vger.kernel.org # v3.9+ Fixes: d5d8184d35c9 ("KVM: ARM: Memory virtualization setup") Reported-by: Shuai Hu Reported-by: Zhenyu Zhang Signed-off-by: Gavin Shan Reviewed-by: David Hildenbrand Reviewed-by: Oliver Upton Reviewed-by: Peter Xu Reviewed-by: Sean Christopherson Reviewed-by: Shaoqin Huang Message-Id: <20230615054259.14911-1-gshan@redhat.com> Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- virt/kvm/kvm_main.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1cb2530831f4..3967184b7d62 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -636,6 +636,24 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn return __kvm_handle_hva_range(kvm, &range); } + +static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + /* + * Skipping invalid memslots is correct if and only change_pte() is + * surrounded by invalidate_range_{start,end}(), which is currently + * guaranteed by the primary MMU. If that ever changes, KVM needs to + * unmap the memslot instead of skipping the memslot to ensure that KVM + * doesn't hold references to the old PFN. + */ + WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); + + if (range->slot->flags & KVM_MEMSLOT_INVALID) + return false; + + return kvm_set_spte_gfn(kvm, range); +} + static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address, @@ -656,7 +674,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, if (!READ_ONCE(kvm->mmu_notifier_count)) return; - kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); + kvm_handle_hva_range(mn, address, address + 1, pte, kvm_change_spte_gfn); } void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, From 4d31eb2e266cf222afc8c613b147e54a167804a8 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 4 May 2023 15:41:55 -0700 Subject: [PATCH 17/98] Drivers: hv: vmbus: Call hv_synic_free() if hv_synic_alloc() fails commit ec97e112985c2581ee61854a4b74f080f6cdfc2c upstream. Commit 572086325ce9 ("Drivers: hv: vmbus: Cleanup synic memory free path") says "Any memory allocations that succeeded will be freed when the caller cleans up by calling hv_synic_free()", but if the get_zeroed_page() in hv_synic_alloc() fails, currently hv_synic_free() is not really called in vmbus_bus_init(), consequently there will be a memory leak, e.g. hv_context.hv_numa_map is not freed in the error path. Fix this by updating the goto labels. Cc: stable@kernel.org Signed-off-by: Dexuan Cui Fixes: 4df4cb9e99f8 ("x86/hyperv: Initialize clockevents earlier in CPU onlining") Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20230504224155.10484-1-decui@microsoft.com Signed-off-by: Wei Liu Signed-off-by: Greg Kroah-Hartman --- drivers/hv/vmbus_drv.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index b906a3a7941c..115835bd562c 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1538,7 +1538,7 @@ static int vmbus_bus_init(void) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", hv_synic_init, hv_synic_cleanup); if (ret < 0) - goto err_cpuhp; + goto err_alloc; hyperv_cpuhp_online = ret; ret = vmbus_connect(); @@ -1589,9 +1589,8 @@ static int vmbus_bus_init(void) err_connect: cpuhp_remove_state(hyperv_cpuhp_online); -err_cpuhp: - hv_synic_free(); err_alloc: + hv_synic_free(); if (vmbus_irq == -1) { hv_remove_vmbus_handler(); } else { From 80c5d97b4aa1d960c0f19a293bcaef5cd9217e14 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Thu, 18 May 2023 08:13:52 -0700 Subject: [PATCH 18/98] Drivers: hv: vmbus: Fix vmbus_wait_for_unload() to scan present CPUs commit 320805ab61e5f1e2a5729ae266e16bec2904050c upstream. vmbus_wait_for_unload() may be called in the panic path after other CPUs are stopped. vmbus_wait_for_unload() currently loops through online CPUs looking for the UNLOAD response message. But the values of CONFIG_KEXEC_CORE and crash_kexec_post_notifiers affect the path used to stop the other CPUs, and in one of the paths the stopped CPUs are removed from cpu_online_mask. This removal happens in both x86/x64 and arm64 architectures. In such a case, vmbus_wait_for_unload() only checks the panic'ing CPU, and misses the UNLOAD response message except when the panic'ing CPU is CPU 0. vmbus_wait_for_unload() eventually times out, but only after waiting 100 seconds. Fix this by looping through *present* CPUs in vmbus_wait_for_unload(). The cpu_present_mask is not modified by stopping the other CPUs in the panic path, nor should it be. Also, in a CoCo VM the synic_message_page is not allocated in hv_synic_alloc(), but is set and cleared in hv_synic_enable_regs() and hv_synic_disable_regs() such that it is set only when the CPU is online. If not all present CPUs are online when vmbus_wait_for_unload() is called, the synic_message_page might be NULL. Add a check for this. Fixes: cd95aad55793 ("Drivers: hv: vmbus: handle various crash scenarios") Cc: stable@vger.kernel.org Reported-by: John Starks Signed-off-by: Michael Kelley Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/1684422832-38476-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu Signed-off-by: Greg Kroah-Hartman --- drivers/hv/channel_mgmt.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index d8dc5cc5e3a8..62c864f8d991 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -827,11 +827,22 @@ static void vmbus_wait_for_unload(void) if (completion_done(&vmbus_connection.unload_event)) goto completed; - for_each_online_cpu(cpu) { + for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); + /* + * In a CoCo VM the synic_message_page is not allocated + * in hv_synic_alloc(). Instead it is set/cleared in + * hv_synic_enable_regs() and hv_synic_disable_regs() + * such that it is set only when the CPU is online. If + * not all present CPUs are online, the message page + * might be NULL, so skip such CPUs. + */ page_addr = hv_cpu->synic_message_page; + if (!page_addr) + continue; + msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; @@ -865,11 +876,14 @@ completed: * maybe-pending messages on all CPUs to be able to receive new * messages after we reconnect. */ - for_each_online_cpu(cpu) { + for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); page_addr = hv_cpu->synic_message_page; + if (!page_addr) + continue; + msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; msg->header.message_type = HVMSG_NONE; } From ac0df91c7d98fc269f3734768194018e566384c1 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Wed, 14 Jun 2023 21:44:47 -0700 Subject: [PATCH 19/98] PCI: hv: Fix a race condition bug in hv_pci_query_relations() commit 440b5e3663271b0ffbd4908115044a6a51fb938b upstream. Since day 1 of the driver, there has been a race between hv_pci_query_relations() and survey_child_resources(): during fast device hotplug, hv_pci_query_relations() may error out due to device-remove and the stack variable 'comp' is no longer valid; however, pci_devices_present_work() -> survey_child_resources() -> complete() may be running on another CPU and accessing the no-longer-valid 'comp'. Fix the race by flushing the workqueue before we exit from hv_pci_query_relations(). Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs") Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Acked-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230615044451.5580-2-decui@microsoft.com Signed-off-by: Wei Liu Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-hyperv.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 9b54715a4b63..1f5e0a12e3de 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2959,6 +2959,24 @@ static int hv_pci_query_relations(struct hv_device *hdev) if (!ret) ret = wait_for_response(hdev, &comp); + /* + * In the case of fast device addition/removal, it's possible that + * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we + * already got a PCI_BUS_RELATIONS* message from the host and the + * channel callback already scheduled a work to hbus->wq, which can be + * running pci_devices_present_work() -> survey_child_resources() -> + * complete(&hbus->survey_event), even after hv_pci_query_relations() + * exits and the stack variable 'comp' is no longer valid; as a result, + * a hang or a page fault may happen when the complete() calls + * raw_spin_lock_irqsave(). Flush hbus->wq before we exit from + * hv_pci_query_relations() to avoid the issues. Note: if 'ret' is + * -ENODEV, there can't be any more work item scheduled to hbus->wq + * after the flush_workqueue(): see vmbus_onoffer_rescind() -> + * vmbus_reset_channel_cb(), vmbus_rescind_cleanup() -> + * channel->rescind = true. + */ + flush_workqueue(hbus->wq); + return ret; } From 5e0d33cc7813f59b352626d8b1ed922fb2bcc581 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Wed, 14 Jun 2023 21:44:50 -0700 Subject: [PATCH 20/98] Revert "PCI: hv: Fix a timing issue which causes kdump to fail occasionally" commit a847234e24d03d01a9566d1d9dcce018cc018d67 upstream. This reverts commit d6af2ed29c7c1c311b96dac989dcb991e90ee195. The statement "the hv_pci_bus_exit() call releases structures of all its child devices" in commit d6af2ed29c7c is not true: in the path hv_pci_probe() -> hv_pci_enter_d0() -> hv_pci_bus_exit(hdev, true): the parameter "keep_devs" is true, so hv_pci_bus_exit() does *not* release the child "struct hv_pci_dev *hpdev" that is created earlier in pci_devices_present_work() -> new_pcichild_device(). The commit d6af2ed29c7c was originally made in July 2020 for RHEL 7.7, where the old version of hv_pci_bus_exit() was used; when the commit was rebased and merged into the upstream, people didn't notice that it's not really necessary. The commit itself doesn't cause any issue, but it makes hv_pci_probe() more complicated. Revert it to facilitate some upcoming changes to hv_pci_probe(). Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Acked-by: Wei Hu Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230615044451.5580-5-decui@microsoft.com Signed-off-by: Wei Liu Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-hyperv.c | 71 ++++++++++++++--------------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 1f5e0a12e3de..2ddb558035d6 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2889,8 +2889,10 @@ static int hv_pci_enter_d0(struct hv_device *hdev) struct pci_bus_d0_entry *d0_entry; struct hv_pci_compl comp_pkt; struct pci_packet *pkt; + bool retry = true; int ret; +enter_d0_retry: /* * Tell the host that the bus is ready to use, and moved into the * powered-on state. This includes telling the host which region @@ -2917,6 +2919,38 @@ static int hv_pci_enter_d0(struct hv_device *hdev) if (ret) goto exit; + /* + * In certain case (Kdump) the pci device of interest was + * not cleanly shut down and resource is still held on host + * side, the host could return invalid device status. + * We need to explicitly request host to release the resource + * and try to enter D0 again. + */ + if (comp_pkt.completion_status < 0 && retry) { + retry = false; + + dev_err(&hdev->device, "Retrying D0 Entry\n"); + + /* + * Hv_pci_bus_exit() calls hv_send_resource_released() + * to free up resources of its child devices. + * In the kdump kernel we need to set the + * wslot_res_allocated to 255 so it scans all child + * devices to release resources allocated in the + * normal kernel before panic happened. + */ + hbus->wslot_res_allocated = 255; + + ret = hv_pci_bus_exit(hdev, true); + + if (ret == 0) { + kfree(pkt); + goto enter_d0_retry; + } + dev_err(&hdev->device, + "Retrying D0 failed with ret %d\n", ret); + } + if (comp_pkt.completion_status < 0) { dev_err(&hdev->device, "PCI Pass-through VSP failed D0 Entry with status %x\n", @@ -3162,7 +3196,6 @@ static int hv_pci_probe(struct hv_device *hdev, struct hv_pcibus_device *hbus; u16 dom_req, dom; char *name; - bool enter_d0_retry = true; int ret; /* @@ -3298,47 +3331,11 @@ static int hv_pci_probe(struct hv_device *hdev, if (ret) goto free_fwnode; -retry: ret = hv_pci_query_relations(hdev); if (ret) goto free_irq_domain; ret = hv_pci_enter_d0(hdev); - /* - * In certain case (Kdump) the pci device of interest was - * not cleanly shut down and resource is still held on host - * side, the host could return invalid device status. - * We need to explicitly request host to release the resource - * and try to enter D0 again. - * Since the hv_pci_bus_exit() call releases structures - * of all its child devices, we need to start the retry from - * hv_pci_query_relations() call, requesting host to send - * the synchronous child device relations message before this - * information is needed in hv_send_resources_allocated() - * call later. - */ - if (ret == -EPROTO && enter_d0_retry) { - enter_d0_retry = false; - - dev_err(&hdev->device, "Retrying D0 Entry\n"); - - /* - * Hv_pci_bus_exit() calls hv_send_resources_released() - * to free up resources of its child devices. - * In the kdump kernel we need to set the - * wslot_res_allocated to 255 so it scans all child - * devices to release resources allocated in the - * normal kernel before panic happened. - */ - hbus->wslot_res_allocated = 255; - ret = hv_pci_bus_exit(hdev, true); - - if (ret == 0) - goto retry; - - dev_err(&hdev->device, - "Retrying D0 failed with ret %d\n", ret); - } if (ret) goto free_irq_domain; From 7d852ca7af3785851684f21c4a71cc3aae232bdb Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Wed, 14 Jun 2023 21:44:49 -0700 Subject: [PATCH 21/98] PCI: hv: Remove the useless hv_pcichild_state from struct hv_pci_dev commit add9195e69c94b32e96f78c2f9cea68f0e850b3f upstream. The hpdev->state is never really useful. The only use in hv_pci_eject_device() and hv_eject_device_work() is not really necessary. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Acked-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230615044451.5580-4-decui@microsoft.com Signed-off-by: Wei Liu Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-hyperv.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 2ddb558035d6..c2039d3214e0 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -547,19 +547,10 @@ struct hv_dr_state { struct hv_pcidev_description func[]; }; -enum hv_pcichild_state { - hv_pcichild_init = 0, - hv_pcichild_requirements, - hv_pcichild_resourced, - hv_pcichild_ejecting, - hv_pcichild_maximum -}; - struct hv_pci_dev { /* List protected by pci_rescan_remove_lock */ struct list_head list_entry; refcount_t refs; - enum hv_pcichild_state state; struct pci_slot *pci_slot; struct hv_pcidev_description desc; bool reported_missing; @@ -2430,8 +2421,6 @@ static void hv_eject_device_work(struct work_struct *work) hpdev = container_of(work, struct hv_pci_dev, wrk); hbus = hpdev->hbus; - WARN_ON(hpdev->state != hv_pcichild_ejecting); - /* * Ejection can come before or after the PCI bus has been set up, so * attempt to find it and tear down the bus state, if it exists. This @@ -2488,7 +2477,6 @@ static void hv_pci_eject_device(struct hv_pci_dev *hpdev) return; } - hpdev->state = hv_pcichild_ejecting; get_pcichild(hpdev); INIT_WORK(&hpdev->wrk, hv_eject_device_work); queue_work(hbus->wq, &hpdev->wrk); From 34e21b8ff3e6445baedb813b2ba8d306d6f312c8 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Wed, 14 Jun 2023 21:44:48 -0700 Subject: [PATCH 22/98] PCI: hv: Fix a race condition in hv_irq_unmask() that can cause panic commit 2738d5ab7929a845b654cd171a1e275c37eb428e upstream. When the host tries to remove a PCI device, the host first sends a PCI_EJECT message to the guest, and the guest is supposed to gracefully remove the PCI device and send a PCI_EJECTION_COMPLETE message to the host; the host then sends a VMBus message CHANNELMSG_RESCIND_CHANNELOFFER to the guest (when the guest receives this message, the device is already unassigned from the guest) and the guest can do some final cleanup work; if the guest fails to respond to the PCI_EJECT message within one minute, the host sends the VMBus message CHANNELMSG_RESCIND_CHANNELOFFER and removes the PCI device forcibly. In the case of fast device addition/removal, it's possible that the PCI device driver is still configuring MSI-X interrupts when the guest receives the PCI_EJECT message; the channel callback calls hv_pci_eject_device(), which sets hpdev->state to hv_pcichild_ejecting, and schedules a work hv_eject_device_work(); if the PCI device driver is calling pci_alloc_irq_vectors() -> ... -> hv_compose_msi_msg(), we can break the while loop in hv_compose_msi_msg() due to the updated hpdev->state, and leave data->chip_data with its default value of NULL; later, when the PCI device driver calls request_irq() -> ... -> hv_irq_unmask(), the guest crashes in hv_arch_irq_unmask() due to data->chip_data being NULL. Fix the issue by not testing hpdev->state in the while loop: when the guest receives PCI_EJECT, the device is still assigned to the guest, and the guest has one minute to finish the device removal gracefully. We don't really need to (and we should not) test hpdev->state in the loop. Fixes: de0aa7b2f97d ("PCI: hv: Fix 2 hang issues in hv_compose_msi_msg()") Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230615044451.5580-3-decui@microsoft.com Signed-off-by: Wei Liu Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-hyperv.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index c2039d3214e0..6e91f1ac4dc3 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1252,6 +1252,11 @@ static void hv_irq_unmask(struct irq_data *data) pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); int_desc = data->chip_data; + if (!int_desc) { + dev_warn(&hbus->hdev->device, "%s() can not unmask irq %u\n", + __func__, data->irq); + return; + } spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags); @@ -1601,12 +1606,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) hv_pci_onchannelcallback(hbus); spin_unlock_irqrestore(&channel->sched_lock, flags); - if (hpdev->state == hv_pcichild_ejecting) { - dev_err_once(&hbus->hdev->device, - "the device is being ejected\n"); - goto enable_tasklet; - } - udelay(100); } From 815b2440116534385c89cfc28227a49c2f22eaba Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Wed, 14 Jun 2023 21:44:51 -0700 Subject: [PATCH 23/98] PCI: hv: Add a per-bus mutex state_lock commit 067d6ec7ed5b49380688e06c1e5f883a71bef4fe upstream. In the case of fast device addition/removal, it's possible that hv_eject_device_work() can start to run before create_root_hv_pci_bus() starts to run; as a result, the pci_get_domain_bus_and_slot() in hv_eject_device_work() can return a 'pdev' of NULL, and hv_eject_device_work() can remove the 'hpdev', and immediately send a message PCI_EJECTION_COMPLETE to the host, and the host immediately unassigns the PCI device from the guest; meanwhile, create_root_hv_pci_bus() and the PCI device driver can be probing the dead PCI device and reporting timeout errors. Fix the issue by adding a per-bus mutex 'state_lock' and grabbing the mutex before powering on the PCI bus in hv_pci_enter_d0(): when hv_eject_device_work() starts to run, it's able to find the 'pdev' and call pci_stop_and_remove_bus_device(pdev): if the PCI device driver has loaded, the PCI device driver's probe() function is already called in create_root_hv_pci_bus() -> pci_bus_add_devices(), and now hv_eject_device_work() -> pci_stop_and_remove_bus_device() is able to call the PCI device driver's remove() function and remove the device reliably; if the PCI device driver hasn't loaded yet, the function call hv_eject_device_work() -> pci_stop_and_remove_bus_device() is able to remove the PCI device reliably and the PCI device driver's probe() function won't be called; if the PCI device driver's probe() is already running (e.g., systemd-udev is loading the PCI device driver), it must be holding the per-device lock, and after the probe() finishes and releases the lock, hv_eject_device_work() -> pci_stop_and_remove_bus_device() is able to proceed to remove the device reliably. Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs") Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Acked-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230615044451.5580-6-decui@microsoft.com Signed-off-by: Wei Liu Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-hyperv.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 6e91f1ac4dc3..9b15c0130bbb 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -483,7 +483,10 @@ struct hv_pcibus_device { struct fwnode_handle *fwnode; /* Protocol version negotiated with the host */ enum pci_protocol_version_t protocol_version; + + struct mutex state_lock; enum hv_pcibus_state state; + struct hv_device *hdev; resource_size_t low_mmio_space; resource_size_t high_mmio_space; @@ -2191,6 +2194,8 @@ static void pci_devices_present_work(struct work_struct *work) if (!dr) return; + mutex_lock(&hbus->state_lock); + /* First, mark all existing children as reported missing. */ spin_lock_irqsave(&hbus->device_list_lock, flags); list_for_each_entry(hpdev, &hbus->children, list_entry) { @@ -2272,6 +2277,8 @@ static void pci_devices_present_work(struct work_struct *work) break; } + mutex_unlock(&hbus->state_lock); + kfree(dr); } @@ -2420,6 +2427,8 @@ static void hv_eject_device_work(struct work_struct *work) hpdev = container_of(work, struct hv_pci_dev, wrk); hbus = hpdev->hbus; + mutex_lock(&hbus->state_lock); + /* * Ejection can come before or after the PCI bus has been set up, so * attempt to find it and tear down the bus state, if it exists. This @@ -2456,6 +2465,8 @@ static void hv_eject_device_work(struct work_struct *work) put_pcichild(hpdev); put_pcichild(hpdev); /* hpdev has been freed. Do not use it any more. */ + + mutex_unlock(&hbus->state_lock); } /** @@ -3218,6 +3229,7 @@ static int hv_pci_probe(struct hv_device *hdev, return -ENOMEM; hbus->bridge = bridge; + mutex_init(&hbus->state_lock); hbus->state = hv_pcibus_init; hbus->wslot_res_allocated = -1; @@ -3322,9 +3334,11 @@ static int hv_pci_probe(struct hv_device *hdev, if (ret) goto free_irq_domain; + mutex_lock(&hbus->state_lock); + ret = hv_pci_enter_d0(hdev); if (ret) - goto free_irq_domain; + goto release_state_lock; ret = hv_pci_allocate_bridge_windows(hbus); if (ret) @@ -3342,12 +3356,15 @@ static int hv_pci_probe(struct hv_device *hdev, if (ret) goto free_windows; + mutex_unlock(&hbus->state_lock); return 0; free_windows: hv_pci_free_bridge_windows(hbus); exit_d0: (void) hv_pci_bus_exit(hdev, true); +release_state_lock: + mutex_unlock(&hbus->state_lock); free_irq_domain: irq_domain_remove(hbus->irq_domain); free_fwnode: @@ -3580,20 +3597,26 @@ static int hv_pci_resume(struct hv_device *hdev) if (ret) goto out; + mutex_lock(&hbus->state_lock); + ret = hv_pci_enter_d0(hdev); if (ret) - goto out; + goto release_state_lock; ret = hv_send_resources_allocated(hdev); if (ret) - goto out; + goto release_state_lock; prepopulate_bars(hbus); hv_pci_restore_msi_state(hbus); hbus->state = hv_pcibus_installed; + mutex_unlock(&hbus->state_lock); return 0; + +release_state_lock: + mutex_unlock(&hbus->state_lock); out: vmbus_close(hdev->channel); return ret; From 4a557910bbed3e2111bee221f63de126f1b6df5c Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sat, 10 Jun 2023 17:26:43 +0800 Subject: [PATCH 24/98] cgroup: Do not corrupt task iteration when rebinding subsystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6f363f5aa845561f7ea496d8b1175e3204470486 upstream. We found a refcount UAF bug as follows: refcount_t: addition on 0; use-after-free. WARNING: CPU: 1 PID: 342 at lib/refcount.c:25 refcount_warn_saturate+0xa0/0x148 Workqueue: events cpuset_hotplug_workfn Call trace: refcount_warn_saturate+0xa0/0x148 __refcount_add.constprop.0+0x5c/0x80 css_task_iter_advance_css_set+0xd8/0x210 css_task_iter_advance+0xa8/0x120 css_task_iter_next+0x94/0x158 update_tasks_root_domain+0x58/0x98 rebuild_root_domains+0xa0/0x1b0 rebuild_sched_domains_locked+0x144/0x188 cpuset_hotplug_workfn+0x138/0x5a0 process_one_work+0x1e8/0x448 worker_thread+0x228/0x3e0 kthread+0xe0/0xf0 ret_from_fork+0x10/0x20 then a kernel panic will be triggered as below: Unable to handle kernel paging request at virtual address 00000000c0000010 Call trace: cgroup_apply_control_disable+0xa4/0x16c rebind_subsystems+0x224/0x590 cgroup_destroy_root+0x64/0x2e0 css_free_rwork_fn+0x198/0x2a0 process_one_work+0x1d4/0x4bc worker_thread+0x158/0x410 kthread+0x108/0x13c ret_from_fork+0x10/0x18 The race that cause this bug can be shown as below: (hotplug cpu) | (umount cpuset) mutex_lock(&cpuset_mutex) | mutex_lock(&cgroup_mutex) cpuset_hotplug_workfn | rebuild_root_domains | rebind_subsystems update_tasks_root_domain | spin_lock_irq(&css_set_lock) css_task_iter_start | list_move_tail(&cset->e_cset_node[ss->id] while(css_task_iter_next) | &dcgrp->e_csets[ss->id]); css_task_iter_end | spin_unlock_irq(&css_set_lock) mutex_unlock(&cpuset_mutex) | mutex_unlock(&cgroup_mutex) Inside css_task_iter_start/next/end, css_set_lock is hold and then released, so when iterating task(left side), the css_set may be moved to another list(right side), then it->cset_head points to the old list head and it->cset_pos->next points to the head node of new list, which can't be used as struct css_set. To fix this issue, switch from all css_sets to only scgrp's css_sets to patch in-flight iterators to preserve correct iteration, and then update it->cset_head as well. Reported-by: Gaosheng Cui Link: https://www.spinics.net/lists/cgroups/msg37935.html Suggested-by: Michal Koutný Link: https://lore.kernel.org/all/20230526114139.70274-1-xiujianfeng@huaweicloud.com/ Signed-off-by: Xiu Jianfeng Fixes: 2d8f243a5e6e ("cgroup: implement cgroup->e_csets[]") Cc: stable@vger.kernel.org # v3.16+ Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/cgroup/cgroup.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index edae2f49f73f..6ccdbce17399 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1741,7 +1741,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - int ssid, i, ret; + int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1785,7 +1785,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); - struct css_set *cset; + struct css_set *cset, *cset_pos; + struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); @@ -1803,9 +1804,22 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) + WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); + list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], + e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); + /* + * all css_sets of scgrp together in same order to dcgrp, + * patch in-flight iterators to preserve correct iteration. + * since the iterator is always advanced right away and + * finished when it->cset_pos meets it->cset_head, so only + * update it->cset_head is enough here. + */ + list_for_each_entry(it, &cset->task_iters, iters_node) + if (it->cset_head == &scgrp->e_csets[ss->id]) + it->cset_head = &dcgrp->e_csets[ss->id]; + } spin_unlock_irq(&css_set_lock); if (ss->css_rstat_flush) { From 00010b52c7051cf93fd4ba623924de082819af15 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Thu, 18 May 2023 11:39:36 +0200 Subject: [PATCH 25/98] mmc: sdhci-msm: Disable broken 64-bit DMA on MSM8916 commit e6f9e590b72e12bbb86b1b8be7e1981f357392ad upstream. While SDHCI claims to support 64-bit DMA on MSM8916 it does not seem to be properly functional. It is not immediately obvious because SDHCI is usually used with IOMMU bypassed on this SoC, and all physical memory has 32-bit addresses. But when trying to enable the IOMMU it quickly fails with an error such as the following: arm-smmu 1e00000.iommu: Unhandled context fault: fsr=0x402, iova=0xfffff200, fsynr=0xe0000, cbfrsynra=0x140, cb=3 mmc1: ADMA error: 0x02000000 mmc1: sdhci: ============ SDHCI REGISTER DUMP =========== mmc1: sdhci: Sys addr: 0x00000000 | Version: 0x00002e02 mmc1: sdhci: Blk size: 0x00000008 | Blk cnt: 0x00000000 mmc1: sdhci: Argument: 0x00000000 | Trn mode: 0x00000013 mmc1: sdhci: Present: 0x03f80206 | Host ctl: 0x00000019 mmc1: sdhci: Power: 0x0000000f | Blk gap: 0x00000000 mmc1: sdhci: Wake-up: 0x00000000 | Clock: 0x00000007 mmc1: sdhci: Timeout: 0x0000000a | Int stat: 0x00000001 mmc1: sdhci: Int enab: 0x03ff900b | Sig enab: 0x03ff100b mmc1: sdhci: ACmd stat: 0x00000000 | Slot int: 0x00000000 mmc1: sdhci: Caps: 0x322dc8b2 | Caps_1: 0x00008007 mmc1: sdhci: Cmd: 0x0000333a | Max curr: 0x00000000 mmc1: sdhci: Resp[0]: 0x00000920 | Resp[1]: 0x5b590000 mmc1: sdhci: Resp[2]: 0xe6487f80 | Resp[3]: 0x0a404094 mmc1: sdhci: Host ctl2: 0x00000008 mmc1: sdhci: ADMA Err: 0x00000001 | ADMA Ptr: 0x0000000ffffff224 mmc1: sdhci_msm: ----------- VENDOR REGISTER DUMP ----------- mmc1: sdhci_msm: DLL sts: 0x00000000 | DLL cfg: 0x60006400 | DLL cfg2: 0x00000000 mmc1: sdhci_msm: DLL cfg3: 0x00000000 | DLL usr ctl: 0x00000000 | DDR cfg: 0x00000000 mmc1: sdhci_msm: Vndr func: 0x00018a9c | Vndr func2 : 0xf88018a8 Vndr func3: 0x00000000 mmc1: sdhci: ============================================ mmc1: sdhci: fffffffff200: DMA 0x0000ffffffffe100, LEN 0x0008, Attr=0x21 mmc1: sdhci: fffffffff20c: DMA 0x0000000000000000, LEN 0x0000, Attr=0x03 Looking closely it's obvious that only the 32-bit part of the address (0xfffff200) arrives at the SMMU, the higher 16-bit (0xffff...) get lost somewhere. This might not be a limitation of the SDHCI itself but perhaps the bus/interconnect it is connected to, or even the connection to the SMMU. Work around this by setting SDHCI_QUIRK2_BROKEN_64_BIT_DMA to avoid using 64-bit addresses. Signed-off-by: Stephan Gerhold Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230518-msm8916-64bit-v1-1-5694b0f35211@gerhold.net Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci-msm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 83d38e44fc25..6537a845266d 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -2482,6 +2482,9 @@ static inline void sdhci_msm_get_of_property(struct platform_device *pdev, msm_host->ddr_config = DDR_CONFIG_POR_VAL; of_property_read_u32(node, "qcom,dll-config", &msm_host->dll_config); + + if (of_device_is_compatible(node, "qcom,msm8916-sdhci")) + host->quirks2 |= SDHCI_QUIRK2_BROKEN_64_BIT_DMA; } static int sdhci_msm_gcc_reset(struct device *dev, struct sdhci_host *host) From 999173f295cc6da8e60c444070d1372f297ccd10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Hundeb=C3=B8ll?= Date: Wed, 7 Jun 2023 10:27:12 +0200 Subject: [PATCH 26/98] mmc: meson-gx: remove redundant mmc_request_done() call from irq context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3c40eb8145325b0f5b93b8a169146078cb2c49d6 upstream. The call to mmc_request_done() can schedule, so it must not be called from irq context. Wake the irq thread if it needs to be called, and let its existing logic do its work. Fixes the following kernel bug, which appears when running an RT patched kernel on the AmLogic Meson AXG A113X SoC: [ 11.111407] BUG: scheduling while atomic: kworker/0:1H/75/0x00010001 [ 11.111438] Modules linked in: [ 11.111451] CPU: 0 PID: 75 Comm: kworker/0:1H Not tainted 6.4.0-rc3-rt2-rtx-00081-gfd07f41ed6b4-dirty #1 [ 11.111461] Hardware name: RTX AXG A113X Linux Platform Board (DT) [ 11.111469] Workqueue: kblockd blk_mq_run_work_fn [ 11.111492] Call trace: [ 11.111497] dump_backtrace+0xac/0xe8 [ 11.111510] show_stack+0x18/0x28 [ 11.111518] dump_stack_lvl+0x48/0x60 [ 11.111530] dump_stack+0x18/0x24 [ 11.111537] __schedule_bug+0x4c/0x68 [ 11.111548] __schedule+0x80/0x574 [ 11.111558] schedule_loop+0x2c/0x50 [ 11.111567] schedule_rtlock+0x14/0x20 [ 11.111576] rtlock_slowlock_locked+0x468/0x730 [ 11.111587] rt_spin_lock+0x40/0x64 [ 11.111596] __wake_up_common_lock+0x5c/0xc4 [ 11.111610] __wake_up+0x18/0x24 [ 11.111620] mmc_blk_mq_req_done+0x68/0x138 [ 11.111633] mmc_request_done+0x104/0x118 [ 11.111644] meson_mmc_request_done+0x38/0x48 [ 11.111654] meson_mmc_irq+0x128/0x1f0 [ 11.111663] __handle_irq_event_percpu+0x70/0x114 [ 11.111674] handle_irq_event_percpu+0x18/0x4c [ 11.111683] handle_irq_event+0x80/0xb8 [ 11.111691] handle_fasteoi_irq+0xa4/0x120 [ 11.111704] handle_irq_desc+0x20/0x38 [ 11.111712] generic_handle_domain_irq+0x1c/0x28 [ 11.111721] gic_handle_irq+0x8c/0xa8 [ 11.111735] call_on_irq_stack+0x24/0x4c [ 11.111746] do_interrupt_handler+0x88/0x94 [ 11.111757] el1_interrupt+0x34/0x64 [ 11.111769] el1h_64_irq_handler+0x18/0x24 [ 11.111779] el1h_64_irq+0x64/0x68 [ 11.111786] __add_wait_queue+0x0/0x4c [ 11.111795] mmc_blk_rw_wait+0x84/0x118 [ 11.111804] mmc_blk_mq_issue_rq+0x5c4/0x654 [ 11.111814] mmc_mq_queue_rq+0x194/0x214 [ 11.111822] blk_mq_dispatch_rq_list+0x3ac/0x528 [ 11.111834] __blk_mq_sched_dispatch_requests+0x340/0x4d0 [ 11.111847] blk_mq_sched_dispatch_requests+0x38/0x70 [ 11.111858] blk_mq_run_work_fn+0x3c/0x70 [ 11.111865] process_one_work+0x17c/0x1f0 [ 11.111876] worker_thread+0x1d4/0x26c [ 11.111885] kthread+0xe4/0xf4 [ 11.111894] ret_from_fork+0x10/0x20 Fixes: 51c5d8447bd7 ("MMC: meson: initial support for GX platforms") Cc: stable@vger.kernel.org Signed-off-by: Martin Hundebøll Link: https://lore.kernel.org/r/20230607082713.517157-1-martin@geanix.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/meson-gx-mmc.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 753f9ea254d4..39640906d333 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -981,11 +981,8 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id) if (status & (IRQ_END_OF_CHAIN | IRQ_RESP_STATUS)) { if (data && !cmd->error) data->bytes_xfered = data->blksz * data->blocks; - if (meson_mmc_bounce_buf_read(data) || - meson_mmc_get_next_command(cmd)) - ret = IRQ_WAKE_THREAD; - else - ret = IRQ_HANDLED; + + return IRQ_WAKE_THREAD; } out: @@ -997,9 +994,6 @@ out: writel(start, host->regs + SD_EMMC_START); } - if (ret == IRQ_HANDLED) - meson_mmc_request_done(host->mmc, cmd->mrq); - return ret; } From acee272283f411cf0ed692e6ec92aee95f7f9084 Mon Sep 17 00:00:00 2001 From: Christophe Kerello Date: Tue, 13 Jun 2023 15:41:46 +0200 Subject: [PATCH 27/98] mmc: mmci: stm32: fix max busy timeout calculation commit 47b3ad6b7842f49d374a01b054a4b1461a621bdc upstream. The way that the timeout is currently calculated could lead to a u64 timeout value in mmci_start_command(). This value is then cast in a u32 register that leads to mmc erase failed issue with some SD cards. Fixes: 8266c585f489 ("mmc: mmci: add hardware busy timeout feature") Signed-off-by: Yann Gautier Signed-off-by: Christophe Kerello Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230613134146.418016-1-yann.gautier@foss.st.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/mmci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index 2c4eda83ca18..090246aa6f03 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -1729,7 +1729,8 @@ static void mmci_set_max_busy_timeout(struct mmc_host *mmc) return; if (host->variant->busy_timeout && mmc->actual_clock) - max_busy_timeout = ~0UL / (mmc->actual_clock / MSEC_PER_SEC); + max_busy_timeout = U32_MAX / DIV_ROUND_UP(mmc->actual_clock, + MSEC_PER_SEC); mmc->max_busy_timeout = max_busy_timeout; } From ba9952e2f50b7cc52103092442f36578a29ae6ab Mon Sep 17 00:00:00 2001 From: Matthias May Date: Thu, 21 Jul 2022 22:27:19 +0200 Subject: [PATCH 28/98] ip_tunnels: allow VXLAN/GENEVE to inherit TOS/TTL from VLAN commit 7074732c8faee201a245a6f983008a5789c0be33 upstream. The current code allows for VXLAN and GENEVE to inherit the TOS respective the TTL when skb-protocol is ETH_P_IP or ETH_P_IPV6. However when the payload is VLAN encapsulated, then this inheriting does not work, because the visible skb-protocol is of type ETH_P_8021Q or ETH_P_8021AD. Instead of skb->protocol use skb_protocol(). Signed-off-by: Matthias May Link: https://lore.kernel.org/r/20220721202718.10092-1-matthias.may@westermo.com Signed-off-by: Jakub Kicinski Cc: Nicolas Dichtel Signed-off-by: Greg Kroah-Hartman --- include/net/ip_tunnels.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 37d5d4968e20..4eb0edc6640d 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -377,9 +377,11 @@ static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, const struct sk_buff *skb) { - if (skb->protocol == htons(ETH_P_IP)) + __be16 payload_protocol = skb_protocol(skb, true); + + if (payload_protocol == htons(ETH_P_IP)) return iph->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + else if (payload_protocol == htons(ETH_P_IPV6)) return ipv6_get_dsfield((const struct ipv6hdr *)iph); else return 0; @@ -388,9 +390,11 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, const struct sk_buff *skb) { - if (skb->protocol == htons(ETH_P_IP)) + __be16 payload_protocol = skb_protocol(skb, true); + + if (payload_protocol == htons(ETH_P_IP)) return iph->ttl; - else if (skb->protocol == htons(ETH_P_IPV6)) + else if (payload_protocol == htons(ETH_P_IPV6)) return ((const struct ipv6hdr *)iph)->hop_limit; else return 0; From 4796d9b06917dee629d04b871f5fc038bd223164 Mon Sep 17 00:00:00 2001 From: Teresa Remmet Date: Wed, 14 Jun 2023 14:52:40 +0200 Subject: [PATCH 29/98] regulator: pca9450: Fix LDO3OUT and LDO4OUT MASK [ Upstream commit 7257d930aadcd62d1c7971ab14f3b1126356abdc ] L3_OUT and L4_OUT Bit fields range from Bit 0:4 and thus the mask should be 0x1F instead of 0x0F. Fixes: 0935ff5f1f0a ("regulator: pca9450: add pca9450 pmic driver") Signed-off-by: Teresa Remmet Reviewed-by: Frieder Schrempf Link: https://lore.kernel.org/r/20230614125240.3946519-1-t.remmet@phytec.de Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- include/linux/regulator/pca9450.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h index 71902f41c919..0c3edff6bdff 100644 --- a/include/linux/regulator/pca9450.h +++ b/include/linux/regulator/pca9450.h @@ -196,11 +196,11 @@ enum { /* PCA9450_REG_LDO3_VOLT bits */ #define LDO3_EN_MASK 0xC0 -#define LDO3OUT_MASK 0x0F +#define LDO3OUT_MASK 0x1F /* PCA9450_REG_LDO4_VOLT bits */ #define LDO4_EN_MASK 0xC0 -#define LDO4OUT_MASK 0x0F +#define LDO4OUT_MASK 0x1F /* PCA9450_REG_LDO5_VOLT bits */ #define LDO5L_EN_MASK 0xC0 From 3c46a240ddbacd3e17b52dd68bf0cbf1e53270d2 Mon Sep 17 00:00:00 2001 From: Russ Weight Date: Tue, 20 Jun 2023 13:28:24 -0700 Subject: [PATCH 30/98] regmap: spi-avmm: Fix regmap_bus max_raw_write [ Upstream commit c8e796895e2310b6130e7577248da1d771431a77 ] The max_raw_write member of the regmap_spi_avmm_bus structure is defined as: .max_raw_write = SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT SPI_AVMM_VAL_SIZE == 4 and MAX_WRITE_CNT == 1 so this results in a maximum write transfer size of 4 bytes which provides only enough space to transfer the address of the target register. It provides no space for the value to be transferred. This bug became an issue (divide-by-zero in _regmap_raw_write()) after the following was accepted into mainline: commit 3981514180c9 ("regmap: Account for register length when chunking") Change max_raw_write to include space (4 additional bytes) for both the register address and value: .max_raw_write = SPI_AVMM_REG_SIZE + SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT Fixes: 7f9fb67358a2 ("regmap: add Intel SPI Slave to AVMM Bus Bridge support") Reviewed-by: Matthew Gerlach Signed-off-by: Russ Weight Link: https://lore.kernel.org/r/20230620202824.380313-1-russell.h.weight@intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/base/regmap/regmap-spi-avmm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap-spi-avmm.c b/drivers/base/regmap/regmap-spi-avmm.c index ad1da83e849f..67f89937219c 100644 --- a/drivers/base/regmap/regmap-spi-avmm.c +++ b/drivers/base/regmap/regmap-spi-avmm.c @@ -666,7 +666,7 @@ static const struct regmap_bus regmap_spi_avmm_bus = { .reg_format_endian_default = REGMAP_ENDIAN_NATIVE, .val_format_endian_default = REGMAP_ENDIAN_NATIVE, .max_raw_read = SPI_AVMM_VAL_SIZE * MAX_READ_CNT, - .max_raw_write = SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT, + .max_raw_write = SPI_AVMM_REG_SIZE + SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT, .free_context = spi_avmm_bridge_ctx_free, }; From b07bb2914adac63d9a10c2878d4a0a0edf27edfe Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 6 Jun 2023 19:36:13 -0400 Subject: [PATCH 31/98] writeback: fix dereferencing NULL mapping->host on writeback_page_template commit 54abe19e00cfcc5a72773d15cd00ed19ab763439 upstream. When commit 19343b5bdd16 ("mm/page-writeback: introduce tracepoint for wait_on_page_writeback()") repurposed the writeback_dirty_page trace event as a template to create its new wait_on_page_writeback trace event, it ended up opening a window to NULL pointer dereference crashes due to the (infrequent) occurrence of a race where an access to a page in the swap-cache happens concurrently with the moment this page is being written to disk and the tracepoint is enabled: BUG: kernel NULL pointer dereference, address: 0000000000000040 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 800000010ec0a067 P4D 800000010ec0a067 PUD 102353067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 1320 Comm: shmem-worker Kdump: loaded Not tainted 6.4.0-rc5+ #13 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS edk2-20230301gitf80f052277c8-1.fc37 03/01/2023 RIP: 0010:trace_event_raw_event_writeback_folio_template+0x76/0xf0 Code: 4d 85 e4 74 5c 49 8b 3c 24 e8 06 98 ee ff 48 89 c7 e8 9e 8b ee ff ba 20 00 00 00 48 89 ef 48 89 c6 e8 fe d4 1a 00 49 8b 04 24 <48> 8b 40 40 48 89 43 28 49 8b 45 20 48 89 e7 48 89 43 30 e8 a2 4d RSP: 0000:ffffaad580b6fb60 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff90e38035c01c RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff90e38035c044 RBP: ffff90e38035c024 R08: 0000000000000002 R09: 0000000000000006 R10: ffff90e38035c02e R11: 0000000000000020 R12: ffff90e380bac000 R13: ffffe3a7456d9200 R14: 0000000000001b81 R15: ffffe3a7456d9200 FS: 00007f2e4e8a15c0(0000) GS:ffff90e3fbc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000040 CR3: 00000001150c6003 CR4: 0000000000170ee0 Call Trace: ? __die+0x20/0x70 ? page_fault_oops+0x76/0x170 ? kernelmode_fixup_or_oops+0x84/0x110 ? exc_page_fault+0x65/0x150 ? asm_exc_page_fault+0x22/0x30 ? trace_event_raw_event_writeback_folio_template+0x76/0xf0 folio_wait_writeback+0x6b/0x80 shmem_swapin_folio+0x24a/0x500 ? filemap_get_entry+0xe3/0x140 shmem_get_folio_gfp+0x36e/0x7c0 ? find_busiest_group+0x43/0x1a0 shmem_fault+0x76/0x2a0 ? __update_load_avg_cfs_rq+0x281/0x2f0 __do_fault+0x33/0x130 do_read_fault+0x118/0x160 do_pte_missing+0x1ed/0x2a0 __handle_mm_fault+0x566/0x630 handle_mm_fault+0x91/0x210 do_user_addr_fault+0x22c/0x740 exc_page_fault+0x65/0x150 asm_exc_page_fault+0x22/0x30 This problem arises from the fact that the repurposed writeback_dirty_page trace event code was written assuming that every pointer to mapping (struct address_space) would come from a file-mapped page-cache object, thus mapping->host would always be populated, and that was a valid case before commit 19343b5bdd16. The swap-cache address space (swapper_spaces), however, doesn't populate its ->host (struct inode) pointer, thus leading to the crashes in the corner-case aforementioned. commit 19343b5bdd16 ended up breaking the assignment of __entry->name and __entry->ino for the wait_on_page_writeback tracepoint -- both dependent on mapping->host carrying a pointer to a valid inode. The assignment of __entry->name was fixed by commit 68f23b89067f ("memcg: fix a crash in wb_workfn when a device disappears"), and this commit fixes the remaining case, for __entry->ino. Link: https://lkml.kernel.org/r/20230606233613.1290819-1-aquini@redhat.com Fixes: 19343b5bdd16 ("mm/page-writeback: introduce tracepoint for wait_on_page_writeback()") Signed-off-by: Rafael Aquini Reviewed-by: Yafang Shao Cc: Aristeu Rozanski Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- include/trace/events/writeback.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 840d1ba84cf5..9d8303a93a36 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(writeback_page_template, strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); - __entry->ino = mapping ? mapping->host->i_ino : 0; + __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0; __entry->index = page->index; ), From 34a7e5021a437bf813e1b52fc9043b43d48e2325 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 23 Jun 2023 07:38:14 -0600 Subject: [PATCH 32/98] io_uring/net: save msghdr->msg_control for retries Commit cac9e4418f4cbd548ccb065b3adcafe073f7f7d2 upstream. If the application sets ->msg_control and we have to later retry this command, or if it got queued with IOSQE_ASYNC to begin with, then we need to retain the original msg_control value. This is due to the net stack overwriting this field with an in-kernel pointer, to copy it in. Hitting that path for the second time will now fail the copy from user, as it's attempting to copy from a non-user address. Cc: stable@vger.kernel.org # 5.10+ Link: https://github.com/axboe/liburing/issues/880 Reported-and-tested-by: Marek Majkowski Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- io_uring/io_uring.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1d8adc57a44a..f7c41d3d7752 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -581,6 +581,7 @@ struct io_sr_msg { size_t len; size_t done_io; struct io_buffer *kbuf; + void __user *msg_control; }; struct io_open { @@ -4864,10 +4865,16 @@ static int io_setup_async_msg(struct io_kiocb *req, static int io_sendmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { + struct io_sr_msg *sr = &req->sr_msg; + int ret; + iomsg->msg.msg_name = &iomsg->addr; iomsg->free_iov = iomsg->fast_iov; - return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, + ret = sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, req->sr_msg.msg_flags, &iomsg->free_iov); + /* save msg_control as sys_sendmsg() overwrites it */ + sr->msg_control = iomsg->msg.msg_control; + return ret; } static int io_sendmsg_prep_async(struct io_kiocb *req) @@ -4924,6 +4931,8 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (ret) return ret; kmsg = &iomsg; + } else { + kmsg->msg.msg_control = sr->msg_control; } flags = req->sr_msg.msg_flags; From 25a543ca3005f86cb8eee034510046ddb5537e83 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 23 Jun 2023 07:39:42 -0600 Subject: [PATCH 33/98] io_uring/net: clear msg_controllen on partial sendmsg retry Commit b1dc492087db0f2e5a45f1072a743d04618dd6be upstream. If we have cmsg attached AND we transferred partial data at least, clear msg_controllen on retry so we don't attempt to send that again. Cc: stable@vger.kernel.org # 5.10+ Fixes: cac9e4418f4c ("io_uring/net: save msghdr->msg_control for retries") Reported-by: Stefan Metzmacher Reviewed-by: Stefan Metzmacher Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- io_uring/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f7c41d3d7752..1d18aa17e71b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -4949,6 +4949,8 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (ret == -ERESTARTSYS) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_control = NULL; sr->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; return io_setup_async_msg(req, kmsg); From 96987c383c2b69eac66b5e193447d5bbdbbc8a86 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 23 Jun 2023 07:41:10 -0600 Subject: [PATCH 34/98] io_uring/net: disable partial retries for recvmsg with cmsg Commit 78d0d2063bab954d19a1696feae4c7706a626d48 upstream. We cannot sanely handle partial retries for recvmsg if we have cmsg attached. If we don't, then we'd just be overwriting the initial cmsg header on retries. Alternatively we could increment and handle this appropriately, but it doesn't seem worth the complication. Move the MSG_WAITALL check into the non-multishot case while at it, since MSG_WAITALL is explicitly disabled for multishot anyway. Link: https://lore.kernel.org/io-uring/0b0d4411-c8fd-4272-770b-e030af6919a0@kernel.dk/ Cc: stable@vger.kernel.org # 5.10+ Reported-by: Stefan Metzmacher Reviewed-by: Stefan Metzmacher Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1d18aa17e71b..cbfc9bbe87b0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -5201,7 +5201,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) flags = req->sr_msg.msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) + if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) min_ret = iov_iter_count(&kmsg->msg.msg_iter); ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, From 793d0224bb601e97ea2c01c5c25ab798abf118d8 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 12 Jun 2023 11:14:56 +0900 Subject: [PATCH 35/98] nilfs2: prevent general protection fault in nilfs_clear_dirty_page() commit 782e53d0c14420858dbf0f8f797973c150d3b6d7 upstream. In a syzbot stress test that deliberately causes file system errors on nilfs2 with a corrupted disk image, it has been reported that nilfs_clear_dirty_page() called from nilfs_clear_dirty_pages() can cause a general protection fault. In nilfs_clear_dirty_pages(), when looking up dirty pages from the page cache and calling nilfs_clear_dirty_page() for each dirty page/folio retrieved, the back reference from the argument page to "mapping" may have been changed to NULL (and possibly others). It is necessary to check this after locking the page/folio. So, fix this issue by not calling nilfs_clear_dirty_page() on a page/folio after locking it in nilfs_clear_dirty_pages() if the back reference "mapping" from the page/folio is different from the "mapping" that held the page/folio just before. Link: https://lkml.kernel.org/r/20230612021456.3682-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+53369d11851d8f26735c@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/000000000000da4f6b05eb9bf593@google.com Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/page.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index d1a148f0cae3..81992b9a219b 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -369,7 +369,15 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) struct page *page = pvec.pages[i]; lock_page(page); - nilfs_clear_dirty_page(page, silent); + + /* + * This page may have been removed from the address + * space by truncation or invalidation when the lock + * was acquired. Skip processing in that case. + */ + if (likely(page->mapping == mapping)) + nilfs_clear_dirty_page(page, silent); + unlock_page(page); } pagevec_release(&pvec); From 103734b429b9984571a92ce1a4184de79e9f9d13 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Wed, 14 Jun 2023 17:38:54 +0100 Subject: [PATCH 36/98] x86/mm: Avoid using set_pgd() outside of real PGD pages commit d082d48737c75d2b3cc1f972b8c8674c25131534 upstream. KPTI keeps around two PGDs: one for userspace and another for the kernel. Among other things, set_pgd() contains infrastructure to ensure that updates to the kernel PGD are reflected in the user PGD as well. One side-effect of this is that set_pgd() expects to be passed whole pages. Unfortunately, init_trampoline_kaslr() passes in a single entry: 'trampoline_pgd_entry'. When KPTI is on, set_pgd() will update 'trampoline_pgd_entry' (an 8-Byte globally stored [.bss] variable) and will then proceed to replicate that value into the non-existent neighboring user page (located +4k away), leading to the corruption of other global [.bss] stored variables. Fix it by directly assigning 'trampoline_pgd_entry' and avoiding set_pgd(). [ dhansen: tweak subject and changelog ] Fixes: 0925dda5962e ("x86/mm/KASLR: Use only one PUD entry for real mode trampoline") Suggested-by: Dave Hansen Signed-off-by: Lee Jones Signed-off-by: Dave Hansen Cc: Link: https://lore.kernel.org/all/20230614163859.924309-1-lee@kernel.org/g Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/kaslr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 557f0fe25dff..37db264866b6 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -172,10 +172,10 @@ void __meminit init_trampoline_kaslr(void) set_p4d(p4d_tramp, __p4d(_KERNPG_TABLE | __pa(pud_page_tramp))); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)); } else { - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); } } From 9bac4a2b7326411a05aca952dde5c453798b62ac Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 7 Jun 2023 15:24:27 +0200 Subject: [PATCH 37/98] memfd: check for non-NULL file_seals in memfd_create() syscall [ Upstream commit 935d44acf621aa0688fef8312dec3e5940f38f4e ] Ensure that file_seals is non-NULL before using it in the memfd_create() syscall. One situation in which memfd_file_seals_ptr() could return a NULL pointer when CONFIG_SHMEM=n, oopsing the kernel. Link: https://lkml.kernel.org/r/20230607132427.2867435-1-roberto.sassu@huaweicloud.com Fixes: 47b9012ecdc7 ("shmem: add sealing support to hugetlb-backed memfd") Signed-off-by: Roberto Sassu Cc: Marc-Andr Lureau Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Sasha Levin --- mm/memfd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memfd.c b/mm/memfd.c index 475d095dd7f5..a73af8be9c28 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -330,7 +330,8 @@ SYSCALL_DEFINE2(memfd_create, if (flags & MFD_ALLOW_SEALING) { file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; + if (file_seals) + *file_seals &= ~F_SEAL_SEAL; } fd_install(fd, file); From 29672dc47d99f0c9247c16961c4e46419cdb705f Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:12 +0300 Subject: [PATCH 38/98] mmc: meson-gx: fix deferred probing [ Upstream commit b8ada54fa1b83f3b6480d4cced71354301750153 ] The driver overrides the error codes and IRQ0 returned by platform_get_irq() to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Since commit ce753ad1549c ("platform: finally disallow IRQ0 in platform_get_irq() and its ilk") IRQ0 is no longer returned by those APIs, so we now can safely ignore it... Fixes: cbcaac6d7dd2 ("mmc: meson-gx-mmc: Fix platform_get_irq's error checking") Cc: stable@vger.kernel.org # v5.19+ Signed-off-by: Sergey Shtylyov Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20230617203622.6812-3-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/meson-gx-mmc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 39640906d333..287705729064 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -1179,8 +1179,8 @@ static int meson_mmc_probe(struct platform_device *pdev) } host->irq = platform_get_irq(pdev, 0); - if (host->irq <= 0) { - ret = -EINVAL; + if (host->irq < 0) { + ret = host->irq; goto free_host; } From 0ce3d0c068d917e6ea6e3b8b5c424c97cd2bd4e0 Mon Sep 17 00:00:00 2001 From: Chen Aotian Date: Sun, 9 Apr 2023 10:20:48 +0800 Subject: [PATCH 39/98] ieee802154: hwsim: Fix possible memory leaks [ Upstream commit a61675294735570daca3779bd1dbb3715f7232bd ] After replacing e->info, it is necessary to free the old einfo. Fixes: f25da51fdc38 ("ieee802154: hwsim: add replacement for fakelb") Reviewed-by: Miquel Raynal Reviewed-by: Alexander Aring Signed-off-by: Chen Aotian Link: https://lore.kernel.org/r/20230409022048.61223-1-chenaotian2@163.com Signed-off-by: Stefan Schmidt Signed-off-by: Sasha Levin --- drivers/net/ieee802154/mac802154_hwsim.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index 36f1c5aa98fc..1ab1ba41c430 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -522,7 +522,7 @@ static int hwsim_del_edge_nl(struct sk_buff *msg, struct genl_info *info) static int hwsim_set_edge_lqi(struct sk_buff *msg, struct genl_info *info) { struct nlattr *edge_attrs[MAC802154_HWSIM_EDGE_ATTR_MAX + 1]; - struct hwsim_edge_info *einfo; + struct hwsim_edge_info *einfo, *einfo_old; struct hwsim_phy *phy_v0; struct hwsim_edge *e; u32 v0, v1; @@ -560,8 +560,10 @@ static int hwsim_set_edge_lqi(struct sk_buff *msg, struct genl_info *info) list_for_each_entry_rcu(e, &phy_v0->edges, list) { if (e->endpoint->idx == v1) { einfo->lqi = lqi; - rcu_assign_pointer(e->info, einfo); + einfo_old = rcu_replace_pointer(e->info, einfo, + lockdep_is_held(&hwsim_phys_lock)); rcu_read_unlock(); + kfree_rcu(einfo_old, rcu); mutex_unlock(&hwsim_phys_lock); return 0; } From 491ce3c1d98ab39a1deae154d442a560f7de0708 Mon Sep 17 00:00:00 2001 From: Benedict Wong Date: Wed, 10 May 2023 01:30:21 +0000 Subject: [PATCH 40/98] xfrm: Treat already-verified secpath entries as optional [ Upstream commit 1f8b6df6a997a430b0c48b504638154b520781ad ] This change allows inbound traffic through nested IPsec tunnels to successfully match policies and templates, while retaining the secpath stack trace as necessary for netfilter policies. Specifically, this patch marks secpath entries that have already matched against a relevant policy as having been verified, allowing it to be treated as optional and skipped after a tunnel decapsulation (during which the src/dst/proto/etc may have changed, and the correct policy chain no long be resolvable). This approach is taken as opposed to the iteration in b0355dbbf13c, where the secpath was cleared, since that breaks subsequent validations that rely on the existence of the secpath entries (netfilter policies, or transport-in-tunnel mode, where policies remain resolvable). Fixes: b0355dbbf13c ("Fix XFRM-I support for nested ESP tunnels") Test: Tested against Android Kernel Unit Tests Test: Tested against Android CTS Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- include/net/xfrm.h | 1 + net/xfrm/xfrm_input.c | 1 + net/xfrm/xfrm_policy.c | 12 ++++++++++++ 3 files changed, 14 insertions(+) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 73030094c6e6..6156ed2950f9 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1026,6 +1026,7 @@ struct xfrm_offload { struct sec_path { int len; int olen; + int verified_cnt; struct xfrm_state *xvec[XFRM_MAX_DEPTH]; struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 7c5958a2eed4..a6861832710d 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -130,6 +130,7 @@ struct sec_path *secpath_set(struct sk_buff *skb) memset(sp->ovec, 0, sizeof(sp->ovec)); sp->olen = 0; sp->len = 0; + sp->verified_cnt = 0; return sp; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 3e28a84ab922..b0a19cc92879 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3275,6 +3275,13 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id)) return ++idx; if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { + if (idx < sp->verified_cnt) { + /* Secpath entry previously verified, consider optional and + * continue searching + */ + continue; + } + if (start == -1) start = -2-idx; break; @@ -3647,6 +3654,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, * Order is _important_. Later we will implement * some barriers, but at the moment barriers * are implied between each two transformations. + * Upon success, marks secpath entries as having been + * verified to allow them to be skipped in future policy + * checks (e.g. nested tunnels). */ for (i = xfrm_nr-1, k = 0; i >= 0; i--) { k = xfrm_policy_ok(tpp[i], sp, k, family, if_id); @@ -3665,6 +3675,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } xfrm_pols_put(pols, npols); + sp->verified_cnt = k; + return 1; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); From d055ee18cab88d1ab751ead8a69f833d8382b663 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sat, 3 Dec 2022 10:46:56 +0200 Subject: [PATCH 41/98] xfrm: interface: rename xfrm_interface.c to xfrm_interface_core.c [ Upstream commit ee9a113ab63468137802898bcd2c598998c96938 ] This change allows adding additional files to the xfrm_interface module. Signed-off-by: Eyal Birger Link: https://lore.kernel.org/r/20221203084659.1837829-2-eyal.birger@gmail.com Signed-off-by: Martin KaFai Lau Stable-dep-of: a287f5b0cfc6 ("xfrm: Ensure policies always checked on XFRM-I input path") Signed-off-by: Sasha Levin --- net/xfrm/Makefile | 2 ++ net/xfrm/{xfrm_interface.c => xfrm_interface_core.c} | 0 2 files changed, 2 insertions(+) rename net/xfrm/{xfrm_interface.c => xfrm_interface_core.c} (100%) diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 494aa744bfb9..08a2870fdd36 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -3,6 +3,8 @@ # Makefile for the XFRM subsystem. # +xfrm_interface-$(CONFIG_XFRM_INTERFACE) += xfrm_interface_core.o + obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ xfrm_input.o xfrm_output.o \ xfrm_sysctl.o xfrm_replay.o xfrm_device.o diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface_core.c similarity index 100% rename from net/xfrm/xfrm_interface.c rename to net/xfrm/xfrm_interface_core.c From 3229a29e95f51aec039c267a0d2b6092a8e245d6 Mon Sep 17 00:00:00 2001 From: Benedict Wong Date: Wed, 10 May 2023 01:30:22 +0000 Subject: [PATCH 42/98] xfrm: Ensure policies always checked on XFRM-I input path [ Upstream commit a287f5b0cfc6804c5b12a4be13c7c9fe27869e90 ] This change adds methods in the XFRM-I input path that ensures that policies are checked prior to processing of the subsequent decapsulated packet, after which the relevant policies may no longer be resolvable (due to changing src/dst/proto/etc). Notably, raw ESP/AH packets did not perform policy checks inherently, whereas all other encapsulated packets (UDP, TCP encapsulated) do policy checks after calling xfrm_input handling in the respective encapsulation layer. Fixes: b0355dbbf13c ("Fix XFRM-I support for nested ESP tunnels") Test: Verified with additional Android Kernel Unit tests Test: Verified against Android CTS Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- net/xfrm/xfrm_interface_core.c | 54 +++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 1e8b26eecb3f..694eec6ca147 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -207,6 +207,52 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) skb->mark = 0; } +static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type, unsigned short family) +{ + struct sec_path *sp; + + sp = skb_sec_path(skb); + if (sp && (sp->len || sp->olen) && + !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + goto discard; + + XFRM_SPI_SKB_CB(skb)->family = family; + if (family == AF_INET) { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + } else { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + } + + return xfrm_input(skb, nexthdr, spi, encap_type); +discard: + kfree_skb(skb); + return 0; +} + +static int xfrmi4_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); +} + +static int xfrmi6_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], + 0, 0, AF_INET6); +} + +static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); +} + +static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); +} + static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; @@ -774,8 +820,8 @@ static struct pernet_operations xfrmi_net_ops = { }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, + .handler = xfrmi6_rcv, + .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -825,8 +871,8 @@ static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, + .handler = xfrmi4_rcv, + .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, From 0b180495f6b06058d752ddee9941a478ddae9d7f Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 15 Feb 2023 01:20:27 +0200 Subject: [PATCH 43/98] bpf: track immediate values written to stack by BPF_ST instruction [ Upstream commit ecdf985d7615356b78241fdb159c091830ed0380 ] For aligned stack writes using BPF_ST instruction track stored values in a same way BPF_STX is handled, e.g. make sure that the following commands produce similar verifier knowledge: fp[-8] = 42; r1 = 42; fp[-8] = r1; This covers two cases: - non-null values written to stack are stored as spill of fake registers; - null values written to stack are stored as STACK_ZERO marks. Previously both cases above used STACK_MISC marks instead. Some verifier test cases relied on the old logic to obtain STACK_MISC marks for some stack values. These test cases are updated in the same commit to avoid failures during bisect. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20230214232030.1502829-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Stable-dep-of: 713274f1f2c8 ("bpf: Fix verifier id tracking of scalars on spill") Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 18 ++- .../bpf/verifier/bounds_mix_sign_unsign.c | 110 ++++++++++-------- 2 files changed, 80 insertions(+), 48 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 27fd331aefe5..9d7a6bf5f7c1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2659,6 +2659,11 @@ static void save_register_state(struct bpf_func_state *state, scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]); } +static bool is_bpf_st_mem(struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM; +} + /* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -2670,8 +2675,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; - u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg; + struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; struct bpf_reg_state *reg = NULL; + u32 dst_reg = insn->dst_reg; err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE)); if (err) @@ -2719,6 +2725,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; } save_register_state(state, spi, reg, size); + } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && + insn->imm != 0 && env->bpf_capable) { + struct bpf_reg_state fake_reg = {}; + + __mark_reg_known(&fake_reg, (u32)insn->imm); + fake_reg.type = SCALAR_VALUE; + save_register_state(state, spi, &fake_reg, size); } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -2753,7 +2766,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; /* when we zero initialize stack slots mark them as such */ - if (reg && register_is_null(reg)) { + if ((reg && register_is_null(reg)) || + (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { /* backtracking doesn't work for STACK_ZERO yet. */ err = mark_chain_precision(env, value_regno); if (err) diff --git a/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c b/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c index c2aa6f26738b..bf82b923c5fe 100644 --- a/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c +++ b/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c @@ -1,13 +1,14 @@ { "bounds checks mixing signed and unsigned, positive bounds", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, 2), BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 3), @@ -17,20 +18,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 3), @@ -40,20 +42,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 2", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 5), @@ -65,20 +68,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 3", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 4), @@ -89,20 +93,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 4", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, 1), BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), @@ -112,19 +117,20 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .result = ACCEPT, }, { "bounds checks mixing signed and unsigned, variant 5", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 5), @@ -135,17 +141,20 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 6", .insns = { + BPF_MOV64_REG(BPF_REG_9, BPF_REG_1), + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -512), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_6, -1), BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_6, 5), @@ -163,13 +172,14 @@ { "bounds checks mixing signed and unsigned, variant 7", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, 1024 * 1024 * 1024), BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 3), @@ -179,19 +189,20 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .result = ACCEPT, }, { "bounds checks mixing signed and unsigned, variant 8", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 2), @@ -203,20 +214,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 9", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_LD_IMM64(BPF_REG_2, -9223372036854775808ULL), BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 2), @@ -228,19 +240,20 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .result = ACCEPT, }, { "bounds checks mixing signed and unsigned, variant 10", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 2), @@ -252,20 +265,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 11", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2), @@ -278,20 +292,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 12", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -6), BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2), @@ -303,20 +318,21 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 13", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, 2), BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2), @@ -331,7 +347,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, @@ -340,13 +356,14 @@ .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1, offsetof(struct __sk_buff, mark)), + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_MOV64_IMM(BPF_REG_8, 2), @@ -360,20 +377,21 @@ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, -3), BPF_JMP_IMM(BPF_JA, 0, 0, -7), }, - .fixup_map_hash_8b = { 4 }, + .fixup_map_hash_8b = { 6 }, .errstr = "unbounded min value", .result = REJECT, }, { "bounds checks mixing signed and unsigned, variant 15", .insns = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), - BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_MOV64_IMM(BPF_REG_2, -6), BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2), @@ -387,7 +405,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_8b = { 5 }, .errstr = "unbounded min value", .result = REJECT, }, From 25e89fa7b5a8d20fbd8bb753da560f9fca132228 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 7 Jun 2023 15:39:50 +0300 Subject: [PATCH 44/98] bpf: Fix verifier id tracking of scalars on spill [ Upstream commit 713274f1f2c896d37017efee333fd44149710119 ] The following scenario describes a bug in the verifier where it incorrectly concludes about equivalent scalar IDs which could lead to verifier bypass in privileged mode: 1. Prepare a 32-bit rogue number. 2. Put the rogue number into the upper half of a 64-bit register, and roll a random (unknown to the verifier) bit in the lower half. The rest of the bits should be zero (although variations are possible). 3. Assign an ID to the register by MOVing it to another arbitrary register. 4. Perform a 32-bit spill of the register, then perform a 32-bit fill to another register. Due to a bug in the verifier, the ID will be preserved, although the new register will contain only the lower 32 bits, i.e. all zeros except one random bit. At this point there are two registers with different values but the same ID, which means the integrity of the verifier state has been corrupted. 5. Compare the new 32-bit register with 0. In the branch where it's equal to 0, the verifier will believe that the original 64-bit register is also 0, because it has the same ID, but its actual value still contains the rogue number in the upper half. Some optimizations of the verifier prevent the actual bypass, so extra care is needed: the comparison must be between two registers, and both branches must be reachable (this is why one random bit is needed). Both branches are still suitable for the bypass. 6. Right shift the original register by 32 bits to pop the rogue number. 7. Use the rogue number as an offset with any pointer. The verifier will believe that the offset is 0, while in reality it's the given number. The fix is similar to the 32-bit BPF_MOV handling in check_alu_op for SCALAR_VALUE. If the spill is narrowing the actual register value, don't keep the ID, make sure it's reset to 0. Fixes: 354e8f1970f8 ("bpf: Support <8-byte scalar spill and refill") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Daniel Borkmann Tested-by: Andrii Nakryiko # Checked veristat delta Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20230607123951.558971-2-maxtram95@gmail.com Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9d7a6bf5f7c1..4f2271f27a1d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2725,6 +2725,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; } save_register_state(state, spi, reg, size); + /* Break the relation on a narrowing spill. */ + if (fls64(reg->umax_value) > BITS_PER_BYTE * size) + state->stack[spi].spilled_ptr.id = 0; } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && insn->imm != 0 && env->bpf_capable) { struct bpf_reg_state fake_reg = {}; From 964cfdfd4b4f438c289c1f40481524d46e5a3f85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Mon, 5 Jun 2023 04:06:54 -0700 Subject: [PATCH 45/98] xfrm: fix inbound ipv4/udp/esp packets to UDPv6 dualstack sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 1166a530a84758bb9e6b448fc8c195ed413f5ded ] Before Linux v5.8 an AF_INET6 SOCK_DGRAM (udp/udplite) socket with SOL_UDP, UDP_ENCAP, UDP_ENCAP_ESPINUDP{,_NON_IKE} enabled would just unconditionally use xfrm4_udp_encap_rcv(), afterwards such a socket would use the newly added xfrm6_udp_encap_rcv() which only handles IPv6 packets. Cc: Sabrina Dubroca Cc: Steffen Klassert Cc: Jakub Kicinski Cc: Benedict Wong Cc: Yan Yan Fixes: 0146dca70b87 ("xfrm: add support for UDPv6 encapsulation of ESP") Signed-off-by: Maciej Żenczykowski Reviewed-by: Simon Horman Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- net/ipv4/xfrm4_input.c | 1 + net/ipv6/xfrm6_input.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index ad2afeef4f10..eac206a290d0 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -164,6 +164,7 @@ drop: kfree_skb(skb); return 0; } +EXPORT_SYMBOL(xfrm4_udp_encap_rcv); int xfrm4_rcv(struct sk_buff *skb) { diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 04cbeefd8982..4907ab241d6b 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -86,6 +86,9 @@ int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) __be32 *udpdata32; __u16 encap_type = up->encap_type; + if (skb->protocol == htons(ETH_P_IP)) + return xfrm4_udp_encap_rcv(sk, skb); + /* if this is not encapsulated socket, then just return now */ if (!encap_type) return 1; From d967bd7ea6cc249063e7be2cac6d9107bee51e3d Mon Sep 17 00:00:00 2001 From: Magali Lemes Date: Tue, 13 Jun 2023 09:32:22 -0300 Subject: [PATCH 46/98] selftests: net: fcnal-test: check if FIPS mode is enabled [ Upstream commit d7a2fc1437f71cb058c7b11bc33dfc19e4bf277a ] There are some MD5 tests which fail when the kernel is in FIPS mode, since MD5 is not FIPS compliant. Add a check and only run those tests if FIPS mode is not enabled. Fixes: f0bee1ebb5594 ("fcnal-test: Add TCP MD5 tests") Fixes: 5cad8bce26e01 ("fcnal-test: Add TCP MD5 tests for VRF") Reviewed-by: David Ahern Signed-off-by: Magali Lemes Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- tools/testing/selftests/net/fcnal-test.sh | 27 ++++++++++++++++------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 364c82b797c1..6ecdbbe1b54f 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -84,6 +84,13 @@ NSC_CMD="ip netns exec ${NSC}" which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) +# Check if FIPS mode is enabled +if [ -f /proc/sys/crypto/fips_enabled ]; then + fips_enabled=`cat /proc/sys/crypto/fips_enabled` +else + fips_enabled=0 +fi + ################################################################################ # utilities @@ -1202,7 +1209,7 @@ ipv4_tcp_novrf() run_cmd nettest -d ${NSA_DEV} -r ${a} log_test_addr ${a} $? 1 "No server, device client, local conn" - ipv4_tcp_md5_novrf + [ "$fips_enabled" = "1" ] || ipv4_tcp_md5_novrf } ipv4_tcp_vrf() @@ -1256,9 +1263,11 @@ ipv4_tcp_vrf() log_test_addr ${a} $? 1 "Global server, local connection" # run MD5 tests - setup_vrf_dup - ipv4_tcp_md5 - cleanup_vrf_dup + if [ "$fips_enabled" = "0" ]; then + setup_vrf_dup + ipv4_tcp_md5 + cleanup_vrf_dup + fi # # enable VRF global server @@ -2674,7 +2683,7 @@ ipv6_tcp_novrf() log_test_addr ${a} $? 1 "No server, device client, local conn" done - ipv6_tcp_md5_novrf + [ "$fips_enabled" = "1" ] || ipv6_tcp_md5_novrf } ipv6_tcp_vrf() @@ -2744,9 +2753,11 @@ ipv6_tcp_vrf() log_test_addr ${a} $? 1 "Global server, local connection" # run MD5 tests - setup_vrf_dup - ipv6_tcp_md5 - cleanup_vrf_dup + if [ "$fips_enabled" = "0" ]; then + setup_vrf_dup + ipv6_tcp_md5 + cleanup_vrf_dup + fi # # enable VRF global server From 156dd06fb33793d3daeb785354b9e919a1a2849b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 14 Jun 2023 12:02:02 +0200 Subject: [PATCH 47/98] xfrm: Linearize the skb after offloading if needed. [ Upstream commit f015b900bc3285322029b4a7d132d6aeb0e51857 ] With offloading enabled, esp_xmit() gets invoked very late, from within validate_xmit_xfrm() which is after validate_xmit_skb() validates and linearizes the skb if the underlying device does not support fragments. esp_output_tail() may add a fragment to the skb while adding the auth tag/ IV. Devices without the proper support will then send skb->data points to with the correct length so the packet will have garbage at the end. A pcap sniffer will claim that the proper data has been sent since it parses the skb properly. It is not affected with INET_ESP_OFFLOAD disabled. Linearize the skb after offloading if the sending hardware requires it. It was tested on v4, v6 has been adopted. Fixes: 7785bba299a8d ("esp: Add a software GRO codepath") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- net/ipv4/esp4_offload.c | 3 +++ net/ipv6/esp6_offload.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 2ddba1e2cf22..ab6fe94b8fd9 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -337,6 +337,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ secpath_reset(skb); + if (skb_needs_linearize(skb, skb->dev->features) && + __skb_linearize(skb)) + return -ENOMEM; return 0; } diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 4cc19acfc369..6b30f34c7978 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -372,6 +372,9 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features secpath_reset(skb); + if (skb_needs_linearize(skb, skb->dev->features) && + __skb_linearize(skb)) + return -ENOMEM; return 0; } From aedecd013d2c16e8d9579374794809bd435a57c5 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Wed, 14 Jun 2023 23:06:56 +0200 Subject: [PATCH 48/98] net: qca_spi: Avoid high load if QCA7000 is not available [ Upstream commit 92717c2356cb62c89e8a3dc37cbbab2502562524 ] In case the QCA7000 is not available via SPI (e.g. in reset), the driver will cause a high load. The reason for this is that the synchronization is never finished and schedule() is never called. Since the synchronization is not timing critical, it's safe to drop this from the scheduling condition. Signed-off-by: Stefan Wahren Fixes: 291ab06ecf67 ("net: qualcomm: new Ethernet over SPI driver for QCA7000") Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/qualcomm/qca_spi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c index 8427fe1b8fd1..2205bb437e68 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.c +++ b/drivers/net/ethernet/qualcomm/qca_spi.c @@ -582,8 +582,7 @@ qcaspi_spi_thread(void *data) while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); if ((qca->intr_req == qca->intr_svc) && - (qca->txr.skb[qca->txr.head] == NULL) && - (qca->sync == QCASPI_SYNC_READY)) + !qca->txr.skb[qca->txr.head]) schedule(); set_current_state(TASK_RUNNING); From 92f73c4f927cff42e69b51b87b1860e741c9c6b0 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:13 +0300 Subject: [PATCH 49/98] mmc: mtk-sd: fix deferred probing [ Upstream commit 0c4dc0f054891a2cbde0426b0c0fdf232d89f47f ] The driver overrides the error codes returned by platform_get_irq() to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: 208489032bdd ("mmc: mediatek: Add Mediatek MMC driver") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-4-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/mtk-sd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 9871c19d2b4e..6d0fc247bddb 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -2525,7 +2525,7 @@ static int msdc_drv_probe(struct platform_device *pdev) host->irq = platform_get_irq(pdev, 0); if (host->irq < 0) { - ret = -EINVAL; + ret = host->irq; goto host_free; } From 840deb8d1418999203b6433fcea3133a92f669f3 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:14 +0300 Subject: [PATCH 50/98] mmc: mvsdio: fix deferred probing [ Upstream commit 8d84064da0d4672e74f984e8710f27881137472c ] The driver overrides the error codes returned by platform_get_irq() to -ENXIO, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: 9ec36cafe43b ("of/irq: do irq resolution in platform_get_irq") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-5-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/mvsdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index 629efbe639c4..b4f6a0a2fcb5 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -704,7 +704,7 @@ static int mvsd_probe(struct platform_device *pdev) } irq = platform_get_irq(pdev, 0); if (irq < 0) - return -ENXIO; + return irq; mmc = mmc_alloc_host(sizeof(struct mvsd_host), &pdev->dev); if (!mmc) { From 445a9568dec1cf2b2e13c0c92ec0dc8e7a62d295 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:15 +0300 Subject: [PATCH 51/98] mmc: omap: fix deferred probing [ Upstream commit aedf4ba1ad00aaa94c1b66c73ecaae95e2564b95 ] The driver overrides the error codes returned by platform_get_irq() to -ENXIO, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: 9ec36cafe43b ("of/irq: do irq resolution in platform_get_irq") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-6-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/omap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index 5e5af34090f1..ecf2a68d0e84 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1343,7 +1343,7 @@ static int mmc_omap_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) - return -ENXIO; + return irq; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); host->virt_base = devm_ioremap_resource(&pdev->dev, res); From b86ca9e08ca9c24ccda971c8c61eebce4ac35155 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:16 +0300 Subject: [PATCH 52/98] mmc: omap_hsmmc: fix deferred probing [ Upstream commit fb51b74a57859b707c3e8055ed0c25a7ca4f6a29 ] The driver overrides the error codes returned by platform_get_irq() to -ENXIO, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: 9ec36cafe43b ("of/irq: do irq resolution in platform_get_irq") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-7-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/omap_hsmmc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index eb0bd46b7e81..500c906413a7 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -1832,9 +1832,11 @@ static int omap_hsmmc_probe(struct platform_device *pdev) } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - irq = platform_get_irq(pdev, 0); - if (res == NULL || irq < 0) + if (!res) return -ENXIO; + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(base)) From 41f1e8dab08d3b40ad905646b9a7c9643c09b8a0 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:17 +0300 Subject: [PATCH 53/98] mmc: owl: fix deferred probing [ Upstream commit 3c482e1e830d79b9be8afb900a965135c01f7893 ] The driver overrides the error codes returned by platform_get_irq() to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: ff65ffe46d28 ("mmc: Add Actions Semi Owl SoCs SD/MMC driver") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-8-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/owl-mmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/owl-mmc.c b/drivers/mmc/host/owl-mmc.c index 3dc143b03939..679b8b0b310e 100644 --- a/drivers/mmc/host/owl-mmc.c +++ b/drivers/mmc/host/owl-mmc.c @@ -638,7 +638,7 @@ static int owl_mmc_probe(struct platform_device *pdev) owl_host->irq = platform_get_irq(pdev, 0); if (owl_host->irq < 0) { - ret = -EINVAL; + ret = owl_host->irq; goto err_release_channel; } From 34c4906b9a06778c51b0cf104eb02b5250ebcdc6 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:18 +0300 Subject: [PATCH 54/98] mmc: sdhci-acpi: fix deferred probing [ Upstream commit b465dea5e1540c7d7b5211adaf94926980d3014b ] The driver overrides the error codes returned by platform_get_irq() to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: 1b7ba57ecc86 ("mmc: sdhci-acpi: Handle return value of platform_get_irq") Signed-off-by: Sergey Shtylyov Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20230617203622.6812-9-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/sdhci-acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index 8fe65f172a61..f4e15eef7045 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -910,7 +910,7 @@ static int sdhci_acpi_probe(struct platform_device *pdev) host->ops = &sdhci_acpi_ops_dflt; host->irq = platform_get_irq(pdev, 0); if (host->irq < 0) { - err = -EINVAL; + err = host->irq; goto err_free; } From d1e08bed0307589bc4f2b30f7779122f853c2fc0 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:20 +0300 Subject: [PATCH 55/98] mmc: sh_mmcif: fix deferred probing [ Upstream commit 5b067d7f855c61df7f8e2e8ccbcee133c282415e ] The driver overrides the error codes returned by platform_get_irq() to -ENXIO, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: 9ec36cafe43b ("of/irq: do irq resolution in platform_get_irq") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-11-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/sh_mmcif.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index bcc595c70a9f..e12fe29b275c 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -1398,7 +1398,7 @@ static int sh_mmcif_probe(struct platform_device *pdev) irq[0] = platform_get_irq(pdev, 0); irq[1] = platform_get_irq_optional(pdev, 1); if (irq[0] < 0) - return -ENXIO; + return irq[0]; reg = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(reg)) From 24d7d9aee03de852985fd8abd0a9439a74ad5bb3 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:22 +0300 Subject: [PATCH 56/98] mmc: usdhi60rol0: fix deferred probing [ Upstream commit 413db499730248431c1005b392e8ed82c4fa19bf ] The driver overrides the error codes returned by platform_get_irq_byname() to -ENODEV, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating error codes upstream. Fixes: 9ec36cafe43b ("of/irq: do irq resolution in platform_get_irq") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-13-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/usdhi6rol0.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/usdhi6rol0.c b/drivers/mmc/host/usdhi6rol0.c index 99515be6e5e5..2032e4e1ee68 100644 --- a/drivers/mmc/host/usdhi6rol0.c +++ b/drivers/mmc/host/usdhi6rol0.c @@ -1757,8 +1757,10 @@ static int usdhi6_probe(struct platform_device *pdev) irq_cd = platform_get_irq_byname(pdev, "card detect"); irq_sd = platform_get_irq_byname(pdev, "data"); irq_sdio = platform_get_irq_byname(pdev, "SDIO"); - if (irq_sd < 0 || irq_sdio < 0) - return -ENODEV; + if (irq_sd < 0) + return irq_sd; + if (irq_sdio < 0) + return irq_sdio; mmc = mmc_alloc_host(sizeof(struct usdhi6_host), dev); if (!mmc) From efea112a87b688af03df852744649d09589befc4 Mon Sep 17 00:00:00 2001 From: Terin Stock Date: Fri, 9 Jun 2023 22:58:42 +0200 Subject: [PATCH 57/98] ipvs: align inner_mac_header for encapsulation [ Upstream commit d7fce52fdf96663ddc2eb21afecff3775588612a ] When using encapsulation the original packet's headers are copied to the inner headers. This preserves the space for an inner mac header, which is not used by the inner payloads for the encapsulation types supported by IPVS. If a packet is using GUE or GRE encapsulation and needs to be segmented, flow can be passed to __skb_udp_tunnel_segment() which calculates a negative tunnel header length. A negative tunnel header length causes pskb_may_pull() to fail, dropping the packet. This can be observed by attaching probes to ip_vs_in_hook(), __dev_queue_xmit(), and __skb_udp_tunnel_segment(): perf probe --add '__dev_queue_xmit skb->inner_mac_header \ skb->inner_network_header skb->mac_header skb->network_header' perf probe --add '__skb_udp_tunnel_segment:7 tnl_hlen' perf probe -m ip_vs --add 'ip_vs_in_hook skb->inner_mac_header \ skb->inner_network_header skb->mac_header skb->network_header' These probes the headers and tunnel header length for packets which traverse the IPVS encapsulation path. A TCP packet can be forced into the segmentation path by being smaller than a calculated clamped MSS, but larger than the advertised MSS. probe:ip_vs_in_hook: inner_mac_header=0x0 inner_network_header=0x0 mac_header=0x44 network_header=0x52 probe:ip_vs_in_hook: inner_mac_header=0x44 inner_network_header=0x52 mac_header=0x44 network_header=0x32 probe:dev_queue_xmit: inner_mac_header=0x44 inner_network_header=0x52 mac_header=0x44 network_header=0x32 probe:__skb_udp_tunnel_segment_L7: tnl_hlen=-2 When using veth-based encapsulation, the interfaces are set to be mac-less, which does not preserve space for an inner mac header. This prevents this issue from occurring. In our real-world testing of sending a 32KB file we observed operation time increasing from ~75ms for veth-based encapsulation to over 1.5s using IPVS encapsulation due to retries from dropped packets. This changeset modifies the packet on the encapsulation path in ip_vs_tunnel_xmit() and ip_vs_tunnel_xmit_v6() to remove the inner mac header offset. This fixes UDP segmentation for both encapsulation types, and corrects the inner headers for any IPIP flows that may use it. Fixes: 84c0d5e96f3a ("ipvs: allow tunneling with gue encapsulation") Signed-off-by: Terin Stock Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/ipvs/ip_vs_xmit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index d2e5a8f644b8..cd2130e98836 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -1225,6 +1225,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; @@ -1373,6 +1374,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; From aa528e7d379f32ad1b4b65be57985b4c1d42a12b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Sat, 17 Jun 2023 09:26:45 +0300 Subject: [PATCH 58/98] net: dsa: mt7530: fix trapping frames on non-MT7621 SoC MT7530 switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 4ae90f90e4909e3014e2dc6a0627964617a7b824 ] All MT7530 switch IP variants share the MT7530_MFC register, but the current driver only writes it for the switch variant that is integrated in the MT7621 SoC. Modify the code to include all MT7530 derivatives. Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch") Suggested-by: Vladimir Oltean Signed-off-by: Arınç ÜNAL Reviewed-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/dsa/mt7530.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 4ec598efc333..4aa3d0cba7b8 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1015,7 +1015,7 @@ mt753x_cpu_port_enable(struct dsa_switch *ds, int port) UNU_FFP(BIT(port))); /* Set CPU port number */ - if (priv->id == ID_MT7621) + if (priv->id == ID_MT7530 || priv->id == ID_MT7621) mt7530_rmw(priv, MT7530_MFC, CPU_MASK, CPU_EN | CPU_PORT(port)); /* CPU port gets connected to all user ports of From 768f94c5f639ee4160290ec1f5382573239f29c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Sat, 17 Jun 2023 09:26:46 +0300 Subject: [PATCH 59/98] net: dsa: mt7530: fix handling of BPDUs on MT7530 switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit d7c66073559386b836bded7cdc8b66ee5c049129 ] BPDUs are link-local frames, therefore they must be trapped to the CPU port. Currently, the MT7530 switch treats BPDUs as regular multicast frames, therefore flooding them to user ports. To fix this, set BPDUs to be trapped to the CPU port. Group this on mt7530_setup() and mt7531_setup_common() into mt753x_trap_frames() and call that. Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch") Signed-off-by: Arınç ÜNAL Reviewed-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/dsa/mt7530.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 4aa3d0cba7b8..f74d9fbd0817 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -993,6 +993,14 @@ unlock_exit: mutex_unlock(&priv->reg_mutex); } +static void +mt753x_trap_frames(struct mt7530_priv *priv) +{ + /* Trap BPDUs to the CPU port(s) */ + mt7530_rmw(priv, MT753X_BPC, MT753X_BPDU_PORT_FW_MASK, + MT753X_BPDU_CPU_ONLY); +} + static int mt753x_cpu_port_enable(struct dsa_switch *ds, int port) { @@ -2194,6 +2202,8 @@ mt7530_setup(struct dsa_switch *ds) priv->p6_interface = PHY_INTERFACE_MODE_NA; + mt753x_trap_frames(priv); + /* Enable and reset MIB counters */ mt7530_mib_reset(ds); @@ -2300,8 +2310,8 @@ mt7531_setup_common(struct dsa_switch *ds) BIT(cpu_dp->index)); break; } - mt7530_rmw(priv, MT753X_BPC, MT753X_BPDU_PORT_FW_MASK, - MT753X_BPDU_CPU_ONLY); + + mt753x_trap_frames(priv); /* Enable and reset MIB counters */ mt7530_mib_reset(ds); From 1328e8d4c3ee38c5de52dd09cbcf17b8dfab8652 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Fri, 16 Jun 2023 17:45:49 +0100 Subject: [PATCH 60/98] be2net: Extend xmit workaround to BE3 chip [ Upstream commit 7580e0a78eb29e7bb1a772eba4088250bbb70d41 ] We have seen a bug where the NIC incorrectly changes the length in the IP header of a padded packet to include the padding bytes. The driver already has a workaround for this so do the workaround for this NIC too. This resolves the issue. The NIC in question identifies itself as follows: [ 8.828494] be2net 0000:02:00.0: FW version is 10.7.110.31 [ 8.834759] be2net 0000:02:00.0: Emulex OneConnect(be3): PF FLEX10 port 1 02:00.0 Ethernet controller: Emulex Corporation OneConnect 10Gb NIC (be3) (rev 01) Fixes: ca34fe38f06d ("be2net: fix wrong usage of adapter->generation") Signed-off-by: Ross Lagerwall Link: https://lore.kernel.org/r/20230616164549.2863037-1-ross.lagerwall@citrix.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/emulex/benet/be_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 361c1c87c183..e874b907bfbd 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -1136,8 +1136,8 @@ static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter, eth_hdr_len = ntohs(skb->protocol) == ETH_P_8021Q ? VLAN_ETH_HLEN : ETH_HLEN; if (skb->len <= 60 && - (lancer_chip(adapter) || skb_vlan_tag_present(skb)) && - is_ipv4_pkt(skb)) { + (lancer_chip(adapter) || BE3_chip(adapter) || + skb_vlan_tag_present(skb)) && is_ipv4_pkt(skb)) { ip = (struct iphdr *)ip_hdr(skb); pskb_trim(skb, eth_hdr_len + ntohs(ip->tot_len)); } From 314a8697d08092df6d00521450d44c352c602943 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 14:45:22 +0200 Subject: [PATCH 61/98] netfilter: nf_tables: fix chain binding transaction logic [ Upstream commit 4bedf9eee016286c835e3d8fa981ddece5338795 ] Add bound flag to rule and chain transactions as in 6a0a8d10a366 ("netfilter: nf_tables: use-after-free in failing rule with bound set") to skip them in case that the chain is already bound from the abort path. This patch fixes an imbalance in the chain use refcnt that triggers a WARN_ON on the table and chain destroy path. This patch also disallows nested chain bindings, which is not supported from userspace. The logic to deal with chain binding in nft_data_hold() and nft_data_release() is not correct. The NFT_TRANS_PREPARE state needs a special handling in case a chain is bound but next expressions in the same rule fail to initialize as described by 1240eb93f061 ("netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE"). The chain is left bound if rule construction fails, so the objects stored in this chain (and the chain itself) are released by the transaction records from the abort path, follow up patch ("netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain") completes this error handling. When deleting an existing rule, chain bound flag is set off so the rule expression .destroy path releases the objects. Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/net/netfilter/nf_tables.h | 21 +++++++- net/netfilter/nf_tables_api.c | 86 +++++++++++++++++++----------- net/netfilter/nft_immediate.c | 87 +++++++++++++++++++++++++++---- 3 files changed, 153 insertions(+), 41 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 8bac5a5ca0f1..1a879140f996 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -968,7 +968,10 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) return (void *)&rule->data[rule->dlen]; } -void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule); +void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule); +void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, + enum nft_trans_phase phase); +void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule); static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, @@ -1037,6 +1040,7 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_set_elem *elem); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); +int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, @@ -1073,11 +1077,17 @@ int nft_chain_validate_dependency(const struct nft_chain *chain, int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); +static inline bool nft_chain_binding(const struct nft_chain *chain) +{ + return chain->flags & NFT_CHAIN_BINDING; +} + static inline bool nft_chain_is_bound(struct nft_chain *chain) { return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; } +int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); void nf_tables_chain_destroy(struct nft_ctx *ctx); @@ -1508,6 +1518,7 @@ struct nft_trans_rule { struct nft_rule *rule; struct nft_flow_rule *flow; u32 rule_id; + bool bound; }; #define nft_trans_rule(trans) \ @@ -1516,6 +1527,8 @@ struct nft_trans_rule { (((struct nft_trans_rule *)trans->data)->flow) #define nft_trans_rule_id(trans) \ (((struct nft_trans_rule *)trans->data)->rule_id) +#define nft_trans_rule_bound(trans) \ + (((struct nft_trans_rule *)trans->data)->bound) struct nft_trans_set { struct nft_set *set; @@ -1540,13 +1553,17 @@ struct nft_trans_set { (((struct nft_trans_set *)trans->data)->gc_int) struct nft_trans_chain { + struct nft_chain *chain; bool update; char *name; struct nft_stats __percpu *stats; u8 policy; + bool bound; u32 chain_id; }; +#define nft_trans_chain(trans) \ + (((struct nft_trans_chain *)trans->data)->chain) #define nft_trans_chain_update(trans) \ (((struct nft_trans_chain *)trans->data)->update) #define nft_trans_chain_name(trans) \ @@ -1555,6 +1572,8 @@ struct nft_trans_chain { (((struct nft_trans_chain *)trans->data)->stats) #define nft_trans_chain_policy(trans) \ (((struct nft_trans_chain *)trans->data)->policy) +#define nft_trans_chain_bound(trans) \ + (((struct nft_trans_chain *)trans->data)->bound) #define nft_trans_chain_id(trans) \ (((struct nft_trans_chain *)trans->data)->chain_id) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 35b9f74f0bc6..83828c530b43 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -195,6 +195,48 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) } } +static void nft_chain_trans_bind(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + struct nftables_pernet *nft_net; + struct net *net = ctx->net; + struct nft_trans *trans; + + if (!nft_chain_binding(chain)) + return; + + nft_net = nft_pernet(net); + list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain(trans) == chain) + nft_trans_chain_bound(trans) = true; + break; + case NFT_MSG_NEWRULE: + if (trans->ctx.chain == chain) + nft_trans_rule_bound(trans) = true; + break; + } + } +} + +int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + if (!nft_chain_binding(chain)) + return 0; + + if (nft_chain_binding(ctx->chain)) + return -EOPNOTSUPP; + + if (chain->bound) + return -EBUSY; + + chain->bound = true; + chain->use++; + nft_chain_trans_bind(ctx, chain); + + return 0; +} + static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { @@ -340,8 +382,9 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); } } - + nft_trans_chain(trans) = ctx->chain; nft_trans_commit_list_add_tail(ctx->net, trans); + return trans; } @@ -359,8 +402,7 @@ static int nft_delchain(struct nft_ctx *ctx) return 0; } -static void nft_rule_expr_activate(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr; @@ -373,9 +415,8 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx, } } -static void nft_rule_expr_deactivate(const struct nft_ctx *ctx, - struct nft_rule *rule, - enum nft_trans_phase phase) +void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, + enum nft_trans_phase phase) { struct nft_expr *expr; @@ -2094,7 +2135,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, return 0; } -static int nft_chain_add(struct nft_table *table, struct nft_chain *chain) +int nft_chain_add(struct nft_table *table, struct nft_chain *chain) { int err; @@ -3220,8 +3261,7 @@ err_fill_rule_info: return err; } -static void nf_tables_rule_destroy(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr, *next; @@ -3238,7 +3278,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) +static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); @@ -6293,7 +6333,6 @@ static int nf_tables_newsetelem(struct sk_buff *skb, void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { struct nft_chain *chain; - struct nft_rule *rule; if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { @@ -6301,15 +6340,6 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) case NFT_GOTO: chain = data->verdict.chain; chain->use++; - - if (!nft_chain_is_bound(chain)) - break; - - chain->table->use++; - list_for_each_entry(rule, &chain->rules, list) - chain->use++; - - nft_chain_add(chain->table, chain); break; } } @@ -9162,7 +9192,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { - if (nft_chain_is_bound(trans->ctx.chain)) { + if (nft_trans_chain_bound(trans)) { nft_trans_destroy(trans); break; } @@ -9179,6 +9209,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; case NFT_MSG_NEWRULE: + if (nft_trans_rule_bound(trans)) { + nft_trans_destroy(trans); + break; + } trans->ctx.chain->use--; list_del_rcu(&nft_trans_rule(trans)->list); nft_rule_expr_deactivate(&trans->ctx, @@ -9737,22 +9771,12 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, static void nft_verdict_uninit(const struct nft_data *data) { struct nft_chain *chain; - struct nft_rule *rule; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; chain->use--; - - if (!nft_chain_is_bound(chain)) - break; - - chain->table->use--; - list_for_each_entry(rule, &chain->rules, list) - chain->use--; - - nft_chain_del(chain); break; } } diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index fcdbc5ed3f36..9d4248898ce4 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -76,11 +76,9 @@ static int nft_immediate_init(const struct nft_ctx *ctx, switch (priv->data.verdict.code) { case NFT_JUMP: case NFT_GOTO: - if (nft_chain_is_bound(chain)) { - err = -EBUSY; - goto err1; - } - chain->bound = true; + err = nf_tables_bind_chain(ctx, chain); + if (err < 0) + return err; break; default: break; @@ -98,6 +96,31 @@ static void nft_immediate_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + struct nft_rule *rule; + + if (priv->dreg == NFT_REG_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + if (!nft_chain_binding(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nft_rule_expr_activate(&chain_ctx, rule); + + nft_clear(ctx->net, chain); + break; + default: + break; + } + } return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg)); } @@ -107,6 +130,40 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, enum nft_trans_phase phase) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + struct nft_rule *rule; + + if (priv->dreg == NFT_REG_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + if (!nft_chain_binding(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nft_rule_expr_deactivate(&chain_ctx, rule, phase); + + switch (phase) { + case NFT_TRANS_PREPARE: + nft_deactivate_next(ctx->net, chain); + break; + default: + nft_chain_del(chain); + chain->bound = false; + chain->table->use--; + break; + } + break; + default: + break; + } + } if (phase == NFT_TRANS_COMMIT) return; @@ -131,15 +188,27 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, case NFT_GOTO: chain = data->verdict.chain; - if (!nft_chain_is_bound(chain)) + if (!nft_chain_binding(chain)) break; + /* Rule construction failed, but chain is already bound: + * let the transaction records release this chain and its rules. + */ + if (chain->bound) { + chain->use--; + break; + } + + /* Rule has been deleted, release chain and its rules. */ chain_ctx = *ctx; chain_ctx.chain = chain; - list_for_each_entry_safe(rule, n, &chain->rules, list) - nf_tables_rule_release(&chain_ctx, rule); - + chain->use--; + list_for_each_entry_safe(rule, n, &chain->rules, list) { + chain->use--; + list_del(&rule->list); + nf_tables_rule_destroy(&chain_ctx, rule); + } nf_tables_chain_destroy(&chain_ctx); break; default: From 4004f12aaca8e03e1a7d0e9b0d511b8a23fd6b1e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 14:45:26 +0200 Subject: [PATCH 62/98] netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain [ Upstream commit 26b5a5712eb85e253724e56a54c17f8519bd8e4e ] Add a new state to deal with rule expressions deactivation from the newrule error path, otherwise the anonymous set remains in the list in inactive state for the next generation. Mark the set/chain transaction as unbound so the abort path releases this object, set it as inactive in the next generation so it is not reachable anymore from this transaction and reference counter is dropped. Fixes: 1240eb93f061 ("netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/net/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 45 ++++++++++++++++++++++++++----- net/netfilter/nft_immediate.c | 3 +++ 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 1a879140f996..603c156da210 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -863,6 +863,7 @@ struct nft_expr_type { enum nft_trans_phase { NFT_TRANS_PREPARE, + NFT_TRANS_PREPARE_ERROR, NFT_TRANS_ABORT, NFT_TRANS_COMMIT, NFT_TRANS_RELEASE @@ -1041,6 +1042,7 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_elem *elem); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); +void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 83828c530b43..120b885fb44a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -171,7 +171,8 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } -static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set, + bool bind) { struct nftables_pernet *nft_net; struct net *net = ctx->net; @@ -185,17 +186,28 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) switch (trans->msg_type) { case NFT_MSG_NEWSET: if (nft_trans_set(trans) == set) - nft_trans_set_bound(trans) = true; + nft_trans_set_bound(trans) = bind; break; case NFT_MSG_NEWSETELEM: if (nft_trans_elem_set(trans) == set) - nft_trans_elem_set_bound(trans) = true; + nft_trans_elem_set_bound(trans) = bind; break; } } } -static void nft_chain_trans_bind(const struct nft_ctx *ctx, struct nft_chain *chain) +static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, true); +} + +static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, false); +} + +static void __nft_chain_trans_bind(const struct nft_ctx *ctx, + struct nft_chain *chain, bool bind) { struct nftables_pernet *nft_net; struct net *net = ctx->net; @@ -209,16 +221,22 @@ static void nft_chain_trans_bind(const struct nft_ctx *ctx, struct nft_chain *ch switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (nft_trans_chain(trans) == chain) - nft_trans_chain_bound(trans) = true; + nft_trans_chain_bound(trans) = bind; break; case NFT_MSG_NEWRULE: if (trans->ctx.chain == chain) - nft_trans_rule_bound(trans) = true; + nft_trans_rule_bound(trans) = bind; break; } } } +static void nft_chain_trans_bind(const struct nft_ctx *ctx, + struct nft_chain *chain) +{ + __nft_chain_trans_bind(ctx, chain, true); +} + int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) { if (!nft_chain_binding(chain)) @@ -237,6 +255,11 @@ int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) return 0; } +void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + __nft_chain_trans_bind(ctx, chain, false); +} + static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { @@ -3612,7 +3635,7 @@ err_destroy_flow_rule: if (flow) nft_flow_rule_destroy(flow); err_release_rule: - nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE); + nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR); nf_tables_rule_destroy(&ctx, rule); err_release_expr: for (i = 0; i < n; i++) { @@ -4893,6 +4916,13 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, enum nft_trans_phase phase) { switch (phase) { + case NFT_TRANS_PREPARE_ERROR: + nft_set_trans_unbind(ctx, set); + if (nft_set_is_anonymous(set)) + nft_deactivate_next(ctx->net, set); + + set->use--; + break; case NFT_TRANS_PREPARE: if (nft_set_is_anonymous(set)) nft_deactivate_next(ctx->net, set); @@ -7337,6 +7367,7 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, enum nft_trans_phase phase) { switch (phase) { + case NFT_TRANS_PREPARE_ERROR: case NFT_TRANS_PREPARE: case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 9d4248898ce4..6b0efab4fad0 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -150,6 +150,9 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, nft_rule_expr_deactivate(&chain_ctx, rule, phase); switch (phase) { + case NFT_TRANS_PREPARE_ERROR: + nf_tables_unbind_chain(ctx, chain); + fallthrough; case NFT_TRANS_PREPARE: nft_deactivate_next(ctx->net, chain); break; From 45eb6944d0f55102229115de040ef3a48841434a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 15:20:04 +0200 Subject: [PATCH 63/98] netfilter: nft_set_pipapo: .walk does not deal with generations [ Upstream commit 2b84e215f87443c74ac0aa7f76bb172d43a87033 ] The .walk callback iterates over the current active set, but it might be useful to iterate over the next generation set. Use the generation mask to determine what set view (either current or next generation) is use for the walk iteration. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_set_pipapo.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 15e451dc3fc4..78e1ec8badde 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1974,12 +1974,16 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_pipapo *priv = nft_set_priv(set); + struct net *net = read_pnet(&set->net); struct nft_pipapo_match *m; struct nft_pipapo_field *f; int i, r; rcu_read_lock(); - m = rcu_dereference(priv->match); + if (iter->genmask == nft_genmask_cur(net)) + m = rcu_dereference(priv->match); + else + m = priv->clone; if (unlikely(!m)) goto out; From baa3ec1b31f512f4aa3ccd700b5304390c3b525f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 15:20:16 +0200 Subject: [PATCH 64/98] netfilter: nf_tables: disallow element updates of bound anonymous sets [ Upstream commit c88c535b592d3baeee74009f3eceeeaf0fdd5e1b ] Anonymous sets come with NFT_SET_CONSTANT from userspace. Although API allows to create anonymous sets without NFT_SET_CONSTANT, it makes no sense to allow to add and to delete elements for bound anonymous sets. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 120b885fb44a..c4e3aaeeb795 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6332,7 +6332,8 @@ static int nf_tables_newsetelem(struct sk_buff *skb, if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + if (!list_empty(&set->bindings) && + (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); @@ -6602,7 +6603,9 @@ static int nf_tables_delsetelem(struct sk_buff *skb, set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + + if (!list_empty(&set->bindings) && + (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); From 2938e7d582d7ae4b53ae4cd65d00000c04a11d0e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 15:21:33 +0200 Subject: [PATCH 65/98] netfilter: nf_tables: reject unbound anonymous set before commit phase [ Upstream commit 938154b93be8cd611ddfd7bafc1849f3c4355201 ] Add a new list to track set transaction and to check for unbound anonymous sets before entering the commit phase. Bail out at the end of the transaction handling if an anonymous set remains unbound. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/net/netfilter/nf_tables.h | 3 +++ net/netfilter/nf_tables_api.c | 35 ++++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 603c156da210..7da74b9428b9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1503,6 +1503,7 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) * struct nft_trans - nf_tables object update in transaction * * @list: used internally + * @binding_list: list of objects with possible bindings * @msg_type: message type * @put_net: ctx->net needs to be put * @ctx: transaction context @@ -1510,6 +1511,7 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) */ struct nft_trans { struct list_head list; + struct list_head binding_list; int msg_type; bool put_net; struct nft_ctx ctx; @@ -1648,6 +1650,7 @@ static inline int nft_request_module(struct net *net, const char *fmt, ...) { re struct nftables_pernet { struct list_head tables; struct list_head commit_list; + struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c4e3aaeeb795..010ef3bce9e5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -153,6 +153,7 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, return NULL; INIT_LIST_HEAD(&trans->list); + INIT_LIST_HEAD(&trans->binding_list); trans->msg_type = msg_type; trans->ctx = *ctx; @@ -165,9 +166,15 @@ static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); } -static void nft_trans_destroy(struct nft_trans *trans) +static void nft_trans_list_del(struct nft_trans *trans) { list_del(&trans->list); + list_del(&trans->binding_list); +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ + nft_trans_list_del(trans); kfree(trans); } @@ -359,6 +366,14 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr { struct nftables_pernet *nft_net = nft_pernet(net); + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans))) + list_add_tail(&trans->binding_list, &nft_net->binding_list); + break; + } + list_add_tail(&trans->list, &nft_net->commit_list); } @@ -8565,7 +8580,7 @@ static void nf_tables_trans_destroy_work(struct work_struct *w) synchronize_rcu(); list_for_each_entry_safe(trans, next, &head, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nft_commit_release(trans); } } @@ -8883,6 +8898,19 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return 0; } + list_for_each_entry(trans, &nft_net->binding_list, binding_list) { + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans)) && + !nft_trans_set_bound(trans)) { + pr_warn_once("nftables ruleset with unbound set\n"); + return -EINVAL; + } + break; + } + } + /* 0. Validate ruleset, otherwise roll back for error reporting. */ if (nf_tables_validate(net) < 0) return -EAGAIN; @@ -9356,7 +9384,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nf_tables_abort_release(trans); } @@ -10129,6 +10157,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->tables); INIT_LIST_HEAD(&nft_net->commit_list); + INIT_LIST_HEAD(&nft_net->binding_list); INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); From 2f2f9eaa6da1b87145d1d12771acf0279ebf486d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 15:21:39 +0200 Subject: [PATCH 66/98] netfilter: nf_tables: reject unbound chain set before commit phase [ Upstream commit 62e1e94b246e685d89c3163aaef4b160e42ceb02 ] Use binding list to track set transaction and to check for unbound chains before entering the commit phase. Bail out if chain binding remain unused before entering the commit step. Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 010ef3bce9e5..66328326ec05 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -372,6 +372,11 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr nft_set_is_anonymous(nft_trans_set(trans))) list_add_tail(&trans->binding_list, &nft_net->binding_list); break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans))) + list_add_tail(&trans->binding_list, &nft_net->binding_list); + break; } list_add_tail(&trans->list, &nft_net->commit_list); @@ -8908,6 +8913,14 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return -EINVAL; } break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans)) && + !nft_trans_chain_bound(trans)) { + pr_warn_once("nftables ruleset with unbound chain\n"); + return -EINVAL; + } + break; } } From 53defc6ecff4fc4189faa9ddcc1ae77eb6e31252 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 15:22:01 +0200 Subject: [PATCH 67/98] netfilter: nf_tables: disallow updates of anonymous sets [ Upstream commit b770283c98e0eee9133c47bc03b6cc625dc94723 ] Disallow updates of set timeout and garbage collection parameters for anonymous sets. Fixes: 123b99619cca ("netfilter: nf_tables: honor set timeout and garbage collection updates") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 66328326ec05..826bd961d90c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4628,6 +4628,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + if (nft_set_is_anonymous(set)) + return -EOPNOTSUPP; + err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags); if (err < 0) return err; From 57130334da4ec646b33733f92be43c30bc8933ad Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 15 Jun 2023 10:14:25 +0200 Subject: [PATCH 68/98] netfilter: nfnetlink_osf: fix module autoload [ Upstream commit 62f9a68a36d4441a6c412b81faed102594bc6670 ] Move the alias from xt_osf to nfnetlink_osf. Fixes: f9324952088f ("netfilter: nfnetlink_osf: extract nfnetlink_subsystem code from xt_osf.c") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nfnetlink_osf.c | 1 + net/netfilter/xt_osf.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index ee6840bd5933..8f1bfa6ccc2d 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -439,3 +439,4 @@ module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index e1990baf3a3b..dc9485854002 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -71,4 +71,3 @@ MODULE_AUTHOR("Evgeniy Polyakov "); MODULE_DESCRIPTION("Passive OS fingerprint matching."); MODULE_ALIAS("ipt_osf"); MODULE_ALIAS("ip6t_osf"); -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); From 0f8d81254fd6c5b4400b31b40f6205eb4c21ba79 Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Mon, 19 Jun 2023 17:44:35 +0200 Subject: [PATCH 69/98] Revert "net: phy: dp83867: perform soft reset and retain established link" [ Upstream commit a129b41fe0a8b4da828c46b10f5244ca07a3fec3 ] This reverts commit da9ef50f545f86ffe6ff786174d26500c4db737a. This fixes a regression in which the link would come up, but no communication was possible. The reverted commit was also removing a comment about DP83867_PHYCR_FORCE_LINK_GOOD, this is not added back in this commits since it seems that this is unrelated to the original code change. Closes: https://lore.kernel.org/all/ZGuDJos8D7N0J6Z2@francesco-nb.int.toradex.com/ Fixes: da9ef50f545f ("net: phy: dp83867: perform soft reset and retain established link") Signed-off-by: Francesco Dolcini Reviewed-by: Andrew Lunn Reviewed-by: Praneeth Bajjuri Link: https://lore.kernel.org/r/20230619154435.355485-1-francesco@dolcini.it Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/phy/dp83867.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index 6230dd5e2990..76ca43108d99 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -836,7 +836,7 @@ static int dp83867_phy_reset(struct phy_device *phydev) { int err; - err = phy_write(phydev, DP83867_CTRL, DP83867_SW_RESTART); + err = phy_write(phydev, DP83867_CTRL, DP83867_SW_RESET); if (err < 0) return err; From b7db41a865416c849b8140887dc9f87ca6666924 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Thu, 15 Jun 2023 16:56:07 +0200 Subject: [PATCH 70/98] bpf/btf: Accept function names that contain dots [ Upstream commit 9724160b3942b0a967b91a59f81da5593f28b8ba ] When building a kernel with LLVM=1, LLVM_IAS=0 and CONFIG_KASAN=y, LLVM leaves DWARF tags for the "asan.module_ctor" & co symbols. In turn, pahole creates BTF_KIND_FUNC entries for these and this makes the BTF metadata validation fail because they contain a dot. In a dramatic turn of event, this BTF verification failure can cause the netfilter_bpf initialization to fail, causing netfilter_core to free the netfilter_helper hashmap and netfilter_ftp to trigger a use-after-free. The risk of u-a-f in netfilter will be addressed separately but the existence of "asan.module_ctor" debug info under some build conditions sounds like a good enough reason to accept functions that contain dots in BTF. Although using only LLVM=1 is the recommended way to compile clang-based kernels, users can certainly do LLVM=1, LLVM_IAS=0 as well and we still try to support that combination according to Nick. To clarify: - > v5.10 kernel, LLVM=1 (LLVM_IAS=0 is not the default) is recommended, but user can still have LLVM=1, LLVM_IAS=0 to trigger the issue - <= 5.10 kernel, LLVM=1 (LLVM_IAS=0 is the default) is recommended in which case GNU as will be used Fixes: 1dc92851849c ("bpf: kernel side support for BTF Var and DataSec") Signed-off-by: Florent Revest Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Cc: Yonghong Song Cc: Nick Desaulniers Link: https://lore.kernel.org/bpf/20230615145607.3469985-1-revest@chromium.org Signed-off-by: Sasha Levin --- kernel/bpf/btf.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6c7126de5c17..5d4bea53ac1f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -633,13 +633,12 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) return offset < btf->hdr.str_len; } -static bool __btf_name_char_ok(char c, bool first, bool dot_ok) +static bool __btf_name_char_ok(char c, bool first) { if ((first ? !isalpha(c) : !isalnum(c)) && c != '_' && - ((c == '.' && !dot_ok) || - c != '.')) + c != '.') return false; return true; } @@ -656,20 +655,20 @@ static const char *btf_str_by_offset(const struct btf *btf, u32 offset) return NULL; } -static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) +static bool __btf_name_valid(const struct btf *btf, u32 offset) { /* offset must be valid */ const char *src = btf_str_by_offset(btf, offset); const char *src_limit; - if (!__btf_name_char_ok(*src, true, dot_ok)) + if (!__btf_name_char_ok(*src, true)) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; src++; while (*src && src < src_limit) { - if (!__btf_name_char_ok(*src, false, dot_ok)) + if (!__btf_name_char_ok(*src, false)) return false; src++; } @@ -677,17 +676,14 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) return !*src; } -/* Only C-style identifier is permitted. This can be relaxed if - * necessary. - */ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset, false); + return __btf_name_valid(btf, offset); } static bool btf_name_valid_section(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset, true); + return __btf_name_valid(btf, offset); } static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) @@ -3536,7 +3532,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env, } if (!t->name_off || - !__btf_name_valid(env->btf, t->name_off, true)) { + !__btf_name_valid(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } From 3138c85031e8d1ffe07ebdbc9236388b1185474b Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Tue, 20 Jun 2023 14:45:15 +0200 Subject: [PATCH 71/98] selftests: forwarding: Fix race condition in mirror installation [ Upstream commit c7c059fba6fb19c3bc924925c984772e733cb594 ] When mirroring to a gretap in hardware the device expects to be programmed with the egress port and all the encapsulating headers. This requires the driver to resolve the path the packet will take in the software data path and program the device accordingly. If the path cannot be resolved (in this case because of an unresolved neighbor), then mirror installation fails until the path is resolved. This results in a race that causes the test to sometimes fail. Fix this by setting the neighbor's state to permanent in a couple of tests, so that it is always valid. Fixes: 35c31d5c323f ("selftests: forwarding: Test mirror-to-gretap w/ UL 802.1d") Fixes: 239e754af854 ("selftests: forwarding: Test mirror-to-gretap w/ UL 802.1q") Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/268816ac729cb6028c7a34d4dda6f4ec7af55333.1687264607.git.petrm@nvidia.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- .../testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh | 4 ++++ .../testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh index c5095da7f6bf..aec752a22e9e 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh @@ -93,12 +93,16 @@ cleanup() test_gretap() { + ip neigh replace 192.0.2.130 lladdr $(mac_get $h3) \ + nud permanent dev br2 full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap" full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap" } test_ip6gretap() { + ip neigh replace 2001:db8:2::2 lladdr $(mac_get $h3) \ + nud permanent dev br2 full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap" full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap" } diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh index 9ff22f28032d..0cf4c47a46f9 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh @@ -90,12 +90,16 @@ cleanup() test_gretap() { + ip neigh replace 192.0.2.130 lladdr $(mac_get $h3) \ + nud permanent dev br1 full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap" full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap" } test_ip6gretap() { + ip neigh replace 2001:db8:2::2 lladdr $(mac_get $h3) \ + nud permanent dev br1 full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap" full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap" } From c1a2b52d999e9f7d31f2c7de459a2f3269d5fdf4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Jun 2023 18:44:25 +0000 Subject: [PATCH 72/98] sch_netem: acquire qdisc lock in netem_change() [ Upstream commit 2174a08db80d1efeea382e25ac41c4e7511eb6d6 ] syzbot managed to trigger a divide error [1] in netem. It could happen if q->rate changes while netem_enqueue() is running, since q->rate is read twice. It turns out netem_change() always lacked proper synchronization. [1] divide error: 0000 [#1] SMP KASAN CPU: 1 PID: 7867 Comm: syz-executor.1 Not tainted 6.1.30-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/25/2023 RIP: 0010:div64_u64 include/linux/math64.h:69 [inline] RIP: 0010:packet_time_ns net/sched/sch_netem.c:357 [inline] RIP: 0010:netem_enqueue+0x2067/0x36d0 net/sched/sch_netem.c:576 Code: 89 e2 48 69 da 00 ca 9a 3b 42 80 3c 28 00 4c 8b a4 24 88 00 00 00 74 0d 4c 89 e7 e8 c3 4f 3b fd 48 8b 4c 24 18 48 89 d8 31 d2 <49> f7 34 24 49 01 c7 4c 8b 64 24 48 4d 01 f7 4c 89 e3 48 c1 eb 03 RSP: 0018:ffffc9000dccea60 EFLAGS: 00010246 RAX: 000001a442624200 RBX: 000001a442624200 RCX: ffff888108a4f000 RDX: 0000000000000000 RSI: 000000000000070d RDI: 000000000000070d RBP: ffffc9000dcceb90 R08: ffffffff849c5e26 R09: fffffbfff10e1297 R10: 0000000000000000 R11: dffffc0000000001 R12: ffff888108a4f358 R13: dffffc0000000000 R14: 0000001a8cd9a7ec R15: 0000000000000000 FS: 00007fa73fe18700(0000) GS:ffff8881f6b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa73fdf7718 CR3: 000000011d36e000 CR4: 0000000000350ee0 Call Trace: [] __dev_xmit_skb net/core/dev.c:3931 [inline] [] __dev_queue_xmit+0xcf5/0x3370 net/core/dev.c:4290 [] dev_queue_xmit include/linux/netdevice.h:3030 [inline] [] neigh_hh_output include/net/neighbour.h:531 [inline] [] neigh_output include/net/neighbour.h:545 [inline] [] ip_finish_output2+0xb92/0x10d0 net/ipv4/ip_output.c:235 [] __ip_finish_output+0xc3/0x2b0 [] ip_finish_output+0x31/0x2a0 net/ipv4/ip_output.c:323 [] NF_HOOK_COND include/linux/netfilter.h:298 [inline] [] ip_output+0x224/0x2a0 net/ipv4/ip_output.c:437 [] dst_output include/net/dst.h:444 [inline] [] ip_local_out net/ipv4/ip_output.c:127 [inline] [] __ip_queue_xmit+0x1425/0x2000 net/ipv4/ip_output.c:542 [] ip_queue_xmit+0x4c/0x70 net/ipv4/ip_output.c:556 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Stephen Hemminger Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Reviewed-by: Jamal Hadi Salim Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230620184425.1179809-1-edumazet@google.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/sched/sch_netem.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index adc5407fd5d5..be42b1196786 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -969,6 +969,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, if (ret < 0) return ret; + sch_tree_lock(sch); /* backup q->clg and q->loss_model */ old_clg = q->clg; old_loss_model = q->loss_model; @@ -977,7 +978,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); if (ret) { q->loss_model = old_loss_model; - return ret; + goto unlock; } } else { q->loss_model = CLG_RANDOM; @@ -1044,6 +1045,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, /* capping jitter to the range acceptable by tabledist() */ q->jitter = min_t(s64, abs(q->jitter), INT_MAX); +unlock: + sch_tree_unlock(sch); return ret; get_table_failure: @@ -1053,7 +1056,8 @@ get_table_failure: */ q->clg = old_clg; q->loss_model = old_loss_model; - return ret; + + goto unlock; } static int netem_init(struct Qdisc *sch, struct nlattr *opt, From 7973c4b3b97de1ef33334be444024c089f1d33aa Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Oct 2021 18:58:11 +0100 Subject: [PATCH 73/98] gpio: Allow per-parent interrupt data [ Upstream commit cfe6807d82e97e81c3209dca9448f091e1448a57 ] The core gpiolib code is able to deal with multiple interrupt parents for a single gpio irqchip. It however only allows a single piece of data to be conveyed to all flow handlers (either the gpio_chip or some other, driver-specific data). This means that drivers have to go through some interesting dance to find the correct context, something that isn't great in interrupt context (see aebdc8abc9db86e2bd33070fc2f961012fff74b4 for a prime example). Instead, offer an optional way for a pinctrl/gpio driver to provide an array of pointers which gets used to provide the correct context to the flow handler. Signed-off-by: Marc Zyngier Signed-off-by: Joey Gouly Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20211026175815.52703-2-joey.gouly@arm.com Signed-off-by: Linus Walleij Stable-dep-of: 8c00914e5438 ("gpiolib: Fix GPIO chip IRQ initialization restriction") Signed-off-by: Sasha Levin --- drivers/gpio/gpiolib.c | 9 +++++++-- include/linux/gpio/driver.h | 19 +++++++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 8c041a8dd9d8..099fe2e39bd6 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1578,9 +1578,14 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, } if (gc->irq.parent_handler) { - void *data = gc->irq.parent_handler_data ?: gc; - for (i = 0; i < gc->irq.num_parents; i++) { + void *data; + + if (gc->irq.per_parent_data) + data = gc->irq.parent_handler_data_array[i]; + else + data = gc->irq.parent_handler_data ?: gc; + /* * The parent IRQ chip is already using the chip_data * for this IRQ chip, so our callbacks simply use the diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 11c26ae7b4fa..65df2ce96f0b 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -168,11 +168,18 @@ struct gpio_irq_chip { /** * @parent_handler_data: + * @parent_handler_data_array: * * Data associated, and passed to, the handler for the parent - * interrupt. + * interrupt. Can either be a single pointer if @per_parent_data + * is false, or an array of @num_parents pointers otherwise. If + * @per_parent_data is true, @parent_handler_data_array cannot be + * NULL. */ - void *parent_handler_data; + union { + void *parent_handler_data; + void **parent_handler_data_array; + }; /** * @num_parents: @@ -203,6 +210,14 @@ struct gpio_irq_chip { */ bool threaded; + /** + * @per_parent_data: + * + * True if parent_handler_data_array describes a @num_parents + * sized array to be used as parent data. + */ + bool per_parent_data; + /** * @init_hw: optional routine to initialize hardware before * an IRQ chip will be added. This is quite useful when From 497d40140865aae8c89757d25d7a17cd6f59d1d2 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Wed, 7 Jun 2023 16:18:03 +0800 Subject: [PATCH 74/98] gpiolib: Fix GPIO chip IRQ initialization restriction [ Upstream commit 8c00914e5438e3636f26b4f814b3297ae2a1b9ee ] In case of gpio-regmap, IRQ chip is added by regmap-irq and associated with GPIO chip by gpiochip_irqchip_add_domain(). The initialization flag was not added in gpiochip_irqchip_add_domain(), causing gpiochip_to_irq() to return -EPROBE_DEFER. Fixes: 5467801f1fcb ("gpio: Restrict usage of GPIO chip irq members before initialization") Signed-off-by: Jiawen Wu Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski Signed-off-by: Sasha Levin --- drivers/gpio/gpiolib.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 099fe2e39bd6..f9fdd117c654 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1683,6 +1683,14 @@ int gpiochip_irqchip_add_domain(struct gpio_chip *gc, gc->to_irq = gpiochip_to_irq; gc->irq.domain = domain; + /* + * Using barrier() here to prevent compiler from reordering + * gc->irq.initialized before adding irqdomain. + */ + barrier(); + + gc->irq.initialized = true; + return 0; } EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_domain); From 1cc379d53b665a7b25e96e784445274f683dcb9e Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Tue, 6 Jun 2023 11:11:59 +0800 Subject: [PATCH 75/98] gpio: sifive: add missing check for platform_get_irq [ Upstream commit c1bcb976d8feb107ff2c12caaf12ac5e70f44d5f ] Add the missing check for platform_get_irq() and return error code if it fails. The returned error code will be dealed with in builtin_platform_driver(sifive_gpio_driver) and the driver will not be registered. Fixes: f52d6d8b43e5 ("gpio: sifive: To get gpio irq offset from device tree data") Signed-off-by: Jiasheng Jiang Signed-off-by: Bartosz Golaszewski Signed-off-by: Sasha Levin --- drivers/gpio/gpio-sifive.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-sifive.c b/drivers/gpio/gpio-sifive.c index f41123de69c5..5ffab0fc1b76 100644 --- a/drivers/gpio/gpio-sifive.c +++ b/drivers/gpio/gpio-sifive.c @@ -215,8 +215,12 @@ static int sifive_gpio_probe(struct platform_device *pdev) return -ENODEV; } - for (i = 0; i < ngpio; i++) - chip->irq_number[i] = platform_get_irq(pdev, i); + for (i = 0; i < ngpio; i++) { + ret = platform_get_irq(pdev, i); + if (ret < 0) + return ret; + chip->irq_number[i] = ret; + } ret = bgpio_init(&chip->gc, dev, 4, chip->base + SIFIVE_GPIO_INPUT_VAL, From e8bdb1f886996c66e5932181e2352af04d899d8e Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Mon, 8 May 2023 18:22:19 +0200 Subject: [PATCH 76/98] scsi: target: iscsi: Prevent login threads from racing between each other [ Upstream commit 2a737d3b8c792400118d6cf94958f559de9c5e59 ] The tpg->np_login_sem is a semaphore that is used to serialize the login process when multiple login threads run concurrently against the same target portal group. The iscsi_target_locate_portal() function finds the tpg, calls iscsit_access_np() against the np_login_sem semaphore and saves the tpg pointer in conn->tpg; If iscsi_target_locate_portal() fails, the caller will check for the conn->tpg pointer and, if it's not NULL, then it will assume that iscsi_target_locate_portal() called iscsit_access_np() on the semaphore. Make sure that conn->tpg gets initialized only if iscsit_access_np() was successful, otherwise iscsit_deaccess_np() may end up being called against a semaphore we never took, allowing more than one thread to access the same tpg. Signed-off-by: Maurizio Lombardi Link: https://lore.kernel.org/r/20230508162219.1731964-4-mlombard@redhat.com Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/target/iscsi/iscsi_target_nego.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c index c0ed6f8e5c5b..32a2852352db 100644 --- a/drivers/target/iscsi/iscsi_target_nego.c +++ b/drivers/target/iscsi/iscsi_target_nego.c @@ -1071,6 +1071,7 @@ int iscsi_target_locate_portal( iscsi_target_set_sock_callbacks(conn); login->np = np; + conn->tpg = NULL; login_req = (struct iscsi_login_req *) login->req; payload_length = ntoh24(login_req->dlength); @@ -1138,7 +1139,6 @@ int iscsi_target_locate_portal( */ sessiontype = strncmp(s_buf, DISCOVERY, 9); if (!sessiontype) { - conn->tpg = iscsit_global->discovery_tpg; if (!login->leading_connection) goto get_target; @@ -1155,9 +1155,11 @@ int iscsi_target_locate_portal( * Serialize access across the discovery struct iscsi_portal_group to * process login attempt. */ + conn->tpg = iscsit_global->discovery_tpg; if (iscsit_access_np(np, conn->tpg) < 0) { iscsit_tx_login_rsp(conn, ISCSI_STATUS_CLS_TARGET_ERR, ISCSI_LOGIN_STATUS_SVC_UNAVAILABLE); + conn->tpg = NULL; ret = -1; goto out; } From f97b16c0a5389ceb95a81e7102b9e6586eb89891 Mon Sep 17 00:00:00 2001 From: Denis Arefev Date: Thu, 27 Apr 2023 14:47:45 +0300 Subject: [PATCH 77/98] HID: wacom: Add error check to wacom_parse_and_register() [ Upstream commit 16a9c24f24fbe4564284eb575b18cc20586b9270 ] Added a variable check and transition in case of an error Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Denis Arefev Reviewed-by: Ping Cheng Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/wacom_sys.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index d29773a799b4..33e763e746a0 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2425,8 +2425,13 @@ static int wacom_parse_and_register(struct wacom *wacom, bool wireless) goto fail_quirks; } - if (features->device_type & WACOM_DEVICETYPE_WL_MONITOR) + if (features->device_type & WACOM_DEVICETYPE_WL_MONITOR) { error = hid_hw_open(hdev); + if (error) { + hid_err(hdev, "hw open failed\n"); + goto fail_quirks; + } + } wacom_set_shared_values(wacom_wac); devres_close_group(&hdev->dev, wacom); From 01cf989090da565b03bf787303da365397e2ad67 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 15 May 2023 21:46:00 +0100 Subject: [PATCH 78/98] arm64: Add missing Set/Way CMO encodings [ Upstream commit 8d0f019e4c4f2ee2de81efd9bf1c27e9fb3c0460 ] Add the missing Set/Way CMOs that apply to tagged memory. Signed-off-by: Marc Zyngier Reviewed-by: Cornelia Huck Reviewed-by: Steven Price Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20230515204601.1270428-2-maz@kernel.org Signed-off-by: Sasha Levin --- arch/arm64/include/asm/sysreg.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index f79f3720e4cb..543eb08fa8e5 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -109,8 +109,14 @@ #define SB_BARRIER_INSN __SYS_BARRIER_INSN(0, 7, 31) #define SYS_DC_ISW sys_insn(1, 0, 7, 6, 2) +#define SYS_DC_IGSW sys_insn(1, 0, 7, 6, 4) +#define SYS_DC_IGDSW sys_insn(1, 0, 7, 6, 6) #define SYS_DC_CSW sys_insn(1, 0, 7, 10, 2) +#define SYS_DC_CGSW sys_insn(1, 0, 7, 10, 4) +#define SYS_DC_CGDSW sys_insn(1, 0, 7, 10, 6) #define SYS_DC_CISW sys_insn(1, 0, 7, 14, 2) +#define SYS_DC_CIGSW sys_insn(1, 0, 7, 14, 4) +#define SYS_DC_CIGDSW sys_insn(1, 0, 7, 14, 6) /* * System registers, organised loosely by encoding but grouped together From 8a5ddd1430d4e00d601f2f6b32968b46db51bc53 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 24 Apr 2023 16:07:28 +0100 Subject: [PATCH 79/98] media: cec: core: don't set last_initiator if tx in progress [ Upstream commit 73af6c7511038249cad3d5f3b44bf8d78ac0f499 ] When a message was received the last_initiator is set to 0xff. This will force the signal free time for the next transmit to that for a new initiator. However, if a new transmit is already in progress, then don't set last_initiator, since that's the initiator of the current transmit. Overwriting this would cause the signal free time of a following transmit to be that of the new initiator instead of a next transmit. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/cec/core/cec-adap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/cec/core/cec-adap.c b/drivers/media/cec/core/cec-adap.c index 67776a0d31e8..99ede1417d72 100644 --- a/drivers/media/cec/core/cec-adap.c +++ b/drivers/media/cec/core/cec-adap.c @@ -1086,7 +1086,8 @@ void cec_received_msg_ts(struct cec_adapter *adap, mutex_lock(&adap->lock); dprintk(2, "%s: %*ph\n", __func__, msg->len, msg->msg); - adap->last_initiator = 0xff; + if (!adap->transmit_in_progress) + adap->last_initiator = 0xff; /* Check if this message was for us (directed or broadcast). */ if (!cec_msg_is_broadcast(msg)) From cce681383d34dae8243484b19b9feaa51587e118 Mon Sep 17 00:00:00 2001 From: Osama Muhammad Date: Thu, 25 May 2023 22:27:46 +0500 Subject: [PATCH 80/98] nfcsim.c: Fix error checking for debugfs_create_dir [ Upstream commit 9b9e46aa07273ceb96866b2e812b46f1ee0b8d2f ] This patch fixes the error checking in nfcsim.c. The DebugFS kernel API is developed in a way that the caller can safely ignore the errors that occur during the creation of DebugFS nodes. Signed-off-by: Osama Muhammad Reviewed-by: Simon Horman Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/nfc/nfcsim.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/nfc/nfcsim.c b/drivers/nfc/nfcsim.c index 85bf8d586c70..0f6befe8be1e 100644 --- a/drivers/nfc/nfcsim.c +++ b/drivers/nfc/nfcsim.c @@ -336,10 +336,6 @@ static struct dentry *nfcsim_debugfs_root; static void nfcsim_debugfs_init(void) { nfcsim_debugfs_root = debugfs_create_dir("nfcsim", NULL); - - if (!nfcsim_debugfs_root) - pr_err("Could not create debugfs entry\n"); - } static void nfcsim_debugfs_remove(void) From e3bbc148377dff5ec1695c969257b0f9004aa7d2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 25 May 2023 18:38:37 +0300 Subject: [PATCH 81/98] usb: gadget: udc: fix NULL dereference in remove() [ Upstream commit 016da9c65fec9f0e78c4909ed9a0f2d567af6775 ] The "udc" pointer was never set in the probe() function so it will lead to a NULL dereference in udc_pci_remove() when we do: usb_del_gadget_udc(&udc->gadget); Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/ZG+A/dNpFWAlCChk@kili Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/gadget/udc/amd5536udc_pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/gadget/udc/amd5536udc_pci.c b/drivers/usb/gadget/udc/amd5536udc_pci.c index c80f9bd51b75..a36913ae31f9 100644 --- a/drivers/usb/gadget/udc/amd5536udc_pci.c +++ b/drivers/usb/gadget/udc/amd5536udc_pci.c @@ -170,6 +170,9 @@ static int udc_pci_probe( retval = -ENODEV; goto err_probe; } + + udc = dev; + return 0; err_probe: From 26bde09a1512a8ad7e8f599a88f618c1b47b7cad Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Thu, 25 May 2023 12:22:02 -0600 Subject: [PATCH 82/98] nvme: double KA polling frequency to avoid KATO with TBKAS on MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit ea4d453b9ec9ea279c39744cd0ecb47ef48ede35 ] With TBKAS on, the completion of one command can defer sending a keep alive for up to twice the delay between successive runs of nvme_keep_alive_work. The current delay of KATO / 2 thus makes it possible for one command to defer sending a keep alive for up to KATO, which can result in the controller detecting a KATO. The following trace demonstrates the issue, taking KATO = 8 for simplicity: 1. t = 0: run nvme_keep_alive_work, no keep-alive sent 2. t = ε: I/O completion seen, set comp_seen = true 3. t = 4: run nvme_keep_alive_work, see comp_seen == true, skip sending keep-alive, set comp_seen = false 4. t = 8: run nvme_keep_alive_work, see comp_seen == false, send a keep-alive command. Here, there is a delay of 8 - ε between receiving a command completion and sending the next command. With ε small, the controller is likely to detect a keep alive timeout. Fix this by running nvme_keep_alive_work with a delay of KATO / 4 whenever TBKAS is on. Going through the above trace now gives us a worst-case delay of 4 - ε, which is in line with the recommendation of sending a command every KATO / 2 in the NVMe specification. Reported-by: Costa Sapuntzakis Reported-by: Randy Jennings Signed-off-by: Uday Shankar Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Sasha Levin --- drivers/nvme/host/core.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e5318b38c662..98a7649a0f06 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1247,9 +1247,25 @@ EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); * The host should send Keep Alive commands at half of the Keep Alive Timeout * accounting for transport roundtrip times [..]. */ +static unsigned long nvme_keep_alive_work_period(struct nvme_ctrl *ctrl) +{ + unsigned long delay = ctrl->kato * HZ / 2; + + /* + * When using Traffic Based Keep Alive, we need to run + * nvme_keep_alive_work at twice the normal frequency, as one + * command completion can postpone sending a keep alive command + * by up to twice the delay between runs. + */ + if (ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) + delay /= 2; + return delay; +} + static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) { - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2); + queue_delayed_work(nvme_wq, &ctrl->ka_work, + nvme_keep_alive_work_period(ctrl)); } static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) From e10d15fdfcedba236adc4d250e6750e4a55a3873 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 11 May 2023 11:57:04 -0700 Subject: [PATCH 83/98] Input: soc_button_array - add invalid acpi_index DMI quirk handling [ Upstream commit 20a99a291d564a559cc2fd013b4824a3bb3f1db7 ] Some devices have a wrong entry in their button array which points to a GPIO which is required in another driver, so soc_button_array must not claim it. A specific example of this is the Lenovo Yoga Book X90F / X90L, where the PNP0C40 home button entry points to a GPIO which is not a home button and which is required by the lenovo-yogabook driver. Add a DMI quirk table which can specify an ACPI GPIO resource index which should be skipped; and add an entry for the Lenovo Yoga Book X90F / X90L to this new DMI quirk table. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20230414072116.4497-1-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov Signed-off-by: Sasha Levin --- drivers/input/misc/soc_button_array.c | 30 +++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/input/misc/soc_button_array.c b/drivers/input/misc/soc_button_array.c index 31c02c2019c1..67a134c8448d 100644 --- a/drivers/input/misc/soc_button_array.c +++ b/drivers/input/misc/soc_button_array.c @@ -108,6 +108,27 @@ static const struct dmi_system_id dmi_use_low_level_irq[] = { {} /* Terminating entry */ }; +/* + * Some devices have a wrong entry which points to a GPIO which is + * required in another driver, so this driver must not claim it. + */ +static const struct dmi_system_id dmi_invalid_acpi_index[] = { + { + /* + * Lenovo Yoga Book X90F / X90L, the PNP0C40 home button entry + * points to a GPIO which is not a home button and which is + * required by the lenovo-yogabook driver. + */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Intel Corporation"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "CHERRYVIEW D1 PLATFORM"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "YETI-11"), + }, + .driver_data = (void *)1l, + }, + {} /* Terminating entry */ +}; + /* * Get the Nth GPIO number from the ACPI object. */ @@ -137,6 +158,8 @@ soc_button_device_create(struct platform_device *pdev, struct platform_device *pd; struct gpio_keys_button *gpio_keys; struct gpio_keys_platform_data *gpio_keys_pdata; + const struct dmi_system_id *dmi_id; + int invalid_acpi_index = -1; int error, gpio, irq; int n_buttons = 0; @@ -154,10 +177,17 @@ soc_button_device_create(struct platform_device *pdev, gpio_keys = (void *)(gpio_keys_pdata + 1); n_buttons = 0; + dmi_id = dmi_first_match(dmi_invalid_acpi_index); + if (dmi_id) + invalid_acpi_index = (long)dmi_id->driver_data; + for (info = button_info; info->name; info++) { if (info->autorepeat != autorepeat) continue; + if (info->acpi_index == invalid_acpi_index) + continue; + error = soc_button_lookup_gpio(&pdev->dev, info->acpi_index, &gpio, &irq); if (error || irq < 0) { /* From f8d9d8f1727d2fb3935b9aec4e1058ef55c9e477 Mon Sep 17 00:00:00 2001 From: Vineeth Vijayan Date: Thu, 4 May 2023 20:53:20 +0200 Subject: [PATCH 84/98] s390/cio: unregister device when the only path is gone [ Upstream commit 89c0c62e947a01e7a36b54582fd9c9e346170255 ] Currently, if the device is offline and all the channel paths are either configured or varied offline, the associated subchannel gets unregistered. Don't unregister the subchannel, instead unregister offline device. Signed-off-by: Vineeth Vijayan Reviewed-by: Peter Oberparleiter Signed-off-by: Alexander Gordeev Signed-off-by: Sasha Levin --- drivers/s390/cio/device.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index b21fa57d1a46..a111154a9046 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -1379,6 +1379,7 @@ void ccw_device_set_notoper(struct ccw_device *cdev) enum io_sch_action { IO_SCH_UNREG, IO_SCH_ORPH_UNREG, + IO_SCH_UNREG_CDEV, IO_SCH_ATTACH, IO_SCH_UNREG_ATTACH, IO_SCH_ORPH_ATTACH, @@ -1411,7 +1412,7 @@ static enum io_sch_action sch_get_action(struct subchannel *sch) } if ((sch->schib.pmcw.pam & sch->opm) == 0) { if (ccw_device_notify(cdev, CIO_NO_PATH) != NOTIFY_OK) - return IO_SCH_UNREG; + return IO_SCH_UNREG_CDEV; return IO_SCH_DISC; } if (device_is_disconnected(cdev)) @@ -1473,6 +1474,7 @@ static int io_subchannel_sch_event(struct subchannel *sch, int process) case IO_SCH_ORPH_ATTACH: ccw_device_set_disconnected(cdev); break; + case IO_SCH_UNREG_CDEV: case IO_SCH_UNREG_ATTACH: case IO_SCH_UNREG: if (!cdev) @@ -1506,6 +1508,7 @@ static int io_subchannel_sch_event(struct subchannel *sch, int process) if (rc) goto out; break; + case IO_SCH_UNREG_CDEV: case IO_SCH_UNREG_ATTACH: spin_lock_irqsave(sch->lock, flags); sch_set_cdev(sch, NULL); From bb45dc7b67c5c162f40746e513078630d9802c53 Mon Sep 17 00:00:00 2001 From: Clark Wang Date: Fri, 5 May 2023 14:35:57 +0800 Subject: [PATCH 85/98] spi: lpspi: disable lpspi module irq in DMA mode [ Upstream commit 9728fb3ce11729aa8c276825ddf504edeb00611d ] When all bits of IER are set to 0, we still can observe the lpspi irq events when using DMA mode to transfer data. So disable irq to avoid the too much irq events. Signed-off-by: Clark Wang Link: https://lore.kernel.org/r/20230505063557.3962220-1-xiaoning.wang@nxp.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-fsl-lpspi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c index 5d98611dd999..c5ff6e8c45be 100644 --- a/drivers/spi/spi-fsl-lpspi.c +++ b/drivers/spi/spi-fsl-lpspi.c @@ -906,9 +906,14 @@ static int fsl_lpspi_probe(struct platform_device *pdev) ret = fsl_lpspi_dma_init(&pdev->dev, fsl_lpspi, controller); if (ret == -EPROBE_DEFER) goto out_pm_get; - if (ret < 0) dev_err(&pdev->dev, "dma setup error %d, use pio\n", ret); + else + /* + * disable LPSPI module IRQ when enable DMA mode successfully, + * to prevent the unexpected LPSPI module IRQ events. + */ + disable_irq(irq); ret = devm_spi_register_controller(&pdev->dev, controller); if (ret < 0) { From 53ad4af4ec909f3315f897eaae5a42642cf50bed Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Tue, 23 May 2023 17:12:22 +0200 Subject: [PATCH 86/98] ASoC: simple-card: Add missing of_node_put() in case of error [ Upstream commit 8938f75a5e35c597a647c28984a0304da7a33d63 ] In the error path, a of_node_put() for platform is missing. Just add it. Signed-off-by: Herve Codina Acked-by: Kuninori Morimoto Link: https://lore.kernel.org/r/20230523151223.109551-9-herve.codina@bootlin.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/generic/simple-card.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/generic/simple-card.c b/sound/soc/generic/simple-card.c index bc3e24c6a28a..283aa21879aa 100644 --- a/sound/soc/generic/simple-card.c +++ b/sound/soc/generic/simple-card.c @@ -417,6 +417,7 @@ static int __simple_for_each_link(struct asoc_simple_priv *priv, if (ret < 0) { of_node_put(codec); + of_node_put(plat); of_node_put(np); goto error; } From d77eac1b14e0cf3e1280ca54dd65e91678c656c7 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 15 May 2023 15:48:59 +0800 Subject: [PATCH 87/98] soundwire: dmi-quirks: add new mapping for HP Spectre x360 [ Upstream commit 700581ede41d029403feec935df4616309696fd7 ] A BIOS/DMI update seems to have broken some devices, let's add a new mapping. Link: https://github.com/thesofproject/linux/issues/4323 Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20230515074859.3097-1-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/soundwire/dmi-quirks.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/soundwire/dmi-quirks.c b/drivers/soundwire/dmi-quirks.c index 2bf534632f64..39f0cc2a5b33 100644 --- a/drivers/soundwire/dmi-quirks.c +++ b/drivers/soundwire/dmi-quirks.c @@ -63,6 +63,13 @@ static const struct dmi_system_id adr_remap_quirk_table[] = { }, .driver_data = (void *)intel_tgl_bios, }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "HP"), + DMI_MATCH(DMI_BOARD_NAME, "8709"), + }, + .driver_data = (void *)intel_tgl_bios, + }, { /* quirk used for NUC15 'Bishop County' LAPBC510 and LAPBC710 skews */ .matches = { From 3c4d87e9fa8a9d6960acb9d849ece1b3f3f5b7c0 Mon Sep 17 00:00:00 2001 From: Edson Juliano Drosdeck Date: Mon, 29 May 2023 15:19:11 -0300 Subject: [PATCH 88/98] ASoC: nau8824: Add quirk to active-high jack-detect [ Upstream commit e384dba03e3294ce7ea69e4da558e9bf8f0e8946 ] Add entries for Positivo laptops: CW14Q01P, K1424G, N14ZP74G to the DMI table, so that active-high jack-detect will work properly on these laptops. Signed-off-by: Edson Juliano Drosdeck Link: https://lore.kernel.org/r/20230529181911.632851-1-edson.drosdeck@gmail.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/nau8824.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/sound/soc/codecs/nau8824.c b/sound/soc/codecs/nau8824.c index 27589900f4fb..f7ff130a9485 100644 --- a/sound/soc/codecs/nau8824.c +++ b/sound/soc/codecs/nau8824.c @@ -1866,6 +1866,30 @@ static const struct dmi_system_id nau8824_quirk_table[] = { }, .driver_data = (void *)(NAU8824_JD_ACTIVE_HIGH), }, + { + /* Positivo CW14Q01P */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Positivo Tecnologia SA"), + DMI_MATCH(DMI_BOARD_NAME, "CW14Q01P"), + }, + .driver_data = (void *)(NAU8824_JD_ACTIVE_HIGH), + }, + { + /* Positivo K1424G */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Positivo Tecnologia SA"), + DMI_MATCH(DMI_BOARD_NAME, "K1424G"), + }, + .driver_data = (void *)(NAU8824_JD_ACTIVE_HIGH), + }, + { + /* Positivo N14ZP74G */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Positivo Tecnologia SA"), + DMI_MATCH(DMI_BOARD_NAME, "N14ZP74G"), + }, + .driver_data = (void *)(NAU8824_JD_ACTIVE_HIGH), + }, {} }; From 9684c4fdeeca4dac19c2618efd92b3f79132f58f Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 25 May 2023 19:41:45 +0200 Subject: [PATCH 89/98] s390/purgatory: disable branch profiling [ Upstream commit 03c5c83b70dca3729a3eb488e668e5044bd9a5ea ] Avoid linker error for randomly generated config file that has CONFIG_BRANCH_PROFILE_NONE enabled and make it similar to riscv, x86 and also to commit 4bf3ec384edf ("s390: disable branch profiling for vdso"). Reviewed-by: Vasily Gorbik Signed-off-by: Alexander Gordeev Signed-off-by: Sasha Levin --- arch/s390/purgatory/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index 360ada80d20c..d22ec8acb13c 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -26,6 +26,7 @@ KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding KBUILD_CFLAGS += -c -MD -Os -m64 -msoft-float -fno-common KBUILD_CFLAGS += -fno-stack-protector +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING KBUILD_CFLAGS += $(CLANG_FLAGS) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) From 79b4125bce96175d740f8ea1194243e15a703cc8 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 10 May 2023 12:51:56 +0200 Subject: [PATCH 90/98] ARM: dts: Fix erroneous ADS touchscreen polarities [ Upstream commit 4a672d500bfd6bb87092c33d5a2572c3d0a1cf83 ] Several device tree files get the polarity of the pendown-gpios wrong: this signal is active low. Fix up all incorrect flags, so that operating systems can rely on the flag being correctly set. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20230510105156.1134320-1-linus.walleij@linaro.org Signed-off-by: Arnd Bergmann Signed-off-by: Sasha Levin --- arch/arm/boot/dts/am57xx-cl-som-am57x.dts | 2 +- arch/arm/boot/dts/at91sam9261ek.dts | 2 +- arch/arm/boot/dts/imx7d-pico-hobbit.dts | 2 +- arch/arm/boot/dts/imx7d-sdb.dts | 2 +- arch/arm/boot/dts/omap3-cm-t3x.dtsi | 2 +- arch/arm/boot/dts/omap3-devkit8000-lcd-common.dtsi | 2 +- arch/arm/boot/dts/omap3-lilly-a83x.dtsi | 2 +- arch/arm/boot/dts/omap3-overo-common-lcd35.dtsi | 2 +- arch/arm/boot/dts/omap3-overo-common-lcd43.dtsi | 2 +- arch/arm/boot/dts/omap3-pandora-common.dtsi | 2 +- arch/arm/boot/dts/omap5-cm-t54.dts | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/arm/boot/dts/am57xx-cl-som-am57x.dts b/arch/arm/boot/dts/am57xx-cl-som-am57x.dts index 2e94f32d9dfc..5de82729eb7e 100644 --- a/arch/arm/boot/dts/am57xx-cl-som-am57x.dts +++ b/arch/arm/boot/dts/am57xx-cl-som-am57x.dts @@ -527,7 +527,7 @@ interrupt-parent = <&gpio1>; interrupts = <31 0>; - pendown-gpio = <&gpio1 31 0>; + pendown-gpio = <&gpio1 31 GPIO_ACTIVE_LOW>; ti,x-min = /bits/ 16 <0x0>; diff --git a/arch/arm/boot/dts/at91sam9261ek.dts b/arch/arm/boot/dts/at91sam9261ek.dts index beed819609e8..8f3b483bb64d 100644 --- a/arch/arm/boot/dts/at91sam9261ek.dts +++ b/arch/arm/boot/dts/at91sam9261ek.dts @@ -156,7 +156,7 @@ compatible = "ti,ads7843"; interrupts-extended = <&pioC 2 IRQ_TYPE_EDGE_BOTH>; spi-max-frequency = <3000000>; - pendown-gpio = <&pioC 2 GPIO_ACTIVE_HIGH>; + pendown-gpio = <&pioC 2 GPIO_ACTIVE_LOW>; ti,x-min = /bits/ 16 <150>; ti,x-max = /bits/ 16 <3830>; diff --git a/arch/arm/boot/dts/imx7d-pico-hobbit.dts b/arch/arm/boot/dts/imx7d-pico-hobbit.dts index d917dc4f2f22..6ad39dca7009 100644 --- a/arch/arm/boot/dts/imx7d-pico-hobbit.dts +++ b/arch/arm/boot/dts/imx7d-pico-hobbit.dts @@ -64,7 +64,7 @@ interrupt-parent = <&gpio2>; interrupts = <7 0>; spi-max-frequency = <1000000>; - pendown-gpio = <&gpio2 7 0>; + pendown-gpio = <&gpio2 7 GPIO_ACTIVE_LOW>; vcc-supply = <®_3p3v>; ti,x-min = /bits/ 16 <0>; ti,x-max = /bits/ 16 <4095>; diff --git a/arch/arm/boot/dts/imx7d-sdb.dts b/arch/arm/boot/dts/imx7d-sdb.dts index 4e1a6cde90fe..4e62ed2df11d 100644 --- a/arch/arm/boot/dts/imx7d-sdb.dts +++ b/arch/arm/boot/dts/imx7d-sdb.dts @@ -205,7 +205,7 @@ pinctrl-0 = <&pinctrl_tsc2046_pendown>; interrupt-parent = <&gpio2>; interrupts = <29 0>; - pendown-gpio = <&gpio2 29 GPIO_ACTIVE_HIGH>; + pendown-gpio = <&gpio2 29 GPIO_ACTIVE_LOW>; touchscreen-max-pressure = <255>; wakeup-source; }; diff --git a/arch/arm/boot/dts/omap3-cm-t3x.dtsi b/arch/arm/boot/dts/omap3-cm-t3x.dtsi index e61b8a2bfb7d..51baedf1603b 100644 --- a/arch/arm/boot/dts/omap3-cm-t3x.dtsi +++ b/arch/arm/boot/dts/omap3-cm-t3x.dtsi @@ -227,7 +227,7 @@ interrupt-parent = <&gpio2>; interrupts = <25 0>; /* gpio_57 */ - pendown-gpio = <&gpio2 25 GPIO_ACTIVE_HIGH>; + pendown-gpio = <&gpio2 25 GPIO_ACTIVE_LOW>; ti,x-min = /bits/ 16 <0x0>; ti,x-max = /bits/ 16 <0x0fff>; diff --git a/arch/arm/boot/dts/omap3-devkit8000-lcd-common.dtsi b/arch/arm/boot/dts/omap3-devkit8000-lcd-common.dtsi index 3decc2d78a6c..a7f99ae0c1fe 100644 --- a/arch/arm/boot/dts/omap3-devkit8000-lcd-common.dtsi +++ b/arch/arm/boot/dts/omap3-devkit8000-lcd-common.dtsi @@ -54,7 +54,7 @@ interrupt-parent = <&gpio1>; interrupts = <27 0>; /* gpio_27 */ - pendown-gpio = <&gpio1 27 GPIO_ACTIVE_HIGH>; + pendown-gpio = <&gpio1 27 GPIO_ACTIVE_LOW>; ti,x-min = /bits/ 16 <0x0>; ti,x-max = /bits/ 16 <0x0fff>; diff --git a/arch/arm/boot/dts/omap3-lilly-a83x.dtsi b/arch/arm/boot/dts/omap3-lilly-a83x.dtsi index 73d477898ec2..06e7cf96c663 100644 --- a/arch/arm/boot/dts/omap3-lilly-a83x.dtsi +++ b/arch/arm/boot/dts/omap3-lilly-a83x.dtsi @@ -311,7 +311,7 @@ interrupt-parent = <&gpio1>; interrupts = <8 0>; /* boot6 / gpio_8 */ spi-max-frequency = <1000000>; - pendown-gpio = <&gpio1 8 GPIO_ACTIVE_HIGH>; + pendown-gpio = <&gpio1 8 GPIO_ACTIVE_LOW>; vcc-supply = <®_vcc3>; pinctrl-names = "default"; pinctrl-0 = <&tsc2048_pins>; diff --git a/arch/arm/boot/dts/omap3-overo-common-lcd35.dtsi b/arch/arm/boot/dts/omap3-overo-common-lcd35.dtsi index 1d6e88f99eb3..c3570acc35fa 100644 --- a/arch/arm/boot/dts/omap3-overo-common-lcd35.dtsi +++ b/arch/arm/boot/dts/omap3-overo-common-lcd35.dtsi @@ -149,7 +149,7 @@ interrupt-parent = <&gpio4>; interrupts = <18 0>; /* gpio_114 */ - pendown-gpio = <&gpio4 18 GPIO_ACTIVE_HIGH>; + pendown-gpio = <&gpio4 18 GPIO_ACTIVE_LOW>; ti,x-min = /bits/ 16 <0x0>; ti,x-max = /bits/ 16 <0x0fff>; diff --git a/arch/arm/boot/dts/omap3-overo-common-lcd43.dtsi b/arch/arm/boot/dts/omap3-overo-common-lcd43.dtsi index 7e30f9d45790..d95a0e130058 100644 --- a/arch/arm/boot/dts/omap3-overo-common-lcd43.dtsi +++ b/arch/arm/boot/dts/omap3-overo-common-lcd43.dtsi @@ -160,7 +160,7 @@ interrupt-parent = <&gpio4>; interrupts = <18 0>; /* gpio_114 */ - pendown-gpio = <&gpio4 18 GPIO_ACTIVE_HIGH>; + pendown-gpio = <&gpio4 18 GPIO_ACTIVE_LOW>; ti,x-min = /bits/ 16 <0x0>; ti,x-max = /bits/ 16 <0x0fff>; diff --git a/arch/arm/boot/dts/omap3-pandora-common.dtsi b/arch/arm/boot/dts/omap3-pandora-common.dtsi index 37608af6c07f..ca6d777ebf84 100644 --- a/arch/arm/boot/dts/omap3-pandora-common.dtsi +++ b/arch/arm/boot/dts/omap3-pandora-common.dtsi @@ -651,7 +651,7 @@ pinctrl-0 = <&penirq_pins>; interrupt-parent = <&gpio3>; interrupts = <30 IRQ_TYPE_NONE>; /* GPIO_94 */ - pendown-gpio = <&gpio3 30 GPIO_ACTIVE_HIGH>; + pendown-gpio = <&gpio3 30 GPIO_ACTIVE_LOW>; vcc-supply = <&vaux4>; ti,x-min = /bits/ 16 <0>; diff --git a/arch/arm/boot/dts/omap5-cm-t54.dts b/arch/arm/boot/dts/omap5-cm-t54.dts index ca759b7b8a58..e62ea8b6d53f 100644 --- a/arch/arm/boot/dts/omap5-cm-t54.dts +++ b/arch/arm/boot/dts/omap5-cm-t54.dts @@ -354,7 +354,7 @@ interrupt-parent = <&gpio1>; interrupts = <15 0>; /* gpio1_wk15 */ - pendown-gpio = <&gpio1 15 GPIO_ACTIVE_HIGH>; + pendown-gpio = <&gpio1 15 GPIO_ACTIVE_LOW>; ti,x-min = /bits/ 16 <0x0>; From a44b4230d2ba6a8f6245ee02667f4a39b2d568ea Mon Sep 17 00:00:00 2001 From: Inki Dae Date: Fri, 19 May 2023 08:55:05 +0900 Subject: [PATCH 91/98] drm/exynos: vidi: fix a wrong error return [ Upstream commit 4a059559809fd1ddbf16f847c4d2237309c08edf ] Fix a wrong error return by dropping an error return. When vidi driver is remvoed, if ctx->raw_edid isn't same as fake_edid_info then only what we have to is to free ctx->raw_edid so that driver removing can work correctly - it's not an error case. Signed-off-by: Inki Dae Reviewed-by: Andi Shyti Signed-off-by: Sasha Levin --- drivers/gpu/drm/exynos/exynos_drm_vidi.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_vidi.c b/drivers/gpu/drm/exynos/exynos_drm_vidi.c index e5662bdcbbde..e96436e11a36 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_vidi.c +++ b/drivers/gpu/drm/exynos/exynos_drm_vidi.c @@ -468,8 +468,6 @@ static int vidi_remove(struct platform_device *pdev) if (ctx->raw_edid != (struct edid *)fake_edid_info) { kfree(ctx->raw_edid); ctx->raw_edid = NULL; - - return -EINVAL; } component_del(&pdev->dev, &vidi_component_ops); From f012d3037c15401dd72d8717806311dc4c568ea8 Mon Sep 17 00:00:00 2001 From: Min Li Date: Fri, 26 May 2023 21:01:31 +0800 Subject: [PATCH 92/98] drm/exynos: fix race condition UAF in exynos_g2d_exec_ioctl [ Upstream commit 48bfd02569f5db49cc033f259e66d57aa6efc9a3 ] If it is async, runqueue_node is freed in g2d_runqueue_worker on another worker thread. So in extreme cases, if g2d_runqueue_worker runs first, and then executes the following if statement, there will be use-after-free. Signed-off-by: Min Li Reviewed-by: Andi Shyti Signed-off-by: Inki Dae Signed-off-by: Sasha Levin --- drivers/gpu/drm/exynos/exynos_drm_g2d.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index 471fd6c8135f..27613abeed96 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -1335,7 +1335,7 @@ int exynos_g2d_exec_ioctl(struct drm_device *drm_dev, void *data, /* Let the runqueue know that there is work to do. */ queue_work(g2d->g2d_workq, &g2d->runqueue_work); - if (runqueue_node->async) + if (req->async) goto out; wait_for_completion(&runqueue_node->complete); From fdac0aa4a175069105ad79112ba99913b8be1167 Mon Sep 17 00:00:00 2001 From: Min Li Date: Sat, 3 Jun 2023 15:43:45 +0800 Subject: [PATCH 93/98] drm/radeon: fix race condition UAF in radeon_gem_set_domain_ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 982b173a6c6d9472730c3116051977e05d17c8c5 ] Userspace can race to free the gobj(robj converted from), robj should not be accessed again after drm_gem_object_put, otherwith it will result in use-after-free. Reviewed-by: Christian König Signed-off-by: Min Li Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/radeon/radeon_gem.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index a36a4f2c76b0..57218263ef3b 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -456,7 +456,6 @@ int radeon_gem_set_domain_ioctl(struct drm_device *dev, void *data, struct radeon_device *rdev = dev->dev_private; struct drm_radeon_gem_set_domain *args = data; struct drm_gem_object *gobj; - struct radeon_bo *robj; int r; /* for now if someone requests domain CPU - @@ -469,13 +468,12 @@ int radeon_gem_set_domain_ioctl(struct drm_device *dev, void *data, up_read(&rdev->exclusive_lock); return -ENOENT; } - robj = gem_to_radeon_bo(gobj); r = radeon_gem_set_domain(gobj, args->read_domains, args->write_domain); drm_gem_object_put(gobj); up_read(&rdev->exclusive_lock); - r = radeon_gem_handle_lockup(robj->rdev, r); + r = radeon_gem_handle_lockup(rdev, r); return r; } From 7949f83f7ecc593ff98f5920046d299f140db058 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Mon, 24 Apr 2023 23:44:11 +0300 Subject: [PATCH 94/98] vhost_net: revert upend_idx only on retriable error [ Upstream commit 1f5d2e3bab16369d5d4b4020a25db4ab1f4f082c ] Fix possible virtqueue used buffers leak and corresponding stuck in case of temporary -EIO from sendmsg() which is produced by tun driver while backend device is not up. In case of no-retriable error and zcopy do not revert upend_idx to pass packet data (that is update used_idx in corresponding vhost_zerocopy_signal_used()) as if packet data has been transferred successfully. v2: set vq->heads[ubuf->desc].len equal to VHOST_DMA_DONE_LEN in case of fake successful transmit. Signed-off-by: Andrey Smetanin Message-Id: <20230424204411.24888-1-asmetanin@yandex-team.ru> Signed-off-by: Michael S. Tsirkin Signed-off-by: Andrey Smetanin Acked-by: Jason Wang Signed-off-by: Sasha Levin --- drivers/vhost/net.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 32148f011200..00f10d340259 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -933,13 +933,18 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { + bool retry = err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS; + if (zcopy_used) { if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS) vhost_net_ubuf_put(ubufs); - nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) - % UIO_MAXIOV; + if (retry) + nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) + % UIO_MAXIOV; + else + vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; } - if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { + if (retry) { vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; From 907a069ec38fa799b698a3cd820ef4cd4ded3223 Mon Sep 17 00:00:00 2001 From: Dheeraj Kumar Srivastava Date: Sat, 17 Jun 2023 02:52:36 +0530 Subject: [PATCH 95/98] x86/apic: Fix kernel panic when booting with intremap=off and x2apic_phys [ Upstream commit 85d38d5810e285d5aec7fb5283107d1da70c12a9 ] When booting with "intremap=off" and "x2apic_phys" on the kernel command line, the physical x2APIC driver ends up being used even when x2APIC mode is disabled ("intremap=off" disables x2APIC mode). This happens because the first compound condition check in x2apic_phys_probe() is false due to x2apic_mode == 0 and so the following one returns true after default_acpi_madt_oem_check() having already selected the physical x2APIC driver. This results in the following panic: kernel BUG at arch/x86/kernel/apic/io_apic.c:2409! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.4.0-rc2-ver4.1rc2 #2 Hardware name: Dell Inc. PowerEdge R6515/07PXPY, BIOS 2.3.6 07/06/2021 RIP: 0010:setup_IO_APIC+0x9c/0xaf0 Call Trace: ? native_read_msr apic_intr_mode_init x86_late_time_init start_kernel x86_64_start_reservations x86_64_start_kernel secondary_startup_64_no_verify which is: setup_IO_APIC: apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); for_each_ioapic(ioapic) BUG_ON(mp_irqdomain_create(ioapic)); Return 0 to denote that x2APIC has not been enabled when probing the physical x2APIC driver. [ bp: Massage commit message heavily. ] Fixes: 9ebd680bd029 ("x86, apic: Use probe routines to simplify apic selection") Signed-off-by: Dheeraj Kumar Srivastava Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Kishon Vijay Abraham I Reviewed-by: Vasant Hegde Reviewed-by: Cyrill Gorcunov Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230616212236.1389-1-dheerajkumar.srivastava@amd.com Signed-off-by: Sasha Levin --- arch/x86/kernel/apic/x2apic_phys.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6bde05a86b4e..896bc41cb2ba 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -97,7 +97,10 @@ static void init_x2apic_ldr(void) static int x2apic_phys_probe(void) { - if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys())) + if (!x2apic_mode) + return 0; + + if (x2apic_phys || x2apic_fadt_phys()) return 1; return apic == &apic_x2apic_phys; From 2230b3f874d9ee3fea33115f2118fcfc290867cb Mon Sep 17 00:00:00 2001 From: Clark Wang Date: Mon, 29 May 2023 16:02:51 +0800 Subject: [PATCH 96/98] i2c: imx-lpi2c: fix type char overflow issue when calculating the clock cycle [ Upstream commit e69b9bc170c6d93ee375a5cbfd15f74c0fb59bdd ] Claim clkhi and clklo as integer type to avoid possible calculation errors caused by data overflow. Fixes: a55fa9d0e42e ("i2c: imx-lpi2c: add low power i2c bus driver") Signed-off-by: Clark Wang Signed-off-by: Carlos Song Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang Signed-off-by: Sasha Levin --- drivers/i2c/busses/i2c-imx-lpi2c.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-imx-lpi2c.c b/drivers/i2c/busses/i2c-imx-lpi2c.c index d45ec26d51cb..c688f11ae5c9 100644 --- a/drivers/i2c/busses/i2c-imx-lpi2c.c +++ b/drivers/i2c/busses/i2c-imx-lpi2c.c @@ -200,8 +200,8 @@ static void lpi2c_imx_stop(struct lpi2c_imx_struct *lpi2c_imx) /* CLKLO = I2C_CLK_RATIO * CLKHI, SETHOLD = CLKHI, DATAVD = CLKHI/2 */ static int lpi2c_imx_config(struct lpi2c_imx_struct *lpi2c_imx) { - u8 prescale, filt, sethold, clkhi, clklo, datavd; - unsigned int clk_rate, clk_cycle; + u8 prescale, filt, sethold, datavd; + unsigned int clk_rate, clk_cycle, clkhi, clklo; enum lpi2c_imx_pincfg pincfg; unsigned int temp; From 10fbd2e04e400f5eb0b85207cb441d37709e583c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 28 Jun 2023 07:56:26 +0200 Subject: [PATCH 97/98] act_mirred: remove unneded merge conflict markers In commit 169a41073993 ("act_mirred: use the backlog for nested calls to mirred ingress"), a merge conflict marker snuck in, so remove it. Reported-by: Bart Van Assche Fixes: 169a41073993 ("act_mirred: use the backlog for nested calls to mirred ingress") Cc: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/net/forwarding/tc_actions.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index 22a1e4c9553a..1e27031288c8 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -199,7 +199,6 @@ mirred_egress_to_ingress_tcp_test() log_test "mirred_egress_to_ingress_tcp ($tcflags)" } ->>>>>>> e921d05033293 (act_mirred: use the backlog for nested calls to mirred ingress) setup_prepare() { h1=${NETIFS[p1]} From 4af60700a60cc45ee4fb6d579cccf1b7bca20c34 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 28 Jun 2023 10:29:53 +0200 Subject: [PATCH 98/98] Linux 5.15.119 Link: https://lore.kernel.org/r/20230626180746.943455203@linuxfoundation.org Tested-by: Jon Hunter Tested-by: Chris Paterson (CIP) Tested-by: Guenter Roeck Tested-by: Linux Kernel Functional Testing Tested-by: Ron Economos Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c4ef4eed1a12..b863208e31b2 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 15 -SUBLEVEL = 118 +SUBLEVEL = 119 EXTRAVERSION = NAME = Trick or Treat