From 748b82c23e25310fec54e1eff2cb63936f391b24 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 1 Apr 2022 12:58:37 -0600 Subject: [PATCH 001/196] xfrm: Pass flowi_oif or l3mdev as oif to xfrm_dst_lookup The commit referenced in the Fixes tag no longer changes the flow oif to the l3mdev ifindex. A xfrm use case was expecting the flowi_oif to be the VRF if relevant and the change broke that test. Update xfrm_bundle_create to pass oif if set and any potential flowi_l3mdev if oif is not set. Fixes: 40867d74c374 ("net: Add l3mdev index to flow struct and avoid oif reset for port devices") Reported-by: kernel test robot Signed-off-by: David Ahern Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 19aa994f5d2c..00bd0ecff5a1 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2593,12 +2593,14 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { __u32 mark = 0; + int oif; if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m) mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); family = xfrm[i]->props.family; - dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, + oif = fl->flowi_oif ? : fl->flowi_l3mdev; + dst = xfrm_dst_lookup(xfrm[i], tos, oif, &saddr, &daddr, family, mark); err = PTR_ERR(dst); if (IS_ERR(dst)) From c775cbf62ed4911e4f0f23880f01815753123690 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 25 Mar 2022 15:42:39 +0000 Subject: [PATCH 002/196] ASoC: atmel: Remove system clock tree configuration for at91sam9g20ek The MCLK of the WM8731 on the AT91SAM9G20-EK board is connected to the PCK0 output of the SoC, intended in the reference software to be supplied using PLLB and programmed to 12MHz. As originally written for use with a board file the audio driver was responsible for configuring the entire tree but in the conversion to the common clock framework the registration of the named pck0 and pllb clocks was removed so the driver has failed to instantiate ever since. Since the WM8731 driver has had support for managing a MCLK provided via the common clock framework for some time we can simply drop all the clock management code from the machine driver other than configuration of the sysclk rate, the CODEC driver still respects that configuration from the machine driver. Fixes: ff78a189b0ae55f ("ARM: at91: remove old at91-specific clock driver") Signed-off-by: Mark Brown Reviewed-by: Codrin Ciubotariu Link: https://lore.kernel.org/r/20220325154241.1600757-2-broonie@kernel.org --- sound/soc/atmel/sam9g20_wm8731.c | 61 -------------------------------- 1 file changed, 61 deletions(-) diff --git a/sound/soc/atmel/sam9g20_wm8731.c b/sound/soc/atmel/sam9g20_wm8731.c index 33e43013ff77..0d639a33ad96 100644 --- a/sound/soc/atmel/sam9g20_wm8731.c +++ b/sound/soc/atmel/sam9g20_wm8731.c @@ -46,35 +46,6 @@ */ #undef ENABLE_MIC_INPUT -static struct clk *mclk; - -static int at91sam9g20ek_set_bias_level(struct snd_soc_card *card, - struct snd_soc_dapm_context *dapm, - enum snd_soc_bias_level level) -{ - static int mclk_on; - int ret = 0; - - switch (level) { - case SND_SOC_BIAS_ON: - case SND_SOC_BIAS_PREPARE: - if (!mclk_on) - ret = clk_enable(mclk); - if (ret == 0) - mclk_on = 1; - break; - - case SND_SOC_BIAS_OFF: - case SND_SOC_BIAS_STANDBY: - if (mclk_on) - clk_disable(mclk); - mclk_on = 0; - break; - } - - return ret; -} - static const struct snd_soc_dapm_widget at91sam9g20ek_dapm_widgets[] = { SND_SOC_DAPM_MIC("Int Mic", NULL), SND_SOC_DAPM_SPK("Ext Spk", NULL), @@ -135,7 +106,6 @@ static struct snd_soc_card snd_soc_at91sam9g20ek = { .owner = THIS_MODULE, .dai_link = &at91sam9g20ek_dai, .num_links = 1, - .set_bias_level = at91sam9g20ek_set_bias_level, .dapm_widgets = at91sam9g20ek_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(at91sam9g20ek_dapm_widgets), @@ -148,7 +118,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; struct device_node *codec_np, *cpu_np; - struct clk *pllb; struct snd_soc_card *card = &snd_soc_at91sam9g20ek; int ret; @@ -162,31 +131,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev) return -EINVAL; } - /* - * Codec MCLK is supplied by PCK0 - set it up. - */ - mclk = clk_get(NULL, "pck0"); - if (IS_ERR(mclk)) { - dev_err(&pdev->dev, "Failed to get MCLK\n"); - ret = PTR_ERR(mclk); - goto err; - } - - pllb = clk_get(NULL, "pllb"); - if (IS_ERR(pllb)) { - dev_err(&pdev->dev, "Failed to get PLLB\n"); - ret = PTR_ERR(pllb); - goto err_mclk; - } - ret = clk_set_parent(mclk, pllb); - clk_put(pllb); - if (ret != 0) { - dev_err(&pdev->dev, "Failed to set MCLK parent\n"); - goto err_mclk; - } - - clk_set_rate(mclk, MCLK_RATE); - card->dev = &pdev->dev; /* Parse device node info */ @@ -230,9 +174,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev) return ret; -err_mclk: - clk_put(mclk); - mclk = NULL; err: atmel_ssc_put_audio(0); return ret; @@ -242,8 +183,6 @@ static int at91sam9g20ek_audio_remove(struct platform_device *pdev) { struct snd_soc_card *card = platform_get_drvdata(pdev); - clk_disable(mclk); - mclk = NULL; snd_soc_unregister_card(card); atmel_ssc_put_audio(0); From 9c363532413cda3e2c6dfa10e5cca7cd221877a0 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 31 Mar 2022 14:49:57 +0300 Subject: [PATCH 003/196] ASoC: topology: Correct error handling in soc_tplg_dapm_widget_create() Academic correction of error handling: In case the allocation of kc or kcontrol_type fails the correct label to jump is hdr_err since the template.sname has been also allocated at this point. Fixes: d29d41e28eea6 ("ASoC: topology: Add support for multiple kcontrol types to a widget") Signed-off-by: Peter Ujfalusi Reviewed-by: Ranjani Sridharan Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20220331114957.519-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-topology.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 72e50df7052c..3bb90a819650 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1436,12 +1436,12 @@ static int soc_tplg_dapm_widget_create(struct soc_tplg *tplg, template.num_kcontrols = le32_to_cpu(w->num_kcontrols); kc = devm_kcalloc(tplg->dev, le32_to_cpu(w->num_kcontrols), sizeof(*kc), GFP_KERNEL); if (!kc) - goto err; + goto hdr_err; kcontrol_type = devm_kcalloc(tplg->dev, le32_to_cpu(w->num_kcontrols), sizeof(unsigned int), GFP_KERNEL); if (!kcontrol_type) - goto err; + goto hdr_err; for (i = 0; i < le32_to_cpu(w->num_kcontrols); i++) { control_hdr = (struct snd_soc_tplg_ctl_hdr *)tplg->pos; From acc72863e0f11cd0bedc888b663700229f9ba5ff Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Sun, 27 Mar 2022 16:13:00 +0800 Subject: [PATCH 004/196] codecs: rt5682s: fix an incorrect NULL check on list iterator The bug is here: if (!dai) { The list iterator value 'dai' will *always* be set and non-NULL by for_each_component_dais(), so it is incorrect to assume that the iterator value will be NULL if the list is empty or no element is found (In fact, it will be a bogus pointer to an invalid struct object containing the HEAD). Otherwise it will bypass the check 'if (!dai) {' (never call dev_err() and never return -ENODEV;) and lead to invalid memory access lately when calling 'rt5682s_set_bclk1_ratio(dai, factor);'. To fix the bug, just return rt5682s_set_bclk1_ratio(dai, factor); when found the 'dai', otherwise dev_err() and return -ENODEV; Cc: stable@vger.kernel.org Fixes: bdd229ab26be9 ("ASoC: rt5682s: Add driver for ALC5682I-VS codec") Signed-off-by: Xiaomeng Tong Link: https://lore.kernel.org/r/20220327081300.12962-1-xiam0nd.tong@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5682s.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/sound/soc/codecs/rt5682s.c b/sound/soc/codecs/rt5682s.c index 1cba8ec7cedb..b55f3ac3a267 100644 --- a/sound/soc/codecs/rt5682s.c +++ b/sound/soc/codecs/rt5682s.c @@ -2687,14 +2687,11 @@ static int rt5682s_bclk_set_rate(struct clk_hw *hw, unsigned long rate, for_each_component_dais(component, dai) if (dai->id == RT5682S_AIF1) - break; - if (!dai) { - dev_err(component->dev, "dai %d not found in component\n", - RT5682S_AIF1); - return -ENODEV; - } + return rt5682s_set_bclk1_ratio(dai, factor); - return rt5682s_set_bclk1_ratio(dai, factor); + dev_err(component->dev, "dai %d not found in component\n", + RT5682S_AIF1); + return -ENODEV; } static const struct clk_ops rt5682s_dai_clk_ops[RT5682S_DAI_NUM_CLKS] = { From 5708cc2f4b50c7bf27234eee77e1d9487533bbd3 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 31 Mar 2022 14:48:45 +0300 Subject: [PATCH 005/196] ASoC: SOF: topology: Fix memory leak of scontrol->name The scontrol->name is allocated with kstrdup, it must be freed before the scontrol is freed to avoid leaking memory. The constant leaking happens via sof_widget_unload() path on every module removal. Fixes: b5cee8feb1d48 ("ASoC: SOF: topology: Make control parsing IPC agnostic") Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Link: https://lore.kernel.org/r/20220331114845.32747-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/topology.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/sof/topology.c b/sound/soc/sof/topology.c index 9b11e9795a7a..75d78f9178a3 100644 --- a/sound/soc/sof/topology.c +++ b/sound/soc/sof/topology.c @@ -941,11 +941,13 @@ static int sof_control_load(struct snd_soc_component *scomp, int index, default: dev_warn(scomp->dev, "control type not supported %d:%d:%d\n", hdr->ops.get, hdr->ops.put, hdr->ops.info); + kfree(scontrol->name); kfree(scontrol); return 0; } if (ret < 0) { + kfree(scontrol->name); kfree(scontrol); return ret; } @@ -1380,6 +1382,7 @@ static int sof_widget_unload(struct snd_soc_component *scomp, } kfree(scontrol->ipc_control_data); list_del(&scontrol->list); + kfree(scontrol->name); kfree(scontrol); } From fb6d679fee95d272c0a94912c4e534146823ee89 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 31 Mar 2022 22:19:44 +0200 Subject: [PATCH 006/196] ASoC: soc-pcm: use GFP_KERNEL when the code is sleepable At the kzalloc() call in dpcm_be_connect(), there is no spin lock involved. It's merely protected by card->pcm_mutex, instead. The spinlock is applied at the later call with snd_soc_pcm_stream_lock_irq() only for the list manipulations. (See it's *_irq(), not *_irqsave(); that means the context being sleepable at that point.) So, we can use GFP_KERNEL safely there. This patch revert commit d8a9c6e1f676 ("ASoC: soc-pcm: use GFP_ATOMIC for dpcm structure") which is no longer needed since commit b7898396f4bb ("ASoC: soc-pcm: Fix and cleanup DPCM locking"). Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/e740f1930843060e025e3c0f17ec1393cfdafb26.1648757961.git.christophe.jaillet@wanadoo.fr Signed-off-by: Mark Brown --- sound/soc/soc-pcm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index 9a954680d492..11c9853e9e80 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -1214,7 +1214,7 @@ static int dpcm_be_connect(struct snd_soc_pcm_runtime *fe, be_substream->pcm->nonatomic = 1; } - dpcm = kzalloc(sizeof(struct snd_soc_dpcm), GFP_ATOMIC); + dpcm = kzalloc(sizeof(struct snd_soc_dpcm), GFP_KERNEL); if (!dpcm) return -ENOMEM; From f730a46b931d894816af34a0ff8e4ad51565b39f Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Tue, 29 Mar 2022 09:21:34 +0800 Subject: [PATCH 007/196] ASoC: soc-dapm: fix two incorrect uses of list iterator These two bug are here: list_for_each_entry_safe_continue(w, n, list, power_list); list_for_each_entry_safe_continue(w, n, list, power_list); After the list_for_each_entry_safe_continue() exits, the list iterator will always be a bogus pointer which point to an invalid struct objdect containing HEAD member. The funciton poniter 'w->event' will be a invalid value which can lead to a control-flow hijack if the 'w' can be controlled. The original intention was to continue the outer list_for_each_entry_safe() loop with the same entry if w->event is NULL, but misunderstanding the meaning of list_for_each_entry_safe_continue(). So just add a 'continue;' to fix the bug. Cc: stable@vger.kernel.org Fixes: 163cac061c973 ("ASoC: Factor out DAPM sequence execution") Signed-off-by: Xiaomeng Tong Link: https://lore.kernel.org/r/20220329012134.9375-1-xiam0nd.tong@gmail.com Signed-off-by: Mark Brown --- sound/soc/soc-dapm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index b435b5c4cfb7..ca917a849c42 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -1687,8 +1687,7 @@ static void dapm_seq_run(struct snd_soc_card *card, switch (w->id) { case snd_soc_dapm_pre: if (!w->event) - list_for_each_entry_safe_continue(w, n, list, - power_list); + continue; if (event == SND_SOC_DAPM_STREAM_START) ret = w->event(w, @@ -1700,8 +1699,7 @@ static void dapm_seq_run(struct snd_soc_card *card, case snd_soc_dapm_post: if (!w->event) - list_for_each_entry_safe_continue(w, n, list, - power_list); + continue; if (event == SND_SOC_DAPM_STREAM_START) ret = w->event(w, From c8618d65007ba68d7891130642d73e89372101e8 Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Sun, 27 Mar 2022 16:10:02 +0800 Subject: [PATCH 008/196] ASoC: rt5682: fix an incorrect NULL check on list iterator The bug is here: if (!dai) { The list iterator value 'dai' will *always* be set and non-NULL by for_each_component_dais(), so it is incorrect to assume that the iterator value will be NULL if the list is empty or no element is found (In fact, it will be a bogus pointer to an invalid struct object containing the HEAD). Otherwise it will bypass the check 'if (!dai) {' (never call dev_err() and never return -ENODEV;) and lead to invalid memory access lately when calling 'rt5682_set_bclk1_ratio(dai, factor);'. To fix the bug, just return rt5682_set_bclk1_ratio(dai, factor); when found the 'dai', otherwise dev_err() and return -ENODEV; Cc: stable@vger.kernel.org Fixes: ebbfabc16d23d ("ASoC: rt5682: Add CCF usage for providing I2S clks") Signed-off-by: Xiaomeng Tong Link: https://lore.kernel.org/r/20220327081002.12684-1-xiam0nd.tong@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5682.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/sound/soc/codecs/rt5682.c b/sound/soc/codecs/rt5682.c index be68d573a490..c9ff9c89adf7 100644 --- a/sound/soc/codecs/rt5682.c +++ b/sound/soc/codecs/rt5682.c @@ -2822,14 +2822,11 @@ static int rt5682_bclk_set_rate(struct clk_hw *hw, unsigned long rate, for_each_component_dais(component, dai) if (dai->id == RT5682_AIF1) - break; - if (!dai) { - dev_err(rt5682->i2c_dev, "dai %d not found in component\n", - RT5682_AIF1); - return -ENODEV; - } + return rt5682_set_bclk1_ratio(dai, factor); - return rt5682_set_bclk1_ratio(dai, factor); + dev_err(rt5682->i2c_dev, "dai %d not found in component\n", + RT5682_AIF1); + return -ENODEV; } static const struct clk_ops rt5682_dai_clk_ops[RT5682_DAI_NUM_CLKS] = { From c598ccfbeb26cb9452f99e7beb92ef779dcb16b1 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Thu, 24 Mar 2022 16:18:38 +0800 Subject: [PATCH 009/196] ASoC: cs35l41: Add one more variable in the debug log otp_map[].size is a key variable to compute the value of otp_val and to update the bit_offset, it is helpful to debug if could put it in the debug log. Signed-off-by: Hui Wang Reviewed-by: Lucas Tanure Link: https://lore.kernel.org/r/20220324081839.62009-1-hui.wang@canonical.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l41-lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/cs35l41-lib.c b/sound/soc/codecs/cs35l41-lib.c index e5a56bcbb223..d0a480c40231 100644 --- a/sound/soc/codecs/cs35l41-lib.c +++ b/sound/soc/codecs/cs35l41-lib.c @@ -822,8 +822,8 @@ int cs35l41_otp_unpack(struct device *dev, struct regmap *regmap) word_offset = otp_map_match->word_offset; for (i = 0; i < otp_map_match->num_elements; i++) { - dev_dbg(dev, "bitoffset= %d, word_offset=%d, bit_sum mod 32=%d\n", - bit_offset, word_offset, bit_sum % 32); + dev_dbg(dev, "bitoffset= %d, word_offset=%d, bit_sum mod 32=%d otp_map[i].size = %d\n", + bit_offset, word_offset, bit_sum % 32, otp_map[i].size); if (bit_offset + otp_map[i].size - 1 >= 32) { otp_val = (otp_mem[word_offset] & GENMASK(31, bit_offset)) >> bit_offset; From 0b3d5d2e358ca6772fc3662fca27acb12a682fbf Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Thu, 24 Mar 2022 16:18:39 +0800 Subject: [PATCH 010/196] ASoC: cs35l41: Fix a shift-out-of-bounds warning found by UBSAN We enabled UBSAN in the ubuntu kernel, and the cs35l41 driver triggers a warning calltrace like below: cs35l41-hda i2c-CSC3551:00-cs35l41-hda.0: bitoffset= 8, word_offset=23, bit_sum mod 32=0, otp_map[i].size = 24 cs35l41-hda i2c-CSC3551:00-cs35l41-hda.0: bitoffset= 0, word_offset=24, bit_sum mod 32=24, otp_map[i].size = 0 ================================================================================ UBSAN: shift-out-of-bounds in linux-kernel-src/sound/soc/codecs/cs35l41-lib.c:836:8 shift exponent 64 is too large for 64-bit type 'long unsigned int' CPU: 10 PID: 595 Comm: systemd-udevd Not tainted 5.15.0-23-generic #23 Hardware name: LENOVO \x02MFG_IN_GO/\x02MFG_IN_GO, BIOS N3GET19W (1.00 ) 03/11/2022 Call Trace: show_stack+0x52/0x58 dump_stack_lvl+0x4a/0x5f dump_stack+0x10/0x12 ubsan_epilogue+0x9/0x45 __ubsan_handle_shift_out_of_bounds.cold+0x61/0xef ? regmap_unlock_mutex+0xe/0x10 cs35l41_otp_unpack.cold+0x1c6/0x2b2 [snd_soc_cs35l41_lib] cs35l41_hda_probe+0x24f/0x33a [snd_hda_scodec_cs35l41] cs35l41_hda_i2c_probe+0x65/0x90 [snd_hda_scodec_cs35l41_i2c] When both bitoffset and otp_map[i].size are 0, the line 836 will result in GENMASK(-1, 0), this triggers the shift-out-of-bounds calltrace. Here add a checking, if both bitoffset and otp_map[i].size are 0, do not run GENMASK() and directly set otp_val to 0, this will not bring any function change on the driver but could avoid the calltrace. Signed-off-by: Hui Wang Link: https://lore.kernel.org/r/20220324081839.62009-2-hui.wang@canonical.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l41-lib.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/cs35l41-lib.c b/sound/soc/codecs/cs35l41-lib.c index d0a480c40231..aa6823fbd1a4 100644 --- a/sound/soc/codecs/cs35l41-lib.c +++ b/sound/soc/codecs/cs35l41-lib.c @@ -831,12 +831,14 @@ int cs35l41_otp_unpack(struct device *dev, struct regmap *regmap) GENMASK(bit_offset + otp_map[i].size - 33, 0)) << (32 - bit_offset); bit_offset += otp_map[i].size - 32; - } else { + } else if (bit_offset + otp_map[i].size - 1 >= 0) { otp_val = (otp_mem[word_offset] & GENMASK(bit_offset + otp_map[i].size - 1, bit_offset) ) >> bit_offset; bit_offset += otp_map[i].size; - } + } else /* both bit_offset and otp_map[i].size are 0 */ + otp_val = 0; + bit_sum += otp_map[i].size; if (bit_offset == 32) { From 8ba08d3a367a70f707b7c5d53ad92b98b960ee88 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Mon, 4 Apr 2022 09:07:46 +0000 Subject: [PATCH 011/196] ASoC: rk817: Use devm_clk_get() in rk817_platform_probe We need to call clk_put() to undo clk_get() in the error path. Use devm_clk_get() to obtain a reference to the clock, It has the benefit that clk_put() is no longer required. Fixes: 0d6a04da9b25 ("ASoC: Add Rockchip rk817 audio CODEC support") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220404090753.17940-1-linmq006@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/rk817_codec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/rk817_codec.c b/sound/soc/codecs/rk817_codec.c index 8fffe378618d..cce6f4e7992f 100644 --- a/sound/soc/codecs/rk817_codec.c +++ b/sound/soc/codecs/rk817_codec.c @@ -489,7 +489,7 @@ static int rk817_platform_probe(struct platform_device *pdev) rk817_codec_parse_dt_property(&pdev->dev, rk817_codec_data); - rk817_codec_data->mclk = clk_get(pdev->dev.parent, "mclk"); + rk817_codec_data->mclk = devm_clk_get(pdev->dev.parent, "mclk"); if (IS_ERR(rk817_codec_data->mclk)) { dev_dbg(&pdev->dev, "Unable to get mclk\n"); ret = -ENXIO; From e927b05f3cc20de87f6b7d912a5bbe556931caca Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Sun, 3 Apr 2022 11:52:39 +0000 Subject: [PATCH 012/196] ASoC: msm8916-wcd-digital: Check failure for devm_snd_soc_register_component devm_snd_soc_register_component() may fails, we should check the error and do the corresponding error handling. Fixes: 150db8c5afa1 ("ASoC: codecs: Add msm8916-wcd digital codec") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220403115239.30140-1-linmq006@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/msm8916-wcd-digital.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/msm8916-wcd-digital.c b/sound/soc/codecs/msm8916-wcd-digital.c index 9ad7fc0baf07..20a07c92b2fc 100644 --- a/sound/soc/codecs/msm8916-wcd-digital.c +++ b/sound/soc/codecs/msm8916-wcd-digital.c @@ -1206,9 +1206,16 @@ static int msm8916_wcd_digital_probe(struct platform_device *pdev) dev_set_drvdata(dev, priv); - return devm_snd_soc_register_component(dev, &msm8916_wcd_digital, + ret = devm_snd_soc_register_component(dev, &msm8916_wcd_digital, msm8916_wcd_digital_dai, ARRAY_SIZE(msm8916_wcd_digital_dai)); + if (ret) + goto err_mclk; + + return 0; + +err_mclk: + clk_disable_unprepare(priv->mclk); err_clk: clk_disable_unprepare(priv->ahbclk); return ret; From d462f6ed2aeac30c0b440a91fb05d964956935f9 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 9 Mar 2022 21:21:55 +0100 Subject: [PATCH 013/196] ASoC: soc-core: add debugfs_prefix member to snd_soc_component_driver Allow the component debugfs_prefix to be set from snd_soc_component_driver. First use case is avoiding a duplicate debugfs entry error in case a device has multiple components which have the same name therefore. Note that we don't set component->debugfs_prefix if it's set already. That's needed because partially component->debugfs_prefix is set before calling snd_soc_component_initialize(). Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/d18bff6a-1df1-5f95-0cf8-10dbaa62d7be@gmail.com Signed-off-by: Mark Brown --- include/sound/soc-component.h | 4 ++++ sound/soc/soc-core.c | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index a52080407b98..766dc6f009c0 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -179,6 +179,10 @@ struct snd_soc_component_driver { struct snd_pcm_hw_params *params); bool use_dai_pcm_id; /* use DAI link PCM ID as PCM device number */ int be_pcm_base; /* base device ID for all BE PCMs */ + +#ifdef CONFIG_DEBUG_FS + const char *debugfs_prefix; +#endif }; struct snd_soc_component { diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index ce153ac2c3ab..8c7da82a62ca 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -2587,6 +2587,11 @@ int snd_soc_component_initialize(struct snd_soc_component *component, component->dev = dev; component->driver = driver; +#ifdef CONFIG_DEBUG_FS + if (!component->debugfs_prefix) + component->debugfs_prefix = driver->debugfs_prefix; +#endif + return 0; } EXPORT_SYMBOL_GPL(snd_soc_component_initialize); From fc35880d198d9f2023bf231c120e1a69ad4db841 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 9 Mar 2022 21:23:06 +0100 Subject: [PATCH 014/196] ASoC: meson: aiu: fix duplicate debugfs directory error On a S905W-based system I get the following error: debugfs: Directory 'c1105400.audio-controller' with parent 'P230-Q200' already present! Turned out that multiple components having the same name triggers this error in soc_init_component_debugfs(). With the patch the error is gone and that's the debugfs entries. /sys/kernel/debug/asoc/P230-Q200/acodec:c1105400.audio-controller /sys/kernel/debug/asoc/P230-Q200/hdmi:c1105400.audio-controller /sys/kernel/debug/asoc/P230-Q200/cpu:c1105400.audio-controller Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/38053baf-c33b-7fdf-7593-99b22153a9c0@gmail.com Signed-off-by: Mark Brown --- sound/soc/meson/aiu-acodec-ctrl.c | 3 +++ sound/soc/meson/aiu-codec-ctrl.c | 3 +++ sound/soc/meson/aiu.c | 3 +++ 3 files changed, 9 insertions(+) diff --git a/sound/soc/meson/aiu-acodec-ctrl.c b/sound/soc/meson/aiu-acodec-ctrl.c index 27a6d3259c50..22e181646bc3 100644 --- a/sound/soc/meson/aiu-acodec-ctrl.c +++ b/sound/soc/meson/aiu-acodec-ctrl.c @@ -193,6 +193,9 @@ static const struct snd_soc_component_driver aiu_acodec_ctrl_component = { .of_xlate_dai_name = aiu_acodec_of_xlate_dai_name, .endianness = 1, .non_legacy_dai_naming = 1, +#ifdef CONFIG_DEBUG_FS + .debugfs_prefix = "acodec", +#endif }; int aiu_acodec_ctrl_register_component(struct device *dev) diff --git a/sound/soc/meson/aiu-codec-ctrl.c b/sound/soc/meson/aiu-codec-ctrl.c index c3ea733fce91..59ee66fc2bcd 100644 --- a/sound/soc/meson/aiu-codec-ctrl.c +++ b/sound/soc/meson/aiu-codec-ctrl.c @@ -140,6 +140,9 @@ static const struct snd_soc_component_driver aiu_hdmi_ctrl_component = { .of_xlate_dai_name = aiu_hdmi_of_xlate_dai_name, .endianness = 1, .non_legacy_dai_naming = 1, +#ifdef CONFIG_DEBUG_FS + .debugfs_prefix = "hdmi", +#endif }; int aiu_hdmi_ctrl_register_component(struct device *dev) diff --git a/sound/soc/meson/aiu.c b/sound/soc/meson/aiu.c index d299a70db7e5..88e611e64d14 100644 --- a/sound/soc/meson/aiu.c +++ b/sound/soc/meson/aiu.c @@ -103,6 +103,9 @@ static const struct snd_soc_component_driver aiu_cpu_component = { .pointer = aiu_fifo_pointer, .probe = aiu_cpu_component_probe, .remove = aiu_cpu_component_remove, +#ifdef CONFIG_DEBUG_FS + .debugfs_prefix = "cpu", +#endif }; static struct snd_soc_dai_driver aiu_cpu_dai_drv[] = { From 51a630a7051f7f4f1cfdd64c20c7110f9907c230 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 4 Apr 2022 12:32:52 +0100 Subject: [PATCH 015/196] ASoC: simple-card-utils: Avoid NULL deref in asoc_simple_set_tdm() Don't dereference simple_dai before it has been checked for NULL. Signed-off-by: Richard Fitzgerald Fixes: 1e974e5b82b3 ("ASoC: audio_graph_card2: Add support for variable slot widths") Reported-by: kernel test robot Reported-by: Dan Carpenter Link: https://lore.kernel.org/r/20220404113252.1152659-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/generic/simple-card-utils.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/soc/generic/simple-card-utils.c b/sound/soc/generic/simple-card-utils.c index 8e037835bc58..f2157944247f 100644 --- a/sound/soc/generic/simple-card-utils.c +++ b/sound/soc/generic/simple-card-utils.c @@ -364,13 +364,15 @@ static int asoc_simple_set_tdm(struct snd_soc_dai *dai, struct snd_pcm_hw_params *params) { int sample_bits = params_width(params); - int slot_width = simple_dai->slot_width; - int slot_count = simple_dai->slots; + int slot_width, slot_count; int i, ret; if (!simple_dai || !simple_dai->tdm_width_map) return 0; + slot_width = simple_dai->slot_width; + slot_count = simple_dai->slots; + if (slot_width == 0) slot_width = sample_bits; From d00887c106dac47b9af6ed70e8d5c45b69c4bd52 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Tue, 5 Apr 2022 17:57:31 +0200 Subject: [PATCH 016/196] ASoC: fsl_sai: fix 1:1 bclk:mclk ratio support Refactoring in commit a50b7926d015 ("ASoC: fsl_sai: implement 1:1 bclk:mclk ratio support") led to the bypass never happening as (ratio = 1) was caught in the existing if (ratio & 1) continue; check. The correct check sequence instead is: - skip all ratios lower than one and higher than 512 - skip all odd ratios except for 1:1 - skip 1:1 ratio if and only if !support_1_1_ratio And for all others, calculate the appropriate divider. Adjust the code to facilitate this. Fixes: a50b7926d015 ("ASoC: fsl_sai: implement 1:1 bclk:mclk ratio support") Signed-off-by: Ahmad Fatoum Acked-by: Shengjiu Wang Reviewed-by: Sascha Hauer Link: https://lore.kernel.org/r/20220405155731.745413-1-a.fatoum@pengutronix.de Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_sai.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c index 4650a6931a94..ffc24afb5a7a 100644 --- a/sound/soc/fsl/fsl_sai.c +++ b/sound/soc/fsl/fsl_sai.c @@ -372,7 +372,7 @@ static int fsl_sai_set_bclk(struct snd_soc_dai *dai, bool tx, u32 freq) continue; if (ratio == 1 && !support_1_1_ratio) continue; - else if (ratio & 1) + if ((ratio & 1) && ratio > 1) continue; diff = abs((long)clk_rate - ratio * freq); From fcd1e39cca6e3a262f2badfcd5edd76c910ad3bc Mon Sep 17 00:00:00 2001 From: Ajye Huang Date: Thu, 24 Mar 2022 16:47:08 +0800 Subject: [PATCH 017/196] ASoC: Intel: sof_rt5682: Add support for max98360a speaker amp on SSP2 Follow Intel's design to replace max98360a amp SSP2 reather than SSP1 by judging DMI_OEM_STRING in sof_rt5682_quirk_table struct. And reusing max98357's topology since DAI setting could be leveraged. Signed-off-by: Ajye Huang Acked-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20220324084708.2009375-1-ajye_huang@compal.corp-partner.google.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_rt5682.c | 13 +++++++++++++ sound/soc/sof/sof-pci-dev.c | 9 ++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/sound/soc/intel/boards/sof_rt5682.c b/sound/soc/intel/boards/sof_rt5682.c index ebec4d15edaa..7126fcb63d90 100644 --- a/sound/soc/intel/boards/sof_rt5682.c +++ b/sound/soc/intel/boards/sof_rt5682.c @@ -212,6 +212,19 @@ static const struct dmi_system_id sof_rt5682_quirk_table[] = { SOF_SSP_BT_OFFLOAD_PRESENT), }, + { + .callback = sof_rt5682_quirk_cb, + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "Google_Brya"), + DMI_MATCH(DMI_OEM_STRING, "AUDIO-MAX98360_ALC5682I_I2S_AMP_SSP2"), + }, + .driver_data = (void *)(SOF_RT5682_MCLK_EN | + SOF_RT5682_SSP_CODEC(0) | + SOF_SPEAKER_AMP_PRESENT | + SOF_MAX98360A_SPEAKER_AMP_PRESENT | + SOF_RT5682_SSP_AMP(2) | + SOF_RT5682_NUM_HDMIDEV(4)), + }, {} }; diff --git a/sound/soc/sof/sof-pci-dev.c b/sound/soc/sof/sof-pci-dev.c index 4c9596742844..12f5cff22448 100644 --- a/sound/soc/sof/sof-pci-dev.c +++ b/sound/soc/sof/sof-pci-dev.c @@ -83,7 +83,14 @@ static const struct dmi_system_id sof_tplg_table[] = { }, .driver_data = "sof-adl-max98357a-rt5682-2way.tplg", }, - + { + .callback = sof_tplg_cb, + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "Google_Brya"), + DMI_MATCH(DMI_OEM_STRING, "AUDIO-MAX98360_ALC5682I_I2S_AMP_SSP2"), + }, + .driver_data = "sof-adl-max98357a-rt5682.tplg", + }, {} }; From 770f3d992a3f7330f801dfeee98429b2885c9fdb Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Wed, 6 Apr 2022 14:20:05 -0500 Subject: [PATCH 018/196] ASoC: rt711/5682: check if bus is active before deferred jack detection This patch takes a defensive programming and paranoid approach in case the parent device (SoundWire) is pm_runtime resumed but the rt711 device is not. In that case, during the attachment and initialization, a jack detection workqueue can be scheduled. Since the pm_runtime suspend routines will not be invoked, the sequence to cancel all deferred work is not executed, and the jack detection could happen after the bus stops operating, leading to a timeout. This patch applies the same solution to rt5682, based on the similarities between codec drivers. The race condition with rt5682 was not detected experimentally though. BugLink: https://github.com/thesofproject/linux/issues/3459 Signed-off-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Reviewed-by: Bard Liao Link: https://lore.kernel.org/r/20220406192005.262996-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5682.c | 9 +++++++++ sound/soc/codecs/rt711.c | 7 +++++++ 2 files changed, 16 insertions(+) diff --git a/sound/soc/codecs/rt5682.c b/sound/soc/codecs/rt5682.c index c9ff9c89adf7..2b6c6d6b9771 100644 --- a/sound/soc/codecs/rt5682.c +++ b/sound/soc/codecs/rt5682.c @@ -1100,6 +1100,15 @@ void rt5682_jack_detect_handler(struct work_struct *work) return; } + if (rt5682->is_sdw) { + if (pm_runtime_status_suspended(rt5682->slave->dev.parent)) { + dev_dbg(&rt5682->slave->dev, + "%s: parent device is pm_runtime_status_suspended, skipping jack detection\n", + __func__); + return; + } + } + dapm = snd_soc_component_get_dapm(rt5682->component); snd_soc_dapm_mutex_lock(dapm); diff --git a/sound/soc/codecs/rt711.c b/sound/soc/codecs/rt711.c index 6770825d037a..ea25fd58d43a 100644 --- a/sound/soc/codecs/rt711.c +++ b/sound/soc/codecs/rt711.c @@ -245,6 +245,13 @@ static void rt711_jack_detect_handler(struct work_struct *work) if (!rt711->component->card->instantiated) return; + if (pm_runtime_status_suspended(rt711->slave->dev.parent)) { + dev_dbg(&rt711->slave->dev, + "%s: parent device is pm_runtime_status_suspended, skipping jack detection\n", + __func__); + return; + } + reg = RT711_VERB_GET_PIN_SENSE | RT711_HP_OUT; ret = regmap_read(rt711->regmap, reg, &jack_status); if (ret < 0) From 20744617bdbafe2e7fb7bf5401f616e24bde4471 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Wed, 6 Apr 2022 14:16:06 -0500 Subject: [PATCH 019/196] ASoC: SOF: topology: cleanup dailinks on widget unload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We set the cpu_dai capture_ or playback_widget on widget_ready but never clear them, which leads to failures when unloading/reloading a topology in modprobe/rmmod tests BugLink: https://github.com/thesofproject/linux/issues/3535 Fixes: 311ce4fe7637 ("ASoC: SOF: Add support for loading topologies") Signed-off-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Reviewed-by: Péter Ujfalusi Reviewed-by: Bard Liao Link: https://lore.kernel.org/r/20220406191606.254576-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/topology.c | 43 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/sound/soc/sof/topology.c b/sound/soc/sof/topology.c index 75d78f9178a3..5953d1050cc9 100644 --- a/sound/soc/sof/topology.c +++ b/sound/soc/sof/topology.c @@ -1070,6 +1070,46 @@ static int sof_connect_dai_widget(struct snd_soc_component *scomp, return 0; } +static void sof_disconnect_dai_widget(struct snd_soc_component *scomp, + struct snd_soc_dapm_widget *w) +{ + struct snd_soc_card *card = scomp->card; + struct snd_soc_pcm_runtime *rtd; + struct snd_soc_dai *cpu_dai; + int i; + + if (!w->sname) + return; + + list_for_each_entry(rtd, &card->rtd_list, list) { + /* does stream match DAI link ? */ + if (!rtd->dai_link->stream_name || + strcmp(w->sname, rtd->dai_link->stream_name)) + continue; + + switch (w->id) { + case snd_soc_dapm_dai_out: + for_each_rtd_cpu_dais(rtd, i, cpu_dai) { + if (cpu_dai->capture_widget == w) { + cpu_dai->capture_widget = NULL; + break; + } + } + break; + case snd_soc_dapm_dai_in: + for_each_rtd_cpu_dais(rtd, i, cpu_dai) { + if (cpu_dai->playback_widget == w) { + cpu_dai->playback_widget = NULL; + break; + } + } + break; + default: + break; + } + } +} + /* bind PCM ID to host component ID */ static int spcm_bind(struct snd_soc_component *scomp, struct snd_sof_pcm *spcm, int dir) @@ -1355,6 +1395,9 @@ static int sof_widget_unload(struct snd_soc_component *scomp, if (dai) list_del(&dai->list); + + sof_disconnect_dai_widget(scomp, widget); + break; default: break; From 9b91d0ece22b9ab37fc185511c7f992e51c93d6e Mon Sep 17 00:00:00 2001 From: Yu Liao Date: Fri, 18 Mar 2022 10:16:16 +0800 Subject: [PATCH 020/196] ASoC: SOF: topology: Fix memory leak in sof_control_load() scontrol doesn't get freed when kstrdup returns NULL. Fix by free iscontrol in that case. scontrol = kzalloc(sizeof(*scontrol), GFP_KERNEL); if (!scontrol) return -ENOMEM; scontrol->name = kstrdup(hdr->name, GFP_KERNEL); if (!scontrol->name) return -ENOMEM; Signed-off-by: Yu Liao Link: https://lore.kernel.org/r/20220318021616.2599630-1-liaoyu15@huawei.com Signed-off-by: Mark Brown --- sound/soc/sof/topology.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/topology.c b/sound/soc/sof/topology.c index 5953d1050cc9..3e5b319b44c7 100644 --- a/sound/soc/sof/topology.c +++ b/sound/soc/sof/topology.c @@ -904,8 +904,10 @@ static int sof_control_load(struct snd_soc_component *scomp, int index, return -ENOMEM; scontrol->name = kstrdup(hdr->name, GFP_KERNEL); - if (!scontrol->name) + if (!scontrol->name) { + kfree(scontrol); return -ENOMEM; + } scontrol->scomp = scomp; scontrol->access = kc->access; From db6dd1bee63d1d88fbddfe07af800af5948ac28e Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Thu, 7 Apr 2022 10:43:13 +0100 Subject: [PATCH 021/196] ASoC: codecs: wcd934x: do not switch off SIDO Buck when codec is in use SIDO(Single-Inductor Dual-Ouput) Buck powers up both analog and digital circuits along with internal memory, powering off this is the last thing that codec should do when going to very low power. Current code was powering off this Buck if there are no users of sysclk, which is not correct. Powering off this buck will result in no register access. This code path was never tested until recently after adding pm support in SoundWire controller. Fix this by removing the buck poweroff when the codec is active and also the code that is not used. Without this patch all the read/write transactions will never complete and results in SLIMBus Errors like: qcom,slim-ngd qcom,slim-ngd.1: Tx:MT:0x0, MC:0x60, LA:0xcf failed:-110 wcd934x-codec wcd934x-codec.1.auto: ASoC: error at soc_component_read_no_lock on wcd934x-codec.1.auto for register: [0x00000d05] -110 qcom,slim-ngd-ctrl 171c0000.slim: Error Interrupt received 0x82000000 Reported-by: Amit Pundir Fixes: a61f3b4f476e ("ASoC: wcd934x: add support to wcd9340/wcd9341 codec") Signed-off-by: Srinivas Kandagatla Tested-by: Amit Pundir Link: https://lore.kernel.org/r/20220407094313.2880-1-srinivas.kandagatla@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/wcd934x.c | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/sound/soc/codecs/wcd934x.c b/sound/soc/codecs/wcd934x.c index 1e75e93cf28f..6298ebe96e94 100644 --- a/sound/soc/codecs/wcd934x.c +++ b/sound/soc/codecs/wcd934x.c @@ -1274,29 +1274,7 @@ static int wcd934x_set_sido_input_src(struct wcd934x_codec *wcd, int sido_src) if (sido_src == wcd->sido_input_src) return 0; - if (sido_src == SIDO_SOURCE_INTERNAL) { - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_HI_ACCU_EN_MASK, 0); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_HI_ACCU_PRE_ENX_MASK, 0x0); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_RCO, - WCD934X_ANA_RCO_BG_EN_MASK, 0); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_PRE_EN1_MASK, - WCD934X_ANA_BUCK_PRE_EN1_ENABLE); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_PRE_EN2_MASK, - WCD934X_ANA_BUCK_PRE_EN2_ENABLE); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_HI_ACCU_EN_MASK, - WCD934X_ANA_BUCK_HI_ACCU_ENABLE); - usleep_range(100, 110); - } else if (sido_src == SIDO_SOURCE_RCO_BG) { + if (sido_src == SIDO_SOURCE_RCO_BG) { regmap_update_bits(wcd->regmap, WCD934X_ANA_RCO, WCD934X_ANA_RCO_BG_EN_MASK, WCD934X_ANA_RCO_BG_ENABLE); @@ -1382,8 +1360,6 @@ static int wcd934x_disable_ana_bias_and_syclk(struct wcd934x_codec *wcd) regmap_update_bits(wcd->regmap, WCD934X_CLK_SYS_MCLK_PRG, WCD934X_EXT_CLK_BUF_EN_MASK | WCD934X_MCLK_EN_MASK, 0x0); - wcd934x_set_sido_input_src(wcd, SIDO_SOURCE_INTERNAL); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BIAS, WCD934X_ANA_BIAS_EN_MASK, 0); regmap_update_bits(wcd->regmap, WCD934X_ANA_BIAS, From 97326be14df7bacc6ba5c62c0556298c27ea0432 Mon Sep 17 00:00:00 2001 From: Chao Song Date: Wed, 6 Apr 2022 14:23:41 -0500 Subject: [PATCH 022/196] ASoC: Intel: soc-acpi: correct device endpoints for max98373 The left speaker of max98373 uses spk_r_endpoint, and right speaker uses spk_l_endpoint, this is obviously wrong. This patch corrects the endpoints for max98373 codec. Signed-off-by: Chao Song Signed-off-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20220406192341.271465-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/common/soc-acpi-intel-tgl-match.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/intel/common/soc-acpi-intel-tgl-match.c b/sound/soc/intel/common/soc-acpi-intel-tgl-match.c index 6edc9b7108cd..ef19150e7b2e 100644 --- a/sound/soc/intel/common/soc-acpi-intel-tgl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-tgl-match.c @@ -132,13 +132,13 @@ static const struct snd_soc_acpi_adr_device mx8373_1_adr[] = { { .adr = 0x000123019F837300ull, .num_endpoints = 1, - .endpoints = &spk_l_endpoint, + .endpoints = &spk_r_endpoint, .name_prefix = "Right" }, { .adr = 0x000127019F837300ull, .num_endpoints = 1, - .endpoints = &spk_r_endpoint, + .endpoints = &spk_l_endpoint, .name_prefix = "Left" } }; From 92ccbf17eeacf510cf1eed9c252d9332ca24f02d Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Tue, 5 Apr 2022 20:10:38 +0800 Subject: [PATCH 023/196] ASoC: wm8731: Disable the regulator when probing fails When the driver fails during probing, the driver should disable the regulator, not just handle it in wm8731_hw_init(). The following log reveals it: [ 17.812483] WARNING: CPU: 1 PID: 364 at drivers/regulator/core.c:2257 _regulator_put+0x3ec/0x4e0 [ 17.815958] RIP: 0010:_regulator_put+0x3ec/0x4e0 [ 17.824467] Call Trace: [ 17.824774] [ 17.825040] regulator_bulk_free+0x82/0xe0 [ 17.825514] devres_release_group+0x319/0x3d0 [ 17.825882] i2c_device_probe+0x766/0x940 [ 17.829198] i2c_register_driver+0xb5/0x130 Signed-off-by: Zheyu Ma Link: https://lore.kernel.org/r/20220405121038.4094051-1-zheyuma97@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/wm8731.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/sound/soc/codecs/wm8731.c b/sound/soc/codecs/wm8731.c index 5d4949c2ec9b..b14c6d104e6d 100644 --- a/sound/soc/codecs/wm8731.c +++ b/sound/soc/codecs/wm8731.c @@ -602,7 +602,7 @@ static int wm8731_hw_init(struct device *dev, struct wm8731_priv *wm8731) ret = wm8731_reset(wm8731->regmap); if (ret < 0) { dev_err(dev, "Failed to issue reset: %d\n", ret); - goto err_regulator_enable; + goto err; } /* Clear POWEROFF, keep everything else disabled */ @@ -619,10 +619,7 @@ static int wm8731_hw_init(struct device *dev, struct wm8731_priv *wm8731) regcache_mark_dirty(wm8731->regmap); -err_regulator_enable: - /* Regulators will be enabled by bias management */ - regulator_bulk_disable(ARRAY_SIZE(wm8731->supplies), wm8731->supplies); - +err: return ret; } @@ -760,21 +757,27 @@ static int wm8731_i2c_probe(struct i2c_client *i2c, ret = PTR_ERR(wm8731->regmap); dev_err(&i2c->dev, "Failed to allocate register map: %d\n", ret); - return ret; + goto err_regulator_enable; } ret = wm8731_hw_init(&i2c->dev, wm8731); if (ret != 0) - return ret; + goto err_regulator_enable; ret = devm_snd_soc_register_component(&i2c->dev, &soc_component_dev_wm8731, &wm8731_dai, 1); if (ret != 0) { dev_err(&i2c->dev, "Failed to register CODEC: %d\n", ret); - return ret; + goto err_regulator_enable; } return 0; + +err_regulator_enable: + /* Regulators will be enabled by bias management */ + regulator_bulk_disable(ARRAY_SIZE(wm8731->supplies), wm8731->supplies); + + return ret; } static const struct i2c_device_id wm8731_i2c_id[] = { From 890a4087a6c2045911b5002566d1528f710cd723 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 7 Apr 2022 20:49:56 +0200 Subject: [PATCH 024/196] ASoC: Intel: sof_es8336: simplify speaker gpio naming In preparation for the support of an additional gpio for headphone control, rename GPIOs to make explicit references to speakers and gpio0 or gpio1. No functionality change. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Mauro Carvalho Chehab Acked-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/3008c576ca45d5cc99ad4a18d1d30de45a0aff80.1649357263.git.mchehab@kernel.org Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_es8336.c | 42 ++++++++++++++--------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/sound/soc/intel/boards/sof_es8336.c b/sound/soc/intel/boards/sof_es8336.c index 5e0529aa4f1d..e4829a376b79 100644 --- a/sound/soc/intel/boards/sof_es8336.c +++ b/sound/soc/intel/boards/sof_es8336.c @@ -27,7 +27,7 @@ #define SOF_ES8336_SSP_CODEC(quirk) ((quirk) & GENMASK(3, 0)) #define SOF_ES8336_SSP_CODEC_MASK (GENMASK(3, 0)) -#define SOF_ES8336_TGL_GPIO_QUIRK BIT(4) +#define SOF_ES8336_SPEAKERS_EN_GPIO1_QUIRK BIT(4) #define SOF_ES8336_ENABLE_DMIC BIT(5) #define SOF_ES8336_JD_INVERTED BIT(6) @@ -39,7 +39,7 @@ MODULE_PARM_DESC(quirk, "Board-specific quirk override"); struct sof_es8336_private { struct device *codec_dev; - struct gpio_desc *gpio_pa; + struct gpio_desc *gpio_speakers; struct snd_soc_jack jack; struct list_head hdmi_pcm_list; bool speaker_en; @@ -51,19 +51,19 @@ struct sof_hdmi_pcm { int device; }; -static const struct acpi_gpio_params pa_enable_gpio = { 0, 0, true }; -static const struct acpi_gpio_mapping acpi_es8336_gpios[] = { - { "pa-enable-gpios", &pa_enable_gpio, 1 }, +static const struct acpi_gpio_params speakers_enable_gpio0 = { 0, 0, true }; +static const struct acpi_gpio_mapping acpi_speakers_enable_gpio0[] = { + { "speakers-enable-gpios", &speakers_enable_gpio0, 1 }, { } }; -static const struct acpi_gpio_params quirk_pa_enable_gpio = { 1, 0, true }; -static const struct acpi_gpio_mapping quirk_acpi_es8336_gpios[] = { - { "pa-enable-gpios", &quirk_pa_enable_gpio, 1 }, +static const struct acpi_gpio_params speakers_enable_gpio1 = { 1, 0, true }; +static const struct acpi_gpio_mapping acpi_speakers_enable_gpio1[] = { + { "speakers-enable-gpios", &speakers_enable_gpio1, 1 }, { } }; -static const struct acpi_gpio_mapping *gpio_mapping = acpi_es8336_gpios; +static const struct acpi_gpio_mapping *gpio_mapping = acpi_speakers_enable_gpio0; static void log_quirks(struct device *dev) { @@ -71,8 +71,8 @@ static void log_quirks(struct device *dev) dev_info(dev, "quirk SSP%ld\n", SOF_ES8336_SSP_CODEC(quirk)); if (quirk & SOF_ES8336_ENABLE_DMIC) dev_info(dev, "quirk DMIC enabled\n"); - if (quirk & SOF_ES8336_TGL_GPIO_QUIRK) - dev_info(dev, "quirk TGL GPIO enabled\n"); + if (quirk & SOF_ES8336_SPEAKERS_EN_GPIO1_QUIRK) + dev_info(dev, "Speakers GPIO1 quirk enabled\n"); if (quirk & SOF_ES8336_JD_INVERTED) dev_info(dev, "quirk JD inverted enabled\n"); } @@ -88,7 +88,7 @@ static int sof_es8316_speaker_power_event(struct snd_soc_dapm_widget *w, else priv->speaker_en = true; - gpiod_set_value_cansleep(priv->gpio_pa, priv->speaker_en); + gpiod_set_value_cansleep(priv->gpio_speakers, priv->speaker_en); return 0; } @@ -233,8 +233,8 @@ static int sof_es8336_quirk_cb(const struct dmi_system_id *id) { quirk = (unsigned long)id->driver_data; - if (quirk & SOF_ES8336_TGL_GPIO_QUIRK) - gpio_mapping = quirk_acpi_es8336_gpios; + if (quirk & SOF_ES8336_SPEAKERS_EN_GPIO1_QUIRK) + gpio_mapping = acpi_speakers_enable_gpio1; return 1; } @@ -257,7 +257,7 @@ static const struct dmi_system_id sof_es8336_quirk_table[] = { DMI_MATCH(DMI_SYS_VENDOR, "IP3 tech"), DMI_MATCH(DMI_BOARD_NAME, "WN1"), }, - .driver_data = (void *)(SOF_ES8336_TGL_GPIO_QUIRK) + .driver_data = (void *)(SOF_ES8336_SPEAKERS_EN_GPIO1_QUIRK) }, {} }; @@ -585,10 +585,10 @@ static int sof_es8336_probe(struct platform_device *pdev) if (ret) dev_warn(codec_dev, "unable to add GPIO mapping table\n"); - priv->gpio_pa = gpiod_get_optional(codec_dev, "pa-enable", GPIOD_OUT_LOW); - if (IS_ERR(priv->gpio_pa)) { - ret = dev_err_probe(dev, PTR_ERR(priv->gpio_pa), - "could not get pa-enable GPIO\n"); + priv->gpio_speakers = gpiod_get_optional(codec_dev, "speakers-enable", GPIOD_OUT_LOW); + if (IS_ERR(priv->gpio_speakers)) { + ret = dev_err_probe(dev, PTR_ERR(priv->gpio_speakers), + "could not get speakers-enable GPIO\n"); goto err_put_codec; } @@ -604,7 +604,7 @@ static int sof_es8336_probe(struct platform_device *pdev) ret = devm_snd_soc_register_card(dev, card); if (ret) { - gpiod_put(priv->gpio_pa); + gpiod_put(priv->gpio_speakers); dev_err(dev, "snd_soc_register_card failed: %d\n", ret); goto err_put_codec; } @@ -622,7 +622,7 @@ static int sof_es8336_remove(struct platform_device *pdev) struct snd_soc_card *card = platform_get_drvdata(pdev); struct sof_es8336_private *priv = snd_soc_card_get_drvdata(card); - gpiod_put(priv->gpio_pa); + gpiod_put(priv->gpio_speakers); device_remove_software_node(priv->codec_dev); put_device(priv->codec_dev); From 6e1ff1459e0086312e61c2d1ff8b74395a082fcb Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 7 Apr 2022 20:49:57 +0200 Subject: [PATCH 025/196] ASoC: Intel: sof_es8336: support a separate gpio to control headphone Some devices may use both gpio0 and gpio1 to independently switch the speaker and the headphone. Add support for that. Acked-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab Acked-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/535454c0c598a8454487fe29b164527370e2db81.1649357263.git.mchehab@kernel.org Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_es8336.c | 59 ++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/sound/soc/intel/boards/sof_es8336.c b/sound/soc/intel/boards/sof_es8336.c index e4829a376b79..d15a58666cc6 100644 --- a/sound/soc/intel/boards/sof_es8336.c +++ b/sound/soc/intel/boards/sof_es8336.c @@ -30,6 +30,7 @@ #define SOF_ES8336_SPEAKERS_EN_GPIO1_QUIRK BIT(4) #define SOF_ES8336_ENABLE_DMIC BIT(5) #define SOF_ES8336_JD_INVERTED BIT(6) +#define SOF_ES8336_HEADPHONE_GPIO BIT(7) static unsigned long quirk; @@ -39,7 +40,7 @@ MODULE_PARM_DESC(quirk, "Board-specific quirk override"); struct sof_es8336_private { struct device *codec_dev; - struct gpio_desc *gpio_speakers; + struct gpio_desc *gpio_speakers, *gpio_headphone; struct snd_soc_jack jack; struct list_head hdmi_pcm_list; bool speaker_en; @@ -51,15 +52,27 @@ struct sof_hdmi_pcm { int device; }; -static const struct acpi_gpio_params speakers_enable_gpio0 = { 0, 0, true }; +static const struct acpi_gpio_params enable_gpio0 = { 0, 0, true }; +static const struct acpi_gpio_params enable_gpio1 = { 1, 0, true }; + static const struct acpi_gpio_mapping acpi_speakers_enable_gpio0[] = { - { "speakers-enable-gpios", &speakers_enable_gpio0, 1 }, + { "speakers-enable-gpios", &enable_gpio0, 1 }, { } }; -static const struct acpi_gpio_params speakers_enable_gpio1 = { 1, 0, true }; static const struct acpi_gpio_mapping acpi_speakers_enable_gpio1[] = { - { "speakers-enable-gpios", &speakers_enable_gpio1, 1 }, + { "speakers-enable-gpios", &enable_gpio1, 1 }, +}; + +static const struct acpi_gpio_mapping acpi_enable_both_gpios[] = { + { "speakers-enable-gpios", &enable_gpio0, 1 }, + { "headphone-enable-gpios", &enable_gpio1, 1 }, + { } +}; + +static const struct acpi_gpio_mapping acpi_enable_both_gpios_rev_order[] = { + { "speakers-enable-gpios", &enable_gpio1, 1 }, + { "headphone-enable-gpios", &enable_gpio0, 1 }, { } }; @@ -73,6 +86,8 @@ static void log_quirks(struct device *dev) dev_info(dev, "quirk DMIC enabled\n"); if (quirk & SOF_ES8336_SPEAKERS_EN_GPIO1_QUIRK) dev_info(dev, "Speakers GPIO1 quirk enabled\n"); + if (quirk & SOF_ES8336_HEADPHONE_GPIO) + dev_info(dev, "quirk headphone GPIO enabled\n"); if (quirk & SOF_ES8336_JD_INVERTED) dev_info(dev, "quirk JD inverted enabled\n"); } @@ -83,13 +98,24 @@ static int sof_es8316_speaker_power_event(struct snd_soc_dapm_widget *w, struct snd_soc_card *card = w->dapm->card; struct sof_es8336_private *priv = snd_soc_card_get_drvdata(card); + if (priv->speaker_en == !SND_SOC_DAPM_EVENT_ON(event)) + return 0; + + priv->speaker_en = !SND_SOC_DAPM_EVENT_ON(event); + if (SND_SOC_DAPM_EVENT_ON(event)) - priv->speaker_en = false; - else - priv->speaker_en = true; + msleep(70); gpiod_set_value_cansleep(priv->gpio_speakers, priv->speaker_en); + if (!(quirk & SOF_ES8336_HEADPHONE_GPIO)) + return 0; + + if (SND_SOC_DAPM_EVENT_ON(event)) + msleep(70); + + gpiod_set_value_cansleep(priv->gpio_headphone, priv->speaker_en); + return 0; } @@ -114,7 +140,7 @@ static const struct snd_soc_dapm_route sof_es8316_audio_map[] = { /* * There is no separate speaker output instead the speakers are muxed to - * the HP outputs. The mux is controlled by the "Speaker Power" supply. + * the HP outputs. The mux is controlled Speaker and/or headphone switch. */ {"Speaker", NULL, "HPOL"}, {"Speaker", NULL, "HPOR"}, @@ -233,8 +259,14 @@ static int sof_es8336_quirk_cb(const struct dmi_system_id *id) { quirk = (unsigned long)id->driver_data; - if (quirk & SOF_ES8336_SPEAKERS_EN_GPIO1_QUIRK) + if (quirk & SOF_ES8336_HEADPHONE_GPIO) { + if (quirk & SOF_ES8336_SPEAKERS_EN_GPIO1_QUIRK) + gpio_mapping = acpi_enable_both_gpios; + else + gpio_mapping = acpi_enable_both_gpios_rev_order; + } else if (quirk & SOF_ES8336_SPEAKERS_EN_GPIO1_QUIRK) { gpio_mapping = acpi_speakers_enable_gpio1; + } return 1; } @@ -592,6 +624,13 @@ static int sof_es8336_probe(struct platform_device *pdev) goto err_put_codec; } + priv->gpio_headphone = gpiod_get_optional(codec_dev, "headphone-enable", GPIOD_OUT_LOW); + if (IS_ERR(priv->gpio_headphone)) { + ret = dev_err_probe(dev, PTR_ERR(priv->gpio_headphone), + "could not get headphone-enable GPIO\n"); + goto err_put_codec; + } + INIT_LIST_HEAD(&priv->hdmi_pcm_list); snd_soc_card_set_drvdata(card, priv); From 7c7bb2a059b226ebadb14ce07460f6357023d56c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 7 Apr 2022 20:49:58 +0200 Subject: [PATCH 026/196] ASoC: Intel: sof_es8336: add a quirk for headset at mic1 port The headset/internal mic can either be routed as mic1/mic2 or vice-versa. By default, the driver assumes that the headset is mapped as mic2, but not all devices map this way. So, add a quirk to support changing it to mic1, using mic2 for the internal analog mic (if any). Signed-off-by: Mauro Carvalho Chehab Acked-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/5d88fc29b79be7ab77dae391c8e5ee929fd36c27.1649357263.git.mchehab@kernel.org Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_es8336.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/sound/soc/intel/boards/sof_es8336.c b/sound/soc/intel/boards/sof_es8336.c index d15a58666cc6..c71842be9d59 100644 --- a/sound/soc/intel/boards/sof_es8336.c +++ b/sound/soc/intel/boards/sof_es8336.c @@ -31,6 +31,7 @@ #define SOF_ES8336_ENABLE_DMIC BIT(5) #define SOF_ES8336_JD_INVERTED BIT(6) #define SOF_ES8336_HEADPHONE_GPIO BIT(7) +#define SOC_ES8336_HEADSET_MIC1 BIT(8) static unsigned long quirk; @@ -90,6 +91,8 @@ static void log_quirks(struct device *dev) dev_info(dev, "quirk headphone GPIO enabled\n"); if (quirk & SOF_ES8336_JD_INVERTED) dev_info(dev, "quirk JD inverted enabled\n"); + if (quirk & SOC_ES8336_HEADSET_MIC1) + dev_info(dev, "quirk headset at mic1 port enabled\n"); } static int sof_es8316_speaker_power_event(struct snd_soc_dapm_widget *w, @@ -147,11 +150,16 @@ static const struct snd_soc_dapm_route sof_es8316_audio_map[] = { {"Speaker", NULL, "Speaker Power"}, }; -static const struct snd_soc_dapm_route sof_es8316_intmic_in1_map[] = { +static const struct snd_soc_dapm_route sof_es8316_headset_mic2_map[] = { {"MIC1", NULL, "Internal Mic"}, {"MIC2", NULL, "Headset Mic"}, }; +static const struct snd_soc_dapm_route sof_es8316_headset_mic1_map[] = { + {"MIC2", NULL, "Internal Mic"}, + {"MIC1", NULL, "Headset Mic"}, +}; + static const struct snd_soc_dapm_route dmic_map[] = { /* digital mics */ {"DMic", NULL, "SoC DMIC"}, @@ -225,8 +233,13 @@ static int sof_es8316_init(struct snd_soc_pcm_runtime *runtime) card->dapm.idle_bias_off = true; - custom_map = sof_es8316_intmic_in1_map; - num_routes = ARRAY_SIZE(sof_es8316_intmic_in1_map); + if (quirk & SOC_ES8336_HEADSET_MIC1) { + custom_map = sof_es8316_headset_mic1_map; + num_routes = ARRAY_SIZE(sof_es8316_headset_mic1_map); + } else { + custom_map = sof_es8316_headset_mic2_map; + num_routes = ARRAY_SIZE(sof_es8316_headset_mic2_map); + } ret = snd_soc_dapm_add_routes(&card->dapm, custom_map, num_routes); if (ret) From c7cb4717f641db68e8117635bfcf62a9c27dc8d3 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 7 Apr 2022 20:49:59 +0200 Subject: [PATCH 027/196] ASoC: Intel: sof_es8336: Add a quirk for Huawei Matebook D15 Based on experimental tests, Huawei Matebook D15 actually uses both gpio0 and gpio1: the first one controls the speaker, while the other one controls the headphone. Also, the headset is mapped as MIC1, instead of MIC2. So, add a quirk for it. Signed-off-by: Mauro Carvalho Chehab Acked-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/d678aef9fc9a07aced611aa7cb8c9b800c649e5a.1649357263.git.mchehab@kernel.org Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_es8336.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sound/soc/intel/boards/sof_es8336.c b/sound/soc/intel/boards/sof_es8336.c index c71842be9d59..9d617831dd20 100644 --- a/sound/soc/intel/boards/sof_es8336.c +++ b/sound/soc/intel/boards/sof_es8336.c @@ -304,6 +304,15 @@ static const struct dmi_system_id sof_es8336_quirk_table[] = { }, .driver_data = (void *)(SOF_ES8336_SPEAKERS_EN_GPIO1_QUIRK) }, + { + .callback = sof_es8336_quirk_cb, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "HUAWEI"), + DMI_MATCH(DMI_BOARD_NAME, "BOHB-WAX9-PCB-B2"), + }, + .driver_data = (void *)(SOF_ES8336_HEADPHONE_GPIO | + SOC_ES8336_HEADSET_MIC1) + }, {} }; From ddfd534528146660de75ee84d6db10f10e778f95 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 3 Apr 2022 08:58:27 +0200 Subject: [PATCH 028/196] ASoC: codecs: Fix an error handling path in (rx|tx|va)_macro_probe() After a successful lpass_macro_pds_init() call, lpass_macro_pds_exit() must be called. Add the missing call in the error handling path of the probe function and use it. Fixes: 9e3d83c52844 ("ASoC: codecs: Add power domains support in digital macro codecs") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/5b5a015a9b1dc8011c6a4053fa49da1f2531e47c.1648969065.git.christophe.jaillet@wanadoo.fr Signed-off-by: Mark Brown --- sound/soc/codecs/lpass-rx-macro.c | 14 ++++++++++---- sound/soc/codecs/lpass-tx-macro.c | 14 ++++++++++---- sound/soc/codecs/lpass-va-macro.c | 8 ++++++-- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/sound/soc/codecs/lpass-rx-macro.c b/sound/soc/codecs/lpass-rx-macro.c index 6884ae505e33..3143f9cd7277 100644 --- a/sound/soc/codecs/lpass-rx-macro.c +++ b/sound/soc/codecs/lpass-rx-macro.c @@ -3566,12 +3566,16 @@ static int rx_macro_probe(struct platform_device *pdev) return PTR_ERR(rx->pds); base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(base)) - return PTR_ERR(base); + if (IS_ERR(base)) { + ret = PTR_ERR(base); + goto err; + } rx->regmap = devm_regmap_init_mmio(dev, base, &rx_regmap_config); - if (IS_ERR(rx->regmap)) - return PTR_ERR(rx->regmap); + if (IS_ERR(rx->regmap)) { + ret = PTR_ERR(rx->regmap); + goto err; + } dev_set_drvdata(dev, rx); @@ -3632,6 +3636,8 @@ err_mclk: err_dcodec: clk_disable_unprepare(rx->macro); err: + lpass_macro_pds_exit(rx->pds); + return ret; } diff --git a/sound/soc/codecs/lpass-tx-macro.c b/sound/soc/codecs/lpass-tx-macro.c index 714a411d5337..55503ba480bb 100644 --- a/sound/soc/codecs/lpass-tx-macro.c +++ b/sound/soc/codecs/lpass-tx-macro.c @@ -1828,8 +1828,10 @@ static int tx_macro_probe(struct platform_device *pdev) return PTR_ERR(tx->pds); base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(base)) - return PTR_ERR(base); + if (IS_ERR(base)) { + ret = PTR_ERR(base); + goto err; + } /* Update defaults for lpass sc7280 */ if (of_device_is_compatible(np, "qcom,sc7280-lpass-tx-macro")) { @@ -1846,8 +1848,10 @@ static int tx_macro_probe(struct platform_device *pdev) } tx->regmap = devm_regmap_init_mmio(dev, base, &tx_regmap_config); - if (IS_ERR(tx->regmap)) - return PTR_ERR(tx->regmap); + if (IS_ERR(tx->regmap)) { + ret = PTR_ERR(tx->regmap); + goto err; + } dev_set_drvdata(dev, tx); @@ -1907,6 +1911,8 @@ err_mclk: err_dcodec: clk_disable_unprepare(tx->macro); err: + lpass_macro_pds_exit(tx->pds); + return ret; } diff --git a/sound/soc/codecs/lpass-va-macro.c b/sound/soc/codecs/lpass-va-macro.c index f3cb596058e0..d18b56e60433 100644 --- a/sound/soc/codecs/lpass-va-macro.c +++ b/sound/soc/codecs/lpass-va-macro.c @@ -1434,8 +1434,10 @@ static int va_macro_probe(struct platform_device *pdev) va->dmic_clk_div = VA_MACRO_CLK_DIV_2; } else { ret = va_macro_validate_dmic_sample_rate(sample_rate, va); - if (!ret) - return -EINVAL; + if (!ret) { + ret = -EINVAL; + goto err; + } } base = devm_platform_ioremap_resource(pdev, 0); @@ -1492,6 +1494,8 @@ err_mclk: err_dcodec: clk_disable_unprepare(va->macro); err: + lpass_macro_pds_exit(va->pds); + return ret; } From 12e45e89556d7a532120f976081e9e7582addd2b Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 5 Apr 2022 14:53:39 -0700 Subject: [PATCH 029/196] dmaengine: idxd: fix device cleanup on disable There are certain parts of WQ that needs to be cleaned up even after WQ is disabled during the device disable. Those are the unchangeable parts for a WQ when the device is still enabled. Move the cleanup outside of WQ state check. Remove idxd_wq_disable_cleanup() inside idxd_wq_device_reset_cleanup() since only the unchangeable parts need to be cleared. Fixes: 0f225705cf65 ("dmaengine: idxd: fix wq settings post wq disable") Reported-by: Tony Zhu Tested-by: Tony Zhu Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/164919561905.1455025.13542366389944678346.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 3061fe857d69..5a0535a0f850 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -373,7 +373,6 @@ static void idxd_wq_device_reset_cleanup(struct idxd_wq *wq) { lockdep_assert_held(&wq->wq_lock); - idxd_wq_disable_cleanup(wq); wq->size = 0; wq->group = NULL; } @@ -701,9 +700,9 @@ static void idxd_device_wqs_clear_state(struct idxd_device *idxd) if (wq->state == IDXD_WQ_ENABLED) { idxd_wq_disable_cleanup(wq); - idxd_wq_device_reset_cleanup(wq); wq->state = IDXD_WQ_DISABLED; } + idxd_wq_device_reset_cleanup(wq); } } From fd0a4b39870d49ff15f6966470185409e261f20f Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Fri, 8 Apr 2022 19:30:49 -0700 Subject: [PATCH 030/196] Input: cypress-sf - register a callback to disable the regulators When the driver fails to probe, we will get the following splat: [ 19.311970] ------------[ cut here ]------------ [ 19.312566] WARNING: CPU: 3 PID: 375 at drivers/regulator/core.c:2257 _regulator_put+0x3ec/0x4e0 [ 19.317591] RIP: 0010:_regulator_put+0x3ec/0x4e0 [ 19.328831] Call Trace: [ 19.329112] [ 19.329369] regulator_bulk_free+0x82/0xe0 [ 19.329860] devres_release_group+0x319/0x3d0 [ 19.330357] i2c_device_probe+0x766/0x940 Fix this by adding a callback that will deal with the disabling when the driver fails to probe. Signed-off-by: Zheyu Ma Link: https://lore.kernel.org/r/20220409022629.3493557-1-zheyuma97@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/cypress-sf.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/input/keyboard/cypress-sf.c b/drivers/input/keyboard/cypress-sf.c index c28996028e80..9a23eed6a4f4 100644 --- a/drivers/input/keyboard/cypress-sf.c +++ b/drivers/input/keyboard/cypress-sf.c @@ -61,6 +61,14 @@ static irqreturn_t cypress_sf_irq_handler(int irq, void *devid) return IRQ_HANDLED; } +static void cypress_sf_disable_regulators(void *arg) +{ + struct cypress_sf_data *touchkey = arg; + + regulator_bulk_disable(ARRAY_SIZE(touchkey->regulators), + touchkey->regulators); +} + static int cypress_sf_probe(struct i2c_client *client) { struct cypress_sf_data *touchkey; @@ -121,6 +129,12 @@ static int cypress_sf_probe(struct i2c_client *client) return error; } + error = devm_add_action_or_reset(&client->dev, + cypress_sf_disable_regulators, + touchkey); + if (error) + return error; + touchkey->input_dev = devm_input_allocate_device(&client->dev); if (!touchkey->input_dev) { dev_err(&client->dev, "Failed to allocate input device\n"); From a3ae97f4c87d9570e7e9a3e3324c443757f6e29a Mon Sep 17 00:00:00 2001 From: Kevin Groeneveld Date: Sun, 10 Apr 2022 18:31:18 -0400 Subject: [PATCH 031/196] dmaengine: imx-sdma: fix init of uart scripts Commit b98ce2f4e32b ("dmaengine: imx-sdma: add uart rom script") broke uart rx on imx5 when using sdma firmware from older Freescale 2.6.35 kernel. In this case reading addr->uartXX_2_mcu_addr was going out of bounds of the firmware memory and corrupting the uart script addresses. Simply adding a bounds check before accessing addr->uartXX_2_mcu_addr does not work as the uartXX_2_mcu_addr members are now beyond the size of the older firmware and the uart addresses would never be populated in that case. There are other ways to fix this but overall the logic seems clearer to me to revert the uartXX_2_mcu_ram_addr structure entries back to uartXX_2_mcu_addr, change the newer entries to uartXX_2_mcu_rom_addr and update the logic accordingly. I have tested this patch on: 1. An i.MX53 system with sdma firmware from Freescale 2.6.35 kernel. Without this patch uart rx is broken in this scenario, with the patch uart rx is restored. 2. An i.MX6D system with no external sdma firmware. uart is okay with or without this patch. 3. An i.MX8MM system using current sdma-imx7d.bin firmware from linux-firmware. uart is okay with or without this patch and I confirmed the rom version of the uart script is being used which was the intention and reason for commit b98ce2f4e32b ("dmaengine: imx-sdma: add uart rom script") in the first place. Fixes: b98ce2f4e32b ("dmaengine: imx-sdma: add uart rom script") Cc: stable@vger.kernel.org Signed-off-by: Kevin Groeneveld Reviewed-by: Lucas Stach Reviewed-by: Fabio Estevam Acked-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20220410223118.15086-1-kgroeneveld@lenbrook.com Signed-off-by: Vinod Koul --- drivers/dma/imx-sdma.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index 70c0aa931ddf..b708d029b6e9 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -198,12 +198,12 @@ struct sdma_script_start_addrs { s32 per_2_firi_addr; s32 mcu_2_firi_addr; s32 uart_2_per_addr; - s32 uart_2_mcu_ram_addr; + s32 uart_2_mcu_addr; s32 per_2_app_addr; s32 mcu_2_app_addr; s32 per_2_per_addr; s32 uartsh_2_per_addr; - s32 uartsh_2_mcu_ram_addr; + s32 uartsh_2_mcu_addr; s32 per_2_shp_addr; s32 mcu_2_shp_addr; s32 ata_2_mcu_addr; @@ -232,8 +232,8 @@ struct sdma_script_start_addrs { s32 mcu_2_ecspi_addr; s32 mcu_2_sai_addr; s32 sai_2_mcu_addr; - s32 uart_2_mcu_addr; - s32 uartsh_2_mcu_addr; + s32 uart_2_mcu_rom_addr; + s32 uartsh_2_mcu_rom_addr; /* End of v3 array */ s32 mcu_2_zqspi_addr; /* End of v4 array */ @@ -1796,17 +1796,17 @@ static void sdma_add_scripts(struct sdma_engine *sdma, saddr_arr[i] = addr_arr[i]; /* - * get uart_2_mcu_addr/uartsh_2_mcu_addr rom script specially because - * they are now replaced by uart_2_mcu_ram_addr/uartsh_2_mcu_ram_addr - * to be compatible with legacy freescale/nxp sdma firmware, and they - * are located in the bottom part of sdma_script_start_addrs which are - * beyond the SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V1. + * For compatibility with NXP internal legacy kernel before 4.19 which + * is based on uart ram script and mainline kernel based on uart rom + * script, both uart ram/rom scripts are present in newer sdma + * firmware. Use the rom versions if they are present (V3 or newer). */ - if (addr->uart_2_mcu_addr) - sdma->script_addrs->uart_2_mcu_addr = addr->uart_2_mcu_addr; - if (addr->uartsh_2_mcu_addr) - sdma->script_addrs->uartsh_2_mcu_addr = addr->uartsh_2_mcu_addr; - + if (sdma->script_number >= SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V3) { + if (addr->uart_2_mcu_rom_addr) + sdma->script_addrs->uart_2_mcu_addr = addr->uart_2_mcu_rom_addr; + if (addr->uartsh_2_mcu_rom_addr) + sdma->script_addrs->uartsh_2_mcu_addr = addr->uartsh_2_mcu_rom_addr; + } } static void sdma_load_firmware(const struct firmware *fw, void *context) From 206680c4e46b62fd8909385e0874a36952595b85 Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Sun, 27 Mar 2022 14:11:54 +0800 Subject: [PATCH 032/196] dma: at_xdmac: fix a missing check on list iterator The bug is here: __func__, desc, &desc->tx_dma_desc.phys, ret, cookie, residue); The list iterator 'desc' will point to a bogus position containing HEAD if the list is empty or no element is found. To avoid dev_dbg() prints a invalid address, use a new variable 'iter' as the list iterator, while use the origin variable 'desc' as a dedicated pointer to point to the found element. Cc: stable@vger.kernel.org Fixes: 82e2424635f4c ("dmaengine: xdmac: fix print warning on dma_addr_t variable") Signed-off-by: Xiaomeng Tong Link: https://lore.kernel.org/r/20220327061154.4867-1-xiam0nd.tong@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/at_xdmac.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 1476156af74b..def564d1e8fa 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1453,7 +1453,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, { struct at_xdmac_chan *atchan = to_at_xdmac_chan(chan); struct at_xdmac *atxdmac = to_at_xdmac(atchan->chan.device); - struct at_xdmac_desc *desc, *_desc; + struct at_xdmac_desc *desc, *_desc, *iter; struct list_head *descs_list; enum dma_status ret; int residue, retry; @@ -1568,11 +1568,13 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, * microblock. */ descs_list = &desc->descs_list; - list_for_each_entry_safe(desc, _desc, descs_list, desc_node) { - dwidth = at_xdmac_get_dwidth(desc->lld.mbr_cfg); - residue -= (desc->lld.mbr_ubc & 0xffffff) << dwidth; - if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda) + list_for_each_entry_safe(iter, _desc, descs_list, desc_node) { + dwidth = at_xdmac_get_dwidth(iter->lld.mbr_cfg); + residue -= (iter->lld.mbr_ubc & 0xffffff) << dwidth; + if ((iter->lld.mbr_nda & 0xfffffffc) == cur_nda) { + desc = iter; break; + } } residue += cur_ubc << dwidth; From 7104b9cb35a33ad803a1adbbfa50569b008faf15 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 8 Mar 2022 06:49:51 +0000 Subject: [PATCH 033/196] dmaengine: imx-sdma: Fix error checking in sdma_event_remap of_parse_phandle() returns NULL on errors, rather than error pointers. Using NULL check on grp_np to fix this. Fixes: d078cd1b4185 ("dmaengine: imx-sdma: Add imx6sx platform support") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220308064952.15743-1-linmq006@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/imx-sdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index b708d029b6e9..6196a7b3956b 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -1885,7 +1885,7 @@ static int sdma_event_remap(struct sdma_engine *sdma) u32 reg, val, shift, num_map, i; int ret = 0; - if (IS_ERR(np) || IS_ERR(gpr_np)) + if (IS_ERR(np) || !gpr_np) goto out; event_remap = of_find_property(np, propname, NULL); @@ -1933,7 +1933,7 @@ static int sdma_event_remap(struct sdma_engine *sdma) } out: - if (!IS_ERR(gpr_np)) + if (gpr_np) of_node_put(gpr_np); return ret; From 545b2baac89b859180e51215468c05d85ea8465a Mon Sep 17 00:00:00 2001 From: zhangqilong Date: Sat, 19 Mar 2022 10:21:42 +0800 Subject: [PATCH 034/196] dmaengine: mediatek:Fix PM usage reference leak of mtk_uart_apdma_alloc_chan_resources pm_runtime_get_sync will increment pm usage counter even it failed. Forgetting to putting operation will result in reference leak here. We fix it: 1) Replacing it with pm_runtime_resume_and_get to keep usage counter balanced. 2) Add putting operation before returning error. Fixes:9135408c3ace4 ("dmaengine: mediatek: Add MediaTek UART APDMA support") Signed-off-by: Zhang Qilong Link: https://lore.kernel.org/r/20220319022142.142709-1-zhangqilong3@huawei.com Signed-off-by: Vinod Koul --- drivers/dma/mediatek/mtk-uart-apdma.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/dma/mediatek/mtk-uart-apdma.c b/drivers/dma/mediatek/mtk-uart-apdma.c index 375e7e647df6..a1517ef1f4a0 100644 --- a/drivers/dma/mediatek/mtk-uart-apdma.c +++ b/drivers/dma/mediatek/mtk-uart-apdma.c @@ -274,7 +274,7 @@ static int mtk_uart_apdma_alloc_chan_resources(struct dma_chan *chan) unsigned int status; int ret; - ret = pm_runtime_get_sync(mtkd->ddev.dev); + ret = pm_runtime_resume_and_get(mtkd->ddev.dev); if (ret < 0) { pm_runtime_put_noidle(chan->device->dev); return ret; @@ -288,18 +288,21 @@ static int mtk_uart_apdma_alloc_chan_resources(struct dma_chan *chan) ret = readx_poll_timeout(readl, c->base + VFF_EN, status, !status, 10, 100); if (ret) - return ret; + goto err_pm; ret = request_irq(c->irq, mtk_uart_apdma_irq_handler, IRQF_TRIGGER_NONE, KBUILD_MODNAME, chan); if (ret < 0) { dev_err(chan->device->dev, "Can't request dma IRQ\n"); - return -EINVAL; + ret = -EINVAL; + goto err_pm; } if (mtkd->support_33bits) mtk_uart_apdma_write(c, VFF_4G_SUPPORT, VFF_4G_SUPPORT_CLR_B); +err_pm: + pm_runtime_put_noidle(mtkd->ddev.dev); return ret; } From 8fc5133d6d4da65cad6b73152fc714ad3d7f91c1 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 25 Feb 2022 13:02:52 +0100 Subject: [PATCH 035/196] dmaengine: dw-edma: Fix unaligned 64bit access On some arch (ie aarch64 iMX8MM) unaligned PCIe accesses are not allowed and lead to a kernel Oops. [ 1911.668835] Unable to handle kernel paging request at virtual address ffff80001bc00a8c [ 1911.668841] Mem abort info: [ 1911.668844] ESR = 0x96000061 [ 1911.668847] EC = 0x25: DABT (current EL), IL = 32 bits [ 1911.668850] SET = 0, FnV = 0 [ 1911.668852] EA = 0, S1PTW = 0 [ 1911.668853] Data abort info: [ 1911.668855] ISV = 0, ISS = 0x00000061 [ 1911.668857] CM = 0, WnR = 1 [ 1911.668861] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000040ff4000 [ 1911.668864] [ffff80001bc00a8c] pgd=00000000bffff003, pud=00000000bfffe003, pmd=0068000018400705 [ 1911.668872] Internal error: Oops: 96000061 [#1] PREEMPT SMP ... The llp register present in the channel group registers is not aligned on 64bit. Fix unaligned 64bit access using two 32bit accesses Fixes: 04e0a39fc10f ("dmaengine: dw-edma: Add writeq() and readq() for 64 bits architectures") Signed-off-by: Herve Codina Link: https://lore.kernel.org/r/20220225120252.309404-1-herve.codina@bootlin.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-v0-core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c index 329fc2e57b70..b5b8f8181e77 100644 --- a/drivers/dma/dw-edma/dw-edma-v0-core.c +++ b/drivers/dma/dw-edma/dw-edma-v0-core.c @@ -415,8 +415,11 @@ void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first) (DW_EDMA_V0_CCS | DW_EDMA_V0_LLE)); /* Linked list */ #ifdef CONFIG_64BIT - SET_CH_64(dw, chan->dir, chan->id, llp.reg, - chunk->ll_region.paddr); + /* llp is not aligned on 64bit -> keep 32bit accesses */ + SET_CH_32(dw, chan->dir, chan->id, llp.lsb, + lower_32_bits(chunk->ll_region.paddr)); + SET_CH_32(dw, chan->dir, chan->id, llp.msb, + upper_32_bits(chunk->ll_region.paddr)); #else /* CONFIG_64BIT */ SET_CH_32(dw, chan->dir, chan->id, llp.lsb, lower_32_bits(chunk->ll_region.paddr)); From 5b933c7262c5b0ea11ea3c3b3ea81add04895954 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 12 Apr 2022 17:39:27 +0100 Subject: [PATCH 036/196] firmware: cs_dsp: Fix overrun of unterminated control name string For wmfw format v2 and later the coefficient name strings have a length field and are NOT null-terminated. Use kasprintf() to convert the unterminated string into a null-terminated string in an allocated buffer. The previous code handled this duplication incorrectly using kmemdup() and getting the length from a strlen() of the (unterminated) source string. This resulted in creating a string that continued up to the next byte in the firmware file that just happened to be 0x00. Signed-off-by: Richard Fitzgerald Fixes: f6bc909e7673 ("firmware: cs_dsp: add driver to support firmware loading on Cirrus Logic DSPs") Link: https://lore.kernel.org/r/20220412163927.1303470-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/firmware/cirrus/cs_dsp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c index e48108e694f8..7dad6f57d970 100644 --- a/drivers/firmware/cirrus/cs_dsp.c +++ b/drivers/firmware/cirrus/cs_dsp.c @@ -955,8 +955,7 @@ static int cs_dsp_create_control(struct cs_dsp *dsp, ctl->alg_region = *alg_region; if (subname && dsp->fw_ver >= 2) { ctl->subname_len = subname_len; - ctl->subname = kmemdup(subname, - strlen(subname) + 1, GFP_KERNEL); + ctl->subname = kasprintf(GFP_KERNEL, "%.*s", subname_len, subname); if (!ctl->subname) { ret = -ENOMEM; goto err_ctl; From ad5cd4f4ee4d5fcdb1bfb7a0c073072961e70783 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Mar 2022 10:50:43 -0800 Subject: [PATCH 037/196] ext4: fix fallocate to use file_modified to update permissions consistently Since the initial introduction of (posix) fallocate back at the turn of the century, it has been possible to use this syscall to change the user-visible contents of files. This can happen by extending the file size during a preallocation, or through any of the newer modes (punch, zero, collapse, insert range). Because the call can be used to change file contents, we should treat it like we do any other modification to a file -- update the mtime, and drop set[ug]id privileges/capabilities. The VFS function file_modified() does all this for us if pass it a locked inode, so let's make fallocate drop permissions correctly. Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220308185043.GA117678@magnolia Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 2 +- fs/ext4/extents.c | 32 +++++++++++++++++++++++++------- fs/ext4/inode.c | 7 ++++++- 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3f87cca49f0c..1d79012c5a5b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3032,7 +3032,7 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); -extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0d98cf402282..e473fde6b64b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4500,9 +4500,9 @@ retry: return ret > 0 ? ret2 : ret; } -static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); -static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len); static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) @@ -4574,6 +4574,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, @@ -4690,7 +4694,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) { - ret = ext4_punch_hole(inode, offset, len); + ret = ext4_punch_hole(file, offset, len); goto exit; } @@ -4699,12 +4703,12 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto exit; if (mode & FALLOC_FL_COLLAPSE_RANGE) { - ret = ext4_collapse_range(inode, offset, len); + ret = ext4_collapse_range(file, offset, len); goto exit; } if (mode & FALLOC_FL_INSERT_RANGE) { - ret = ext4_insert_range(inode, offset, len); + ret = ext4_insert_range(file, offset, len); goto exit; } @@ -4740,6 +4744,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out; + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); if (ret) goto out; @@ -5241,8 +5249,9 @@ out: * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ -static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; ext4_lblk_t punch_start, punch_stop; @@ -5294,6 +5303,10 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) /* Wait for existing dio to complete */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. @@ -5387,8 +5400,9 @@ out_mutex: * by len bytes. * Returns 0 on success, error otherwise. */ -static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; handle_t *handle; @@ -5445,6 +5459,10 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) /* Wait for existing dio to complete */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 26218088f63b..955dd978dccf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3946,8 +3946,9 @@ int ext4_break_layouts(struct inode *inode) * Returns: 0 on success or negative on failure */ -int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) +int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; @@ -4009,6 +4010,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. From a2b0b205d125f27cddfb4f7280e39affdaf46686 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 21 Mar 2022 22:44:38 +0800 Subject: [PATCH 038/196] ext4: fix symlink file size not match to file content We got issue as follows: [home]# fsck.ext4 -fn ram0yb e2fsck 1.45.6 (20-Mar-2020) Pass 1: Checking inodes, blocks, and sizes Pass 2: Checking directory structure Symlink /p3/d14/d1a/l3d (inode #3494) is invalid. Clear? no Entry 'l3d' in /p3/d14/d1a (3383) has an incorrect filetype (was 7, should be 0). Fix? no As the symlink file size does not match the file content. If the writeback of the symlink data block failed, ext4_finish_bio() handles the end of IO. However this function fails to mark the buffer with BH_write_io_error and so when unmount does journal checkpoint it cannot detect the writeback error and will cleanup the journal. Thus we've lost the correct data in the journal area. To solve this issue, mark the buffer as BH_write_io_error in ext4_finish_bio(). Cc: stable@kernel.org Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220321144438.201685-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/page-io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1d370364230e..40b7d8485b44 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -134,8 +134,10 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_status) + if (bio->bi_status) { + set_buffer_write_io_error(bh); buffer_io_error(bh); + } } while ((bh = bh->b_this_page) != head); spin_unlock_irqrestore(&head->b_uptodate_lock, flags); if (!under_io) { From b98535d091795a79336f520b0708457aacf55c67 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Tue, 22 Mar 2022 09:24:19 +0800 Subject: [PATCH 039/196] ext4: fix bug_on in start_this_handle during umount filesystem We got issue as follows: ------------[ cut here ]------------ kernel BUG at fs/jbd2/transaction.c:389! invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 9 PID: 131 Comm: kworker/9:1 Not tainted 5.17.0-862.14.0.6.x86_64-00001-g23f87daf7d74-dirty #197 Workqueue: events flush_stashed_error_work RIP: 0010:start_this_handle+0x41c/0x1160 RSP: 0018:ffff888106b47c20 EFLAGS: 00010202 RAX: ffffed10251b8400 RBX: ffff888128dc204c RCX: ffffffffb52972ac RDX: 0000000000000200 RSI: 0000000000000004 RDI: ffff888128dc2050 RBP: 0000000000000039 R08: 0000000000000001 R09: ffffed10251b840a R10: ffff888128dc204f R11: ffffed10251b8409 R12: ffff888116d78000 R13: 0000000000000000 R14: dffffc0000000000 R15: ffff888128dc2000 FS: 0000000000000000(0000) GS:ffff88839d680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000001620068 CR3: 0000000376c0e000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: jbd2__journal_start+0x38a/0x790 jbd2_journal_start+0x19/0x20 flush_stashed_error_work+0x110/0x2b3 process_one_work+0x688/0x1080 worker_thread+0x8b/0xc50 kthread+0x26f/0x310 ret_from_fork+0x22/0x30 Modules linked in: ---[ end trace 0000000000000000 ]--- Above issue may happen as follows: umount read procfs error_work ext4_put_super flush_work(&sbi->s_error_work); ext4_mb_seq_groups_show ext4_mb_load_buddy_gfp ext4_mb_init_group ext4_mb_init_cache ext4_read_block_bitmap_nowait ext4_validate_block_bitmap ext4_error ext4_handle_error schedule_work(&EXT4_SB(sb)->s_error_work); ext4_unregister_sysfs(sb); jbd2_journal_destroy(sbi->s_journal); journal_kill_thread journal->j_flags |= JBD2_UNMOUNT; flush_stashed_error_work jbd2_journal_start start_this_handle BUG_ON(journal->j_flags & JBD2_UNMOUNT); To solve this issue, we call 'ext4_unregister_sysfs() before flushing s_error_work in ext4_put_super(). Signed-off-by: Ye Bin Reviewed-by: Jan Kara Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20220322012419.725457-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4120a743be52..f2a5e78f93a9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1199,6 +1199,18 @@ static void ext4_put_super(struct super_block *sb) int aborted = 0; int i, err; + /* + * Unregister sysfs before destroying jbd2 journal. + * Since we could still access attr_journal_task attribute via sysfs + * path which could have sbi->s_journal->j_task as NULL + * Unregister sysfs before flush sbi->s_error_work. + * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If + * read metadata verify failed then will queue error work. + * flush_stashed_error_work will call start_this_handle may trigger + * BUG_ON. + */ + ext4_unregister_sysfs(sb); + ext4_unregister_li_request(sb); ext4_quota_off_umount(sb); @@ -1206,13 +1218,6 @@ static void ext4_put_super(struct super_block *sb) destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); - /* - * Unregister sysfs before destroying jbd2 journal. - * Since we could still access attr_journal_task attribute via sysfs - * path which could have sbi->s_journal->j_task as NULL - */ - ext4_unregister_sysfs(sb); - if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); From c186f0887fe7061a35cebef024550ec33ef8fbd8 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 24 Mar 2022 14:48:16 +0800 Subject: [PATCH 040/196] ext4: fix use-after-free in ext4_search_dir We got issue as follows: EXT4-fs (loop0): mounted filesystem without journal. Opts: ,errors=continue ================================================================== BUG: KASAN: use-after-free in ext4_search_dir fs/ext4/namei.c:1394 [inline] BUG: KASAN: use-after-free in search_dirblock fs/ext4/namei.c:1199 [inline] BUG: KASAN: use-after-free in __ext4_find_entry+0xdca/0x1210 fs/ext4/namei.c:1553 Read of size 1 at addr ffff8881317c3005 by task syz-executor117/2331 CPU: 1 PID: 2331 Comm: syz-executor117 Not tainted 5.10.0+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:83 [inline] dump_stack+0x144/0x187 lib/dump_stack.c:124 print_address_description+0x7d/0x630 mm/kasan/report.c:387 __kasan_report+0x132/0x190 mm/kasan/report.c:547 kasan_report+0x47/0x60 mm/kasan/report.c:564 ext4_search_dir fs/ext4/namei.c:1394 [inline] search_dirblock fs/ext4/namei.c:1199 [inline] __ext4_find_entry+0xdca/0x1210 fs/ext4/namei.c:1553 ext4_lookup_entry fs/ext4/namei.c:1622 [inline] ext4_lookup+0xb8/0x3a0 fs/ext4/namei.c:1690 __lookup_hash+0xc5/0x190 fs/namei.c:1451 do_rmdir+0x19e/0x310 fs/namei.c:3760 do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x445e59 Code: 4d c7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 1b c7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fff2277fac8 EFLAGS: 00000246 ORIG_RAX: 0000000000000054 RAX: ffffffffffffffda RBX: 0000000000400280 RCX: 0000000000445e59 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000200000c0 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000002 R10: 00007fff2277f990 R11: 0000000000000246 R12: 0000000000000000 R13: 431bde82d7b634db R14: 0000000000000000 R15: 0000000000000000 The buggy address belongs to the page: page:0000000048cd3304 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x1317c3 flags: 0x200000000000000() raw: 0200000000000000 ffffea0004526588 ffffea0004528088 0000000000000000 raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8881317c2f00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff8881317c2f80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff8881317c3000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff8881317c3080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8881317c3100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== ext4_search_dir: ... de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { ... if ((char *) de + de->name_len <= dlimit && ext4_match(dir, fname, de)) { ... } ... de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); if (de_len <= 0) return -1; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } Assume: de=0xffff8881317c2fff dlimit=0x0xffff8881317c3000 If read 'de->name_len' which address is 0xffff8881317c3005, obviously is out of range, then will trigger use-after-free. To solve this issue, 'dlimit' must reserve 8 bytes, as we will read 'de->name_len' to judge if '(char *) de + de->name_len' out of range. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220324064816.1209985-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 4 ++++ fs/ext4/namei.c | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1d79012c5a5b..48dc2c3247ad 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2273,6 +2273,10 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) * Structure of a directory entry */ #define EXT4_NAME_LEN 255 +/* + * Base length of the ext4 directory entry excluding the name length + */ +#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN) struct ext4_dir_entry { __le32 inode; /* Inode number */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e37da8d5cd0c..767b4bfe39c3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1466,10 +1466,10 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; - while ((char *) de < dlimit) { + while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ - if ((char *) de + de->name_len <= dlimit && + if (de->name + de->name_len <= dlimit && ext4_match(dir, fname, de)) { /* found a match - just to be sure, do * a full check */ From 2da376228a2427501feb9d15815a45dbdbdd753e Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Thu, 31 Mar 2022 13:05:15 -0700 Subject: [PATCH 041/196] ext4: limit length to bitmap_maxbytes - blocksize in punch_hole Syzbot found an issue [1] in ext4_fallocate(). The C reproducer [2] calls fallocate(), passing size 0xffeffeff000ul, and offset 0x1000000ul, which, when added together exceed the bitmap_maxbytes for the inode. This triggers a BUG in ext4_ind_remove_space(). According to the comments in this function the 'end' parameter needs to be one block after the last block to be removed. In the case when the BUG is triggered it points to the last block. Modify the ext4_punch_hole() function and add constraint that caps the length to satisfy the one before laster block requirement. LINK: [1] https://syzkaller.appspot.com/bug?id=b80bd9cf348aac724a4f4dff251800106d721331 LINK: [2] https://syzkaller.appspot.com/text?tag=ReproC&x=14ba0238700000 Fixes: a4bb6b64e39a ("ext4: enable "punch hole" functionality") Reported-by: syzbot+7a806094edd5d07ba029@syzkaller.appspotmail.com Signed-off-by: Tadeusz Struk Link: https://lore.kernel.org/r/20220331200515.153214-1-tadeusz.struk@linaro.org Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 955dd978dccf..d815502cc97c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3952,7 +3952,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset; + loff_t first_block_offset, last_block_offset, max_length; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; unsigned int credits; int ret = 0, ret2 = 0; @@ -3995,6 +3996,14 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) offset; } + /* + * For punch hole the length + offset needs to be within one block + * before last range. Adjust the length if it goes beyond that limit. + */ + max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; + if (offset + length > max_length) + length = max_length - offset; + if (offset & (sb->s_blocksize - 1) || (offset + length) & (sb->s_blocksize - 1)) { /* From 7102ffe4c166ca0f5e35137e9f9de83768c2d27d Mon Sep 17 00:00:00 2001 From: "wangjianjian (C)" Date: Fri, 1 Apr 2022 20:07:35 +0800 Subject: [PATCH 042/196] ext4, doc: fix incorrect h_reserved size According to document and code, ext4_xattr_header's size is 32 bytes, so h_reserved size should be 3. Signed-off-by: Wang Jianjian Link: https://lore.kernel.org/r/92fcc3a6-7d77-8c09-4126-377fcb4c46a5@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- Documentation/filesystems/ext4/attributes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/ext4/attributes.rst b/Documentation/filesystems/ext4/attributes.rst index 54386a010a8d..871d2da7a0a9 100644 --- a/Documentation/filesystems/ext4/attributes.rst +++ b/Documentation/filesystems/ext4/attributes.rst @@ -76,7 +76,7 @@ The beginning of an extended attribute block is in - Checksum of the extended attribute block. * - 0x14 - \_\_u32 - - h\_reserved[2] + - h\_reserved[3] - Zero. The checksum is calculated against the FS UUID, the 64-bit block number From 5bd8baab087dff657e05387aee802e70304cc813 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 13 Apr 2022 10:10:50 +0200 Subject: [PATCH 043/196] esp: limit skb_page_frag_refill use to a single page Commit ebe48d368e97 ("esp: Fix possible buffer overflow in ESP transformation") tried to fix skb_page_frag_refill usage in ESP by capping allocsize to 32k, but that doesn't completely solve the issue, as skb_page_frag_refill may return a single page. If that happens, we will write out of bounds, despite the check introduced in the previous patch. This patch forces COW in cases where we would end up calling skb_page_frag_refill with a size larger than a page (first in esp_output_head with tailen, then in esp_output_tail with skb->data_len). Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/esp.h | 2 -- net/ipv4/esp4.c | 5 ++--- net/ipv6/esp6.c | 5 ++--- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/include/net/esp.h b/include/net/esp.h index 90cd02ff77ef..9c5637d41d95 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,8 +4,6 @@ #include -#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) - struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 70e6c87fbe3d..d747166bb291 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -446,7 +446,6 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; - unsigned int allocsz; /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { @@ -456,8 +455,8 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } - allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); - if (allocsz > ESP_SKB_FRAG_MAXSIZE) + if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || + ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 55d604c9b3b3..f2120e92caf1 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -482,7 +482,6 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; - unsigned int allocsz; if (x->encap) { int err = esp6_output_encap(x, skb, esp); @@ -491,8 +490,8 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } - allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); - if (allocsz > ESP_SKB_FRAG_MAXSIZE) + if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || + ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { From 022074918042465668db9b0f768e2260b1e39c59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 11 Apr 2022 15:49:09 +0200 Subject: [PATCH 044/196] drm/radeon: fix logic inversion in radeon_sync_resv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shared is the opposite of write/exclusive. Signed-off-by: Christian König Fixes: 0597ca7b43e4 ("drm/radeon: use new iterator in radeon_sync_resv") Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1970 Reviewed-by: Alex Deucher Link: https://patchwork.freedesktop.org/patch/msgid/20220412093626.608767-1-christian.koenig@amd.com --- drivers/gpu/drm/radeon/radeon_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/radeon_sync.c b/drivers/gpu/drm/radeon/radeon_sync.c index b991ba1bcd51..f63efd8d5e52 100644 --- a/drivers/gpu/drm/radeon/radeon_sync.c +++ b/drivers/gpu/drm/radeon/radeon_sync.c @@ -96,7 +96,7 @@ int radeon_sync_resv(struct radeon_device *rdev, struct dma_fence *f; int r = 0; - dma_resv_for_each_fence(&cursor, resv, shared, f) { + dma_resv_for_each_fence(&cursor, resv, !shared, f) { fence = to_radeon_fence(f); if (fence && fence->rdev == rdev) radeon_sync_fence(sync, fence); From e5c23779f93d45e39a52758ca593bd7e62e9b4be Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 8 Apr 2022 15:33:30 -0500 Subject: [PATCH 045/196] arm_pmu: Validate single/group leader events In the case where there is only a cycle counter available (i.e. PMCR_EL0.N is 0) and an event other than CPU cycles is opened, the open should fail as the event can never possibly be scheduled. However, the event validation when an event is opened is skipped when the group leader is opened. Fix this by always validating the group leader events. Reported-by: Al Grant Cc: Will Deacon Cc: Mark Rutland Signed-off-by: Rob Herring Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20220408203330.4014015-1-robh@kernel.org Cc: Signed-off-by: Will Deacon --- drivers/perf/arm_pmu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 9694370651fa..59d3980b8ca2 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -400,6 +400,9 @@ validate_group(struct perf_event *event) if (!validate_event(event->pmu, &fake_pmu, leader)) return -EINVAL; + if (event == leader) + return 0; + for_each_sibling_event(sibling, leader) { if (!validate_event(event->pmu, &fake_pmu, sibling)) return -EINVAL; @@ -489,12 +492,7 @@ __hw_perf_event_init(struct perf_event *event) local64_set(&hwc->period_left, hwc->sample_period); } - if (event->group_leader != event) { - if (validate_group(event) != 0) - return -EINVAL; - } - - return 0; + return validate_group(event); } static int armpmu_event_init(struct perf_event *event) From f8e6b7babfeb40987e946bc1427609a9976017fa Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Mon, 11 Apr 2022 15:44:04 +0200 Subject: [PATCH 046/196] dma-buf-map: remove renamed header file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7938f4218168 ("dma-buf-map: Rename to iosys-map") already renamed this file, but it got brought back by a merge. Delete it for real this time. Fixes: 30424ebae8df ("Merge tag 'drm-intel-gt-next-2022-02-17' of git://anongit.freedesktop.org/drm/drm-intel into drm-intel-next") Cc: Rodrigo Vivi Cc: Lucas De Marchi Cc: dri-devel@lists.freedesktop.org Signed-off-by: Karol Herbst Reviewed-by: Michel Dänzer Reviewed-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20220411134404.524776-1-kherbst@redhat.com --- include/linux/dma-buf-map.h | 266 ------------------------------------ 1 file changed, 266 deletions(-) delete mode 100644 include/linux/dma-buf-map.h diff --git a/include/linux/dma-buf-map.h b/include/linux/dma-buf-map.h deleted file mode 100644 index 19fa0b5ae5ec..000000000000 --- a/include/linux/dma-buf-map.h +++ /dev/null @@ -1,266 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Pointer to dma-buf-mapped memory, plus helpers. - */ - -#ifndef __DMA_BUF_MAP_H__ -#define __DMA_BUF_MAP_H__ - -#include -#include - -/** - * DOC: overview - * - * Calling dma-buf's vmap operation returns a pointer to the buffer's memory. - * Depending on the location of the buffer, users may have to access it with - * I/O operations or memory load/store operations. For example, copying to - * system memory could be done with memcpy(), copying to I/O memory would be - * done with memcpy_toio(). - * - * .. code-block:: c - * - * void *vaddr = ...; // pointer to system memory - * memcpy(vaddr, src, len); - * - * void *vaddr_iomem = ...; // pointer to I/O memory - * memcpy_toio(vaddr, _iomem, src, len); - * - * When using dma-buf's vmap operation, the returned pointer is encoded as - * :c:type:`struct dma_buf_map `. - * :c:type:`struct dma_buf_map ` stores the buffer's address in - * system or I/O memory and a flag that signals the required method of - * accessing the buffer. Use the returned instance and the helper functions - * to access the buffer's memory in the correct way. - * - * The type :c:type:`struct dma_buf_map ` and its helpers are - * actually independent from the dma-buf infrastructure. When sharing buffers - * among devices, drivers have to know the location of the memory to access - * the buffers in a safe way. :c:type:`struct dma_buf_map ` - * solves this problem for dma-buf and its users. If other drivers or - * sub-systems require similar functionality, the type could be generalized - * and moved to a more prominent header file. - * - * Open-coding access to :c:type:`struct dma_buf_map ` is - * considered bad style. Rather then accessing its fields directly, use one - * of the provided helper functions, or implement your own. For example, - * instances of :c:type:`struct dma_buf_map ` can be initialized - * statically with DMA_BUF_MAP_INIT_VADDR(), or at runtime with - * dma_buf_map_set_vaddr(). These helpers will set an address in system memory. - * - * .. code-block:: c - * - * struct dma_buf_map map = DMA_BUF_MAP_INIT_VADDR(0xdeadbeaf); - * - * dma_buf_map_set_vaddr(&map, 0xdeadbeaf); - * - * To set an address in I/O memory, use dma_buf_map_set_vaddr_iomem(). - * - * .. code-block:: c - * - * dma_buf_map_set_vaddr_iomem(&map, 0xdeadbeaf); - * - * Instances of struct dma_buf_map do not have to be cleaned up, but - * can be cleared to NULL with dma_buf_map_clear(). Cleared mappings - * always refer to system memory. - * - * .. code-block:: c - * - * dma_buf_map_clear(&map); - * - * Test if a mapping is valid with either dma_buf_map_is_set() or - * dma_buf_map_is_null(). - * - * .. code-block:: c - * - * if (dma_buf_map_is_set(&map) != dma_buf_map_is_null(&map)) - * // always true - * - * Instances of :c:type:`struct dma_buf_map ` can be compared - * for equality with dma_buf_map_is_equal(). Mappings the point to different - * memory spaces, system or I/O, are never equal. That's even true if both - * spaces are located in the same address space, both mappings contain the - * same address value, or both mappings refer to NULL. - * - * .. code-block:: c - * - * struct dma_buf_map sys_map; // refers to system memory - * struct dma_buf_map io_map; // refers to I/O memory - * - * if (dma_buf_map_is_equal(&sys_map, &io_map)) - * // always false - * - * A set up instance of struct dma_buf_map can be used to access or manipulate - * the buffer memory. Depending on the location of the memory, the provided - * helpers will pick the correct operations. Data can be copied into the memory - * with dma_buf_map_memcpy_to(). The address can be manipulated with - * dma_buf_map_incr(). - * - * .. code-block:: c - * - * const void *src = ...; // source buffer - * size_t len = ...; // length of src - * - * dma_buf_map_memcpy_to(&map, src, len); - * dma_buf_map_incr(&map, len); // go to first byte after the memcpy - */ - -/** - * struct dma_buf_map - Pointer to vmap'ed dma-buf memory. - * @vaddr_iomem: The buffer's address if in I/O memory - * @vaddr: The buffer's address if in system memory - * @is_iomem: True if the dma-buf memory is located in I/O - * memory, or false otherwise. - */ -struct dma_buf_map { - union { - void __iomem *vaddr_iomem; - void *vaddr; - }; - bool is_iomem; -}; - -/** - * DMA_BUF_MAP_INIT_VADDR - Initializes struct dma_buf_map to an address in system memory - * @vaddr_: A system-memory address - */ -#define DMA_BUF_MAP_INIT_VADDR(vaddr_) \ - { \ - .vaddr = (vaddr_), \ - .is_iomem = false, \ - } - -/** - * dma_buf_map_set_vaddr - Sets a dma-buf mapping structure to an address in system memory - * @map: The dma-buf mapping structure - * @vaddr: A system-memory address - * - * Sets the address and clears the I/O-memory flag. - */ -static inline void dma_buf_map_set_vaddr(struct dma_buf_map *map, void *vaddr) -{ - map->vaddr = vaddr; - map->is_iomem = false; -} - -/** - * dma_buf_map_set_vaddr_iomem - Sets a dma-buf mapping structure to an address in I/O memory - * @map: The dma-buf mapping structure - * @vaddr_iomem: An I/O-memory address - * - * Sets the address and the I/O-memory flag. - */ -static inline void dma_buf_map_set_vaddr_iomem(struct dma_buf_map *map, - void __iomem *vaddr_iomem) -{ - map->vaddr_iomem = vaddr_iomem; - map->is_iomem = true; -} - -/** - * dma_buf_map_is_equal - Compares two dma-buf mapping structures for equality - * @lhs: The dma-buf mapping structure - * @rhs: A dma-buf mapping structure to compare with - * - * Two dma-buf mapping structures are equal if they both refer to the same type of memory - * and to the same address within that memory. - * - * Returns: - * True is both structures are equal, or false otherwise. - */ -static inline bool dma_buf_map_is_equal(const struct dma_buf_map *lhs, - const struct dma_buf_map *rhs) -{ - if (lhs->is_iomem != rhs->is_iomem) - return false; - else if (lhs->is_iomem) - return lhs->vaddr_iomem == rhs->vaddr_iomem; - else - return lhs->vaddr == rhs->vaddr; -} - -/** - * dma_buf_map_is_null - Tests for a dma-buf mapping to be NULL - * @map: The dma-buf mapping structure - * - * Depending on the state of struct dma_buf_map.is_iomem, tests if the - * mapping is NULL. - * - * Returns: - * True if the mapping is NULL, or false otherwise. - */ -static inline bool dma_buf_map_is_null(const struct dma_buf_map *map) -{ - if (map->is_iomem) - return !map->vaddr_iomem; - return !map->vaddr; -} - -/** - * dma_buf_map_is_set - Tests is the dma-buf mapping has been set - * @map: The dma-buf mapping structure - * - * Depending on the state of struct dma_buf_map.is_iomem, tests if the - * mapping has been set. - * - * Returns: - * True if the mapping is been set, or false otherwise. - */ -static inline bool dma_buf_map_is_set(const struct dma_buf_map *map) -{ - return !dma_buf_map_is_null(map); -} - -/** - * dma_buf_map_clear - Clears a dma-buf mapping structure - * @map: The dma-buf mapping structure - * - * Clears all fields to zero; including struct dma_buf_map.is_iomem. So - * mapping structures that were set to point to I/O memory are reset for - * system memory. Pointers are cleared to NULL. This is the default. - */ -static inline void dma_buf_map_clear(struct dma_buf_map *map) -{ - if (map->is_iomem) { - map->vaddr_iomem = NULL; - map->is_iomem = false; - } else { - map->vaddr = NULL; - } -} - -/** - * dma_buf_map_memcpy_to - Memcpy into dma-buf mapping - * @dst: The dma-buf mapping structure - * @src: The source buffer - * @len: The number of byte in src - * - * Copies data into a dma-buf mapping. The source buffer is in system - * memory. Depending on the buffer's location, the helper picks the correct - * method of accessing the memory. - */ -static inline void dma_buf_map_memcpy_to(struct dma_buf_map *dst, const void *src, size_t len) -{ - if (dst->is_iomem) - memcpy_toio(dst->vaddr_iomem, src, len); - else - memcpy(dst->vaddr, src, len); -} - -/** - * dma_buf_map_incr - Increments the address stored in a dma-buf mapping - * @map: The dma-buf mapping structure - * @incr: The number of bytes to increment - * - * Increments the address stored in a dma-buf mapping. Depending on the - * buffer's location, the correct value will be updated. - */ -static inline void dma_buf_map_incr(struct dma_buf_map *map, size_t incr) -{ - if (map->is_iomem) - map->vaddr_iomem += incr; - else - map->vaddr += incr; -} - -#endif /* __DMA_BUF_MAP_H__ */ From 907862e9aef75bf89e2b265efcc58870be06081e Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 1 Mar 2022 15:32:10 +0200 Subject: [PATCH 047/196] igc: Fix infinite loop in release_swfw_sync An infinite loop may occur if we fail to acquire the HW semaphore, which is needed for resource release. This will typically happen if the hardware is surprise-removed. At this stage there is nothing to do, except log an error and quit. Fixes: c0071c7aa5fe ("igc: Add HW initialization code") Suggested-by: Dima Ruinskiy Signed-off-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_i225.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_i225.c b/drivers/net/ethernet/intel/igc/igc_i225.c index 66ea566488d1..59d5c467ea6e 100644 --- a/drivers/net/ethernet/intel/igc/igc_i225.c +++ b/drivers/net/ethernet/intel/igc/igc_i225.c @@ -156,8 +156,15 @@ void igc_release_swfw_sync_i225(struct igc_hw *hw, u16 mask) { u32 swfw_sync; - while (igc_get_hw_semaphore_i225(hw)) - ; /* Empty */ + /* Releasing the resource requires first getting the HW semaphore. + * If we fail to get the semaphore, there is nothing we can do, + * except log an error and quit. We are not allowed to hang here + * indefinitely, as it may cause denial of service or system crash. + */ + if (igc_get_hw_semaphore_i225(hw)) { + hw_dbg("Failed to release SW_FW_SYNC.\n"); + return; + } swfw_sync = rd32(IGC_SW_FW_SYNC); swfw_sync &= ~mask; From c80a29f0fe9b6f5457e0788e27d1110577eba99b Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Wed, 9 Mar 2022 08:19:19 +0200 Subject: [PATCH 048/196] igc: Fix BUG: scheduling while atomic Replace usleep_range() method with udelay() method to allow atomic contexts in low-level MDIO access functions. The following issue can be seen by doing the following: $ modprobe -r bonding $ modprobe -v bonding max_bonds=1 mode=1 miimon=100 use_carrier=0 $ ip link set bond0 up $ ifenslave bond0 eth0 eth1 [ 982.357308] BUG: scheduling while atomic: kworker/u64:0/9/0x00000002 [ 982.364431] INFO: lockdep is turned off. [ 982.368824] Modules linked in: bonding sctp ip6_udp_tunnel udp_tunnel mlx4_ib ib_uverbs ib_core mlx4_en mlx4_core nfp tls sunrpc intel_rapl_msr iTCO_wdt iTCO_vendor_support mxm_wmi dcdbas intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel rapl intel_cstate intel_uncore pcspkr lpc_ich mei_me ipmi_ssif mei ipmi_si ipmi_devintf ipmi_msghandler wmi acpi_power_meter xfs libcrc32c sr_mod cdrom sd_mod t10_pi sg mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm ahci libahci crc32c_intel libata i2c_algo_bit tg3 megaraid_sas igc dm_mirror dm_region_hash dm_log dm_mod [last unloaded: bonding] [ 982.437941] CPU: 25 PID: 9 Comm: kworker/u64:0 Kdump: loaded Tainted: G W --------- - - 4.18.0-348.el8.x86_64+debug #1 [ 982.451333] Hardware name: Dell Inc. PowerEdge R730/0H21J3, BIOS 2.7.0 12/005/2017 [ 982.459791] Workqueue: bond0 bond_mii_monitor [bonding] [ 982.465622] Call Trace: [ 982.468355] dump_stack+0x8e/0xd0 [ 982.472056] __schedule_bug.cold.60+0x3a/0x60 [ 982.476919] __schedule+0x147b/0x1bc0 [ 982.481007] ? firmware_map_remove+0x16b/0x16b [ 982.485967] ? hrtimer_fixup_init+0x40/0x40 [ 982.490625] schedule+0xd9/0x250 [ 982.494227] schedule_hrtimeout_range_clock+0x10d/0x2c0 [ 982.500058] ? hrtimer_nanosleep_restart+0x130/0x130 [ 982.505598] ? hrtimer_init_sleeper_on_stack+0x90/0x90 [ 982.511332] ? usleep_range+0x88/0x130 [ 982.515514] ? recalibrate_cpu_khz+0x10/0x10 [ 982.520279] ? ktime_get+0xab/0x1c0 [ 982.524175] ? usleep_range+0x88/0x130 [ 982.528355] usleep_range+0xdd/0x130 [ 982.532344] ? console_conditional_schedule+0x30/0x30 [ 982.537987] ? igc_put_hw_semaphore+0x17/0x60 [igc] [ 982.543432] igc_read_phy_reg_gpy+0x111/0x2b0 [igc] [ 982.548887] igc_phy_has_link+0xfa/0x260 [igc] [ 982.553847] ? igc_get_phy_id+0x210/0x210 [igc] [ 982.558894] ? lock_acquire+0x34d/0x890 [ 982.563187] ? lock_downgrade+0x710/0x710 [ 982.567659] ? rcu_read_unlock+0x50/0x50 [ 982.572039] igc_check_for_copper_link+0x106/0x210 [igc] [ 982.577970] ? igc_config_fc_after_link_up+0x840/0x840 [igc] [ 982.584286] ? rcu_read_unlock+0x50/0x50 [ 982.588661] ? lock_release+0x591/0xb80 [ 982.592939] ? lock_release+0x591/0xb80 [ 982.597220] igc_has_link+0x113/0x330 [igc] [ 982.601887] ? lock_downgrade+0x710/0x710 [ 982.606362] igc_ethtool_get_link+0x6d/0x90 [igc] [ 982.611614] bond_check_dev_link+0x131/0x2c0 [bonding] [ 982.617350] ? bond_time_in_interval+0xd0/0xd0 [bonding] [ 982.623277] ? rcu_read_lock_held+0x62/0xc0 [ 982.627944] ? rcu_read_lock_sched_held+0xe0/0xe0 [ 982.633198] bond_mii_monitor+0x314/0x2500 [bonding] [ 982.638738] ? lock_contended+0x880/0x880 [ 982.643214] ? bond_miimon_link_change+0xa0/0xa0 [bonding] [ 982.649336] ? lock_acquire+0x34d/0x890 [ 982.653615] ? lock_downgrade+0x710/0x710 [ 982.658089] ? debug_object_deactivate+0x221/0x340 [ 982.663436] ? rcu_read_unlock+0x50/0x50 [ 982.667811] ? debug_print_object+0x2b0/0x2b0 [ 982.672672] ? __switch_to_asm+0x41/0x70 [ 982.677049] ? __switch_to_asm+0x35/0x70 [ 982.681426] ? _raw_spin_unlock_irq+0x24/0x40 [ 982.686288] ? trace_hardirqs_on+0x20/0x195 [ 982.690956] ? _raw_spin_unlock_irq+0x24/0x40 [ 982.695818] process_one_work+0x8f0/0x1770 [ 982.700390] ? pwq_dec_nr_in_flight+0x320/0x320 [ 982.705443] ? debug_show_held_locks+0x50/0x50 [ 982.710403] worker_thread+0x87/0xb40 [ 982.714489] ? process_one_work+0x1770/0x1770 [ 982.719349] kthread+0x344/0x410 [ 982.722950] ? kthread_insert_work_sanity_check+0xd0/0xd0 [ 982.728975] ret_from_fork+0x3a/0x50 Fixes: 5586838fe9ce ("igc: Add code for PHY support") Reported-by: Corinna Vinschen Suggested-by: Dima Ruinskiy Signed-off-by: Sasha Neftin Tested-by: Corinna Vinschen Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_phy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_phy.c b/drivers/net/ethernet/intel/igc/igc_phy.c index 40dbf4b43234..6961f65d36b9 100644 --- a/drivers/net/ethernet/intel/igc/igc_phy.c +++ b/drivers/net/ethernet/intel/igc/igc_phy.c @@ -581,7 +581,7 @@ static s32 igc_read_phy_reg_mdic(struct igc_hw *hw, u32 offset, u16 *data) * the lower time out */ for (i = 0; i < IGC_GEN_POLL_TIMEOUT; i++) { - usleep_range(500, 1000); + udelay(50); mdic = rd32(IGC_MDIC); if (mdic & IGC_MDIC_READY) break; @@ -638,7 +638,7 @@ static s32 igc_write_phy_reg_mdic(struct igc_hw *hw, u32 offset, u16 data) * the lower time out */ for (i = 0; i < IGC_GEN_POLL_TIMEOUT; i++) { - usleep_range(500, 1000); + udelay(50); mdic = rd32(IGC_MDIC); if (mdic & IGC_MDIC_READY) break; From 822f52e7efdc88fccffb9fbf6250a4b7666a0b0f Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Tue, 12 Apr 2022 18:58:15 -0700 Subject: [PATCH 049/196] igc: Fix suspending when PTM is active Some mainboard/CPU combinations, in particular, Alder Lake-S with a W680 mainboard, have shown problems (system hangs usually, no kernel logs) with suspend/resume when PCIe PTM is enabled and active. In some cases, it could be reproduced when removing the igc module. The best we can do is to stop PTM dialogs from the downstream/device side before the interface is brought down. PCIe PTM will be re-enabled when the interface is being brought up. Fixes: a90ec8483732 ("igc: Add support for PTP getcrosststamp()") Signed-off-by: Vinicius Costa Gomes Tested-by: Naama Meir Acked-by: Paul Menzel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_ptp.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index 0d6e3215e98f..653e9f1e35b5 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -992,6 +992,17 @@ static void igc_ptp_time_restore(struct igc_adapter *adapter) igc_ptp_write_i225(adapter, &ts); } +static void igc_ptm_stop(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + u32 ctrl; + + ctrl = rd32(IGC_PTM_CTRL); + ctrl &= ~IGC_PTM_CTRL_EN; + + wr32(IGC_PTM_CTRL, ctrl); +} + /** * igc_ptp_suspend - Disable PTP work items and prepare for suspend * @adapter: Board private structure @@ -1009,8 +1020,10 @@ void igc_ptp_suspend(struct igc_adapter *adapter) adapter->ptp_tx_skb = NULL; clear_bit_unlock(__IGC_PTP_TX_IN_PROGRESS, &adapter->state); - if (pci_device_is_present(adapter->pdev)) + if (pci_device_is_present(adapter->pdev)) { igc_ptp_time_save(adapter); + igc_ptm_stop(adapter); + } } /** From 04ebaa1cfddae5f240cc7404f009133bb0389a47 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 5 Apr 2022 18:56:01 +0300 Subject: [PATCH 050/196] e1000e: Fix possible overflow in LTR decoding When we decode the latency and the max_latency, u16 value may not fit the required size and could lead to the wrong LTR representation. Scaling is represented as: scale 0 - 1 (2^(5*0)) = 2^0 scale 1 - 32 (2^(5 *1))= 2^5 scale 2 - 1024 (2^(5 *2)) =2^10 scale 3 - 32768 (2^(5 *3)) =2^15 scale 4 - 1048576 (2^(5 *4)) = 2^20 scale 5 - 33554432 (2^(5 *4)) = 2^25 scale 4 and scale 5 required 20 and 25 bits respectively. scale 6 reserved. Replace the u16 type with the u32 type and allow corrected LTR representation. Cc: stable@vger.kernel.org Fixes: 44a13a5d99c7 ("e1000e: Fix the max snoop/no-snoop latency for 10M") Reported-by: James Hutchinson Link: https://bugzilla.kernel.org/show_bug.cgi?id=215689 Suggested-by: Dima Ruinskiy Signed-off-by: Sasha Neftin Tested-by: Naama Meir Tested-by: James Hutchinson Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index d60e2016d03c..e6c8e6d5234f 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1009,8 +1009,8 @@ static s32 e1000_platform_pm_pch_lpt(struct e1000_hw *hw, bool link) { u32 reg = link << (E1000_LTRV_REQ_SHIFT + E1000_LTRV_NOSNOOP_SHIFT) | link << E1000_LTRV_REQ_SHIFT | E1000_LTRV_SEND; - u16 max_ltr_enc_d = 0; /* maximum LTR decoded by platform */ - u16 lat_enc_d = 0; /* latency decoded */ + u32 max_ltr_enc_d = 0; /* maximum LTR decoded by platform */ + u32 lat_enc_d = 0; /* latency decoded */ u16 lat_enc = 0; /* latency encoded */ if (link) { From 45bd8951806eb5e857772c593de021b09057950d Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 13 Apr 2022 11:14:21 -0700 Subject: [PATCH 051/196] arm64: Improve HAVE_DYNAMIC_FTRACE_WITH_REGS selection for clang Will and Anders reported that using just 'CC=clang' with CONFIG_FTRACE=y and CONFIG_STACK_TRACER=y would result in an error while linking: aarch64-linux-gnu-ld: .init.data has both ordered [`__patchable_function_entries' in init/main.o] and unordered [`.meminit.data' in mm/sparse.o] sections aarch64-linux-gnu-ld: final link failed: bad value This error was exposed by commit f12b034afeb3 ("scripts/Makefile.clang: default to LLVM_IAS=1") in combination with binutils older than 2.36. When '-fpatchable-function-entry' was implemented in LLVM, two code paths were added for adding the section attributes, one for the integrated assembler and another for GNU as, due to binutils deficiencies at the time. If the integrated assembler was used, attributes that GNU ld < 2.36 could not handle were added, presumably with the assumption that use of the integrated assembler meant the whole LLVM stack was being used, namely ld.lld. Prior to the kernel change previously mentioned, that assumption was valid, as there were three commonly used combinations of tools for compiling, assembling, and linking respectively: $ make CC=clang (clang, GNU as, GNU ld) $ make LLVM=1 (clang, GNU as, ld.lld) $ make LLVM=1 LLVM_IAS=1 (clang, integrated assembler, ld.lld) After the default switch of the integrated assembler, the second and third commands become equivalent and the first command means "clang, integrated assembler, and GNU ld", which was not a combination that was considered when the aforementioned LLVM change was implemented. It is not possible to go back and fix LLVM, as this change was implemented in the 10.x series, which is no longer supported. To workaround this on the kernel side, split out the selection of HAVE_DYNAMIC_FTRACE_WITH_REGS to two separate configurations, one for GCC and one for clang. The GCC config inherits the '-fpatchable-function-entry' check. The Clang config does not it, as '-fpatchable-function-entry' is always available for LLVM 11.0.0 and newer, which is the supported range of versions for the kernel. The Clang config makes sure that the user is using GNU as or the integrated assembler with ld.lld or GNU ld 2.36 or newer, which will avoid the error above. Link: https://github.com/ClangBuiltLinux/linux/issues/1507 Link: https://github.com/ClangBuiltLinux/linux/issues/788 Link: https://lore.kernel.org/YlCA5PoIjF6nhwYj@dev-arch.thelio-3990X/ Link: https://sourceware.org/bugzilla/show_bug.cgi?id=26256 Link: https://github.com/llvm/llvm-project/commit/7fa5290d5bd5632d7a36a4ea9f46e81e04fb819e Link: https://github.com/llvm/llvm-project/commit/853a2649160c1c80b9bbd38a20b53ca8fab704e8 Reported-by: Anders Roxell Reported-by: Will Deacon Tested-by: Will Deacon Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20220413181420.3522187-1-nathan@kernel.org Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 57c4c995965f..1fd16faa7f31 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -175,8 +175,6 @@ config ARM64 select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE - select HAVE_DYNAMIC_FTRACE_WITH_REGS \ - if $(cc-option,-fpatchable-function-entry=2) select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \ if DYNAMIC_FTRACE_WITH_REGS select HAVE_EFFICIENT_UNALIGNED_ACCESS @@ -228,6 +226,17 @@ config ARM64 help ARM 64-bit (AArch64) Linux support. +config CLANG_SUPPORTS_DYNAMIC_FTRACE_WITH_REGS + def_bool CC_IS_CLANG + # https://github.com/ClangBuiltLinux/linux/issues/1507 + depends on AS_IS_GNU || (AS_IS_LLVM && (LD_IS_LLD || LD_VERSION >= 23600)) + select HAVE_DYNAMIC_FTRACE_WITH_REGS + +config GCC_SUPPORTS_DYNAMIC_FTRACE_WITH_REGS + def_bool CC_IS_GCC + depends on $(cc-option,-fpatchable-function-entry=2) + select HAVE_DYNAMIC_FTRACE_WITH_REGS + config 64BIT def_bool y From 0ff74a23e08f909ce859039e860f53727dfed0dd Mon Sep 17 00:00:00 2001 From: Ken Kurematsu Date: Thu, 14 Apr 2022 02:37:18 +0000 Subject: [PATCH 052/196] arm64: fix typos in comments Fix a typo "cortex" Signed-off-by: Ken Kurematsu Link: https://lore.kernel.org/r/OSBPR01MB3288B15006E15C64D4D617F7DBEF9@OSBPR01MB3288.jpnprd01.prod.outlook.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1fd16faa7f31..20ea89d9ac2f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -687,7 +687,7 @@ config ARM64_ERRATUM_2051678 default y help This options adds the workaround for ARM Cortex-A510 erratum ARM64_ERRATUM_2051678. - Affected Coretex-A510 might not respect the ordering rules for + Affected Cortex-A510 might not respect the ordering rules for hardware update of the page table's dirty bit. The workaround is to not enable the feature on affected CPUs. From e2932d1f6f055b2af2114c7e64a26dc1b5593d0c Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Thu, 14 Apr 2022 15:58:13 +0530 Subject: [PATCH 053/196] EDAC/synopsys: Read the error count from the correct register Currently, the error count is read wrongly from the status register. Read the count from the proper error count register (ERRCNT). [ bp: Massage. ] Fixes: b500b4a029d5 ("EDAC, synopsys: Add ECC support for ZynqMP DDR controller") Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov Acked-by: Michal Simek Cc: Link: https://lore.kernel.org/r/20220414102813.4468-1-shubhrajyoti.datta@xilinx.com --- drivers/edac/synopsys_edac.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index f05ff02c0656..40b1abeca856 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -164,6 +164,11 @@ #define ECC_STAT_CECNT_SHIFT 8 #define ECC_STAT_BITNUM_MASK 0x7F +/* ECC error count register definitions */ +#define ECC_ERRCNT_UECNT_MASK 0xFFFF0000 +#define ECC_ERRCNT_UECNT_SHIFT 16 +#define ECC_ERRCNT_CECNT_MASK 0xFFFF + /* DDR QOS Interrupt register definitions */ #define DDR_QOS_IRQ_STAT_OFST 0x20200 #define DDR_QOSUE_MASK 0x4 @@ -423,15 +428,16 @@ static int zynqmp_get_error_info(struct synps_edac_priv *priv) base = priv->baseaddr; p = &priv->stat; + regval = readl(base + ECC_ERRCNT_OFST); + p->ce_cnt = regval & ECC_ERRCNT_CECNT_MASK; + p->ue_cnt = (regval & ECC_ERRCNT_UECNT_MASK) >> ECC_ERRCNT_UECNT_SHIFT; + if (!p->ce_cnt) + goto ue_err; + regval = readl(base + ECC_STAT_OFST); if (!regval) return 1; - p->ce_cnt = (regval & ECC_STAT_CECNT_MASK) >> ECC_STAT_CECNT_SHIFT; - p->ue_cnt = (regval & ECC_STAT_UECNT_MASK) >> ECC_STAT_UECNT_SHIFT; - if (!p->ce_cnt) - goto ue_err; - p->ceinfo.bitpos = (regval & ECC_STAT_BITNUM_MASK); regval = readl(base + ECC_CEADDR0_OFST); From d1fc4c6feac18f893e55aeefa267a281e132c7b7 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 17 Mar 2022 18:57:27 +0100 Subject: [PATCH 054/196] ice: xsk: check if Rx ring was filled up to the end __ice_alloc_rx_bufs_zc() checks if a number of the descriptors to be allocated would cause the ring wrap. In that case, driver will issue two calls to xsk_buff_alloc_batch() - one that will fill the ring up to the end and the second one that will start with filling descriptors from the beginning of the ring. ice_fill_rx_descs() is a wrapper for taking care of what xsk_buff_alloc_batch() gave back to the driver. It works in a best effort approach, so for example when driver asks for 64 buffers, ice_fill_rx_descs() could assign only 32. Such case needs to be checked when ring is being filled up to the end, because in that situation ntu might not reached the end of the ring. Fix the ring wrap by checking if nb_buffs_extra has the expected value. If not, bump ntu and go directly to tail update. Fixes: 3876ff525de7 ("ice: xsk: Handle SW XDP ring wrap and bump tail more often") Signed-off-by: Magnus Karlsson Signed-off-by: Maciej Fijalkowski Tested-by: Shwetha Nagaraju Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 866ee4df9671..9dd38f667059 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -415,8 +415,8 @@ static u16 ice_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp, */ static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) { + u32 nb_buffs_extra = 0, nb_buffs = 0; union ice_32b_rx_flex_desc *rx_desc; - u32 nb_buffs_extra = 0, nb_buffs; u16 ntu = rx_ring->next_to_use; u16 total_count = count; struct xdp_buff **xdp; @@ -428,6 +428,10 @@ static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) nb_buffs_extra = ice_fill_rx_descs(rx_ring->xsk_pool, xdp, rx_desc, rx_ring->count - ntu); + if (nb_buffs_extra != rx_ring->count - ntu) { + ntu += nb_buffs_extra; + goto exit; + } rx_desc = ICE_RX_DESC(rx_ring, 0); xdp = ice_xdp_buf(rx_ring, 0); ntu = 0; @@ -441,6 +445,7 @@ static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) if (ntu == rx_ring->count) ntu = 0; +exit: if (rx_ring->next_to_use != ntu) ice_release_rx_desc(rx_ring, ntu); From aacca7a83b9753c562395ef328352dfd8c003c59 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 1 Apr 2022 14:54:38 +0200 Subject: [PATCH 055/196] ice: allow creating VFs for !CONFIG_NET_SWITCHDEV Currently for !CONFIG_NET_SWITCHDEV kernel builds it is not possible to create VFs properly as call to ice_eswitch_configure() returns -EOPNOTSUPP for us. This is because CONFIG_ICE_SWITCHDEV depends on CONFIG_NET_SWITCHDEV. Change the ice_eswitch_configure() implementation for !CONFIG_ICE_SWITCHDEV to return 0 instead -EOPNOTSUPP and let ice_ena_vfs() finish its work properly. CC: Grzegorz Nitka Fixes: 1a1c40df2e80 ("ice: set and release switchdev environment") Signed-off-by: Maciej Fijalkowski Signed-off-by: Michal Swiatkowski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_eswitch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.h b/drivers/net/ethernet/intel/ice/ice_eswitch.h index bd58d9d2e565..6a413331572b 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.h +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.h @@ -52,7 +52,7 @@ static inline void ice_eswitch_update_repr(struct ice_vsi *vsi) { } static inline int ice_eswitch_configure(struct ice_pf *pf) { - return -EOPNOTSUPP; + return 0; } static inline int ice_eswitch_rebuild(struct ice_pf *pf) From d201665147ae788b7cca9fab58a1826f64152034 Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Fri, 8 Apr 2022 09:56:10 +0200 Subject: [PATCH 056/196] ice: fix crash in switchdev mode Below steps end up with crash: - modprobe ice - devlink dev eswitch set $PF1_PCI mode switchdev - echo 64 > /sys/class/net/$PF1/device/sriov_numvfs - rmmod ice Calling ice_eswitch_port_start_xmit while the process of removing VFs is in progress ends up with NULL pointer dereference. That's because PR netdev is not released but some resources are already freed. Fix it by checking if ICE_VF_DIS bit is set. Call trace: [ 1379.595146] BUG: kernel NULL pointer dereference, address: 0000000000000040 [ 1379.595284] #PF: supervisor read access in kernel mode [ 1379.595410] #PF: error_code(0x0000) - not-present page [ 1379.595535] PGD 0 P4D 0 [ 1379.595657] Oops: 0000 [#1] PREEMPT SMP PTI [ 1379.595783] CPU: 4 PID: 974 Comm: NetworkManager Kdump: loaded Tainted: G OE 5.17.0-rc8_mrq_dev-queue+ #12 [ 1379.595926] Hardware name: Intel Corporation S1200SP/S1200SP, BIOS S1200SP.86B.03.01.0042.013020190050 01/30/2019 [ 1379.596063] RIP: 0010:ice_eswitch_port_start_xmit+0x46/0xd0 [ice] [ 1379.596292] Code: c7 c8 09 00 00 e8 9a c9 fc ff 84 c0 0f 85 82 00 00 00 4c 89 e7 e8 ca 70 fe ff 48 8b 7d 58 48 89 c3 48 85 ff 75 5e 48 8b 53 20 <8b> 42 40 85 c0 74 78 8d 48 01 f0 0f b1 4a 40 75 f2 0f b6 95 84 00 [ 1379.596456] RSP: 0018:ffffaba0c0d7bad0 EFLAGS: 00010246 [ 1379.596584] RAX: ffff969c14c71680 RBX: ffff969c14c71680 RCX: 000100107a0f0000 [ 1379.596715] RDX: 0000000000000000 RSI: ffff969b9d631000 RDI: 0000000000000000 [ 1379.596846] RBP: ffff969c07b46500 R08: ffff969becfca8ac R09: 0000000000000001 [ 1379.596977] R10: 0000000000000004 R11: ffffaba0c0d7bbec R12: ffff969b9d631000 [ 1379.597106] R13: ffffffffc08357a0 R14: ffff969c07b46500 R15: ffff969b9d631000 [ 1379.597237] FS: 00007f72c0e25c80(0000) GS:ffff969f13500000(0000) knlGS:0000000000000000 [ 1379.597414] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1379.597562] CR2: 0000000000000040 CR3: 000000012b316006 CR4: 00000000003706e0 [ 1379.597713] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1379.597863] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1379.598015] Call Trace: [ 1379.598153] [ 1379.598294] dev_hard_start_xmit+0xd9/0x220 [ 1379.598444] sch_direct_xmit+0x8a/0x340 [ 1379.598592] __dev_queue_xmit+0xa3c/0xd30 [ 1379.598739] ? packet_parse_headers+0xb4/0xf0 [ 1379.598890] packet_sendmsg+0xa15/0x1620 [ 1379.599038] ? __check_object_size+0x46/0x140 [ 1379.599186] sock_sendmsg+0x5e/0x60 [ 1379.599330] ____sys_sendmsg+0x22c/0x270 [ 1379.599474] ? import_iovec+0x17/0x20 [ 1379.599622] ? sendmsg_copy_msghdr+0x59/0x90 [ 1379.599771] ___sys_sendmsg+0x81/0xc0 [ 1379.599917] ? __pollwait+0xd0/0xd0 [ 1379.600061] ? preempt_count_add+0x68/0xa0 [ 1379.600210] ? _raw_write_lock_irq+0x1a/0x40 [ 1379.600369] ? ep_done_scan+0xc9/0x110 [ 1379.600494] ? _raw_spin_unlock_irqrestore+0x25/0x40 [ 1379.600622] ? preempt_count_add+0x68/0xa0 [ 1379.600747] ? _raw_spin_lock_irq+0x1a/0x40 [ 1379.600899] ? __fget_light+0x8f/0x110 [ 1379.601024] __sys_sendmsg+0x49/0x80 [ 1379.601148] ? release_ds_buffers+0x50/0xe0 [ 1379.601274] do_syscall_64+0x3b/0x90 [ 1379.601399] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 1379.601525] RIP: 0033:0x7f72c1e2e35d Fixes: f5396b8a663f ("ice: switchdev slow path") Signed-off-by: Wojciech Drewek Reported-by: Marcin Szycik Reviewed-by: Michal Swiatkowski Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_eswitch.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index 9a84d746a6c4..6a463b242c7d 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -361,7 +361,8 @@ ice_eswitch_port_start_xmit(struct sk_buff *skb, struct net_device *netdev) np = netdev_priv(netdev); vsi = np->vsi; - if (ice_is_reset_in_progress(vsi->back->state)) + if (ice_is_reset_in_progress(vsi->back->state) || + test_bit(ICE_VF_DIS, vsi->back->state)) return NETDEV_TX_BUSY; repr = ice_netdev_to_repr(netdev); From 7c8881b77908a51814a050da408c89f1a25b7fb7 Mon Sep 17 00:00:00 2001 From: Jianglei Nie Date: Sat, 2 Apr 2022 10:17:15 +0800 Subject: [PATCH 057/196] ice: Fix memory leak in ice_get_orom_civd_data() A memory chunk was allocated for orom_data in ice_get_orom_civd_data() by vzmalloc(). But when ice_read_flash_module() fails, the allocated memory is not freed, which will lead to a memory leak. We can fix it by freeing the orom_data when ce_read_flash_module() fails. Fixes: af18d8866c80 ("ice: reduce time to read Option ROM CIVD data") Signed-off-by: Jianglei Nie Tested-by: Gurucharan (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_nvm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/ice/ice_nvm.c b/drivers/net/ethernet/intel/ice/ice_nvm.c index 4eb0599714f4..13cdb5ea594d 100644 --- a/drivers/net/ethernet/intel/ice/ice_nvm.c +++ b/drivers/net/ethernet/intel/ice/ice_nvm.c @@ -641,6 +641,7 @@ ice_get_orom_civd_data(struct ice_hw *hw, enum ice_bank_select bank, status = ice_read_flash_module(hw, bank, ICE_SR_1ST_OROM_BANK_PTR, 0, orom_data, hw->flash.banks.orom_size); if (status) { + vfree(orom_data); ice_debug(hw, ICE_DBG_NVM, "Unable to read Option ROM data\n"); return status; } From 10b01ee92df52c8d7200afead4d5e5f55a5c58b1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 Apr 2022 21:31:27 -0400 Subject: [PATCH 058/196] ext4: fix overhead calculation to account for the reserved gdt blocks The kernel calculation was underestimating the overhead by not taking into account the reserved gdt blocks. With this change, the overhead calculated by the kernel matches the overhead calculation in mke2fs. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f2a5e78f93a9..23a9b2c086ed 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4177,9 +4177,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; + int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) - return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + + return (has_super + ext4_bg_num_gdb(sb, grp) + + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + From 50f500b7f6335404b18bbffa93e3a905a08d061a Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 24 Feb 2022 11:03:41 +0900 Subject: [PATCH 059/196] ksmbd: remove filename in ksmbd_file If the filename is change by underlying rename the server, fp->filename and real filename can be different. This patch remove the uses of fp->filename in ksmbd and replace it with d_path(). Signed-off-by: Namjae Jeon Reviewed-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/misc.c | 40 +++++++++++++++++++++++++++++++--------- fs/ksmbd/misc.h | 3 ++- fs/ksmbd/oplock.c | 30 ------------------------------ fs/ksmbd/oplock.h | 2 -- fs/ksmbd/smb2pdu.c | 21 +++++++-------------- fs/ksmbd/vfs.c | 6 ++---- fs/ksmbd/vfs_cache.c | 1 - fs/ksmbd/vfs_cache.h | 1 - 8 files changed, 42 insertions(+), 62 deletions(-) diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c index 60e7ac62c917..1e2076a53bed 100644 --- a/fs/ksmbd/misc.c +++ b/fs/ksmbd/misc.c @@ -158,19 +158,41 @@ out: * Return : windows path string or error */ -char *convert_to_nt_pathname(char *filename) +char *convert_to_nt_pathname(struct ksmbd_share_config *share, + struct path *path) { - char *ab_pathname; + char *pathname, *ab_pathname, *nt_pathname; + int share_path_len = share->path_sz; - if (strlen(filename) == 0) - filename = "\\"; + pathname = kmalloc(PATH_MAX, GFP_KERNEL); + if (!pathname) + return ERR_PTR(-EACCES); - ab_pathname = kstrdup(filename, GFP_KERNEL); - if (!ab_pathname) - return NULL; + ab_pathname = d_path(path, pathname, PATH_MAX); + if (IS_ERR(ab_pathname)) { + nt_pathname = ERR_PTR(-EACCES); + goto free_pathname; + } - ksmbd_conv_path_to_windows(ab_pathname); - return ab_pathname; + if (strncmp(ab_pathname, share->path, share_path_len)) { + nt_pathname = ERR_PTR(-EACCES); + goto free_pathname; + } + + nt_pathname = kzalloc(strlen(&ab_pathname[share_path_len]) + 2, GFP_KERNEL); + if (!nt_pathname) { + nt_pathname = ERR_PTR(-ENOMEM); + goto free_pathname; + } + if (ab_pathname[share_path_len] == '\0') + strcpy(nt_pathname, "/"); + strcat(nt_pathname, &ab_pathname[share_path_len]); + + ksmbd_conv_path_to_windows(nt_pathname); + +free_pathname: + kfree(pathname); + return nt_pathname; } int get_nlink(struct kstat *st) diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h index 253366bd0951..aae2a252945f 100644 --- a/fs/ksmbd/misc.h +++ b/fs/ksmbd/misc.h @@ -14,7 +14,8 @@ struct ksmbd_file; int match_pattern(const char *str, size_t len, const char *pattern); int ksmbd_validate_filename(char *filename); int parse_stream_name(char *filename, char **stream_name, int *s_type); -char *convert_to_nt_pathname(char *filename); +char *convert_to_nt_pathname(struct ksmbd_share_config *share, + struct path *path); int get_nlink(struct kstat *st); void ksmbd_conv_path_to_unix(char *path); void ksmbd_strip_last_slash(char *path); diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index 23871b18a429..8b5560574d4c 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -1694,33 +1694,3 @@ out: read_unlock(&lease_list_lock); return ret_op; } - -int smb2_check_durable_oplock(struct ksmbd_file *fp, - struct lease_ctx_info *lctx, char *name) -{ - struct oplock_info *opinfo = opinfo_get(fp); - int ret = 0; - - if (opinfo && opinfo->is_lease) { - if (!lctx) { - pr_err("open does not include lease\n"); - ret = -EBADF; - goto out; - } - if (memcmp(opinfo->o_lease->lease_key, lctx->lease_key, - SMB2_LEASE_KEY_SIZE)) { - pr_err("invalid lease key\n"); - ret = -EBADF; - goto out; - } - if (name && strcmp(fp->filename, name)) { - pr_err("invalid name reconnect %s\n", name); - ret = -EINVAL; - goto out; - } - } -out: - if (opinfo) - opinfo_put(opinfo); - return ret; -} diff --git a/fs/ksmbd/oplock.h b/fs/ksmbd/oplock.h index 0cf7a2b5bbc0..09753448f779 100644 --- a/fs/ksmbd/oplock.h +++ b/fs/ksmbd/oplock.h @@ -124,6 +124,4 @@ struct oplock_info *lookup_lease_in_table(struct ksmbd_conn *conn, int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci, struct lease_ctx_info *lctx); void destroy_lease_table(struct ksmbd_conn *conn); -int smb2_check_durable_oplock(struct ksmbd_file *fp, - struct lease_ctx_info *lctx, char *name); #endif /* __KSMBD_OPLOCK_H */ diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 3bf6c56c654c..e38fb68ded21 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -2918,7 +2918,6 @@ int smb2_open(struct ksmbd_work *work) goto err_out; } - fp->filename = name; fp->cdoption = req->CreateDisposition; fp->daccess = daccess; fp->saccess = req->ShareAccess; @@ -3270,14 +3269,13 @@ err_out1: if (!rsp->hdr.Status) rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR; - if (!fp || !fp->filename) - kfree(name); if (fp) ksmbd_fd_put(work, fp); smb2_set_err_rsp(work); ksmbd_debug(SMB, "Error response: %x\n", rsp->hdr.Status); } + kfree(name); kfree(lc); return 0; @@ -3895,8 +3893,6 @@ int smb2_query_dir(struct ksmbd_work *work) ksmbd_debug(SMB, "Search pattern is %s\n", srch_ptr); } - ksmbd_debug(SMB, "Directory name is %s\n", dir_fp->filename); - if (srch_flag & SMB2_REOPEN || srch_flag & SMB2_RESTART_SCANS) { ksmbd_debug(SMB, "Restart directory scan\n"); generic_file_llseek(dir_fp->filp, 0, SEEK_SET); @@ -4390,9 +4386,9 @@ static int get_file_all_info(struct ksmbd_work *work, return -EACCES; } - filename = convert_to_nt_pathname(fp->filename); - if (!filename) - return -ENOMEM; + filename = convert_to_nt_pathname(work->tcon->share_conf, &fp->filp->f_path); + if (IS_ERR(filename)) + return PTR_ERR(filename); inode = file_inode(fp->filp); generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); @@ -5683,8 +5679,7 @@ static int set_file_allocation_info(struct ksmbd_work *work, size = i_size_read(inode); rc = ksmbd_vfs_truncate(work, fp, alloc_blks * 512); if (rc) { - pr_err("truncate failed! filename : %s, err %d\n", - fp->filename, rc); + pr_err("truncate failed!, err %d\n", rc); return rc; } if (size < alloc_blks * 512) @@ -5714,12 +5709,10 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, * truncated range. */ if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) { - ksmbd_debug(SMB, "filename : %s truncated to newsize %lld\n", - fp->filename, newsize); + ksmbd_debug(SMB, "truncated to newsize %lld\n", newsize); rc = ksmbd_vfs_truncate(work, fp, newsize); if (rc) { - ksmbd_debug(SMB, "truncate failed! filename : %s err %d\n", - fp->filename, rc); + ksmbd_debug(SMB, "truncate failed!, err %d\n", rc); if (rc != -EAGAIN) rc = -EBADF; return rc; diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 9cebb6ba555b..dcdd07c6efff 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -398,8 +398,7 @@ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count, nbytes = kernel_read(filp, rbuf, count, pos); if (nbytes < 0) { - pr_err("smb read failed for (%s), err = %zd\n", - fp->filename, nbytes); + pr_err("smb read failed, err = %zd\n", nbytes); return nbytes; } @@ -875,8 +874,7 @@ int ksmbd_vfs_truncate(struct ksmbd_work *work, err = vfs_truncate(&filp->f_path, size); if (err) - pr_err("truncate failed for filename : %s err %d\n", - fp->filename, err); + pr_err("truncate failed, err %d\n", err); return err; } diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c index 29c1db66bd0f..0974d2e972b9 100644 --- a/fs/ksmbd/vfs_cache.c +++ b/fs/ksmbd/vfs_cache.c @@ -328,7 +328,6 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp) kfree(smb_lock); } - kfree(fp->filename); if (ksmbd_stream_fd(fp)) kfree(fp->stream.name); kmem_cache_free(filp_cache, fp); diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h index 36239ce31afd..fcb13413fa8d 100644 --- a/fs/ksmbd/vfs_cache.h +++ b/fs/ksmbd/vfs_cache.h @@ -62,7 +62,6 @@ struct ksmbd_inode { struct ksmbd_file { struct file *filp; - char *filename; u64 persistent_id; u64 volatile_id; From 8510a043d334ecdf83d4604782f288db6bf21d60 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 5 Apr 2022 12:04:43 +0900 Subject: [PATCH 060/196] ksmbd: increment reference count of parent fp Add missing increment reference count of parent fp in ksmbd_lookup_fd_inode(). Signed-off-by: Namjae Jeon Reviewed-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 2 ++ fs/ksmbd/vfs_cache.c | 1 + 2 files changed, 3 insertions(+) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index e38fb68ded21..62cc0f95ab87 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -5758,8 +5758,10 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, if (parent_fp) { if (parent_fp->daccess & FILE_DELETE_LE) { pr_err("parent dir is opened with delete access\n"); + ksmbd_fd_put(work, parent_fp); return -ESHARE; } + ksmbd_fd_put(work, parent_fp); } next: return smb2_rename(work, fp, user_ns, rename_info, diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c index 0974d2e972b9..c4d59d2735f0 100644 --- a/fs/ksmbd/vfs_cache.c +++ b/fs/ksmbd/vfs_cache.c @@ -496,6 +496,7 @@ struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode) list_for_each_entry(lfp, &ci->m_fp_list, node) { if (inode == file_inode(lfp->filp)) { atomic_dec(&ci->m_count); + lfp = ksmbd_fp_get(lfp); read_unlock(&ci->m_lock); return lfp; } From 02655a70b7cc0f534531ee65fa72692f4d31a944 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 13 Apr 2022 10:01:36 +0900 Subject: [PATCH 061/196] ksmbd: set fixed sector size to FS_SECTOR_SIZE_INFORMATION Currently ksmbd is using ->f_bsize from vfs_statfs() as sector size. If fat/exfat is a local share, ->f_bsize is a cluster size that is too large to be used as a sector size. Sector sizes larger than 4K cause problem occurs when mounting an iso file through windows client. The error message can be obtained using Mount-DiskImage command, the error is: "Mount-DiskImage : The sector size of the physical disk on which the virtual disk resides is not supported." This patch reports fixed 4KB sector size if ->s_blocksize is bigger than 4KB. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 62cc0f95ab87..16c803a9d996 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "glob.h" #include "smbfsctl.h" @@ -4995,15 +4996,17 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, case FS_SECTOR_SIZE_INFORMATION: { struct smb3_fs_ss_info *info; + unsigned int sector_size = + min_t(unsigned int, path.mnt->mnt_sb->s_blocksize, 4096); info = (struct smb3_fs_ss_info *)(rsp->Buffer); - info->LogicalBytesPerSector = cpu_to_le32(stfs.f_bsize); + info->LogicalBytesPerSector = cpu_to_le32(sector_size); info->PhysicalBytesPerSectorForAtomicity = - cpu_to_le32(stfs.f_bsize); - info->PhysicalBytesPerSectorForPerf = cpu_to_le32(stfs.f_bsize); + cpu_to_le32(sector_size); + info->PhysicalBytesPerSectorForPerf = cpu_to_le32(sector_size); info->FSEffPhysicalBytesPerSectorForAtomicity = - cpu_to_le32(stfs.f_bsize); + cpu_to_le32(sector_size); info->Flags = cpu_to_le32(SSINFO_FLAGS_ALIGNED_DEVICE | SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE); info->ByteOffsetForSectorAlignment = 0; From 85d825dbf4899a69407338bae462a59aa9a37326 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 Apr 2022 21:57:49 -0400 Subject: [PATCH 062/196] ext4: force overhead calculation if the s_overhead_cluster makes no sense If the file system does not use bigalloc, calculating the overhead is cheap, so force the recalculation of the overhead so we don't have to trust the precalculated overhead in the superblock. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 23a9b2c086ed..d08820fdfdee 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5289,9 +5289,18 @@ no_journal: * Get the # of file system overhead blocks from the * superblock if present. */ - if (es->s_overhead_clusters) - sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); - else { + sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); + /* ignore the precalculated value if it is ridiculous */ + if (sbi->s_overhead > ext4_blocks_count(es)) + sbi->s_overhead = 0; + /* + * If the bigalloc feature is not enabled recalculating the + * overhead doesn't take long, so we might as well just redo + * it to make sure we are using the correct value. + */ + if (!ext4_has_feature_bigalloc(sb)) + sbi->s_overhead = 0; + if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; From eb7054212eac8b451d727bf079eae3db8c88f9d3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 Apr 2022 22:39:00 -0400 Subject: [PATCH 063/196] ext4: update the cached overhead value in the superblock If we (re-)calculate the file system overhead amount and it's different from the on-disk s_overhead_clusters value, update the on-disk version since this can take potentially quite a while on bigalloc file systems. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 1 + fs/ext4/ioctl.c | 16 ++++++++++++++++ fs/ext4/super.c | 2 ++ 3 files changed, 19 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 48dc2c3247ad..a743b1e3b89e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3068,6 +3068,7 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); +int ext4_update_overhead(struct super_block *sb); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 992229ca2d83..ba44fa1be70a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1652,3 +1652,19 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif + +static void set_overhead(struct ext4_super_block *es, const void *arg) +{ + es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); +} + +int ext4_update_overhead(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sb_rdonly(sb) || sbi->s_overhead == 0 || + sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters)) + return 0; + + return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead); +} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d08820fdfdee..1847b46af808 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5618,6 +5618,8 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " "Quota mode: %s.", descr, ext4_quota_mode(sb)); + /* Update the s_overhead_clusters if necessary */ + ext4_update_overhead(sb); return 0; free_sbi: From 357ad4d898286b94aaae0cb7e3f573459e5b98b9 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Thu, 14 Apr 2022 17:19:38 +0800 Subject: [PATCH 064/196] sound/oss/dmasound: fix 'dmasound_setup' defined but not used We observed: 'dmasound_setup' defined but not used error with COMPILER=gcc ARCH=m68k DEFCONFIG=allmodconfig build. Fix it by adding __maybe_unused to dmasound_setup. Error(s): sound/oss/dmasound/dmasound_core.c:1431:12: error: 'dmasound_setup' defined but not used [-Werror=unused-function] Fixes: 9dd7c46346ca ("sound/oss/dmasound: fix build when drivers are mixed =y/=m") Signed-off-by: Miles Chen Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20220414091940.2216-1-miles.chen@mediatek.com Signed-off-by: Takashi Iwai --- sound/oss/dmasound/dmasound_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/oss/dmasound/dmasound_core.c b/sound/oss/dmasound/dmasound_core.c index 9c48f3a9e3d1..164335d3c200 100644 --- a/sound/oss/dmasound/dmasound_core.c +++ b/sound/oss/dmasound/dmasound_core.c @@ -1428,7 +1428,7 @@ void dmasound_deinit(void) unregister_sound_dsp(sq_unit); } -static int dmasound_setup(char *str) +static int __maybe_unused dmasound_setup(char *str) { int ints[6], size; From c74193787b2f683751a67603fb5f15c7584f355f Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Thu, 14 Apr 2022 18:05:16 +0300 Subject: [PATCH 065/196] ALSA: hda/hdmi: fix warning about PCM count when used with SOF With commit 13046370c4d1 ("ALSA: hda/hdmi: let new platforms assign the pcm slot dynamically"), old behaviour to consider the HDA pin number, when choosing PCM to assign, was dropped. Build on this change and limit the number of PCMs created to number of converters (= maximum number of concurrent display/receivers) when "mst_no_extra_pcms" and "dyn_pcm_no_legacy" quirks are both set. Fix the check in hdmi_find_pcm_slot() to ensure only spec->pcm_used entries are considered in the search. Elsewhere in the driver spec->pcm_used is already checked properly. Doing this avoids following warning at SOF driver probe for multiple machine drivers: [ 112.425297] sof_sdw sof_sdw: hda_dsp_hdmi_build_controls: no PCM in topology for HDMI converter 4 [ 112.425298] sof_sdw sof_sdw: hda_dsp_hdmi_build_controls: no PCM in topology for HDMI converter 5 [ 112.425299] sof_sdw sof_sdw: hda_dsp_hdmi_build_controls: no PCM in topology for HDMI converter 6 Fixes: 13046370c4d1 ("ALSA: hda/hdmi: let new platforms assign the pcm slot dynamically") BugLink: https://github.com/thesofproject/linux/issues/2573 Signed-off-by: Kai Vehmanen Link: https://lore.kernel.org/r/20220414150516.3638283-1-kai.vehmanen@linux.intel.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_hdmi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 3e086eebf88d..f9d67058d69d 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -1395,7 +1395,7 @@ static int hdmi_find_pcm_slot(struct hdmi_spec *spec, last_try: /* the last try; check the empty slots in pins */ - for (i = 0; i < spec->num_nids; i++) { + for (i = 0; i < spec->pcm_used; i++) { if (!test_bit(i, &spec->pcm_bitmap)) return i; } @@ -2325,7 +2325,9 @@ static int generic_hdmi_build_pcms(struct hda_codec *codec) * dev_num is the device entry number in a pin */ - if (codec->mst_no_extra_pcms) + if (spec->dyn_pcm_no_legacy && codec->mst_no_extra_pcms) + pcm_num = spec->num_cvts; + else if (codec->mst_no_extra_pcms) pcm_num = spec->num_nids; else pcm_num = spec->num_nids + spec->dev_num - 1; From 6624fb41f5126c7205e866e58d4aaae0453f0914 Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Thu, 14 Apr 2022 19:01:29 +0300 Subject: [PATCH 066/196] ALSA: hda/hdmi: add HDMI codec VID for Raptorlake-P Add HDMI codec VID for Intel Raptorlake-P platform. Signed-off-by: Kai Vehmanen Link: https://lore.kernel.org/r/20220414160129.3641411-1-kai.vehmanen@linux.intel.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_hdmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index f9d67058d69d..31fe41795571 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -4553,6 +4553,7 @@ HDA_CODEC_ENTRY(0x80862819, "DG2 HDMI", patch_i915_adlp_hdmi), HDA_CODEC_ENTRY(0x8086281a, "Jasperlake HDMI", patch_i915_icl_hdmi), HDA_CODEC_ENTRY(0x8086281b, "Elkhartlake HDMI", patch_i915_icl_hdmi), HDA_CODEC_ENTRY(0x8086281c, "Alderlake-P HDMI", patch_i915_adlp_hdmi), +HDA_CODEC_ENTRY(0x8086281f, "Raptorlake-P HDMI", patch_i915_adlp_hdmi), HDA_CODEC_ENTRY(0x80862880, "CedarTrail HDMI", patch_generic_hdmi), HDA_CODEC_ENTRY(0x80862882, "Valleyview2 HDMI", patch_i915_byt_hdmi), HDA_CODEC_ENTRY(0x80862883, "Braswell HDMI", patch_i915_byt_hdmi), From 6f06aa6b2fd741d2171ff99de3537141610fd933 Mon Sep 17 00:00:00 2001 From: Arun Ramadoss Date: Wed, 13 Apr 2022 12:44:09 +0530 Subject: [PATCH 067/196] net: phy: LAN937x: added PHY_POLL_CABLE_TEST flag Added the phy_poll_cable_test flag for the lan937x phy driver. Tested using command - ethtool --cable-test Fixes: 680baca546f2 ("net: phy: added the LAN937x phy support") Signed-off-by: Arun Ramadoss Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/microchip_t1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/microchip_t1.c b/drivers/net/phy/microchip_t1.c index 3f79bbbe62d3..c2c0e361fd3d 100644 --- a/drivers/net/phy/microchip_t1.c +++ b/drivers/net/phy/microchip_t1.c @@ -743,6 +743,7 @@ static struct phy_driver microchip_t1_phy_driver[] = { { PHY_ID_MATCH_MODEL(PHY_ID_LAN937X), .name = "Microchip LAN937x T1", + .flags = PHY_POLL_CABLE_TEST, .features = PHY_BASIC_T1_FEATURES, .config_init = lan87xx_config_init, .suspend = genphy_suspend, From ee3b0826b4764f6c13ad6db67495c5a1c38e9025 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 13 Apr 2022 11:16:25 +0100 Subject: [PATCH 068/196] rxrpc: Restore removed timer deletion A recent patch[1] from Eric Dumazet flipped the order in which the keepalive timer and the keepalive worker were cancelled in order to fix a syzbot reported issue[2]. Unfortunately, this enables the mirror image bug whereby the timer races with rxrpc_exit_net(), restarting the worker after it has been cancelled: CPU 1 CPU 2 =============== ===================== if (rxnet->live) rxnet->live = false; cancel_work_sync(&rxnet->peer_keepalive_work); rxrpc_queue_work(&rxnet->peer_keepalive_work); del_timer_sync(&rxnet->peer_keepalive_timer); Fix this by restoring the removed del_timer_sync() so that we try to remove the timer twice. If the timer runs again, it should see ->live == false and not restart the worker. Fixes: 1946014ca3b1 ("rxrpc: fix a race in rxrpc_exit_net()") Signed-off-by: David Howells cc: Eric Dumazet cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/20220404183439.3537837-1-eric.dumazet@gmail.com/ [1] Link: https://syzkaller.appspot.com/bug?extid=724378c4bb58f703b09a [2] Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/rxrpc/net_ns.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index f15d6942da45..cc7e30733feb 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -113,7 +113,9 @@ static __net_exit void rxrpc_exit_net(struct net *net) struct rxrpc_net *rxnet = rxrpc_net(net); rxnet->live = false; + del_timer_sync(&rxnet->peer_keepalive_timer); cancel_work_sync(&rxnet->peer_keepalive_work); + /* Remove the timer again as the worker may have restarted it. */ del_timer_sync(&rxnet->peer_keepalive_timer); rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); From 1a74e99323746353bba11562a2f2d0aa8102f402 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Thu, 14 Apr 2022 15:51:03 +0800 Subject: [PATCH 069/196] net/smc: Fix sock leak when release after smc_shutdown() Since commit e5d5aadcf3cd ("net/smc: fix sk_refcnt underflow on linkdown and fallback"), for a fallback connection, __smc_release() does not call sock_put() if its state is already SMC_CLOSED. When calling smc_shutdown() after falling back, its state is set to SMC_CLOSED but does not call sock_put(), so this patch calls it. Reported-and-tested-by: syzbot+6e29a053eb165bd50de5@syzkaller.appspotmail.com Fixes: e5d5aadcf3cd ("net/smc: fix sk_refcnt underflow on linkdown and fallback") Signed-off-by: Tony Lu Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 14ddc40149e8..fc7b6eb22143 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2674,8 +2674,10 @@ static int smc_shutdown(struct socket *sock, int how) if (smc->use_fallback) { rc = kernel_sock_shutdown(smc->clcsock, how); sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; + sock_put(sk); + } goto out; } switch (how) { From 29e8e659f984be00d75ec5fef4e37c88def72712 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 14 Apr 2022 16:49:25 +0800 Subject: [PATCH 070/196] net/packet: fix packet_sock xmit return value checking packet_sock xmit could be dev_queue_xmit, which also returns negative errors. So only checking positive errors is not enough, or userspace sendmsg may return success while packet is not send out. Move the net_xmit_errno() assignment in the braces as checkpatch.pl said do not use assignment in if condition. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Flavio Leitner Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/packet/af_packet.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c39c09899fd0..002d2b9c69dd 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2858,8 +2858,9 @@ tpacket_error: status = TP_STATUS_SEND_REQUEST; err = po->xmit(skb); - if (unlikely(err > 0)) { - err = net_xmit_errno(err); + if (unlikely(err != 0)) { + if (err > 0) + err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == TP_STATUS_AVAILABLE) { /* skb was destructed already */ @@ -3060,8 +3061,12 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->no_fcs = 1; err = po->xmit(skb); - if (err > 0 && (err = net_xmit_errno(err)) != 0) - goto out_unlock; + if (unlikely(err != 0)) { + if (err > 0) + err = net_xmit_errno(err); + if (err) + goto out_unlock; + } dev_put(dev); From f40c064e933d7787ca7411b699504d7a2664c1f5 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 14 Apr 2022 13:34:26 -0700 Subject: [PATCH 071/196] ip6_gre: Avoid updating tunnel->tun_hlen in __gre6_xmit() Do not update tunnel->tun_hlen in data plane code. Use a local variable instead, just like "tunnel_hlen" in net/ipv4/ip_gre.c:gre_fb_xmit(). Co-developed-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Peilin Ye Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 8753e9cec326..b43a46449130 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -743,6 +743,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; __be16 flags; + int tun_hlen; tun_info = skb_tunnel_info_txcheck(skb); if (IS_ERR(tun_info) || @@ -760,9 +761,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, dsfield = key->tos; flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); - tunnel->tun_hlen = gre_calc_hlen(flags); + tun_hlen = gre_calc_hlen(flags); - gre_build_header(skb, tunnel->tun_hlen, + gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) From ab198e1d0dd8dc4bc7575fb50758e2cbd51e14e1 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 14 Apr 2022 13:35:40 -0700 Subject: [PATCH 072/196] ip6_gre: Fix skb_under_panic in __gre6_xmit() Feng reported an skb_under_panic BUG triggered by running test_ip6gretap() in tools/testing/selftests/bpf/test_tunnel.sh: [ 82.492551] skbuff: skb_under_panic: text:ffffffffb268bb8e len:403 put:12 head:ffff9997c5480000 data:ffff9997c547fff8 tail:0x18b end:0x2c0 dev:ip6gretap11 <...> [ 82.607380] Call Trace: [ 82.609389] [ 82.611136] skb_push.cold.109+0x10/0x10 [ 82.614289] __gre6_xmit+0x41e/0x590 [ 82.617169] ip6gre_tunnel_xmit+0x344/0x3f0 [ 82.620526] dev_hard_start_xmit+0xf1/0x330 [ 82.623882] sch_direct_xmit+0xe4/0x250 [ 82.626961] __dev_queue_xmit+0x720/0xfe0 <...> [ 82.633431] packet_sendmsg+0x96a/0x1cb0 [ 82.636568] sock_sendmsg+0x30/0x40 <...> The following sequence of events caused the BUG: 1. During ip6gretap device initialization, tunnel->tun_hlen (e.g. 4) is calculated based on old flags (see ip6gre_calc_hlen()); 2. packet_snd() reserves header room for skb A, assuming tunnel->tun_hlen is 4; 3. Later (in clsact Qdisc), the eBPF program sets a new tunnel key for skb A using bpf_skb_set_tunnel_key() (see _ip6gretap_set_tunnel()); 4. __gre6_xmit() detects the new tunnel key, and recalculates "tun_hlen" (e.g. 12) based on new flags (e.g. TUNNEL_KEY and TUNNEL_SEQ); 5. gre_build_header() calls skb_push() with insufficient reserved header room, triggering the BUG. As sugguested by Cong, fix it by moving the call to skb_cow_head() after the recalculation of tun_hlen. Reproducer: OBJ=$LINUX/tools/testing/selftests/bpf/test_tunnel_kern.o ip netns add at_ns0 ip link add veth0 type veth peer name veth1 ip link set veth0 netns at_ns0 ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 ip netns exec at_ns0 ip link set dev veth0 up ip link set dev veth1 up mtu 1500 ip addr add dev veth1 172.16.1.200/24 ip netns exec at_ns0 ip addr add ::11/96 dev veth0 ip netns exec at_ns0 ip link set dev veth0 up ip addr add dev veth1 ::22/96 ip link set dev veth1 up ip netns exec at_ns0 \ ip link add dev ip6gretap00 type ip6gretap seq flowlabel 0xbcdef key 2 \ local ::11 remote ::22 ip netns exec at_ns0 ip addr add dev ip6gretap00 10.1.1.100/24 ip netns exec at_ns0 ip addr add dev ip6gretap00 fc80::100/96 ip netns exec at_ns0 ip link set dev ip6gretap00 up ip link add dev ip6gretap11 type ip6gretap external ip addr add dev ip6gretap11 10.1.1.200/24 ip addr add dev ip6gretap11 fc80::200/24 ip link set dev ip6gretap11 up tc qdisc add dev ip6gretap11 clsact tc filter add dev ip6gretap11 egress bpf da obj $OBJ sec ip6gretap_set_tunnel tc filter add dev ip6gretap11 ingress bpf da obj $OBJ sec ip6gretap_get_tunnel ping6 -c 3 -w 10 -q ::11 Fixes: 6712abc168eb ("ip6_gre: add ip6 gre and gretap collect_md mode") Reported-by: Feng Zhou Co-developed-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Peilin Ye Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b43a46449130..976236736146 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -733,9 +733,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, else fl6->daddr = tunnel->parms.raddr; - if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) - return -ENOMEM; - /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; @@ -763,6 +760,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); tun_hlen = gre_calc_hlen(flags); + if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen)) + return -ENOMEM; + gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), @@ -773,6 +773,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, if (tunnel->parms.o_flags & TUNNEL_SEQ) tunnel->o_seqno++; + if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) + return -ENOMEM; + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); From cefa91b2332d7009bc0be5d951d6cbbf349f90f8 Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Fri, 15 Apr 2022 10:08:41 +0200 Subject: [PATCH 073/196] openvswitch: fix OOB access in reserve_sfa_size() Given a sufficiently large number of actions, while copying and reserving memory for a new action of a new flow, if next_offset is greater than MAX_ACTIONS_BUFSIZE, the function reserve_sfa_size() does not return -EMSGSIZE as expected, but it allocates MAX_ACTIONS_BUFSIZE bytes increasing actions_len by req_size. This can then lead to an OOB write access, especially when further actions need to be copied. Fix it by rearranging the flow action size check. KASAN splat below: ================================================================== BUG: KASAN: slab-out-of-bounds in reserve_sfa_size+0x1ba/0x380 [openvswitch] Write of size 65360 at addr ffff888147e4001c by task handler15/836 CPU: 1 PID: 836 Comm: handler15 Not tainted 5.18.0-rc1+ #27 ... Call Trace: dump_stack_lvl+0x45/0x5a print_report.cold+0x5e/0x5db ? __lock_text_start+0x8/0x8 ? reserve_sfa_size+0x1ba/0x380 [openvswitch] kasan_report+0xb5/0x130 ? reserve_sfa_size+0x1ba/0x380 [openvswitch] kasan_check_range+0xf5/0x1d0 memcpy+0x39/0x60 reserve_sfa_size+0x1ba/0x380 [openvswitch] __add_action+0x24/0x120 [openvswitch] ovs_nla_add_action+0xe/0x20 [openvswitch] ovs_ct_copy_action+0x29d/0x1130 [openvswitch] ? __kernel_text_address+0xe/0x30 ? unwind_get_return_address+0x56/0xa0 ? create_prof_cpu_mask+0x20/0x20 ? ovs_ct_verify+0xf0/0xf0 [openvswitch] ? prep_compound_page+0x198/0x2a0 ? __kasan_check_byte+0x10/0x40 ? kasan_unpoison+0x40/0x70 ? ksize+0x44/0x60 ? reserve_sfa_size+0x75/0x380 [openvswitch] __ovs_nla_copy_actions+0xc26/0x2070 [openvswitch] ? __zone_watermark_ok+0x420/0x420 ? validate_set.constprop.0+0xc90/0xc90 [openvswitch] ? __alloc_pages+0x1a9/0x3e0 ? __alloc_pages_slowpath.constprop.0+0x1da0/0x1da0 ? unwind_next_frame+0x991/0x1e40 ? __mod_node_page_state+0x99/0x120 ? __mod_lruvec_page_state+0x2e3/0x470 ? __kasan_kmalloc_large+0x90/0xe0 ovs_nla_copy_actions+0x1b4/0x2c0 [openvswitch] ovs_flow_cmd_new+0x3cd/0xb10 [openvswitch] ... Cc: stable@vger.kernel.org Fixes: f28cd2af22a0 ("openvswitch: fix flow actions reallocation") Signed-off-by: Paolo Valerio Acked-by: Eelco Chaudron Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 7176156d3844..4c09cf8a0ab2 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2465,7 +2465,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2); if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) { + if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) { OVS_NLERR(log, "Flow action size exceeds max %u", MAX_ACTIONS_BUFSIZE); return ERR_PTR(-EMSGSIZE); From da367ac74aecb59b62a9538009d4aee8ce4bdfb3 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 14 Apr 2022 09:03:12 -0700 Subject: [PATCH 074/196] net: restore alpha order to Ethernet devices in config The displayed list of Ethernet devices in make menuconfig has gotten out of order. This is mostly due to changes in vendor names etc, but also because of new Microsoft entry in wrong place. This restores so that the display is in order even if the names of the sub directories are not. Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/ethernet/Kconfig | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig index bd4cb9d7c35d..827993022386 100644 --- a/drivers/net/ethernet/Kconfig +++ b/drivers/net/ethernet/Kconfig @@ -35,15 +35,6 @@ source "drivers/net/ethernet/aquantia/Kconfig" source "drivers/net/ethernet/arc/Kconfig" source "drivers/net/ethernet/asix/Kconfig" source "drivers/net/ethernet/atheros/Kconfig" -source "drivers/net/ethernet/broadcom/Kconfig" -source "drivers/net/ethernet/brocade/Kconfig" -source "drivers/net/ethernet/cadence/Kconfig" -source "drivers/net/ethernet/calxeda/Kconfig" -source "drivers/net/ethernet/cavium/Kconfig" -source "drivers/net/ethernet/chelsio/Kconfig" -source "drivers/net/ethernet/cirrus/Kconfig" -source "drivers/net/ethernet/cisco/Kconfig" -source "drivers/net/ethernet/cortina/Kconfig" config CX_ECAT tristate "Beckhoff CX5020 EtherCAT master support" @@ -57,6 +48,14 @@ config CX_ECAT To compile this driver as a module, choose M here. The module will be called ec_bhf. +source "drivers/net/ethernet/broadcom/Kconfig" +source "drivers/net/ethernet/cadence/Kconfig" +source "drivers/net/ethernet/calxeda/Kconfig" +source "drivers/net/ethernet/cavium/Kconfig" +source "drivers/net/ethernet/chelsio/Kconfig" +source "drivers/net/ethernet/cirrus/Kconfig" +source "drivers/net/ethernet/cisco/Kconfig" +source "drivers/net/ethernet/cortina/Kconfig" source "drivers/net/ethernet/davicom/Kconfig" config DNET @@ -85,7 +84,6 @@ source "drivers/net/ethernet/huawei/Kconfig" source "drivers/net/ethernet/i825xx/Kconfig" source "drivers/net/ethernet/ibm/Kconfig" source "drivers/net/ethernet/intel/Kconfig" -source "drivers/net/ethernet/microsoft/Kconfig" source "drivers/net/ethernet/xscale/Kconfig" config JME @@ -128,8 +126,9 @@ source "drivers/net/ethernet/mediatek/Kconfig" source "drivers/net/ethernet/mellanox/Kconfig" source "drivers/net/ethernet/micrel/Kconfig" source "drivers/net/ethernet/microchip/Kconfig" -source "drivers/net/ethernet/moxa/Kconfig" source "drivers/net/ethernet/mscc/Kconfig" +source "drivers/net/ethernet/microsoft/Kconfig" +source "drivers/net/ethernet/moxa/Kconfig" source "drivers/net/ethernet/myricom/Kconfig" config FEALNX @@ -141,10 +140,10 @@ config FEALNX Say Y here to support the Myson MTD-800 family of PCI-based Ethernet cards. +source "drivers/net/ethernet/ni/Kconfig" source "drivers/net/ethernet/natsemi/Kconfig" source "drivers/net/ethernet/neterion/Kconfig" source "drivers/net/ethernet/netronome/Kconfig" -source "drivers/net/ethernet/ni/Kconfig" source "drivers/net/ethernet/8390/Kconfig" source "drivers/net/ethernet/nvidia/Kconfig" source "drivers/net/ethernet/nxp/Kconfig" @@ -164,6 +163,7 @@ source "drivers/net/ethernet/packetengines/Kconfig" source "drivers/net/ethernet/pasemi/Kconfig" source "drivers/net/ethernet/pensando/Kconfig" source "drivers/net/ethernet/qlogic/Kconfig" +source "drivers/net/ethernet/brocade/Kconfig" source "drivers/net/ethernet/qualcomm/Kconfig" source "drivers/net/ethernet/rdc/Kconfig" source "drivers/net/ethernet/realtek/Kconfig" @@ -171,10 +171,10 @@ source "drivers/net/ethernet/renesas/Kconfig" source "drivers/net/ethernet/rocker/Kconfig" source "drivers/net/ethernet/samsung/Kconfig" source "drivers/net/ethernet/seeq/Kconfig" -source "drivers/net/ethernet/sfc/Kconfig" source "drivers/net/ethernet/sgi/Kconfig" source "drivers/net/ethernet/silan/Kconfig" source "drivers/net/ethernet/sis/Kconfig" +source "drivers/net/ethernet/sfc/Kconfig" source "drivers/net/ethernet/smsc/Kconfig" source "drivers/net/ethernet/socionext/Kconfig" source "drivers/net/ethernet/stmicro/Kconfig" From 60496069d0ae2d056bc12714d953c142afdef696 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 13 Apr 2022 12:45:15 -0700 Subject: [PATCH 075/196] powerpc: Update MAINTAINERS for ibmvnic and VAS Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7341667e7313..ad76e7bbaf28 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9337,14 +9337,12 @@ F: drivers/pci/hotplug/rpaphp* IBM Power SRIOV Virtual NIC Device Driver M: Dany Madden -M: Sukadev Bhattiprolu R: Thomas Falcon L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/ibm/ibmvnic.* IBM Power Virtual Accelerator Switchboard -M: Sukadev Bhattiprolu L: linuxppc-dev@lists.ozlabs.org S: Supported F: arch/powerpc/include/asm/vas.h From 3db09e762dc79584a69c10d74a6b98f89a9979f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Apr 2022 10:35:41 -0700 Subject: [PATCH 076/196] net/sched: cls_u32: fix netns refcount changes in u32_change() We are now able to detect extra put_net() at the moment they happen, instead of much later in correct code paths. u32_init_knode() / tcf_exts_init() populates the ->exts.net pointer, but as mentioned in tcf_exts_init(), the refcount on netns has not been elevated yet. The refcount is taken only once tcf_exts_get_net() is called. So the two u32_destroy_key() calls from u32_change() are attempting to release an invalid reference on the netns. syzbot report: refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 0 PID: 21708 at lib/refcount.c:31 refcount_warn_saturate+0xbf/0x1e0 lib/refcount.c:31 Modules linked in: CPU: 0 PID: 21708 Comm: syz-executor.5 Not tainted 5.18.0-rc2-next-20220412-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:refcount_warn_saturate+0xbf/0x1e0 lib/refcount.c:31 Code: 1d 14 b6 b2 09 31 ff 89 de e8 6d e9 89 fd 84 db 75 e0 e8 84 e5 89 fd 48 c7 c7 40 aa 26 8a c6 05 f4 b5 b2 09 01 e8 e5 81 2e 05 <0f> 0b eb c4 e8 68 e5 89 fd 0f b6 1d e3 b5 b2 09 31 ff 89 de e8 38 RSP: 0018:ffffc900051af1b0 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000040000 RSI: ffffffff8160a0c8 RDI: fffff52000a35e28 RBP: 0000000000000004 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff81604a9e R11: 0000000000000000 R12: 1ffff92000a35e3b R13: 00000000ffffffef R14: ffff8880211a0194 R15: ffff8880577d0a00 FS: 00007f25d183e700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f19c859c028 CR3: 0000000051009000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __refcount_dec include/linux/refcount.h:344 [inline] refcount_dec include/linux/refcount.h:359 [inline] ref_tracker_free+0x535/0x6b0 lib/ref_tracker.c:118 netns_tracker_free include/net/net_namespace.h:327 [inline] put_net_track include/net/net_namespace.h:341 [inline] tcf_exts_put_net include/net/pkt_cls.h:255 [inline] u32_destroy_key.isra.0+0xa7/0x2b0 net/sched/cls_u32.c:394 u32_change+0xe01/0x3140 net/sched/cls_u32.c:909 tc_new_tfilter+0x98d/0x2200 net/sched/cls_api.c:2148 rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:6016 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2495 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x543/0x7f0 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1921 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:725 ____sys_sendmsg+0x6e2/0x800 net/socket.c:2413 ___sys_sendmsg+0xf3/0x170 net/socket.c:2467 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2496 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f25d0689049 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f25d183e168 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f25d079c030 RCX: 00007f25d0689049 RDX: 0000000000000000 RSI: 0000000020000340 RDI: 0000000000000005 RBP: 00007f25d06e308d R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffd0b752e3f R14: 00007f25d183e300 R15: 0000000000022000 Fixes: 35c55fc156d8 ("cls_u32: use tcf_exts_get_net() before call_rcu()") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Cong Wang Cc: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski --- net/sched/cls_u32.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index cf5649292ee0..fcba6c43ba50 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -386,14 +386,19 @@ static int u32_init(struct tcf_proto *tp) return 0; } -static int u32_destroy_key(struct tc_u_knode *n, bool free_pf) +static void __u32_destroy_key(struct tc_u_knode *n) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); tcf_exts_destroy(&n->exts); - tcf_exts_put_net(&n->exts); if (ht && --ht->refcnt == 0) kfree(ht); + kfree(n); +} + +static void u32_destroy_key(struct tc_u_knode *n, bool free_pf) +{ + tcf_exts_put_net(&n->exts); #ifdef CONFIG_CLS_U32_PERF if (free_pf) free_percpu(n->pf); @@ -402,8 +407,7 @@ static int u32_destroy_key(struct tc_u_knode *n, bool free_pf) if (free_pf) free_percpu(n->pcpu_success); #endif - kfree(n); - return 0; + __u32_destroy_key(n); } /* u32_delete_key_rcu should be called when free'ing a copied @@ -900,13 +904,13 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, extack); if (err) { - u32_destroy_key(new, false); + __u32_destroy_key(new); return err; } err = u32_replace_hw_knode(tp, new, flags, extack); if (err) { - u32_destroy_key(new, false); + __u32_destroy_key(new); return err; } From ec5b0f605b105457f257f2870acad4a5d463984b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Apr 2022 10:35:42 -0700 Subject: [PATCH 077/196] net/sched: cls_u32: fix possible leak in u32_init_knode() While investigating a related syzbot report, I found that whenever call to tcf_exts_init() from u32_init_knode() is failing, we end up with an elevated refcount on ht->refcnt To avoid that, only increase the refcount after all possible errors have been evaluated. Fixes: b9a24bb76bf6 ("net_sched: properly handle failure case of tcf_exts_init()") Signed-off-by: Eric Dumazet Cc: Cong Wang Cc: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski --- net/sched/cls_u32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index fcba6c43ba50..4d27300c287c 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -815,10 +815,6 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, new->flags = n->flags; RCU_INIT_POINTER(new->ht_down, ht); - /* bump reference count as long as we hold pointer to structure */ - if (ht) - ht->refcnt++; - #ifdef CONFIG_CLS_U32_PERF /* Statistics may be incremented by readers during update * so we must keep them in tact. When the node is later destroyed @@ -840,6 +836,10 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, return NULL; } + /* bump reference count as long as we hold pointer to structure */ + if (ht) + ht->refcnt++; + return new; } From 83daab06252ee5d0e1f4373ff28b79304945fc19 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 13 Apr 2022 11:43:19 -0600 Subject: [PATCH 078/196] l3mdev: l3mdev_master_upper_ifindex_by_index_rcu should be using netdev_master_upper_dev_get_rcu Next patch uses l3mdev_master_upper_ifindex_by_index_rcu which throws a splat with debug kernels: [13783.087570] ------------[ cut here ]------------ [13783.093974] RTNL: assertion failed at net/core/dev.c (6702) [13783.100761] WARNING: CPU: 3 PID: 51132 at net/core/dev.c:6702 netdev_master_upper_dev_get+0x16a/0x1a0 [13783.184226] CPU: 3 PID: 51132 Comm: kworker/3:3 Not tainted 5.17.0-custom-100090-g6f963aafb1cc #682 [13783.194788] Hardware name: Mellanox Technologies Ltd. MSN2010/SA002610, BIOS 5.6.5 08/24/2017 [13783.204755] Workqueue: mld mld_ifc_work [ipv6] [13783.210338] RIP: 0010:netdev_master_upper_dev_get+0x16a/0x1a0 [13783.217209] Code: 0f 85 e3 fe ff ff e8 65 ac ec fe ba 2e 1a 00 00 48 c7 c6 60 6f 38 83 48 c7 c7 c0 70 38 83 c6 05 5e b5 d7 01 01 e8 c6 29 52 00 <0f> 0b e9 b8 fe ff ff e8 5a 6c 35 ff e9 1c ff ff ff 48 89 ef e8 7d [13783.238659] RSP: 0018:ffffc9000b37f5a8 EFLAGS: 00010286 [13783.244995] RAX: 0000000000000000 RBX: ffff88812ee5c000 RCX: 0000000000000000 [13783.253379] RDX: ffff88811ce09d40 RSI: ffffffff812d0fcd RDI: fffff5200166fea7 [13783.261769] RBP: 0000000000000000 R08: 0000000000000001 R09: ffff8882375f4287 [13783.270138] R10: ffffed1046ebe850 R11: 0000000000000001 R12: dffffc0000000000 [13783.278510] R13: 0000000000000275 R14: ffffc9000b37f688 R15: ffff8881273b4af8 [13783.286870] FS: 0000000000000000(0000) GS:ffff888237400000(0000) knlGS:0000000000000000 [13783.296352] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [13783.303177] CR2: 00007ff25fc9b2e8 CR3: 0000000174d23000 CR4: 00000000001006e0 [13783.311546] Call Trace: [13783.314660] [13783.317553] l3mdev_master_upper_ifindex_by_index_rcu+0x43/0xe0 ... Change l3mdev_master_upper_ifindex_by_index_rcu to use netdev_master_upper_dev_get_rcu. Fixes: 6a6d6681ac1a ("l3mdev: add function to retreive upper master") Signed-off-by: Ido Schimmel Signed-off-by: David Ahern Cc: Alexis Bauvin Signed-off-by: Jakub Kicinski --- net/l3mdev/l3mdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 4eb8892fb2ff..ca10916340b0 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -147,7 +147,7 @@ int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) dev = dev_get_by_index_rcu(net, ifindex); while (dev && !netif_is_l3_master(dev)) - dev = netdev_master_upper_dev_get(dev); + dev = netdev_master_upper_dev_get_rcu(dev); return dev ? dev->ifindex : 0; } From db53cd3d88dc328dea2e968c9c8d3b4294a8a674 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 13 Apr 2022 11:43:20 -0600 Subject: [PATCH 079/196] net: Handle l3mdev in ip_tunnel_init_flow Ido reported that the commit referenced in the Fixes tag broke a gre use case with dummy devices. Add a check to ip_tunnel_init_flow to see if the oif is an l3mdev port and if so set the oif to 0 to avoid the oif comparison in fib_lookup_good_nhc. Fixes: 40867d74c374 ("net: Add l3mdev index to flow struct and avoid oif reset for port devices") Reported-by: Ido Schimmel Signed-off-by: David Ahern Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 2 +- include/net/ip_tunnels.h | 11 +++++++++-- net/ipv4/ip_gre.c | 4 ++-- net/ipv4/ip_tunnel.c | 9 +++++---- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index b73466470f75..fe663b0ab708 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -423,7 +423,7 @@ mlxsw_sp_span_gretap4_route(const struct net_device *to_dev, parms = mlxsw_sp_ipip_netdev_parms4(to_dev); ip_tunnel_init_flow(&fl4, parms.iph.protocol, *daddrp, *saddrp, - 0, 0, parms.link, tun->fwmark, 0); + 0, 0, dev_net(to_dev), parms.link, tun->fwmark, 0); rt = ip_route_output_key(tun->net, &fl4); if (IS_ERR(rt)) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 0219fe907b26..88dee57eac8a 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -243,11 +243,18 @@ static inline __be32 tunnel_id_to_key32(__be64 tun_id) static inline void ip_tunnel_init_flow(struct flowi4 *fl4, int proto, __be32 daddr, __be32 saddr, - __be32 key, __u8 tos, int oif, + __be32 key, __u8 tos, + struct net *net, int oif, __u32 mark, __u32 tun_inner_hash) { memset(fl4, 0, sizeof(*fl4)); - fl4->flowi4_oif = oif; + + if (oif) { + fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index_rcu(net, oif); + /* Legacy VRF/l3mdev use case */ + fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif; + } + fl4->daddr = daddr; fl4->saddr = saddr; fl4->flowi4_tos = tos; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 99db2e41ed10..365caebf51ab 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -605,8 +605,8 @@ static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) key = &info->key; ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), - key->tos & ~INET_ECN_MASK, 0, skb->mark, - skb_get_hash(skb)); + key->tos & ~INET_ECN_MASK, dev_net(dev), 0, + skb->mark, skb_get_hash(skb)); rt = ip_route_output_key(dev_net(dev), &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 5a473319d3a5..94017a8c3994 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -294,8 +294,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, - RT_TOS(iph->tos), tunnel->parms.link, - tunnel->fwmark, 0); + RT_TOS(iph->tos), dev_net(dev), + tunnel->parms.link, tunnel->fwmark, 0); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { @@ -570,7 +570,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), RT_TOS(tos), - 0, skb->mark, skb_get_hash(skb)); + dev_net(dev), 0, skb->mark, skb_get_hash(skb)); if (tunnel->encap.type != TUNNEL_ENCAP_NONE) goto tx_error; @@ -726,7 +726,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, + tunnel->parms.o_key, RT_TOS(tos), + dev_net(dev), tunnel->parms.link, tunnel->fwmark, skb_get_hash(skb)); if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) From 9cb7c013420f98fa6fd12fc6a5dc055170c108db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Apr 2022 11:13:33 -0700 Subject: [PATCH 080/196] ipv6: make ip6_rt_gc_expire an atomic_t Reads and Writes to ip6_rt_gc_expire always have been racy, as syzbot reported lately [1] There is a possible risk of under-flow, leading to unexpected high value passed to fib6_run_gc(), although I have not observed this in the field. Hosts hitting ip6_dst_gc() very hard are under pretty bad state anyway. [1] BUG: KCSAN: data-race in ip6_dst_gc / ip6_dst_gc read-write to 0xffff888102110744 of 4 bytes by task 13165 on cpu 1: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 read-write to 0xffff888102110744 of 4 bytes by task 11607 on cpu 0: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 value changed: 0x00000bb3 -> 0x00000ba9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 11607 Comm: kworker/0:21 Not tainted 5.18.0-rc1-syzkaller-00037-g42e7a03d3bad-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: mld mld_ifc_work Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220413181333.649424-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv6.h | 4 ++-- net/ipv6/route.c | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 3d83b64471d3..b4af4837d80b 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -75,8 +75,8 @@ struct netns_ipv6 { struct list_head fib6_walkers; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; - unsigned int ip6_rt_gc_expire; - unsigned long ip6_rt_last_gc; + atomic_t ip6_rt_gc_expire; + unsigned long ip6_rt_last_gc; unsigned char flowlabel_has_excl; #ifdef CONFIG_IPV6_MULTIPLE_TABLES bool fib6_has_custom_rules; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 169e9df6d172..c4b6ce017d5e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3292,6 +3292,7 @@ static int ip6_dst_gc(struct dst_ops *ops) int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + unsigned int val; int entries; entries = dst_entries_get_fast(ops); @@ -3302,13 +3303,13 @@ static int ip6_dst_gc(struct dst_ops *ops) entries <= rt_max_size) goto out; - net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); + fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) - net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; + atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: - net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; + val = atomic_read(&net->ipv6.ip6_rt_gc_expire); + atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); return entries > rt_max_size; } @@ -6509,7 +6510,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; net->ipv6.sysctl.skip_notify_on_dev_down = 0; - net->ipv6.ip6_rt_gc_expire = 30*HZ; + atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); ret = 0; out: From d08ed852560eb71445547f3df7b05bf5c5c69cc4 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Wed, 13 Apr 2022 21:57:16 +0200 Subject: [PATCH 081/196] net: lan966x: Make sure to release ptp interrupt When the lan966x driver is removed make sure to remove also the ptp_irq IRQ. Fixes: e85a96e48e3309 ("net: lan966x: Add support for ptp interrupts") Signed-off-by: Horatiu Vultur Link: https://lore.kernel.org/r/20220413195716.3796467-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan966x/lan966x_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c index 958e55596b82..95830e3e2b1f 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c @@ -671,6 +671,9 @@ static void lan966x_cleanup_ports(struct lan966x *lan966x) disable_irq(lan966x->ana_irq); lan966x->ana_irq = -ENXIO; } + + if (lan966x->ptp_irq) + devm_free_irq(lan966x->dev, lan966x->ptp_irq, lan966x); } static int lan966x_probe_port(struct lan966x *lan966x, u32 p, From 323b190ba2debbcc03c01d2edaf1ec6b43e6ae43 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Apr 2022 21:14:00 -0600 Subject: [PATCH 082/196] io_uring: free iovec if file assignment fails We just return failure in this case, but we need to release the iovec first. If we're doing IO with more than FAST_IOV segments, then the iovec is allocated and must be freed. Reported-by: syzbot+96b43810dfe9c3bb95ed@syzkaller.appspotmail.com Fixes: 584b0180f0f4 ("io_uring: move read/write file prep state into actual opcode handler") Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4479013854d2..24409dd07239 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3832,8 +3832,10 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) iovec = NULL; } ret = io_rw_init_file(req, FMODE_READ); - if (unlikely(ret)) + if (unlikely(ret)) { + kfree(iovec); return ret; + } req->result = iov_iter_count(&s->iter); if (force_nonblock) { @@ -3958,8 +3960,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) iovec = NULL; } ret = io_rw_init_file(req, FMODE_WRITE); - if (unlikely(ret)) + if (unlikely(ret)) { + kfree(iovec); return ret; + } req->result = iov_iter_count(&s->iter); if (force_nonblock) { From 49aefd131739df552f83c566d0665744c30b1d70 Mon Sep 17 00:00:00 2001 From: suresh kumar Date: Sat, 16 Apr 2022 16:44:10 +0530 Subject: [PATCH 083/196] bonding: do not discard lowest hash bit for non layer3+4 hashing Commit b5f862180d70 was introduced to discard lowest hash bit for layer3+4 hashing but it also removes last bit from non layer3+4 hashing Below script shows layer2+3 hashing will result in same slave to be used with above commit. $ cat hash.py #/usr/bin/python3.6 h_dests=[0xa0, 0xa1] h_source=0xe3 hproto=0x8 saddr=0x1e7aa8c0 daddr=0x17aa8c0 for h_dest in h_dests: hash = (h_dest ^ h_source ^ hproto ^ saddr ^ daddr) hash ^= hash >> 16 hash ^= hash >> 8 print(hash) print("with last bit removed") for h_dest in h_dests: hash = (h_dest ^ h_source ^ hproto ^ saddr ^ daddr) hash ^= hash >> 16 hash ^= hash >> 8 hash = hash >> 1 print(hash) Output: $ python3.6 hash.py 522133332 522133333 <-------------- will result in both slaves being used with last bit removed 261066666 261066666 <-------------- only single slave used Signed-off-by: suresh kumar Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 15eddca7b4b6..38e152548126 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4027,14 +4027,19 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, const v return true; } -static u32 bond_ip_hash(u32 hash, struct flow_keys *flow) +static u32 bond_ip_hash(u32 hash, struct flow_keys *flow, int xmit_policy) { hash ^= (__force u32)flow_get_u32_dst(flow) ^ (__force u32)flow_get_u32_src(flow); hash ^= (hash >> 16); hash ^= (hash >> 8); + /* discard lowest hash bit to deal with the common even ports pattern */ - return hash >> 1; + if (xmit_policy == BOND_XMIT_POLICY_LAYER34 || + xmit_policy == BOND_XMIT_POLICY_ENCAP34) + return hash >> 1; + + return hash; } /* Generate hash based on xmit policy. If @skb is given it is used to linearize @@ -4064,7 +4069,7 @@ static u32 __bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, const voi memcpy(&hash, &flow.ports.ports, sizeof(hash)); } - return bond_ip_hash(hash, &flow); + return bond_ip_hash(hash, &flow, bond->params.xmit_policy); } /** @@ -5259,7 +5264,7 @@ static u32 bond_sk_hash_l34(struct sock *sk) /* L4 */ memcpy(&hash, &flow.ports.ports, sizeof(hash)); /* L3 */ - return bond_ip_hash(hash, &flow); + return bond_ip_hash(hash, &flow, BOND_XMIT_POLICY_LAYER34); } static struct net_device *__bond_sk_get_lower_dev(struct bonding *bond, From c0713540f6d55c53dca65baaead55a5a8b20552d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 17 Apr 2022 10:10:34 +0100 Subject: [PATCH 084/196] io_uring: fix leaks on IOPOLL and CQE_SKIP If all completed requests in io_do_iopoll() were marked with REQ_F_CQE_SKIP, we'll not only skip CQE posting but also io_free_batch_list() leaking memory and resources. Move @nr_events increment before REQ_F_CQE_SKIP check. We'll potentially return the value greater than the real one, but iopolling will deal with it and the userspace will re-iopoll if needed. In anyway, I don't think there are many use cases for REQ_F_CQE_SKIP + IOPOLL. Fixes: 83a13a4181b0e ("io_uring: tweak iopoll CQE_SKIP event counting") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5072fc8693fbfd595f89e5d4305bfcfd5d2f0a64.1650186611.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 24409dd07239..7625b29153b9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2797,11 +2797,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) /* order with io_complete_rw_iopoll(), e.g. ->result updates */ if (!smp_load_acquire(&req->iopoll_completed)) break; + nr_events++; if (unlikely(req->flags & REQ_F_CQE_SKIP)) continue; - __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0)); - nr_events++; } if (unlikely(!nr_events)) From d73497081710c876c3c61444445512989e102152 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 5 Apr 2022 19:51:12 +0200 Subject: [PATCH 085/196] can: isotp: stop timeout monitoring when no first frame was sent The first attempt to fix a the 'impossible' WARN_ON_ONCE(1) in isotp_tx_timer_handler() focussed on the identical CAN IDs created by the syzbot reproducer and lead to upstream fix/commit 3ea566422cbd ("can: isotp: sanitize CAN ID checks in isotp_bind()"). But this did not catch the root cause of the wrong tx.state in the tx_timer handler. In the isotp 'first frame' case a timeout monitoring needs to be started before the 'first frame' is send. But when this sending failed the timeout monitoring for this specific frame has to be disabled too. Otherwise the tx_timer is fired with the 'warn me' tx.state of ISOTP_IDLE. Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol") Link: https://lore.kernel.org/all/20220405175112.2682-1-socketcan@hartkopp.net Reported-by: syzbot+2339c27f5c66c652843e@syzkaller.appspotmail.com Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/can/isotp.c b/net/can/isotp.c index bafb0fb5f0e0..ff5d7870294e 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -906,6 +906,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; + s64 hrtimer_sec = 0; int off; int err; @@ -1004,7 +1005,9 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) isotp_create_fframe(cf, so, ae); /* start timeout for FC */ - hrtimer_start(&so->txtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); + hrtimer_sec = 1; + hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), + HRTIMER_MODE_REL_SOFT); } /* send the first or only CAN frame */ @@ -1017,6 +1020,11 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(err)); + + /* no transmission -> no timeout monitoring */ + if (hrtimer_sec) + hrtimer_cancel(&so->txtimer); + goto err_out_drop; } From 81022a170462d38ea10612cb67e8e2c529d58abe Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Sun, 17 Apr 2022 13:03:31 -0700 Subject: [PATCH 086/196] Input: omap4-keypad - fix pm_runtime_get_sync() error checking If the device is already in a runtime PM enabled state pm_runtime_get_sync() will return 1, so a test for negative value should be used to check for errors. Fixes: f77621cc640a ("Input: omap-keypad - dynamically handle register offsets") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220412070131.19848-1-linmq006@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/omap4-keypad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/omap4-keypad.c b/drivers/input/keyboard/omap4-keypad.c index 43375b38ee59..8a7ce41b8c56 100644 --- a/drivers/input/keyboard/omap4-keypad.c +++ b/drivers/input/keyboard/omap4-keypad.c @@ -393,7 +393,7 @@ static int omap4_keypad_probe(struct platform_device *pdev) * revision register. */ error = pm_runtime_get_sync(dev); - if (error) { + if (error < 0) { dev_err(dev, "pm_runtime_get_sync() failed\n"); pm_runtime_put_noidle(dev); return error; From 470776c6b03491a3e82c644737a6da5466b8b3eb Mon Sep 17 00:00:00 2001 From: Shelby Heffron Date: Sun, 17 Apr 2022 13:05:08 -0700 Subject: [PATCH 087/196] Input: add Marine Navigation Keycodes Add keycodes that are used by marine navigation devices. Signed-off-by: Shelby Heffron Link: https://lore.kernel.org/r/20220414015356.1619310-1-Shelby.Heffron@garmin.com Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 7989d9483ea7..dff8e7f17074 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -662,6 +662,27 @@ /* Select an area of screen to be copied */ #define KEY_SELECTIVE_SCREENSHOT 0x27a +/* Move the focus to the next or previous user controllable element within a UI container */ +#define KEY_NEXT_ELEMENT 0x27b +#define KEY_PREVIOUS_ELEMENT 0x27c + +/* Toggle Autopilot engagement */ +#define KEY_AUTOPILOT_ENGAGE_TOGGLE 0x27d + +/* Shortcut Keys */ +#define KEY_MARK_WAYPOINT 0x27e +#define KEY_SOS 0x27f +#define KEY_NAV_CHART 0x280 +#define KEY_FISHING_CHART 0x281 +#define KEY_SINGLE_RANGE_RADAR 0x282 +#define KEY_DUAL_RANGE_RADAR 0x283 +#define KEY_RADAR_OVERLAY 0x284 +#define KEY_TRADITIONAL_SONAR 0x285 +#define KEY_CLEARVU_SONAR 0x286 +#define KEY_SIDEVU_SONAR 0x287 +#define KEY_NAV_INFO 0x288 +#define KEY_BRIGHTNESS_MENU 0x289 + /* * Some keyboards have keys which do not have a defined meaning, these keys * are intended to be programmed / bound to macros by the user. For most From cbe6c3a8f8f4315b96e46e1a1c70393c06d95a4c Mon Sep 17 00:00:00 2001 From: Manuel Ullmann Date: Mon, 18 Apr 2022 00:20:01 +0200 Subject: [PATCH 088/196] net: atlantic: invert deep par in pm functions, preventing null derefs This will reset deeply on freeze and thaw instead of suspend and resume and prevent null pointer dereferences of the uninitialized ring 0 buffer while thawing. The impact is an indefinitely hanging kernel. You can't switch consoles after this and the only possible user interaction is SysRq. BUG: kernel NULL pointer dereference RIP: 0010:aq_ring_rx_fill+0xcf/0x210 [atlantic] aq_vec_init+0x85/0xe0 [atlantic] aq_nic_init+0xf7/0x1d0 [atlantic] atl_resume_common+0x4f/0x100 [atlantic] pci_pm_thaw+0x42/0xa0 resolves in aq_ring.o to ``` 0000000000000ae0 : { /* ... */ baf: 48 8b 43 08 mov 0x8(%rbx),%rax buff->flags = 0U; /* buff is NULL */ ``` The bug has been present since the introduction of the new pm code in 8aaa112a57c1 ("net: atlantic: refactoring pm logic") and was hidden until 8ce84271697a ("net: atlantic: changes for multi-TC support"), which refactored the aq_vec_{free,alloc} functions into aq_vec_{,ring}_{free,alloc}, but is technically not wrong. The original functions just always reinitialized the buffers on S3/S4. If the interface is down before freezing, the bug does not occur. It does not matter, whether the initrd contains and loads the module before thawing. So the fix is to invert the boolean parameter deep in all pm function calls, which was clearly intended to be set like that. First report was on Github [1], which you have to guess from the resume logs in the posted dmesg snippet. Recently I posted one on Bugzilla [2], since I did not have an AQC device so far. #regzbot introduced: 8ce84271697a #regzbot from: koo5 #regzbot monitor: https://github.com/Aquantia/AQtion/issues/32 Fixes: 8aaa112a57c1 ("net: atlantic: refactoring pm logic") Link: https://github.com/Aquantia/AQtion/issues/32 [1] Link: https://bugzilla.kernel.org/show_bug.cgi?id=215798 [2] Cc: stable@vger.kernel.org Reported-by: koo5 Signed-off-by: Manuel Ullmann Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index 797a95142d1f..3a529ee8c834 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -444,22 +444,22 @@ err_exit: static int aq_pm_freeze(struct device *dev) { - return aq_suspend_common(dev, false); + return aq_suspend_common(dev, true); } static int aq_pm_suspend_poweroff(struct device *dev) { - return aq_suspend_common(dev, true); + return aq_suspend_common(dev, false); } static int aq_pm_thaw(struct device *dev) { - return atl_resume_common(dev, false); + return atl_resume_common(dev, true); } static int aq_pm_resume_restore(struct device *dev) { - return atl_resume_common(dev, true); + return atl_resume_common(dev, false); } static const struct dev_pm_ops aq_pm_ops = { From 9339faac6d206544601b939321059f60ba96a18d Mon Sep 17 00:00:00 2001 From: Haowen Bai Date: Mon, 18 Apr 2022 18:26:26 +0800 Subject: [PATCH 089/196] cifs: Use kzalloc instead of kmalloc/memset Use kzalloc rather than duplicating its implementation, which makes code simple and easy to understand. Signed-off-by: Haowen Bai Signed-off-by: Steve French --- fs/cifs/transport.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index d9d1c353bafc..c667e6ddfe2f 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -464,13 +464,12 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, return -EIO; } - tr_hdr = kmalloc(sizeof(*tr_hdr), GFP_NOFS); + tr_hdr = kzalloc(sizeof(*tr_hdr), GFP_NOFS); if (!tr_hdr) return -ENOMEM; memset(&cur_rqst[0], 0, sizeof(cur_rqst)); memset(&iov, 0, sizeof(iov)); - memset(tr_hdr, 0, sizeof(*tr_hdr)); iov.iov_base = tr_hdr; iov.iov_len = sizeof(*tr_hdr); From b1c6ecfdd06907554518ec384ce8e99889d15193 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Thu, 14 Apr 2022 11:17:22 +0300 Subject: [PATCH 090/196] ARC: entry: fix syscall_trace_exit argument Function syscall_trace_exit expects pointer to pt_regs. However r0 is also used to keep syscall return value. Restore pointer to pt_regs before calling syscall_trace_exit. Cc: Signed-off-by: Sergey Matyukevich Signed-off-by: Vineet Gupta --- arch/arc/kernel/entry.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index dd77a0c8f740..66ba549b520f 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -196,6 +196,7 @@ tracesys_exit: st r0, [sp, PT_r0] ; sys call return value in pt_regs ;POST Sys Call Ptrace Hook + mov r0, sp ; pt_regs needed bl @syscall_trace_exit b ret_from_exception ; NOT ret_from_system_call at is saves r0 which ; we'd done before calling post hook above From ecaa054fc4c65ad337ec57aef2c6b041e0ef8f91 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 18 Mar 2022 11:37:15 +0100 Subject: [PATCH 091/196] ARC: fix typos in comments Various spelling mistakes in comments. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Vineet Gupta --- arch/arc/kernel/disasm.c | 2 +- arch/arc/kernel/signal.c | 2 +- arch/arc/kernel/smp.c | 2 +- arch/arc/kernel/unaligned.c | 2 +- arch/arc/mm/cache.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arc/kernel/disasm.c b/arch/arc/kernel/disasm.c index 03f8b1be0c3a..897d5d70b39a 100644 --- a/arch/arc/kernel/disasm.c +++ b/arch/arc/kernel/disasm.c @@ -366,7 +366,7 @@ void __kprobes disasm_instr(unsigned long addr, struct disasm_state *state, case op_SP: /* LD_S|LDB_S b,[sp,u7], ST_S|STB_S b,[sp,u7] */ /* note: we are ignoring possibility of: * ADD_S, SUB_S, PUSH_S, POP_S as these should not - * cause unaliged exception anyway */ + * cause unaligned exception anyway */ state->write = BITS(state->words[0], 6, 6); state->zz = BITS(state->words[0], 5, 5); if (state->zz) diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c index f748483628f2..3c1590c27fae 100644 --- a/arch/arc/kernel/signal.c +++ b/arch/arc/kernel/signal.c @@ -319,7 +319,7 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) regs->ret = (unsigned long)ksig->ka.sa.sa_handler; /* - * handler returns using sigreturn stub provided already by userpsace + * handler returns using sigreturn stub provided already by userspace * If not, nuke the process right away */ if(!(ksig->ka.sa.sa_flags & SA_RESTORER)) diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index 78e6d069b1c1..383fefee2ae5 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -35,7 +35,7 @@ EXPORT_SYMBOL_GPL(smp_atomic_ops_lock); struct plat_smp_ops __weak plat_smp_ops; -/* XXX: per cpu ? Only needed once in early seconday boot */ +/* XXX: per cpu ? Only needed once in early secondary boot */ struct task_struct *secondary_idle_tsk; /* Called from start_kernel */ diff --git a/arch/arc/kernel/unaligned.c b/arch/arc/kernel/unaligned.c index d63ebd81f1c6..99a9b92ed98d 100644 --- a/arch/arc/kernel/unaligned.c +++ b/arch/arc/kernel/unaligned.c @@ -237,7 +237,7 @@ int misaligned_fixup(unsigned long address, struct pt_regs *regs, if (state.fault) goto fault; - /* clear any remanants of delay slot */ + /* clear any remnants of delay slot */ if (delay_mode(regs)) { regs->ret = regs->bta & ~1U; regs->status32 &= ~STATUS_DE_MASK; diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 8aa1231865d1..5446967ea98d 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -401,7 +401,7 @@ static inline void __before_dc_op(const int op) { if (op == OP_FLUSH_N_INV) { /* Dcache provides 2 cmd: FLUSH or INV - * INV inturn has sub-modes: DISCARD or FLUSH-BEFORE + * INV in turn has sub-modes: DISCARD or FLUSH-BEFORE * flush-n-inv is achieved by INV cmd but with IM=1 * So toggle INV sub-mode depending on op request and default */ From 7f56b6d789dfbb4e72fa6912cd037fd916d4ee1c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 22 Mar 2022 20:49:05 +0100 Subject: [PATCH 092/196] ARC: Remove a redundant memset() disasm_instr() already call memset(0) on its 2nd argument, so there is no need to clear it explicitly before calling this function. Remove the redundant memset(). Signed-off-by: Christophe JAILLET Signed-off-by: Vineet Gupta --- arch/arc/kernel/disasm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arc/kernel/disasm.c b/arch/arc/kernel/disasm.c index 897d5d70b39a..1e1db51b6941 100644 --- a/arch/arc/kernel/disasm.c +++ b/arch/arc/kernel/disasm.c @@ -503,7 +503,6 @@ int __kprobes disasm_next_pc(unsigned long pc, struct pt_regs *regs, { struct disasm_state instr; - memset(&instr, 0, sizeof(struct disasm_state)); disasm_instr(pc, &instr, 0, regs, cregs); *next_pc = pc + instr.instr_len; From 3f943be0e76c72955ca4d1376b9577755be85f5c Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 7 Apr 2022 16:33:22 +0200 Subject: [PATCH 093/196] ARC: dts: align SPI NOR node name with dtschema The node names should be generic and SPI NOR dtschema expects "flash". Signed-off-by: Krzysztof Kozlowski Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/hsdk.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index dcaa44e408ac..f48ba03e9b5e 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -275,7 +275,7 @@ cs-gpios = <&creg_gpio 0 GPIO_ACTIVE_LOW>, <&creg_gpio 1 GPIO_ACTIVE_LOW>; - spi-flash@0 { + flash@0 { compatible = "sst26wf016b", "jedec,spi-nor"; reg = <0>; #address-cells = <1>; From d139d0f0bfdabe5762214a96f3d5c4b88f524b41 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Mon, 28 Mar 2022 10:15:58 +0200 Subject: [PATCH 094/196] arc: drop definitions of pgd_index() and pgd_offset{, _k}() entirely They were in and have been removed from there in 974b9b2c68f ("mm: consolidate pte_index() and pte_offset_*() definitions") in favor of the generic version. But that missed that the same definitons also existed in , where they were (inadvertently?) introduced in fe6cb7b043b6 ("ARC: mm: disintegrate pgtable.h into levels and flags"). Fixes: 974b9b2c68f3 ("mm: consolidate pte_index() and pte_offset_*() definitions") Fixes: fe6cb7b043b6 ("ARC: mm: disintegrate pgtable.h into levels and flags") Signed-off-by: Rolf Eike Beer Signed-off-by: Vineet Gupta --- arch/arc/include/asm/pgtable-levels.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arc/include/asm/pgtable-levels.h b/arch/arc/include/asm/pgtable-levels.h index 7848348719b2..64ca25d199be 100644 --- a/arch/arc/include/asm/pgtable-levels.h +++ b/arch/arc/include/asm/pgtable-levels.h @@ -98,9 +98,6 @@ /* * 1st level paging: pgd */ -#define pgd_index(addr) ((addr) >> PGDIR_SHIFT) -#define pgd_offset(mm, addr) (((mm)->pgd) + pgd_index(addr)) -#define pgd_offset_k(addr) pgd_offset(&init_mm, addr) #define pgd_ERROR(e) \ pr_crit("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) From ac411e41ec065daa867b5668b6e71ea1aff7b36a Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Tue, 22 Feb 2022 17:05:24 +0300 Subject: [PATCH 095/196] ARC: atomic: cleanup atomic-llsc definitions Remove redundant c_op macro argument. Only asm_op is needed to define atomic operations using llock/scond. Signed-off-by: Sergey Matyukevich Signed-off-by: Vineet Gupta --- arch/arc/include/asm/atomic-llsc.h | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/arc/include/asm/atomic-llsc.h b/arch/arc/include/asm/atomic-llsc.h index 088d348781c1..1b0ffaeee16d 100644 --- a/arch/arc/include/asm/atomic-llsc.h +++ b/arch/arc/include/asm/atomic-llsc.h @@ -5,7 +5,7 @@ #define arch_atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) -#define ATOMIC_OP(op, c_op, asm_op) \ +#define ATOMIC_OP(op, asm_op) \ static inline void arch_atomic_##op(int i, atomic_t *v) \ { \ unsigned int val; \ @@ -21,7 +21,7 @@ static inline void arch_atomic_##op(int i, atomic_t *v) \ : "cc"); \ } \ -#define ATOMIC_OP_RETURN(op, c_op, asm_op) \ +#define ATOMIC_OP_RETURN(op, asm_op) \ static inline int arch_atomic_##op##_return_relaxed(int i, atomic_t *v) \ { \ unsigned int val; \ @@ -42,7 +42,7 @@ static inline int arch_atomic_##op##_return_relaxed(int i, atomic_t *v) \ #define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed #define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed -#define ATOMIC_FETCH_OP(op, c_op, asm_op) \ +#define ATOMIC_FETCH_OP(op, asm_op) \ static inline int arch_atomic_fetch_##op##_relaxed(int i, atomic_t *v) \ { \ unsigned int val, orig; \ @@ -69,23 +69,23 @@ static inline int arch_atomic_fetch_##op##_relaxed(int i, atomic_t *v) \ #define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or_relaxed #define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor_relaxed -#define ATOMIC_OPS(op, c_op, asm_op) \ - ATOMIC_OP(op, c_op, asm_op) \ - ATOMIC_OP_RETURN(op, c_op, asm_op) \ - ATOMIC_FETCH_OP(op, c_op, asm_op) +#define ATOMIC_OPS(op, asm_op) \ + ATOMIC_OP(op, asm_op) \ + ATOMIC_OP_RETURN(op, asm_op) \ + ATOMIC_FETCH_OP(op, asm_op) -ATOMIC_OPS(add, +=, add) -ATOMIC_OPS(sub, -=, sub) +ATOMIC_OPS(add, add) +ATOMIC_OPS(sub, sub) #undef ATOMIC_OPS -#define ATOMIC_OPS(op, c_op, asm_op) \ - ATOMIC_OP(op, c_op, asm_op) \ - ATOMIC_FETCH_OP(op, c_op, asm_op) +#define ATOMIC_OPS(op, asm_op) \ + ATOMIC_OP(op, asm_op) \ + ATOMIC_FETCH_OP(op, asm_op) -ATOMIC_OPS(and, &=, and) -ATOMIC_OPS(andnot, &= ~, bic) -ATOMIC_OPS(or, |=, or) -ATOMIC_OPS(xor, ^=, xor) +ATOMIC_OPS(and, and) +ATOMIC_OPS(andnot, bic) +ATOMIC_OPS(or, or) +ATOMIC_OPS(xor, xor) #define arch_atomic_andnot arch_atomic_andnot From c6ed4d84a2c49de7d6f490144cca7b4a4831fb6e Mon Sep 17 00:00:00 2001 From: Bang Li Date: Sat, 19 Mar 2022 10:03:16 +0800 Subject: [PATCH 096/196] ARC: remove redundant READ_ONCE() in cmpxchg loop This patch reverts commit 7082a29c22ac ("ARC: use ACCESS_ONCE in cmpxchg loop"). It is not necessary to use READ_ONCE() because cmpxchg contains barrier. We can get it from commit d57f727264f1 ("ARC: add compiler barrier to LLSC based cmpxchg"). Signed-off-by: Bang Li Signed-off-by: Vineet Gupta --- arch/arc/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index 383fefee2ae5..d947473f1e6d 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -274,7 +274,7 @@ static void ipi_send_msg_one(int cpu, enum ipi_msg_type msg) * and read back old value */ do { - new = old = READ_ONCE(*ipi_data_ptr); + new = old = *ipi_data_ptr; new |= 1U << msg; } while (cmpxchg(ipi_data_ptr, old, new) != old); From faad6cebded8e0fd902b672f220449b93db479eb Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 11 Apr 2022 13:47:56 -0400 Subject: [PATCH 097/196] scsi: sr: Do not leak information in ioctl sr_ioctl.c uses this pattern: result = sr_do_ioctl(cd, &cgc); to-user = buffer[]; kfree(buffer); return result; Use of a buffer without checking leaks information. Check result and jump over the use of buffer if there is an error. result = sr_do_ioctl(cd, &cgc); if (result) goto err; to-user = buffer[]; err: kfree(buffer); return result; Additionally, initialize the buffer to zero. This problem can be seen in the 2.4.0 kernel. Link: https://lore.kernel.org/r/20220411174756.2418435-1-trix@redhat.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reviewed-by: Christoph Hellwig Signed-off-by: Tom Rix Signed-off-by: Martin K. Petersen --- drivers/scsi/sr_ioctl.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c index ddd00efc4882..fbdb5124d7f7 100644 --- a/drivers/scsi/sr_ioctl.c +++ b/drivers/scsi/sr_ioctl.c @@ -41,7 +41,7 @@ static int sr_read_tochdr(struct cdrom_device_info *cdi, int result; unsigned char *buffer; - buffer = kmalloc(32, GFP_KERNEL); + buffer = kzalloc(32, GFP_KERNEL); if (!buffer) return -ENOMEM; @@ -55,10 +55,13 @@ static int sr_read_tochdr(struct cdrom_device_info *cdi, cgc.data_direction = DMA_FROM_DEVICE; result = sr_do_ioctl(cd, &cgc); + if (result) + goto err; tochdr->cdth_trk0 = buffer[2]; tochdr->cdth_trk1 = buffer[3]; +err: kfree(buffer); return result; } @@ -71,7 +74,7 @@ static int sr_read_tocentry(struct cdrom_device_info *cdi, int result; unsigned char *buffer; - buffer = kmalloc(32, GFP_KERNEL); + buffer = kzalloc(32, GFP_KERNEL); if (!buffer) return -ENOMEM; @@ -86,6 +89,8 @@ static int sr_read_tocentry(struct cdrom_device_info *cdi, cgc.data_direction = DMA_FROM_DEVICE; result = sr_do_ioctl(cd, &cgc); + if (result) + goto err; tocentry->cdte_ctrl = buffer[5] & 0xf; tocentry->cdte_adr = buffer[5] >> 4; @@ -98,6 +103,7 @@ static int sr_read_tocentry(struct cdrom_device_info *cdi, tocentry->cdte_addr.lba = (((((buffer[8] << 8) + buffer[9]) << 8) + buffer[10]) << 8) + buffer[11]; +err: kfree(buffer); return result; } @@ -384,7 +390,7 @@ int sr_get_mcn(struct cdrom_device_info *cdi, struct cdrom_mcn *mcn) { Scsi_CD *cd = cdi->handle; struct packet_command cgc; - char *buffer = kmalloc(32, GFP_KERNEL); + char *buffer = kzalloc(32, GFP_KERNEL); int result; if (!buffer) @@ -400,10 +406,13 @@ int sr_get_mcn(struct cdrom_device_info *cdi, struct cdrom_mcn *mcn) cgc.data_direction = DMA_FROM_DEVICE; cgc.timeout = IOCTL_TIMEOUT; result = sr_do_ioctl(cd, &cgc); + if (result) + goto err; memcpy(mcn->medium_catalog_number, buffer + 9, 13); mcn->medium_catalog_number[13] = 0; +err: kfree(buffer); return result; } From 00fd7cfad0548b6b7234c93370076f9b9c2e39f8 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 15 Apr 2022 23:44:18 -0700 Subject: [PATCH 098/196] ALSA: hda/i915: Fix one too many pci_dev_put() pci_get_class() will already unref the pci device passed as argument. So if it's unconditionally unref'ed, even if the loop is not stopped, there will be one too many unref for each device not matched. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5701 Fixes: c9db8a30d9f0 ("ALSA: hda/i915 - skip acomp init if no matching display") Signed-off-by: Lucas De Marchi Reviewed-by: Kai Vehmanen Link: https://lore.kernel.org/r/20220416064418.2364582-1-lucas.demarchi@intel.com Signed-off-by: Takashi Iwai --- sound/hda/hdac_i915.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c index 48b8ed752b69..3f35972e1cf7 100644 --- a/sound/hda/hdac_i915.c +++ b/sound/hda/hdac_i915.c @@ -127,11 +127,10 @@ static int i915_gfx_present(struct pci_dev *hdac_pci) display_dev = pci_get_class(class, display_dev); if (display_dev && display_dev->vendor == PCI_VENDOR_ID_INTEL && - connectivity_check(display_dev, hdac_pci)) + connectivity_check(display_dev, hdac_pci)) { + pci_dev_put(display_dev); match = true; - - pci_dev_put(display_dev); - + } } while (!match && display_dev); return match; From 4ddef9c4d70aae0c9029bdec7c3f7f1c1c51ff8c Mon Sep 17 00:00:00 2001 From: Maurizio Avogadro Date: Mon, 18 Apr 2022 15:16:12 +0200 Subject: [PATCH 099/196] ALSA: usb-audio: add mapping for MSI MAG X570S Torpedo MAX. The USB audio device 0db0:a073 based on the Realtek ALC4080 chipset exposes all playback volume controls as "PCM". This makes distinguishing the individual functions hard. The mapping already adopted for device 0db0:419c based on the same chipset fixes the issue, apply it for this device too. Signed-off-by: Maurizio Avogadro Cc: Link: https://lore.kernel.org/r/Yl1ykPaGgsFf3SnW@ryzen Signed-off-by: Takashi Iwai --- sound/usb/mixer_maps.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/usb/mixer_maps.c b/sound/usb/mixer_maps.c index 64f5544d0a0a..7ef7a8abcc2b 100644 --- a/sound/usb/mixer_maps.c +++ b/sound/usb/mixer_maps.c @@ -599,6 +599,10 @@ static const struct usbmix_ctl_map usbmix_ctl_maps[] = { .id = USB_ID(0x0db0, 0x419c), .map = msi_mpg_x570s_carbon_max_wifi_alc4080_map, }, + { /* MSI MAG X570S Torpedo Max */ + .id = USB_ID(0x0db0, 0xa073), + .map = msi_mpg_x570s_carbon_max_wifi_alc4080_map, + }, { /* MSI TRX40 */ .id = USB_ID(0x0db0, 0x543d), .map = trx40_mobo_map, From 9df1e3ff60241ce3fb26db75933970dd1b871213 Mon Sep 17 00:00:00 2001 From: Manasi Navare Date: Thu, 3 Mar 2022 15:32:22 -0800 Subject: [PATCH 100/196] drm/i915/display/vrr: Reset VRR capable property on a long hpd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With some VRR panels, user can turn VRR ON/OFF on the fly from the panel settings. When VRR is turned OFF ,sends a long HPD to the driver clearing the Ignore MSA bit in the DPCD. Currently the driver parses that onevery HPD but fails to reset the corresponding VRR Capable Connector property. Hence the userspace still sees this as VRR Capable panel which is incorrect. Fix this by explicitly resetting the connector property. v2: Reset vrr capable if status == connector_disconnected v3: Use i915 and use bool vrr_capable (Jani Nikula) v4: Move vrr_capable to after update modes call (Jani N) Remove the redundant comment (Jan N) v5: Fixes the regression on older platforms by resetting the VRR only if HAS_VRR v6: Remove the checks from driver, add in drm core before setting VRR prop (Ville) v7: Move VRR set/reset to set/unset_edid (Ville) Cc: Jani Nikula Cc: Ville Syrjälä Fixes: 9bc34b4d0f3c ("drm/i915/display/vrr: Reset VRR capable property on a long hpd") Signed-off-by: Manasi Navare Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220303233222.4698-1-manasi.d.navare@intel.com (cherry picked from commit d999ad1079f574be06a8f1701cd24a5dc0ada48c) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_dp.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index d667657e3606..f868db8be02a 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -4383,13 +4383,20 @@ intel_dp_update_420(struct intel_dp *intel_dp) static void intel_dp_set_edid(struct intel_dp *intel_dp) { + struct drm_i915_private *i915 = dp_to_i915(intel_dp); struct intel_connector *connector = intel_dp->attached_connector; struct edid *edid; + bool vrr_capable; intel_dp_unset_edid(intel_dp); edid = intel_dp_get_edid(intel_dp); connector->detect_edid = edid; + vrr_capable = intel_vrr_is_capable(&connector->base); + drm_dbg_kms(&i915->drm, "[CONNECTOR:%d:%s] VRR capable: %s\n", + connector->base.base.id, connector->base.name, str_yes_no(vrr_capable)); + drm_connector_set_vrr_capable_property(&connector->base, vrr_capable); + intel_dp_update_dfp(intel_dp, edid); intel_dp_update_420(intel_dp); @@ -4422,6 +4429,9 @@ intel_dp_unset_edid(struct intel_dp *intel_dp) intel_dp->dfp.ycbcr_444_to_420 = false; connector->base.ycbcr_420_allowed = false; + + drm_connector_set_vrr_capable_property(&connector->base, + false); } static int @@ -4572,14 +4582,9 @@ static int intel_dp_get_modes(struct drm_connector *connector) int num_modes = 0; edid = intel_connector->detect_edid; - if (edid) { + if (edid) num_modes = intel_connector_update_modes(connector, edid); - if (intel_vrr_is_capable(connector)) - drm_connector_set_vrr_capable_property(connector, - true); - } - /* Also add fixed mode, which may or may not be present in EDID */ if (intel_dp_is_edp(intel_attached_dp(intel_connector)) && intel_connector->panel.fixed_mode) { From 0763120b090418a5257402754e22a34227ae5f12 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Fri, 15 Apr 2022 12:33:20 +0200 Subject: [PATCH 101/196] net: dsa: hellcreek: Calculate checksums in tagger In case the checksum calculation is offloaded to the DSA master network interface, it will include the switch trailing tag. As soon as the switch strips that tag on egress, the calculated checksum is wrong. Therefore, add the checksum calculation to the tagger (if required) before adding the switch tag. This way, the hellcreek code works with all DSA master interfaces regardless of their declared feature set. Fixes: 01ef09caad66 ("net: dsa: Add tag handling for Hirschmann Hellcreek switches") Signed-off-by: Kurt Kanzenbach Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20220415103320.90657-1-kurt@linutronix.de Signed-off-by: Paolo Abeni --- net/dsa/tag_hellcreek.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index f64b805303cd..eb204ad36eee 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -21,6 +21,14 @@ static struct sk_buff *hellcreek_xmit(struct sk_buff *skb, struct dsa_port *dp = dsa_slave_to_port(dev); u8 *tag; + /* Calculate checksums (if required) before adding the trailer tag to + * avoid including it in calculations. That would lead to wrong + * checksums after the switch strips the tag. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_help(skb)) + return NULL; + /* Tag encoding */ tag = skb_put(skb, HELLCREEK_TAG_LEN); *tag = BIT(dp->index); From 4cf35a2b627a020fe1a6b6fc7a6a12394644e474 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 15 Apr 2022 18:19:50 +0300 Subject: [PATCH 102/196] net: mscc: ocelot: fix broken IP multicast flooding When the user runs: bridge link set dev $br_port mcast_flood on this command should affect not only L2 multicast, but also IPv4 and IPv6 multicast. In the Ocelot switch, unknown multicast gets flooded according to different PGIDs according to its type, and PGID_MC only handles L2 multicast. Therefore, by leaving PGID_MCIPV4 and PGID_MCIPV6 at their default value of 0, unknown IP multicast traffic is never flooded. Fixes: 421741ea5672 ("net: mscc: ocelot: offload bridge port flags to device") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220415151950.219660-1-vladimir.oltean@nxp.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mscc/ocelot.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index e443bd8b2d09..ee9c607d62a7 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -2859,6 +2859,8 @@ static void ocelot_port_set_mcast_flood(struct ocelot *ocelot, int port, val = BIT(port); ocelot_rmw_rix(ocelot, val, BIT(port), ANA_PGID_PGID, PGID_MC); + ocelot_rmw_rix(ocelot, val, BIT(port), ANA_PGID_PGID, PGID_MCIPV4); + ocelot_rmw_rix(ocelot, val, BIT(port), ANA_PGID_PGID, PGID_MCIPV6); } static void ocelot_port_set_bcast_flood(struct ocelot *ocelot, int port, From 99c07327ae11e24886d552dddbe4537bfca2765d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Apr 2022 11:14:42 -0700 Subject: [PATCH 103/196] netlink: reset network and mac headers in netlink_dump() netlink_dump() is allocating an skb, reserves space in it but forgets to reset network header. This allows a BPF program, invoked later from sk_filter() to access uninitialized kernel memory from the reserved space. Theorically mac header reset could be omitted, because it is set to a special initial value. bpf_internal_load_pointer_neg_helper calls skb_mac_header() without checking skb_mac_header_was_set(). Relying on skb->len not being too big seems fragile. We also could add a sanity check in bpf_internal_load_pointer_neg_helper() to avoid surprises in the future. syzbot report was: BUG: KMSAN: uninit-value in ___bpf_prog_run+0xa22b/0xb420 kernel/bpf/core.c:1637 ___bpf_prog_run+0xa22b/0xb420 kernel/bpf/core.c:1637 __bpf_prog_run32+0x121/0x180 kernel/bpf/core.c:1796 bpf_dispatcher_nop_func include/linux/bpf.h:784 [inline] __bpf_prog_run include/linux/filter.h:626 [inline] bpf_prog_run include/linux/filter.h:633 [inline] __bpf_prog_run_save_cb+0x168/0x580 include/linux/filter.h:756 bpf_prog_run_save_cb include/linux/filter.h:770 [inline] sk_filter_trim_cap+0x3bc/0x8c0 net/core/filter.c:150 sk_filter include/linux/filter.h:905 [inline] netlink_dump+0xe0c/0x16c0 net/netlink/af_netlink.c:2276 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_read_iter+0x5a9/0x630 net/socket.c:1039 do_iter_readv_writev+0xa7f/0xc70 do_iter_read+0x52c/0x14c0 fs/read_write.c:786 vfs_readv fs/read_write.c:906 [inline] do_readv+0x432/0x800 fs/read_write.c:943 __do_sys_readv fs/read_write.c:1034 [inline] __se_sys_readv fs/read_write.c:1031 [inline] __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x44/0xae Uninit was stored to memory at: ___bpf_prog_run+0x96c/0xb420 kernel/bpf/core.c:1558 __bpf_prog_run32+0x121/0x180 kernel/bpf/core.c:1796 bpf_dispatcher_nop_func include/linux/bpf.h:784 [inline] __bpf_prog_run include/linux/filter.h:626 [inline] bpf_prog_run include/linux/filter.h:633 [inline] __bpf_prog_run_save_cb+0x168/0x580 include/linux/filter.h:756 bpf_prog_run_save_cb include/linux/filter.h:770 [inline] sk_filter_trim_cap+0x3bc/0x8c0 net/core/filter.c:150 sk_filter include/linux/filter.h:905 [inline] netlink_dump+0xe0c/0x16c0 net/netlink/af_netlink.c:2276 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_read_iter+0x5a9/0x630 net/socket.c:1039 do_iter_readv_writev+0xa7f/0xc70 do_iter_read+0x52c/0x14c0 fs/read_write.c:786 vfs_readv fs/read_write.c:906 [inline] do_readv+0x432/0x800 fs/read_write.c:943 __do_sys_readv fs/read_write.c:1034 [inline] __se_sys_readv fs/read_write.c:1031 [inline] __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x44/0xae Uninit was created at: slab_post_alloc_hook mm/slab.h:737 [inline] slab_alloc_node mm/slub.c:3244 [inline] __kmalloc_node_track_caller+0xde3/0x14f0 mm/slub.c:4972 kmalloc_reserve net/core/skbuff.c:354 [inline] __alloc_skb+0x545/0xf90 net/core/skbuff.c:426 alloc_skb include/linux/skbuff.h:1158 [inline] netlink_dump+0x30f/0x16c0 net/netlink/af_netlink.c:2242 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_read_iter+0x5a9/0x630 net/socket.c:1039 do_iter_readv_writev+0xa7f/0xc70 do_iter_read+0x52c/0x14c0 fs/read_write.c:786 vfs_readv fs/read_write.c:906 [inline] do_readv+0x432/0x800 fs/read_write.c:943 __do_sys_readv fs/read_write.c:1034 [inline] __se_sys_readv fs/read_write.c:1031 [inline] __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x44/0xae CPU: 0 PID: 3470 Comm: syz-executor751 Not tainted 5.17.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: db65a3aaf29e ("netlink: Trim skb to alloc size to avoid MSG_TRUNC") Fixes: 9063e21fb026 ("netlink: autosize skb lengthes") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20220415181442.551228-1-eric.dumazet@gmail.com Signed-off-by: Paolo Abeni --- net/netlink/af_netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 47a876ccd288..05a3795eac8e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2263,6 +2263,13 @@ static int netlink_dump(struct sock *sk) * single netdev. The outcome is MSG_TRUNC error. */ skb_reserve(skb, skb_tailroom(skb) - alloc_size); + + /* Make sure malicious BPF programs can not read unitialized memory + * from skb->head -> skb->data + */ + skb_reset_network_header(skb); + skb_reset_mac_header(skb); + netlink_skb_set_owner_r(skb, sk); if (nlk->dump_done_errno > 0) { From ff2695e52c9936febf65aa36a1769881da71bec5 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 20 Apr 2022 00:04:24 +0800 Subject: [PATCH 104/196] bcache: put bch_bio_map() back to correct location in journal_write_unlocked() Commit a7c50c940477 ("block: pass a block_device and opf to bio_reset") moves bch_bio_map() inside journal_write_unlocked() next to the location where the modified bio_reset() was called. This change is wrong because calling bch_bio_map() immediately after bio_reset(), a BUG_ON(!bio->bi_iter.bi_size) inside bch_bio_map() will be triggered and panic the kernel. This patch puts bch_bio_map() back to its original correct location in journal_write_unlocked() and avoid the BUG_ON(). Fixes: a7c50c940477 ("block: pass a block_device and opf to bio_reset") Signed-off-by: Coly Li Cc: Christoph Hellwig Cc: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220419160425.4148-2-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7c2ca52ca3e4..df5347ea450b 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -771,12 +771,12 @@ static void journal_write_unlocked(struct closure *cl) bio_reset(bio, ca->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PREFLUSH | REQ_FUA); - bch_bio_map(bio, w->data); bio->bi_iter.bi_sector = PTR_OFFSET(k, i); bio->bi_iter.bi_size = sectors << 9; bio->bi_end_io = journal_write_endio; bio->bi_private = w; + bch_bio_map(bio, w->data); trace_bcache_journal_write(bio, w->data->keys); bio_list_add(&list, bio); From 9dca4168a37c9cfe182f077f0d2289292e9e3656 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 20 Apr 2022 00:04:25 +0800 Subject: [PATCH 105/196] bcache: fix wrong bdev parameter when calling bio_alloc_clone() in do_bio_hook() Commit abfc426d1b2f ("block: pass a block_device to bio_clone_fast") calls the modified bio_alloc_clone() in bcache code as: bio_init_clone(bio->bi_bdev, bio, orig_bio, GFP_NOIO); But the first parameter is wrong, where bio->bi_bdev should be orig_bio->bi_bdev. The wrong bi_bdev panics the kernel when submitting cache bio. This patch fixes the wrong bdev parameter usage and avoid the panic. Fixes: abfc426d1b2f ("block: pass a block_device to bio_clone_fast") Signed-off-by: Coly Li Cc: Christoph Hellwig Cc: Mike Snitzer Link: https://lore.kernel.org/r/20220419160425.4148-3-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index fdd0194f84dd..320fcdfef48e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -685,7 +685,7 @@ static void do_bio_hook(struct search *s, { struct bio *bio = &s->bio.bio; - bio_init_clone(bio->bi_bdev, bio, orig_bio, GFP_NOIO); + bio_init_clone(orig_bio->bi_bdev, bio, orig_bio, GFP_NOIO); /* * bi_end_io can be set separately somewhere else, e.g. the * variants in, From 0371870b96907bf560ecf7dc3fadc238fadf7845 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 19 Apr 2022 16:04:22 +0300 Subject: [PATCH 106/196] drm/msm: Revert "drm/msm: Stop using iommu_present()" This reverts commit e2a88eabb02410267519b838fb9b79f5206769be. The commit in question makes msm_use_mmu() check whether the DRM 'component master' device is translated by the IOMMU. At this moment it is the 'mdss' device. However on platforms using the MDP5 driver (e.g. MSM8916/APQ8016, MSM8996/APQ8096) it's the mdp5 device, which has the iommus property (and thus is "translated by the IOMMU"). This results in these devices being broken with the following lines in the dmesg. [drm] Initialized msm 1.9.0 20130625 for 1a00000.mdss on minor 0 msm 1a00000.mdss: [drm:adreno_request_fw] loaded qcom/a300_pm4.fw from new location msm 1a00000.mdss: [drm:adreno_request_fw] loaded qcom/a300_pfp.fw from new location msm 1a00000.mdss: [drm:get_pages] *ERROR* could not get pages: -28 msm 1a00000.mdss: could not allocate stolen bo msm 1a00000.mdss: [drm:get_pages] *ERROR* could not get pages: -28 msm 1a00000.mdss: [drm:msm_alloc_stolen_fb] *ERROR* failed to allocate buffer object msm 1a00000.mdss: [drm:msm_fbdev_create] *ERROR* failed to allocate fb Getting the mdp5 device pointer from this function is not that easy at this moment. Thus this patch is reverted till the MDSS rework [1] lands. It will make the mdp5/dpu1 device component master and the check will be legit. [1] https://patchwork.freedesktop.org/series/98525/ Fixes: e2a88eabb024 ("drm/msm: Stop using iommu_present()") Signed-off-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20220419130422.1033699-1-dmitry.baryshkov@linaro.org Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 2905b82a9de3..e88c4b46a56f 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -274,7 +274,7 @@ bool msm_use_mmu(struct drm_device *dev) struct msm_drm_private *priv = dev->dev_private; /* a2xx comes with its own MMU */ - return priv->is_a2xx || device_iommu_mapped(dev->dev); + return priv->is_a2xx || iommu_present(&platform_bus_type); } static int msm_init_vram(struct drm_device *dev) From 40f5aa4c5eaebfeaca4566217cb9c468e28ed682 Mon Sep 17 00:00:00 2001 From: kuyo chang Date: Thu, 14 Apr 2022 17:02:20 +0800 Subject: [PATCH 107/196] sched/pelt: Fix attach_entity_load_avg() corner case The warning in cfs_rq_is_decayed() triggered: SCHED_WARN_ON(cfs_rq->avg.load_avg || cfs_rq->avg.util_avg || cfs_rq->avg.runnable_avg) There exists a corner case in attach_entity_load_avg() which will cause load_sum to be zero while load_avg will not be. Consider se_weight is 88761 as per the sched_prio_to_weight[] table. Further assume the get_pelt_divider() is 47742, this gives: se->avg.load_avg is 1. However, calculating load_sum: se->avg.load_sum = div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); se->avg.load_sum = 1*47742/88761 = 0. Then enqueue_load_avg() adds this to the cfs_rq totals: cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; Resulting in load_avg being 1 with load_sum is 0, which will trigger the WARN. Fixes: f207934fb79d ("sched/fair: Align PELT windows between cfs_rq and its se") Signed-off-by: kuyo chang [peterz: massage changelog] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20220414090229.342-1-kuyo.chang@mediatek.com --- kernel/sched/fair.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d4bd299d67ab..a68482d66535 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3829,11 +3829,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s se->avg.runnable_sum = se->avg.runnable_avg * divider; - se->avg.load_sum = divider; - if (se_weight(se)) { - se->avg.load_sum = - div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); - } + se->avg.load_sum = se->avg.load_avg * divider; + if (se_weight(se) < se->avg.load_sum) + se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se)); + else + se->avg.load_sum = 1; enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; From 60490e7966659b26d74bf1fa4aa8693d9a94ca88 Mon Sep 17 00:00:00 2001 From: Zhipeng Xie Date: Wed, 9 Feb 2022 09:54:17 -0500 Subject: [PATCH 108/196] perf/core: Fix perf_mmap fail when CONFIG_PERF_USE_VMALLOC enabled This problem can be reproduced with CONFIG_PERF_USE_VMALLOC enabled on both x86_64 and aarch64 arch when using sysdig -B(using ebpf)[1]. sysdig -B works fine after rebuilding the kernel with CONFIG_PERF_USE_VMALLOC disabled. I tracked it down to the if condition event->rb->nr_pages != nr_pages in perf_mmap is true when CONFIG_PERF_USE_VMALLOC is enabled where event->rb->nr_pages = 1 and nr_pages = 2048 resulting perf_mmap to return -EINVAL. This is because when CONFIG_PERF_USE_VMALLOC is enabled, rb->nr_pages is always equal to 1. Arch with CONFIG_PERF_USE_VMALLOC enabled by default: arc/arm/csky/mips/sh/sparc/xtensa Arch with CONFIG_PERF_USE_VMALLOC disabled by default: x86_64/aarch64/... Fix this problem by using data_page_nr() [1] https://github.com/draios/sysdig Fixes: 906010b2134e ("perf_event: Provide vmalloc() based mmap() backing") Signed-off-by: Zhipeng Xie Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220209145417.6495-1-xiezhipeng1@huawei.com --- kernel/events/core.c | 2 +- kernel/events/internal.h | 5 +++++ kernel/events/ring_buffer.c | 5 ----- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 23bb19716ad3..7858bafffa9d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6247,7 +6247,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages != nr_pages) { + if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 082832738c8f..5150d5f84c03 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb) } #endif +static inline int data_page_nr(struct perf_buffer *rb) +{ + return rb->nr_pages << page_order(rb); +} + static inline unsigned long perf_data_size(struct perf_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 52868716ec35..fb35b926024c 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -859,11 +859,6 @@ void rb_free(struct perf_buffer *rb) } #else -static int data_page_nr(struct perf_buffer *rb) -{ - return rb->nr_pages << page_order(rb); -} - static struct page * __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { From 528c9f1daf20da14d3e7348dc4b1d7c55743ee64 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 15 Apr 2022 18:45:20 +0800 Subject: [PATCH 109/196] perf/x86/cstate: Add SAPPHIRERAPIDS_X CPU support From the perspective of Intel cstate residency counters, SAPPHIRERAPIDS_X is the same as ICELAKE_X. Share the code with it. And update the comments for SAPPHIRERAPIDS_X. Signed-off-by: Zhang Rui Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lkml.kernel.org/r/20220415104520.2737004-1-rui.zhang@intel.com --- arch/x86/events/intel/cstate.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 5d7762288a24..48e5db21142c 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -51,7 +51,7 @@ * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, - * TGL,TNT,RKL,ADL,RPL + * TGL,TNT,RKL,ADL,RPL,SPR * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 @@ -62,7 +62,7 @@ * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, - * RPL + * RPL,SPR * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 @@ -74,7 +74,7 @@ * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, - * TGL,TNT,RKL,ADL,RPL + * TGL,TNT,RKL,ADL,RPL,SPR * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -675,6 +675,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates), From f81f7861ee2aaa6f652f18e8f622547bdd379724 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 2 Apr 2022 20:13:55 -0700 Subject: [PATCH 110/196] cpuidle: riscv: support non-SMP config Add for cpuid_to_hartid_map etc. This is needed for both SMP and non-SMP builds, but not having it causes a build error for non-SMP: drivers/cpuidle/cpuidle-riscv-sbi.c: In function 'sbi_cpuidle_init_cpu': drivers/cpuidle/cpuidle-riscv-sbi.c:350:26: error: implicit declaration of function 'cpuid_to_hartid_map' [-Werror=implicit-function-declaration] Fixes: 6abf32f1d9c5 ("cpuidle: Add RISC-V SBI CPU idle driver") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- drivers/cpuidle/cpuidle-riscv-sbi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c index b459eda2cd37..5c852e671992 100644 --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "dt_idle_states.h" From bb02330408a7bde33b5f46aa14fd5d7bfe6093b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Roberto=20de=20Souza?= Date: Thu, 14 Apr 2022 08:11:17 -0700 Subject: [PATCH 111/196] drm/i915/display/psr: Unset enable_psr2_sel_fetch if other checks in intel_psr2_config_valid() fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If any of the PSR2 checks after intel_psr2_sel_fetch_config_valid() fails, enable_psr2_sel_fetch will be kept enabled causing problems in the functions that only checks for it and not for has_psr2. So here moving the check that do not depend on enable_psr2_sel_fetch and for the remaning ones jumping to a section that unset enable_psr2_sel_fetch in case of failure to support PSR2. Fixes: 6e43e276b8c9 ("drm/i915: Initial implementation of PSR2 selective fetch") Cc: Jouni Högander Reviewed-by: Jouni Högander Signed-off-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20220414151118.21980-1-jose.souza@intel.com (cherry picked from commit 554ae8dce1268789e72767a67f0635cb743b3cea) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_psr.c | 38 +++++++++++++----------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index bff8c2d73cdf..6c9e6e7f0afd 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -887,6 +887,20 @@ static bool intel_psr2_config_valid(struct intel_dp *intel_dp, return false; } + /* Wa_16011303918:adl-p */ + if (crtc_state->vrr.enable && + IS_ADLP_DISPLAY_STEP(dev_priv, STEP_A0, STEP_B0)) { + drm_dbg_kms(&dev_priv->drm, + "PSR2 not enabled, not compatible with HW stepping + VRR\n"); + return false; + } + + if (!_compute_psr2_sdp_prior_scanline_indication(intel_dp, crtc_state)) { + drm_dbg_kms(&dev_priv->drm, + "PSR2 not enabled, PSR2 SDP indication do not fit in hblank\n"); + return false; + } + if (HAS_PSR2_SEL_FETCH(dev_priv)) { if (!intel_psr2_sel_fetch_config_valid(intel_dp, crtc_state) && !HAS_PSR_HW_TRACKING(dev_priv)) { @@ -900,12 +914,12 @@ static bool intel_psr2_config_valid(struct intel_dp *intel_dp, if (!crtc_state->enable_psr2_sel_fetch && IS_TGL_DISPLAY_STEP(dev_priv, STEP_A0, STEP_C0)) { drm_dbg_kms(&dev_priv->drm, "PSR2 HW tracking is not supported this Display stepping\n"); - return false; + goto unsupported; } if (!psr2_granularity_check(intel_dp, crtc_state)) { drm_dbg_kms(&dev_priv->drm, "PSR2 not enabled, SU granularity not compatible\n"); - return false; + goto unsupported; } if (!crtc_state->enable_psr2_sel_fetch && @@ -914,25 +928,15 @@ static bool intel_psr2_config_valid(struct intel_dp *intel_dp, "PSR2 not enabled, resolution %dx%d > max supported %dx%d\n", crtc_hdisplay, crtc_vdisplay, psr_max_h, psr_max_v); - return false; - } - - if (!_compute_psr2_sdp_prior_scanline_indication(intel_dp, crtc_state)) { - drm_dbg_kms(&dev_priv->drm, - "PSR2 not enabled, PSR2 SDP indication do not fit in hblank\n"); - return false; - } - - /* Wa_16011303918:adl-p */ - if (crtc_state->vrr.enable && - IS_ADLP_DISPLAY_STEP(dev_priv, STEP_A0, STEP_B0)) { - drm_dbg_kms(&dev_priv->drm, - "PSR2 not enabled, not compatible with HW stepping + VRR\n"); - return false; + goto unsupported; } tgl_dc3co_exitline_compute_config(intel_dp, crtc_state); return true; + +unsupported: + crtc_state->enable_psr2_sel_fetch = false; + return false; } void intel_psr_compute_config(struct intel_dp *intel_dp, From 3ab75a793e4939519d288ef1994db73b8e2d1d86 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 19 Apr 2022 18:32:57 -0700 Subject: [PATCH 112/196] RISC-V: KVM: Remove 's' & 'u' as valid ISA extension There are no ISA extension defined as 's' & 'u' in RISC-V specifications. The misa register defines 's' & 'u' bit as Supervisor/User privilege mode enabled. But it should not appear in the ISA extension in the device tree. Remove those from the allowed ISA extension for kvm. Fixes: a33c72faf2d7 ("RISC-V: KVM: Implement VCPU create, init and destroy functions") Signed-off-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 6785aef4cbd4..2e25a7b83a1b 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -43,9 +43,7 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { riscv_isa_extension_mask(d) | \ riscv_isa_extension_mask(f) | \ riscv_isa_extension_mask(i) | \ - riscv_isa_extension_mask(m) | \ - riscv_isa_extension_mask(s) | \ - riscv_isa_extension_mask(u)) + riscv_isa_extension_mask(m)) static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) { From f92055ae0acb035891e988ce345d6b81a0316423 Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Fri, 15 Apr 2022 18:25:12 +0200 Subject: [PATCH 113/196] drm/panel/raspberrypi-touchscreen: Avoid NULL deref if not initialised If a call to rpi_touchscreen_i2c_write from rpi_touchscreen_probe fails before mipi_dsi_device_register_full is called, then in trying to log the error message if uses ts->dsi->dev when it is still NULL. Use ts->i2c->dev instead, which is initialised earlier in probe. Fixes: 2f733d6194bd ("drm/panel: Add support for the Raspberry Pi 7" Touchscreen.") Signed-off-by: Dave Stevenson Signed-off-by: Stefan Wahren Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220415162513.42190-2-stefan.wahren@i2se.com --- drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c index 46029c5610c8..1f805eb8fdb5 100644 --- a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c +++ b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c @@ -229,7 +229,7 @@ static void rpi_touchscreen_i2c_write(struct rpi_touchscreen *ts, ret = i2c_smbus_write_byte_data(ts->i2c, reg, val); if (ret) - dev_err(&ts->dsi->dev, "I2C write failed: %d\n", ret); + dev_err(&ts->i2c->dev, "I2C write failed: %d\n", ret); } static int rpi_touchscreen_write(struct rpi_touchscreen *ts, u16 reg, u32 val) From 5f18c0782b99e26121efa93d20b76c19e17aa1dd Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Fri, 15 Apr 2022 18:25:13 +0200 Subject: [PATCH 114/196] drm/panel/raspberrypi-touchscreen: Initialise the bridge in prepare The panel has a prepare call which is before video starts, and an enable call which is after. The Toshiba bridge should be configured before video, so move the relevant power and initialisation calls to prepare. Fixes: 2f733d6194bd ("drm/panel: Add support for the Raspberry Pi 7" Touchscreen.") Signed-off-by: Dave Stevenson Signed-off-by: Stefan Wahren Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220415162513.42190-3-stefan.wahren@i2se.com --- drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c index 1f805eb8fdb5..145047e19394 100644 --- a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c +++ b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c @@ -265,7 +265,7 @@ static int rpi_touchscreen_noop(struct drm_panel *panel) return 0; } -static int rpi_touchscreen_enable(struct drm_panel *panel) +static int rpi_touchscreen_prepare(struct drm_panel *panel) { struct rpi_touchscreen *ts = panel_to_ts(panel); int i; @@ -295,6 +295,13 @@ static int rpi_touchscreen_enable(struct drm_panel *panel) rpi_touchscreen_write(ts, DSI_STARTDSI, 0x01); msleep(100); + return 0; +} + +static int rpi_touchscreen_enable(struct drm_panel *panel) +{ + struct rpi_touchscreen *ts = panel_to_ts(panel); + /* Turn on the backlight. */ rpi_touchscreen_i2c_write(ts, REG_PWM, 255); @@ -349,7 +356,7 @@ static int rpi_touchscreen_get_modes(struct drm_panel *panel, static const struct drm_panel_funcs rpi_touchscreen_funcs = { .disable = rpi_touchscreen_disable, .unprepare = rpi_touchscreen_noop, - .prepare = rpi_touchscreen_noop, + .prepare = rpi_touchscreen_prepare, .enable = rpi_touchscreen_enable, .get_modes = rpi_touchscreen_get_modes, }; From 4dee8eebcfc1a99d7550855ec40720503df4842b Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Mon, 11 Apr 2022 10:43:25 +0800 Subject: [PATCH 115/196] drm/vc4: Fix build error when CONFIG_DRM_VC4=y && CONFIG_RASPBERRYPI_FIRMWARE=m If CONFIG_DRM_VC4=y, CONFIG_RASPBERRYPI_FIRMWARE=m, CONFIG_COMPILE_TEST=n, bulding fails: drivers/gpu/drm/vc4/vc4_drv.o: In function `vc4_drm_bind': vc4_drv.c:(.text+0x320): undefined reference to `rpi_firmware_get' vc4_drv.c:(.text+0x320): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `rpi_firmware_get' vc4_drv.c:(.text+0x34c): undefined reference to `rpi_firmware_property' vc4_drv.c:(.text+0x34c): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `rpi_firmware_property' vc4_drv.c:(.text+0x354): undefined reference to `rpi_firmware_put' vc4_drv.c:(.text+0x354): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `rpi_firmware_put' Make DRM_VC4 depends on RASPBERRYPI_FIRMWARE || (COMPILE_TEST && !RASPBERRYPI_FIRMWARE) to fix this. Fixes: c406ad5e4a85 ("drm/vc4: Notify the firmware when DRM is in charge") Reported-by: Hulk Robot Signed-off-by: Zheng Bin Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220411024325.3968413-1-zhengbin13@huawei.com --- drivers/gpu/drm/vc4/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/vc4/Kconfig b/drivers/gpu/drm/vc4/Kconfig index de3424fed2fc..6cf2621786e6 100644 --- a/drivers/gpu/drm/vc4/Kconfig +++ b/drivers/gpu/drm/vc4/Kconfig @@ -2,6 +2,9 @@ config DRM_VC4 tristate "Broadcom VC4 Graphics" depends on ARCH_BCM || ARCH_BCM2835 || COMPILE_TEST + # Make sure not 'y' when RASPBERRYPI_FIRMWARE is 'm'. This can only + # happen when COMPILE_TEST=y, hence the added !RASPBERRYPI_FIRMWARE. + depends on RASPBERRYPI_FIRMWARE || (COMPILE_TEST && !RASPBERRYPI_FIRMWARE) depends on DRM depends on SND && SND_SOC depends on COMMON_CLK From 38d9a4ac65f204f264b33b966f0af4366f5518a8 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 19 Apr 2022 18:32:58 -0700 Subject: [PATCH 116/196] RISC-V: KVM: Restrict the extensions that can be disabled Currently, the config isa register allows us to disable all allowed single letter ISA extensions. It shouldn't be the case as vmm shouldn't be able to disable base extensions (imac). These extensions should always be enabled as long as they are enabled in the host ISA. Signed-off-by: Atish Patra Signed-off-by: Anup Patel Fixes: 92ad82002c39 ("RISC-V: KVM: Implement KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls") --- arch/riscv/kvm/vcpu.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 2e25a7b83a1b..aad430668bb4 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -38,12 +38,16 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; -#define KVM_RISCV_ISA_ALLOWED (riscv_isa_extension_mask(a) | \ - riscv_isa_extension_mask(c) | \ - riscv_isa_extension_mask(d) | \ - riscv_isa_extension_mask(f) | \ - riscv_isa_extension_mask(i) | \ - riscv_isa_extension_mask(m)) +#define KVM_RISCV_ISA_DISABLE_ALLOWED (riscv_isa_extension_mask(d) | \ + riscv_isa_extension_mask(f)) + +#define KVM_RISCV_ISA_DISABLE_NOT_ALLOWED (riscv_isa_extension_mask(a) | \ + riscv_isa_extension_mask(c) | \ + riscv_isa_extension_mask(i) | \ + riscv_isa_extension_mask(m)) + +#define KVM_RISCV_ISA_ALLOWED (KVM_RISCV_ISA_DISABLE_ALLOWED | \ + KVM_RISCV_ISA_DISABLE_NOT_ALLOWED) static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) { @@ -217,7 +221,8 @@ static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu, switch (reg_num) { case KVM_REG_RISCV_CONFIG_REG(isa): if (!vcpu->arch.ran_atleast_once) { - vcpu->arch.isa = reg_val; + /* Ignore the disable request for these extensions */ + vcpu->arch.isa = reg_val | KVM_RISCV_ISA_DISABLE_NOT_ALLOWED; vcpu->arch.isa &= riscv_isa_extension_base(NULL); vcpu->arch.isa &= KVM_RISCV_ISA_ALLOWED; kvm_riscv_vcpu_fp_reset(vcpu); From c6a4254c18c6a2195cdf01f58a362392fbe81e85 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 13 Apr 2022 16:00:00 +0200 Subject: [PATCH 117/196] doc/ip-sysctl: add bc_forwarding Let's describe this sysctl. Fixes: 5cbf777cfdf6 ("route: add support for directed broadcast forwarding") Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index b0024aa7b051..66828293d9cb 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -267,6 +267,13 @@ ipfrag_max_dist - INTEGER from different IP datagrams, which could result in data corruption. Default: 64 +bc_forwarding - INTEGER + bc_forwarding enables the feature described in rfc1812#section-5.3.5.2 + and rfc2644. It allows the router to forward directed broadcast. + To enable this feature, the 'all' entry and the input interface entry + should be set to 1. + Default: 0 + INET peer storage ================= From 234901de2bc6847eaa0aeb4aba62c31ffb8d3ad6 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Tue, 19 Apr 2022 16:42:26 +0800 Subject: [PATCH 118/196] net: stmmac: Use readl_poll_timeout_atomic() in atomic state The init_systime() may be invoked in atomic state. We have observed the following call trace when running "phc_ctl /dev/ptp0 set" on a Intel Agilex board. BUG: sleeping function called from invalid context at drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c:74 in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 381, name: phc_ctl preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 Preemption disabled at: [] stmmac_set_time+0x34/0x8c CPU: 2 PID: 381 Comm: phc_ctl Not tainted 5.18.0-rc2-next-20220414-yocto-standard+ #567 Hardware name: SoCFPGA Agilex SoCDK (DT) Call trace: dump_backtrace.part.0+0xc4/0xd0 show_stack+0x24/0x40 dump_stack_lvl+0x7c/0xa0 dump_stack+0x18/0x34 __might_resched+0x154/0x1c0 __might_sleep+0x58/0x90 init_systime+0x78/0x120 stmmac_set_time+0x64/0x8c ptp_clock_settime+0x60/0x9c pc_clock_settime+0x6c/0xc0 __arm64_sys_clock_settime+0x88/0xf0 invoke_syscall+0x5c/0x130 el0_svc_common.constprop.0+0x4c/0x100 do_el0_svc+0x7c/0xa0 el0_svc+0x58/0xcc el0t_64_sync_handler+0xa4/0x130 el0t_64_sync+0x18c/0x190 So we should use readl_poll_timeout_atomic() here instead of readl_poll_timeout(). Also adjust the delay time to 10us to fix a "__bad_udelay" build error reported by "kernel test robot ". I have tested this on Intel Agilex and NXP S32G boards, there is no delay needed at all. So the 10us delay should be long enough for most cases. Fixes: ff8ed737860e ("net: stmmac: use readl_poll_timeout() function in init_systime()") Signed-off-by: Kevin Hao Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c index 22fea0f67245..92d32940aff0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c @@ -71,9 +71,9 @@ static int init_systime(void __iomem *ioaddr, u32 sec, u32 nsec) writel(value, ioaddr + PTP_TCR); /* wait for present system time initialize to complete */ - return readl_poll_timeout(ioaddr + PTP_TCR, value, + return readl_poll_timeout_atomic(ioaddr + PTP_TCR, value, !(value & PTP_TCR_TSINIT), - 10000, 100000); + 10, 100000); } static int config_addend(void __iomem *ioaddr, u32 addend) From d4860224e6a9bcaef24121827e97831001290328 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 13 Apr 2022 10:34:42 +0800 Subject: [PATCH 119/196] dmaengine: dw-edma: Fix inconsistent indenting Eliminate the follow smatch warning: drivers/dma/dw-edma/dw-edma-v0-core.c:419 dw_edma_v0_core_start() warn: inconsistent indenting. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/20220413023442.18856-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-v0-core.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c index b5b8f8181e77..33bc1e6c4cf2 100644 --- a/drivers/dma/dw-edma/dw-edma-v0-core.c +++ b/drivers/dma/dw-edma/dw-edma-v0-core.c @@ -414,17 +414,18 @@ void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first) SET_CH_32(dw, chan->dir, chan->id, ch_control1, (DW_EDMA_V0_CCS | DW_EDMA_V0_LLE)); /* Linked list */ + #ifdef CONFIG_64BIT - /* llp is not aligned on 64bit -> keep 32bit accesses */ - SET_CH_32(dw, chan->dir, chan->id, llp.lsb, - lower_32_bits(chunk->ll_region.paddr)); - SET_CH_32(dw, chan->dir, chan->id, llp.msb, - upper_32_bits(chunk->ll_region.paddr)); + /* llp is not aligned on 64bit -> keep 32bit accesses */ + SET_CH_32(dw, chan->dir, chan->id, llp.lsb, + lower_32_bits(chunk->ll_region.paddr)); + SET_CH_32(dw, chan->dir, chan->id, llp.msb, + upper_32_bits(chunk->ll_region.paddr)); #else /* CONFIG_64BIT */ - SET_CH_32(dw, chan->dir, chan->id, llp.lsb, - lower_32_bits(chunk->ll_region.paddr)); - SET_CH_32(dw, chan->dir, chan->id, llp.msb, - upper_32_bits(chunk->ll_region.paddr)); + SET_CH_32(dw, chan->dir, chan->id, llp.lsb, + lower_32_bits(chunk->ll_region.paddr)); + SET_CH_32(dw, chan->dir, chan->id, llp.msb, + upper_32_bits(chunk->ll_region.paddr)); #endif /* CONFIG_64BIT */ } /* Doorbell */ From 5d9d16e5aa0cf023e600bf716239fd9caa2d4148 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 18 Apr 2022 14:31:10 -0700 Subject: [PATCH 120/196] dmaengine: idxd: match type for retries var in idxd_enqcmds() wq->enqcmds_retries is defined as unsigned int. However, retries on the stack is defined as int. Change retries to unsigned int to compare the same type. Fixes: 7930d8553575 ("dmaengine: idxd: add knob for enqcmds retries") Suggested-by: Thiago Macieira Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/165031747059.3658198.6035308204505664375.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/submit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c index e289fd48711a..554b0602d2e9 100644 --- a/drivers/dma/idxd/submit.c +++ b/drivers/dma/idxd/submit.c @@ -150,7 +150,8 @@ static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie, */ int idxd_enqcmds(struct idxd_wq *wq, void __iomem *portal, const void *desc) { - int rc, retries = 0; + unsigned int retries = 0; + int rc; do { rc = enqcmds(portal, desc); From bc3452cdfc468a65965d0ac397c940acb787ea4d Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 18 Apr 2022 14:33:21 -0700 Subject: [PATCH 121/196] dmaengine: idxd: fix retry value to be constant for duration of function call When retries is compared to wq->enqcmds_retries each loop of idxd_enqcmds(), wq->enqcmds_retries can potentially changed by user. Assign the value of retries to wq->enqcmds_retries during initialization so it is the original value set when entering the function. Fixes: 7930d8553575 ("dmaengine: idxd: add knob for enqcmds retries") Suggested-by: Dave Hansen Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/165031760154.3658664.1983547716619266558.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/submit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c index 554b0602d2e9..c01db23e3333 100644 --- a/drivers/dma/idxd/submit.c +++ b/drivers/dma/idxd/submit.c @@ -150,7 +150,7 @@ static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie, */ int idxd_enqcmds(struct idxd_wq *wq, void __iomem *portal, const void *desc) { - unsigned int retries = 0; + unsigned int retries = wq->enqcmds_retries; int rc; do { @@ -158,7 +158,7 @@ int idxd_enqcmds(struct idxd_wq *wq, void __iomem *portal, const void *desc) if (rc == 0) break; cpu_relax(); - } while (retries++ < wq->enqcmds_retries); + } while (retries--); return rc; } From 66903461ffed0b66fc3e0200082d4e09365aacdc Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 11 Apr 2022 15:08:55 -0700 Subject: [PATCH 122/196] dmaengine: idxd: add RO check for wq max_batch_size write Block wq_max_batch_size_store() when the device is configured as read-only and not configurable. Fixes: e7184b159dd3 ("dmaengine: idxd: add support for configurable max wq batch size") Reported-by: Bernice Zhang Tested-by: Bernice Zhang Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/164971493551.2201159.1942042593642155209.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 7e19ab92b61a..6c41d429bd89 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -939,6 +939,9 @@ static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribu u64 batch_size; int rc; + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + if (wq->state != IDXD_WQ_DISABLED) return -EPERM; From 505a2d1032ae656b0a8c736be110255503941cde Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 11 Apr 2022 15:08:01 -0700 Subject: [PATCH 123/196] dmaengine: idxd: add RO check for wq max_transfer_size write Block wq_max_transfer_size_store() when the device is configured as read-only and not configurable. Fixes: d7aad5550eca ("dmaengine: idxd: add support for configurable max wq xfer size") Reported-by: Bernice Zhang Tested-by: Bernice Zhang Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/164971488154.2200913.10706665404118545941.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 6c41d429bd89..dfd549685c46 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -905,6 +905,9 @@ static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attr u64 xfer_size; int rc; + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + if (wq->state != IDXD_WQ_DISABLED) return -EPERM; From 1cd8e751d96c43ece3f6842ac2244a37d9332c3a Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 11 Apr 2022 15:06:34 -0700 Subject: [PATCH 124/196] dmaengine: idxd: skip clearing device context when device is read-only If the device shows up as read-only configuration, skip the clearing of the state as the context must be preserved for device re-enable after being disabled. Fixes: 0dcfe41e9a4c ("dmanegine: idxd: cleanup all device related bits after disabling device") Reported-by: Tony Zhu Tested-by: Tony Zhu Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/164971479479.2200566.13980022473526292759.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 5a0535a0f850..f652da6ab47d 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -708,6 +708,9 @@ static void idxd_device_wqs_clear_state(struct idxd_device *idxd) void idxd_device_clear_state(struct idxd_device *idxd) { + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return; + idxd_groups_clear_state(idxd); idxd_engines_clear_state(idxd); idxd_device_wqs_clear_state(idxd); From c5d0fc54bede8765b7a91bdac342c7c7de8bc8bd Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 20 Apr 2022 12:46:01 +0200 Subject: [PATCH 125/196] nfc: MAINTAINERS: add Bug entry Add a Bug section, indicating preferred mailing method for bug reports, to NFC Subsystem entry. Signed-off-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index ad76e7bbaf28..8c58eda0be23 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13820,6 +13820,7 @@ M: Krzysztof Kozlowski L: linux-nfc@lists.01.org (subscribers-only) L: netdev@vger.kernel.org S: Maintained +B: mailto:linux-nfc@lists.01.org F: Documentation/devicetree/bindings/net/nfc/ F: drivers/nfc/ F: include/linux/platform_data/nfcmrvl.h From 7495a5bbf89f68c8880757c112fd0994f5dba309 Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Thu, 14 Apr 2022 12:12:35 +0530 Subject: [PATCH 126/196] dt-bindings: dmaengine: qcom: gpi: Add minItems for interrupts Add the minItems for interrupts property as well. In the absence of this, we get warning if interrupts are less than 13 arch/arm64/boot/dts/qcom/qrb5165-rb5.dtb: dma-controller@800000: interrupts: [[0, 588, 4], [0, 589, 4], [0, 590, 4], [0, 591, 4], [0, 592, 4], [0, 593, 4], [0, 594, 4], [0, 595, 4], [0, 596, 4], [0, 597, 4]] is too short Signed-off-by: Vinod Koul Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220414064235.1182195-1-vkoul@kernel.org Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/qcom,gpi.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/dma/qcom,gpi.yaml b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml index e614fe3187bb..d09d79d7406a 100644 --- a/Documentation/devicetree/bindings/dma/qcom,gpi.yaml +++ b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml @@ -29,6 +29,7 @@ properties: interrupts: description: Interrupt lines for each GPI instance + minItems: 1 maxItems: 13 "#dma-cells": From 0665886ad1392e6b5bae85d7a6ccbed48dca1522 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 20 Apr 2022 15:02:47 +0200 Subject: [PATCH 127/196] ALSA: usb-audio: Clear MIDI port active flag after draining When a rawmidi output stream is closed, it calls the drain at first, then does trigger-off only when the drain returns -ERESTARTSYS as a fallback. It implies that each driver should turn off the stream properly after the drain. Meanwhile, USB-audio MIDI interface didn't change the port->active flag after the drain. This may leave the output work picking up the port that is closed right now, which eventually leads to a use-after-free for the already released rawmidi object. This patch fixes the bug by properly clearing the port->active flag after the output drain. Reported-by: syzbot+70e777a39907d6d5fd0a@syzkaller.appspotmail.com Cc: Link: https://lore.kernel.org/r/00000000000011555605dceaff03@google.com Link: https://lore.kernel.org/r/20220420130247.22062-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/midi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/midi.c b/sound/usb/midi.c index 2c01649c70f6..7c6ca2b433a5 100644 --- a/sound/usb/midi.c +++ b/sound/usb/midi.c @@ -1194,6 +1194,7 @@ static void snd_usbmidi_output_drain(struct snd_rawmidi_substream *substream) } while (drain_urbs && timeout); finish_wait(&ep->drain_wait, &wait); } + port->active = 0; spin_unlock_irq(&ep->buffer_lock); } From 044011fdf162c5dd61c02841930c8f438a9adadb Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 19 Apr 2022 16:51:54 +0300 Subject: [PATCH 128/196] selftests: mlxsw: vxlan_flooding: Prevent flooding of unwanted packets The test verifies that packets are correctly flooded by the bridge and the VXLAN device by matching on the encapsulated packets at the other end. However, if packets other than those generated by the test also ingress the bridge (e.g., MLD packets), they will be flooded as well and interfere with the expected count. Make the test more robust by making sure that only the packets generated by the test can ingress the bridge. Drop all the rest using tc filters on the egress of 'br0' and 'h1'. In the software data path, the problem can be solved by matching on the inner destination MAC or dropping unwanted packets at the egress of the VXLAN device, but this is not currently supported by mlxsw. Fixes: 94d302deae25 ("selftests: mlxsw: Add a test for VxLAN flooding") Signed-off-by: Ido Schimmel Reviewed-by: Amit Cohen Signed-off-by: David S. Miller --- .../drivers/net/mlxsw/vxlan_flooding.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh index fedcb7b35af9..af5ea50ed5c0 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh @@ -172,6 +172,17 @@ flooding_filters_add() local lsb local i + # Prevent unwanted packets from entering the bridge and interfering + # with the test. + tc qdisc add dev br0 clsact + tc filter add dev br0 egress protocol all pref 1 handle 1 \ + matchall skip_hw action drop + tc qdisc add dev $h1 clsact + tc filter add dev $h1 egress protocol all pref 1 handle 1 \ + flower skip_hw dst_mac de:ad:be:ef:13:37 action pass + tc filter add dev $h1 egress protocol all pref 2 handle 2 \ + matchall skip_hw action drop + tc qdisc add dev $rp2 clsact for i in $(eval echo {1..$num_remotes}); do @@ -194,6 +205,12 @@ flooding_filters_del() done tc qdisc del dev $rp2 clsact + + tc filter del dev $h1 egress protocol all pref 2 handle 2 matchall + tc filter del dev $h1 egress protocol all pref 1 handle 1 flower + tc qdisc del dev $h1 clsact + tc filter del dev br0 egress protocol all pref 1 handle 1 matchall + tc qdisc del dev br0 clsact } flooding_check_packets() From 5e6242151d7f17b056a82ca7b860c4ec8eaa7589 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 19 Apr 2022 16:51:55 +0300 Subject: [PATCH 129/196] selftests: mlxsw: vxlan_flooding_ipv6: Prevent flooding of unwanted packets The test verifies that packets are correctly flooded by the bridge and the VXLAN device by matching on the encapsulated packets at the other end. However, if packets other than those generated by the test also ingress the bridge (e.g., MLD packets), they will be flooded as well and interfere with the expected count. Make the test more robust by making sure that only the packets generated by the test can ingress the bridge. Drop all the rest using tc filters on the egress of 'br0' and 'h1'. In the software data path, the problem can be solved by matching on the inner destination MAC or dropping unwanted packets at the egress of the VXLAN device, but this is not currently supported by mlxsw. Fixes: d01724dd2a66 ("selftests: mlxsw: spectrum-2: Add a test for VxLAN flooding with IPv6") Signed-off-by: Ido Schimmel Reviewed-by: Amit Cohen Signed-off-by: David S. Miller --- .../net/mlxsw/spectrum-2/vxlan_flooding_ipv6.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/vxlan_flooding_ipv6.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/vxlan_flooding_ipv6.sh index 429f7ee735cf..fd23c80eba31 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/vxlan_flooding_ipv6.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/vxlan_flooding_ipv6.sh @@ -159,6 +159,17 @@ flooding_remotes_add() local lsb local i + # Prevent unwanted packets from entering the bridge and interfering + # with the test. + tc qdisc add dev br0 clsact + tc filter add dev br0 egress protocol all pref 1 handle 1 \ + matchall skip_hw action drop + tc qdisc add dev $h1 clsact + tc filter add dev $h1 egress protocol all pref 1 handle 1 \ + flower skip_hw dst_mac de:ad:be:ef:13:37 action pass + tc filter add dev $h1 egress protocol all pref 2 handle 2 \ + matchall skip_hw action drop + for i in $(eval echo {1..$num_remotes}); do lsb=$((i + 1)) @@ -195,6 +206,12 @@ flooding_filters_del() done tc qdisc del dev $rp2 clsact + + tc filter del dev $h1 egress protocol all pref 2 handle 2 matchall + tc filter del dev $h1 egress protocol all pref 1 handle 1 flower + tc qdisc del dev $h1 clsact + tc filter del dev br0 egress protocol all pref 1 handle 1 matchall + tc qdisc del dev br0 clsact } flooding_check_packets() From f31076a6b2391896088fa81755a892213f07788e Mon Sep 17 00:00:00 2001 From: Alaa Mohamed Date: Wed, 20 Apr 2022 01:43:28 +0200 Subject: [PATCH 130/196] xen: Convert kmap() to kmap_local_page() kmap() is being deprecated and these usages are all local to the thread so there is no reason kmap_local_page() can't be used. Replace kmap() calls with kmap_local_page(). Signed-off-by: Alaa Mohamed Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220419234328.10346-1-eng.alaamohamedsoliman.am@gmail.com Signed-off-by: Boris Ostrovsky --- drivers/xen/gntalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 4849f94372a4..55acb32842a3 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -178,9 +178,9 @@ static void __del_gref(struct gntalloc_gref *gref) unsigned long addr; if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { - uint8_t *tmp = kmap(gref->page); + uint8_t *tmp = kmap_local_page(gref->page); tmp[gref->notify.pgoff] = 0; - kunmap(gref->page); + kunmap_local(tmp); } if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { notify_remote_via_evtchn(gref->notify.event); From 298799a28264ce400d9ff95c51b7adcb123d866e Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Wed, 20 Apr 2022 00:03:28 -0400 Subject: [PATCH 131/196] drm/vmwgfx: Fix gem refcounting and memory evictions v2: Add the last part of the ref count fix which was spotted by Philipp Sieweck where the ref count of cpu writers is off due to ERESTARTSYS or EBUSY during bo waits. The initial GEM port broke refcounting on shareable (prime) surfaces and memory evictions. The prime surfaces broke because the parent surfaces weren't increasing the ref count on GEM surfaces, which meant that the memory backing textures could have been deleted while the texture was still accessible. The evictions broke due to a typo, the code was supposed to exit if the passed buffers were not vmw_buffer_object not if they were. They're tied because the evictions depend on having memory to actually evict. This fixes crashes with XA state tracker which is used for xrender acceleration on xf86-video-vmware, apps/tests which use a lot of memory (a good test being the piglit's streaming-texture-leak) and desktops. Signed-off-by: Zack Rusin Fixes: 8afa13a0583f ("drm/vmwgfx: Implement DRIVER_GEM") Reported-by: Philipp Sieweck Cc: # v5.17+ Reviewed-by: Maaz Mombasawala Reviewed-by: Martin Krastev Link: https://patchwork.freedesktop.org/patch/msgid/20220420040328.1007409-1-zack@kde.org --- drivers/gpu/drm/vmwgfx/vmwgfx_bo.c | 43 ++++++++++++------------- drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 8 ++--- drivers/gpu/drm/vmwgfx/vmwgfx_surface.c | 7 +++- 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c index 31aecc46624b..04c8a378aeed 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c @@ -46,6 +46,21 @@ vmw_buffer_object(struct ttm_buffer_object *bo) return container_of(bo, struct vmw_buffer_object, base); } +/** + * bo_is_vmw - check if the buffer object is a &vmw_buffer_object + * @bo: ttm buffer object to be checked + * + * Uses destroy function associated with the object to determine if this is + * a &vmw_buffer_object. + * + * Returns: + * true if the object is of &vmw_buffer_object type, false if not. + */ +static bool bo_is_vmw(struct ttm_buffer_object *bo) +{ + return bo->destroy == &vmw_bo_bo_free || + bo->destroy == &vmw_gem_destroy; +} /** * vmw_bo_pin_in_placement - Validate a buffer to placement. @@ -615,8 +630,9 @@ int vmw_user_bo_synccpu_ioctl(struct drm_device *dev, void *data, ret = vmw_user_bo_synccpu_grab(vbo, arg->flags); vmw_bo_unreference(&vbo); - if (unlikely(ret != 0 && ret != -ERESTARTSYS && - ret != -EBUSY)) { + if (unlikely(ret != 0)) { + if (ret == -ERESTARTSYS || ret == -EBUSY) + return -EBUSY; DRM_ERROR("Failed synccpu grab on handle 0x%08x.\n", (unsigned int) arg->handle); return ret; @@ -798,7 +814,7 @@ int vmw_dumb_create(struct drm_file *file_priv, void vmw_bo_swap_notify(struct ttm_buffer_object *bo) { /* Is @bo embedded in a struct vmw_buffer_object? */ - if (vmw_bo_is_vmw_bo(bo)) + if (!bo_is_vmw(bo)) return; /* Kill any cached kernel maps before swapout */ @@ -822,7 +838,7 @@ void vmw_bo_move_notify(struct ttm_buffer_object *bo, struct vmw_buffer_object *vbo; /* Make sure @bo is embedded in a struct vmw_buffer_object? */ - if (vmw_bo_is_vmw_bo(bo)) + if (!bo_is_vmw(bo)) return; vbo = container_of(bo, struct vmw_buffer_object, base); @@ -843,22 +859,3 @@ void vmw_bo_move_notify(struct ttm_buffer_object *bo, if (mem->mem_type != VMW_PL_MOB && bo->resource->mem_type == VMW_PL_MOB) vmw_resource_unbind_list(vbo); } - -/** - * vmw_bo_is_vmw_bo - check if the buffer object is a &vmw_buffer_object - * @bo: buffer object to be checked - * - * Uses destroy function associated with the object to determine if this is - * a &vmw_buffer_object. - * - * Returns: - * true if the object is of &vmw_buffer_object type, false if not. - */ -bool vmw_bo_is_vmw_bo(struct ttm_buffer_object *bo) -{ - if (bo->destroy == &vmw_bo_bo_free || - bo->destroy == &vmw_gem_destroy) - return true; - - return false; -} diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index 26eb5478394a..163c00793eb1 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -998,13 +998,10 @@ static int vmw_driver_load(struct vmw_private *dev_priv, u32 pci_id) goto out_no_fman; } - drm_vma_offset_manager_init(&dev_priv->vma_manager, - DRM_FILE_PAGE_OFFSET_START, - DRM_FILE_PAGE_OFFSET_SIZE); ret = ttm_device_init(&dev_priv->bdev, &vmw_bo_driver, dev_priv->drm.dev, dev_priv->drm.anon_inode->i_mapping, - &dev_priv->vma_manager, + dev_priv->drm.vma_offset_manager, dev_priv->map_mode == vmw_dma_alloc_coherent, false); if (unlikely(ret != 0)) { @@ -1174,7 +1171,6 @@ static void vmw_driver_unload(struct drm_device *dev) vmw_devcaps_destroy(dev_priv); vmw_vram_manager_fini(dev_priv); ttm_device_fini(&dev_priv->bdev); - drm_vma_offset_manager_destroy(&dev_priv->vma_manager); vmw_release_device_late(dev_priv); vmw_fence_manager_takedown(dev_priv->fman); if (dev_priv->capabilities & SVGA_CAP_IRQMASK) @@ -1398,7 +1394,7 @@ vmw_get_unmapped_area(struct file *file, unsigned long uaddr, struct vmw_private *dev_priv = vmw_priv(file_priv->minor->dev); return drm_get_unmapped_area(file, uaddr, len, pgoff, flags, - &dev_priv->vma_manager); + dev_priv->drm.vma_offset_manager); } static int vmwgfx_pm_notifier(struct notifier_block *nb, unsigned long val, diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c index 00e8e27e4884..ace7ca150b03 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c @@ -683,6 +683,9 @@ static void vmw_user_surface_base_release(struct ttm_base_object **p_base) container_of(base, struct vmw_user_surface, prime.base); struct vmw_resource *res = &user_srf->srf.res; + if (base->shareable && res && res->backup) + drm_gem_object_put(&res->backup->base.base); + *p_base = NULL; vmw_resource_unreference(&res); } @@ -857,6 +860,7 @@ int vmw_surface_define_ioctl(struct drm_device *dev, void *data, goto out_unlock; } vmw_bo_reference(res->backup); + drm_gem_object_get(&res->backup->base.base); } tmp = vmw_resource_reference(&srf->res); @@ -1513,7 +1517,6 @@ vmw_gb_surface_define_internal(struct drm_device *dev, &res->backup); if (ret == 0) vmw_bo_reference(res->backup); - } if (unlikely(ret != 0)) { @@ -1561,6 +1564,8 @@ vmw_gb_surface_define_internal(struct drm_device *dev, drm_vma_node_offset_addr(&res->backup->base.base.vma_node); rep->buffer_size = res->backup->base.base.size; rep->buffer_handle = backup_handle; + if (user_srf->prime.base.shareable) + drm_gem_object_get(&res->backup->base.base); } else { rep->buffer_map_handle = 0; rep->buffer_size = 0; From 41f10081a92a0ed280008218a8ec18ad8ba0fceb Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 20 Apr 2022 21:05:45 -0300 Subject: [PATCH 132/196] cifs: fix NULL ptr dereference in refresh_mounts() Either mount(2) or automount might not have server->origin_fullpath set yet while refresh_cache_worker() is attempting to refresh DFS referrals. Add missing NULL check and locking around it. This fixes bellow crash: [ 1070.276835] general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN NOPTI [ 1070.277676] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] [ 1070.278219] CPU: 1 PID: 8506 Comm: kworker/u8:1 Not tainted 5.18.0-rc3 #10 [ 1070.278701] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.15.0-0-g2dd4b9b-rebuilt.opensuse.org 04/01/2014 [ 1070.279495] Workqueue: cifs-dfscache refresh_cache_worker [cifs] [ 1070.280044] RIP: 0010:strcasecmp+0x34/0x150 [ 1070.280359] Code: 00 00 00 fc ff df 41 54 55 48 89 fd 53 48 83 ec 10 eb 03 4c 89 fe 48 89 ef 48 83 c5 01 48 89 f8 48 89 fa 48 c1 e8 03 83 e2 07 <42> 0f b6 04 28 38 d0 7f 08 84 c0 0f 85 bc 00 00 00 0f b6 45 ff 44 [ 1070.281729] RSP: 0018:ffffc90008367958 EFLAGS: 00010246 [ 1070.282114] RAX: 0000000000000000 RBX: dffffc0000000000 RCX: 0000000000000000 [ 1070.282691] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 1070.283273] RBP: 0000000000000001 R08: 0000000000000000 R09: ffffffff873eda27 [ 1070.283857] R10: ffffc900083679a0 R11: 0000000000000001 R12: ffff88812624c000 [ 1070.284436] R13: dffffc0000000000 R14: ffff88810e6e9a88 R15: ffff888119bb9000 [ 1070.284990] FS: 0000000000000000(0000) GS:ffff888151200000(0000) knlGS:0000000000000000 [ 1070.285625] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1070.286100] CR2: 0000561a4d922418 CR3: 000000010aecc000 CR4: 0000000000350ee0 [ 1070.286683] Call Trace: [ 1070.286890] [ 1070.287070] refresh_cache_worker+0x895/0xd20 [cifs] [ 1070.287475] ? __refresh_tcon.isra.0+0xfb0/0xfb0 [cifs] [ 1070.287905] ? __lock_acquire+0xcd1/0x6960 [ 1070.288247] ? is_dynamic_key+0x1a0/0x1a0 [ 1070.288591] ? lockdep_hardirqs_on_prepare+0x410/0x410 [ 1070.289012] ? lock_downgrade+0x6f0/0x6f0 [ 1070.289318] process_one_work+0x7bd/0x12d0 [ 1070.289637] ? worker_thread+0x160/0xec0 [ 1070.289970] ? pwq_dec_nr_in_flight+0x230/0x230 [ 1070.290318] ? _raw_spin_lock_irq+0x5e/0x90 [ 1070.290619] worker_thread+0x5ac/0xec0 [ 1070.290891] ? process_one_work+0x12d0/0x12d0 [ 1070.291199] kthread+0x2a5/0x350 [ 1070.291430] ? kthread_complete_and_exit+0x20/0x20 [ 1070.291770] ret_from_fork+0x22/0x30 [ 1070.292050] [ 1070.292223] Modules linked in: bpfilter cifs cifs_arc4 cifs_md4 [ 1070.292765] ---[ end trace 0000000000000000 ]--- [ 1070.293108] RIP: 0010:strcasecmp+0x34/0x150 [ 1070.293471] Code: 00 00 00 fc ff df 41 54 55 48 89 fd 53 48 83 ec 10 eb 03 4c 89 fe 48 89 ef 48 83 c5 01 48 89 f8 48 89 fa 48 c1 e8 03 83 e2 07 <42> 0f b6 04 28 38 d0 7f 08 84 c0 0f 85 bc 00 00 00 0f b6 45 ff 44 [ 1070.297718] RSP: 0018:ffffc90008367958 EFLAGS: 00010246 [ 1070.298622] RAX: 0000000000000000 RBX: dffffc0000000000 RCX: 0000000000000000 [ 1070.299428] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 1070.300296] RBP: 0000000000000001 R08: 0000000000000000 R09: ffffffff873eda27 [ 1070.301204] R10: ffffc900083679a0 R11: 0000000000000001 R12: ffff88812624c000 [ 1070.301932] R13: dffffc0000000000 R14: ffff88810e6e9a88 R15: ffff888119bb9000 [ 1070.302645] FS: 0000000000000000(0000) GS:ffff888151200000(0000) knlGS:0000000000000000 [ 1070.303462] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1070.304131] CR2: 0000561a4d922418 CR3: 000000010aecc000 CR4: 0000000000350ee0 [ 1070.305004] Kernel panic - not syncing: Fatal exception [ 1070.305711] Kernel Offset: disabled [ 1070.305971] ---[ end Kernel panic - not syncing: Fatal exception ]--- Signed-off-by: Paulo Alcantara (SUSE) Cc: stable@vger.kernel.org Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/connect.c | 2 ++ fs/cifs/dfs_cache.c | 19 ++++++++++++------- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 902e8c6c0f9c..2c24d433061a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3675,9 +3675,11 @@ static void setup_server_referral_paths(struct mount_ctx *mnt_ctx) { struct TCP_Server_Info *server = mnt_ctx->server; + mutex_lock(&server->refpath_lock); server->origin_fullpath = mnt_ctx->origin_fullpath; server->leaf_fullpath = mnt_ctx->leaf_fullpath; server->current_fullpath = mnt_ctx->leaf_fullpath; + mutex_unlock(&server->refpath_lock); mnt_ctx->origin_fullpath = mnt_ctx->leaf_fullpath = NULL; } diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 30e040da4f09..956f8e5cf3e7 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1422,12 +1422,14 @@ static int refresh_tcon(struct cifs_ses **sessions, struct cifs_tcon *tcon, bool struct TCP_Server_Info *server = tcon->ses->server; mutex_lock(&server->refpath_lock); - if (strcasecmp(server->leaf_fullpath, server->origin_fullpath)) - __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh); + if (server->origin_fullpath) { + if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath, + server->origin_fullpath)) + __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh); + __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh); + } mutex_unlock(&server->refpath_lock); - __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh); - return 0; } @@ -1530,11 +1532,14 @@ static void refresh_mounts(struct cifs_ses **sessions) list_del_init(&tcon->ulist); mutex_lock(&server->refpath_lock); - if (strcasecmp(server->leaf_fullpath, server->origin_fullpath)) - __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false); + if (server->origin_fullpath) { + if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath, + server->origin_fullpath)) + __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false); + __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false); + } mutex_unlock(&server->refpath_lock); - __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false); cifs_put_tcon(tcon); } } From cd70a3e8988a999c42d307d2616a5e7b6a33c7c8 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 20 Apr 2022 21:05:46 -0300 Subject: [PATCH 133/196] cifs: use correct lock type in cifs_reconnect() TCP_Server_Info::origin_fullpath and TCP_Server_Info::leaf_fullpath are protected by refpath_lock mutex and not cifs_tcp_ses_lock spinlock. Signed-off-by: Paulo Alcantara (SUSE) Cc: stable@vger.kernel.org Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/connect.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2c24d433061a..42e14f408856 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -534,12 +534,19 @@ int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { /* If tcp session is not an dfs connection, then reconnect to last target server */ spin_lock(&cifs_tcp_ses_lock); - if (!server->is_dfs_conn || !server->origin_fullpath || !server->leaf_fullpath) { + if (!server->is_dfs_conn) { spin_unlock(&cifs_tcp_ses_lock); return __cifs_reconnect(server, mark_smb_session); } spin_unlock(&cifs_tcp_ses_lock); + mutex_lock(&server->refpath_lock); + if (!server->origin_fullpath || !server->leaf_fullpath) { + mutex_unlock(&server->refpath_lock); + return __cifs_reconnect(server, mark_smb_session); + } + mutex_unlock(&server->refpath_lock); + return reconnect_dfs_server(server); } #else From f5d0f921ea362636e4a2efb7c38d1ead373a8700 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 21 Apr 2022 11:15:36 +1000 Subject: [PATCH 134/196] cifs: destage any unwritten data to the server before calling copychunk_write because the copychunk_write might cover a region of the file that has not yet been sent to the server and thus fail. A simple way to reproduce this is: truncate -s 0 /mnt/testfile; strace -f -o x -ttT xfs_io -i -f -c 'pwrite 0k 128k' -c 'fcollapse 16k 24k' /mnt/testfile the issue is that the 'pwrite 0k 128k' becomes rearranged on the wire with the 'fcollapse 16k 24k' due to write-back caching. fcollapse is implemented in cifs.ko as a SMB2 IOCTL(COPYCHUNK_WRITE) call and it will fail serverside since the file is still 0b in size serverside until the writes have been destaged. To avoid this we must ensure that we destage any unwritten data to the server before calling COPYCHUNK_WRITE. Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1997373 Reported-by: Xiaoli Feng Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index a67df8eaf702..d6aaeff4a30a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1858,9 +1858,17 @@ smb2_copychunk_range(const unsigned int xid, int chunks_copied = 0; bool chunk_sizes_updated = false; ssize_t bytes_written, total_bytes_written = 0; + struct inode *inode; pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); + /* + * We need to flush all unwritten data before we can send the + * copychunk ioctl to the server. + */ + inode = d_inode(trgtfile->dentry); + filemap_write_and_wait(inode->i_mapping); + if (pcchunk == NULL) return -ENOMEM; From d2b9be1f4af5cabed1ee5bb341f887f64b1c1669 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 21 Apr 2022 00:16:57 +1000 Subject: [PATCH 135/196] powerpc/time: Always set decrementer in timer_interrupt() This is a partial revert of commit 0faf20a1ad16 ("powerpc/64s/interrupt: Don't enable MSR[EE] in irq handlers unless perf is in use"). Prior to that commit, we always set the decrementer in timer_interrupt(), to clear the timer interrupt. Otherwise we could end up continuously taking timer interrupts. When high res timers are enabled there is no problem seen with leaving the decrementer untouched in timer_interrupt(), because it will be programmed via hrtimer_interrupt() -> tick_program_event() -> clockevents_program_event() -> decrementer_set_next_event(). However with CONFIG_HIGH_RES_TIMERS=n or booting with highres=off, we see a stall/lockup, because tick_nohz_handler() does not cause a reprogram of the decrementer, leading to endless timer interrupts. Example trace: [ 1.898617][ T7] Freeing initrd memory: 2624K^M [ 22.680919][ C1] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:^M [ 22.682281][ C1] rcu: 0-....: (25 ticks this GP) idle=073/0/0x1 softirq=10/16 fqs=1050 ^M [ 22.682851][ C1] (detected by 1, t=2102 jiffies, g=-1179, q=476)^M [ 22.683649][ C1] Sending NMI from CPU 1 to CPUs 0:^M [ 22.685252][ C0] NMI backtrace for cpu 0^M [ 22.685649][ C0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.16.0-rc2-00185-g0faf20a1ad16 #145^M [ 22.686393][ C0] NIP: c000000000016d64 LR: c000000000f6cca4 CTR: c00000000019c6e0^M [ 22.686774][ C0] REGS: c000000002833590 TRAP: 0500 Not tainted (5.16.0-rc2-00185-g0faf20a1ad16)^M [ 22.687222][ C0] MSR: 8000000000009033 CR: 24000222 XER: 00000000^M [ 22.688297][ C0] CFAR: c00000000000c854 IRQMASK: 0 ^M ... [ 22.692637][ C0] NIP [c000000000016d64] arch_local_irq_restore+0x174/0x250^M [ 22.694443][ C0] LR [c000000000f6cca4] __do_softirq+0xe4/0x3dc^M [ 22.695762][ C0] Call Trace:^M [ 22.696050][ C0] [c000000002833830] [c000000000f6cc80] __do_softirq+0xc0/0x3dc (unreliable)^M [ 22.697377][ C0] [c000000002833920] [c000000000151508] __irq_exit_rcu+0xd8/0x130^M [ 22.698739][ C0] [c000000002833950] [c000000000151730] irq_exit+0x20/0x40^M [ 22.699938][ C0] [c000000002833970] [c000000000027f40] timer_interrupt+0x270/0x460^M [ 22.701119][ C0] [c0000000028339d0] [c0000000000099a8] decrementer_common_virt+0x208/0x210^M Possibly this should be fixed in the lowres timing code, but that would be a generic change and could take some time and may not backport easily, so for now make the programming of the decrementer unconditional again in timer_interrupt() to avoid the stall/lockup. Fixes: 0faf20a1ad16 ("powerpc/64s/interrupt: Don't enable MSR[EE] in irq handlers unless perf is in use") Reported-by: Miguel Ojeda Signed-off-by: Michael Ellerman Reviewed-by: Nicholas Piggin Link: https://lore.kernel.org/r/20220420141657.771442-1-mpe@ellerman.id.au --- arch/powerpc/kernel/time.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index f5cbfe5efd25..f80cce0e3899 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -615,23 +615,22 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) return; } - /* Conditionally hard-enable interrupts. */ - if (should_hard_irq_enable()) { - /* - * Ensure a positive value is written to the decrementer, or - * else some CPUs will continue to take decrementer exceptions. - * When the PPC_WATCHDOG (decrementer based) is configured, - * keep this at most 31 bits, which is about 4 seconds on most - * systems, which gives the watchdog a chance of catching timer - * interrupt hard lockups. - */ - if (IS_ENABLED(CONFIG_PPC_WATCHDOG)) - set_dec(0x7fffffff); - else - set_dec(decrementer_max); + /* + * Ensure a positive value is written to the decrementer, or + * else some CPUs will continue to take decrementer exceptions. + * When the PPC_WATCHDOG (decrementer based) is configured, + * keep this at most 31 bits, which is about 4 seconds on most + * systems, which gives the watchdog a chance of catching timer + * interrupt hard lockups. + */ + if (IS_ENABLED(CONFIG_PPC_WATCHDOG)) + set_dec(0x7fffffff); + else + set_dec(decrementer_max); + /* Conditionally hard-enable interrupts. */ + if (should_hard_irq_enable()) do_hard_irq_enable(); - } #if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC) if (atomic_read(&ppc_n_lost_interrupts) != 0) From 26a62b750a4e6364b0393562f66759b1494c3a01 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 20 Apr 2022 15:08:40 +1000 Subject: [PATCH 136/196] KVM: PPC: Fix TCE handling for VFIO The LoPAPR spec defines a guest visible IOMMU with a variable page size. Currently QEMU advertises 4K, 64K, 2M, 16MB pages, a Linux VM picks the biggest (16MB). In the case of a passed though PCI device, there is a hardware IOMMU which does not support all pages sizes from the above - P8 cannot do 2MB and P9 cannot do 16MB. So for each emulated 16M IOMMU page we may create several smaller mappings ("TCEs") in the hardware IOMMU. The code wrongly uses the emulated TCE index instead of hardware TCE index in error handling. The problem is easier to see on POWER8 with multi-level TCE tables (when only the first level is preallocated) as hash mode uses real mode TCE hypercalls handlers. The kernel starts using indirect tables when VMs get bigger than 128GB (depends on the max page order). The very first real mode hcall is going to fail with H_TOO_HARD as in the real mode we cannot allocate memory for TCEs (we can in the virtual mode) but on the way out the code attempts to clear hardware TCEs using emulated TCE indexes which corrupts random kernel memory because it_offset==1<<59 is subtracted from those indexes and the resulting index is out of the TCE table bounds. This fixes kvmppc_clear_tce() to use the correct TCE indexes. While at it, this fixes TCE cache invalidation which uses emulated TCE indexes instead of the hardware ones. This went unnoticed as 64bit DMA is used these days and VMs map all RAM in one go and only then do DMA and this is when the TCE cache gets populated. Potentially this could slow down mapping, however normally 16MB emulated pages are backed by 64K hardware pages so it is one write to the "TCE Kill" per 256 updates which is not that bad considering the size of the cache (1024 TCEs or so). Fixes: ca1fc489cfa0 ("KVM: PPC: Book3S: Allow backing bigger guest IOMMU pages with smaller physical pages") Signed-off-by: Alexey Kardashevskiy Tested-by: David Gibson Reviewed-by: Frederic Barrat Reviewed-by: David Gibson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220420050840.328223-1-aik@ozlabs.ru --- arch/powerpc/kvm/book3s_64_vio.c | 45 +++++++++++++++-------------- arch/powerpc/kvm/book3s_64_vio_hv.c | 44 ++++++++++++++-------------- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index d42b4b6d4a79..85cfa6328222 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -420,13 +420,19 @@ static void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, tbl[idx % TCES_PER_PAGE] = tce; } -static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl, - unsigned long entry) +static void kvmppc_clear_tce(struct mm_struct *mm, struct kvmppc_spapr_tce_table *stt, + struct iommu_table *tbl, unsigned long entry) { - unsigned long hpa = 0; - enum dma_data_direction dir = DMA_NONE; + unsigned long i; + unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); + unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift); - iommu_tce_xchg_no_kill(mm, tbl, entry, &hpa, &dir); + for (i = 0; i < subpages; ++i) { + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; + + iommu_tce_xchg_no_kill(mm, tbl, io_entry + i, &hpa, &dir); + } } static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, @@ -485,6 +491,8 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm, break; } + iommu_tce_kill(tbl, io_entry, subpages); + return ret; } @@ -544,6 +552,8 @@ static long kvmppc_tce_iommu_map(struct kvm *kvm, break; } + iommu_tce_kill(tbl, io_entry, subpages); + return ret; } @@ -590,10 +600,9 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - iommu_tce_kill(stit->tbl, entry, 1); if (ret != H_SUCCESS) { - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry); goto unlock_exit; } } @@ -669,13 +678,13 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, */ if (get_user(tce, tces + i)) { ret = H_TOO_HARD; - goto invalidate_exit; + goto unlock_exit; } tce = be64_to_cpu(tce); if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) { ret = H_PARAMETER; - goto invalidate_exit; + goto unlock_exit; } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -684,19 +693,15 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, iommu_tce_direction(tce)); if (ret != H_SUCCESS) { - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, - entry); - goto invalidate_exit; + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, + entry + i); + goto unlock_exit; } } kvmppc_tce_put(stt, entry + i, tce); } -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill(stit->tbl, entry, npages); - unlock_exit: srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -735,20 +740,16 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, continue; if (ret == H_TOO_HARD) - goto invalidate_exit; + return ret; WARN_ON_ONCE(1); - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry + i); } } for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill(stit->tbl, ioba >> stt->page_shift, npages); - return ret; } EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 870b7f0c7ea5..fdeda6a9cff4 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -247,13 +247,19 @@ static void iommu_tce_kill_rm(struct iommu_table *tbl, tbl->it_ops->tce_kill(tbl, entry, pages, true); } -static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl, - unsigned long entry) +static void kvmppc_rm_clear_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *stt, + struct iommu_table *tbl, unsigned long entry) { - unsigned long hpa = 0; - enum dma_data_direction dir = DMA_NONE; + unsigned long i; + unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); + unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift); - iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); + for (i = 0; i < subpages; ++i) { + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; + + iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, io_entry + i, &hpa, &dir); + } } static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, @@ -316,6 +322,8 @@ static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm, break; } + iommu_tce_kill_rm(tbl, io_entry, subpages); + return ret; } @@ -379,6 +387,8 @@ static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, break; } + iommu_tce_kill_rm(tbl, io_entry, subpages); + return ret; } @@ -420,10 +430,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - iommu_tce_kill_rm(stit->tbl, entry, 1); - if (ret != H_SUCCESS) { - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry); return ret; } } @@ -561,7 +569,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, ua = 0; if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) { ret = H_PARAMETER; - goto invalidate_exit; + goto unlock_exit; } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -570,19 +578,15 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, iommu_tce_direction(tce)); if (ret != H_SUCCESS) { - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, - entry); - goto invalidate_exit; + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, + entry + i); + goto unlock_exit; } } kvmppc_rm_tce_put(stt, entry + i, tce); } -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill_rm(stit->tbl, entry, npages); - unlock_exit: if (!prereg) arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock); @@ -620,20 +624,16 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, continue; if (ret == H_TOO_HARD) - goto invalidate_exit; + return ret; WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry + i); } } for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value); -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill_rm(stit->tbl, ioba >> stt->page_shift, npages); - return ret; } From 3d0b93d92a2790337aa9d18cb332d02356a24126 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Wed, 20 Apr 2022 21:50:07 +0800 Subject: [PATCH 137/196] drm/vc4: Use pm_runtime_resume_and_get to fix pm_runtime_get_sync() usage If the device is already in a runtime PM enabled state pm_runtime_get_sync() will return 1. Also, we need to call pm_runtime_put_noidle() when pm_runtime_get_sync() fails, so use pm_runtime_resume_and_get() instead. this function will handle this. Fixes: 4078f5757144 ("drm/vc4: Add DSI driver") Signed-off-by: Miaoqian Lin Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220420135008.2757-1-linmq006@gmail.com --- drivers/gpu/drm/vc4/vc4_dsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vc4/vc4_dsi.c b/drivers/gpu/drm/vc4/vc4_dsi.c index 752f921735c6..98308a17e4ed 100644 --- a/drivers/gpu/drm/vc4/vc4_dsi.c +++ b/drivers/gpu/drm/vc4/vc4_dsi.c @@ -846,7 +846,7 @@ static void vc4_dsi_encoder_enable(struct drm_encoder *encoder) unsigned long phy_clock; int ret; - ret = pm_runtime_get_sync(dev); + ret = pm_runtime_resume_and_get(dev); if (ret) { DRM_ERROR("Failed to runtime PM enable on DSI%d\n", dsi->variant->port); return; From 169466d4e59ca204683998b7f45673ebf0eb2de6 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 20 Apr 2022 16:12:29 -0700 Subject: [PATCH 138/196] Revert "drm: of: Properly try all possible cases for bridge/panel detection" Commit '80253168dbfd ("drm: of: Lookup if child node has panel or bridge")' introduced the ability to describe a panel under a display controller without having to use a graph to connect the controller to its single child panel (or bridge). The implementation of this would find the first non-graph node and attempt to acquire the related panel or bridge. This prevents cases where any other child node, such as a aux bus for a DisplayPort controller, or an opp-table to find the referenced panel. Commit '67bae5f28c89 ("drm: of: Properly try all possible cases for bridge/panel detection")' attempted to solve this problem by not bypassing the graph reference lookup before attempting to find the panel or bridge. While this does solve the case where a proper graph reference is present, it does not allow the caller to distinguish between a yet-to-be-probed panel or bridge and the absence of a reference to a panel. One such case is a DisplayPort controller that on some boards have an explicitly described reference to a panel, but on others have a discoverable DisplayPort display attached (which doesn't need to be expressed in DeviceTree). This reverts commit '67bae5f28c89 ("drm: of: Properly try all possible cases for bridge/panel detection")', as a step towards reverting commit '80253168dbfd ("drm: of: Lookup if child node has panel or bridge")'. Signed-off-by: Bjorn Andersson Acked-by: Paul Kocialkowski Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220420231230.58499-1-bjorn.andersson@linaro.org --- drivers/gpu/drm/drm_of.c | 101 +++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 51 deletions(-) diff --git a/drivers/gpu/drm/drm_of.c b/drivers/gpu/drm/drm_of.c index f4df344509a8..026e4e29a0f3 100644 --- a/drivers/gpu/drm/drm_of.c +++ b/drivers/gpu/drm/drm_of.c @@ -214,29 +214,6 @@ int drm_of_encoder_active_endpoint(struct device_node *node, } EXPORT_SYMBOL_GPL(drm_of_encoder_active_endpoint); -static int find_panel_or_bridge(struct device_node *node, - struct drm_panel **panel, - struct drm_bridge **bridge) -{ - if (panel) { - *panel = of_drm_find_panel(node); - if (!IS_ERR(*panel)) - return 0; - - /* Clear the panel pointer in case of error. */ - *panel = NULL; - } - - /* No panel found yet, check for a bridge next. */ - if (bridge) { - *bridge = of_drm_find_bridge(node); - if (*bridge) - return 0; - } - - return -EPROBE_DEFER; -} - /** * drm_of_find_panel_or_bridge - return connected panel or bridge device * @np: device tree node containing encoder output ports @@ -259,44 +236,66 @@ int drm_of_find_panel_or_bridge(const struct device_node *np, struct drm_panel **panel, struct drm_bridge **bridge) { - struct device_node *node; - int ret; + int ret = -EPROBE_DEFER; + struct device_node *remote; if (!panel && !bridge) return -EINVAL; - if (panel) *panel = NULL; - if (bridge) - *bridge = NULL; - /* Check for a graph on the device node first. */ - if (of_graph_is_present(np)) { - node = of_graph_get_remote_node(np, port, endpoint); - if (node) { - ret = find_panel_or_bridge(node, panel, bridge); - of_node_put(node); - - if (!ret) - return 0; - } - } - - /* Otherwise check for any child node other than port/ports. */ - for_each_available_child_of_node(np, node) { - if (of_node_name_eq(node, "port") || - of_node_name_eq(node, "ports")) + /** + * Devices can also be child nodes when we also control that device + * through the upstream device (ie, MIPI-DCS for a MIPI-DSI device). + * + * Lookup for a child node of the given parent that isn't either port + * or ports. + */ + for_each_available_child_of_node(np, remote) { + if (of_node_name_eq(remote, "port") || + of_node_name_eq(remote, "ports")) continue; - ret = find_panel_or_bridge(node, panel, bridge); - of_node_put(node); - - /* Stop at the first found occurrence. */ - if (!ret) - return 0; + goto of_find_panel_or_bridge; } - return -EPROBE_DEFER; + /* + * of_graph_get_remote_node() produces a noisy error message if port + * node isn't found and the absence of the port is a legit case here, + * so at first we silently check whether graph presents in the + * device-tree node. + */ + if (!of_graph_is_present(np)) + return -ENODEV; + + remote = of_graph_get_remote_node(np, port, endpoint); + +of_find_panel_or_bridge: + if (!remote) + return -ENODEV; + + if (panel) { + *panel = of_drm_find_panel(remote); + if (!IS_ERR(*panel)) + ret = 0; + else + *panel = NULL; + } + + /* No panel found yet, check for a bridge next. */ + if (bridge) { + if (ret) { + *bridge = of_drm_find_bridge(remote); + if (*bridge) + ret = 0; + } else { + *bridge = NULL; + } + + } + + of_node_put(remote); + return ret; } EXPORT_SYMBOL_GPL(drm_of_find_panel_or_bridge); From b089c0a9b14c354a0c3a421e09af3208cb7c232c Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 20 Apr 2022 16:12:30 -0700 Subject: [PATCH 139/196] Revert "drm: of: Lookup if child node has panel or bridge" Commit '80253168dbfd ("drm: of: Lookup if child node has panel or bridge")' attempted to simplify the case of expressing a simple panel under a DSI controller, by assuming that the first non-graph child node was a panel or bridge. Unfortunately for non-trivial cases the first child node might not be a panel or bridge. Examples of this can be a aux-bus in the case of DisplayPort, or an opp-table represented before the panel node. In these cases the reverted commit prevents the caller from ever finding a reference to the panel. This reverts commit '80253168dbfd ("drm: of: Lookup if child node has panel or bridge")', in favor of using an explicit graph reference to the panel in the trivial case as well. Signed-off-by: Bjorn Andersson Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220420231230.58499-2-bjorn.andersson@linaro.org --- drivers/gpu/drm/drm_of.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/drivers/gpu/drm/drm_of.c b/drivers/gpu/drm/drm_of.c index 026e4e29a0f3..9a2cfab3a177 100644 --- a/drivers/gpu/drm/drm_of.c +++ b/drivers/gpu/drm/drm_of.c @@ -244,21 +244,6 @@ int drm_of_find_panel_or_bridge(const struct device_node *np, if (panel) *panel = NULL; - /** - * Devices can also be child nodes when we also control that device - * through the upstream device (ie, MIPI-DCS for a MIPI-DSI device). - * - * Lookup for a child node of the given parent that isn't either port - * or ports. - */ - for_each_available_child_of_node(np, remote) { - if (of_node_name_eq(remote, "port") || - of_node_name_eq(remote, "ports")) - continue; - - goto of_find_panel_or_bridge; - } - /* * of_graph_get_remote_node() produces a noisy error message if port * node isn't found and the absence of the port is a legit case here, @@ -269,8 +254,6 @@ int drm_of_find_panel_or_bridge(const struct device_node *np, return -ENODEV; remote = of_graph_get_remote_node(np, port, endpoint); - -of_find_panel_or_bridge: if (!remote) return -ENODEV; From b3fbe53610b5ed8f0370ec4c7e6c8a1f261ddf70 Mon Sep 17 00:00:00 2001 From: Andy Chi Date: Thu, 21 Apr 2022 14:36:04 +0800 Subject: [PATCH 140/196] ALSA: hda/realtek: Enable mute/micmute LEDs and limit mic boost on EliteBook 845/865 G9 On HP EliteBook 845 G9 and EliteBook 865 G9, the audio LEDs can be enabled by ALC285_FIXUP_HP_MUTE_LED. So use it accordingly. Signed-off-by: Andy Chi Fixes: 07bcab93946c ("ALSA: hda/realtek: Add support for HP Laptops") Link: https://lore.kernel.org/r/20220421063606.39772-1-andy.chi@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 62fbf3772b41..0cba2f19a772 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7006,6 +7006,7 @@ enum { ALC285_FIXUP_LEGION_Y9000X_AUTOMUTE, ALC287_FIXUP_LEGION_16ACHG6, ALC287_FIXUP_CS35L41_I2C_2, + ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED, ALC245_FIXUP_CS35L41_SPI_2, ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED, ALC245_FIXUP_CS35L41_SPI_4, @@ -8769,6 +8770,12 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = cs35l41_fixup_i2c_two, }, + [ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED] = { + .type = HDA_FIXUP_FUNC, + .v.func = cs35l41_fixup_i2c_two, + .chained = true, + .chain_id = ALC285_FIXUP_HP_MUTE_LED, + }, [ALC245_FIXUP_CS35L41_SPI_2] = { .type = HDA_FIXUP_FUNC, .v.func = cs35l41_fixup_spi_two, @@ -9025,9 +9032,9 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8981, "HP Elite Dragonfly G3", ALC245_FIXUP_CS35L41_SPI_4), SND_PCI_QUIRK(0x103c, 0x898e, "HP EliteBook 835 G9", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x898f, "HP EliteBook 835 G9", ALC287_FIXUP_CS35L41_I2C_2), - SND_PCI_QUIRK(0x103c, 0x8991, "HP EliteBook 845 G9", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x103c, 0x8991, "HP EliteBook 845 G9", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8992, "HP EliteBook 845 G9", ALC287_FIXUP_CS35L41_I2C_2), - SND_PCI_QUIRK(0x103c, 0x8994, "HP EliteBook 855 G9", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x103c, 0x8994, "HP EliteBook 855 G9", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8995, "HP EliteBook 855 G9", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x89a4, "HP ProBook 440 G9", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89a6, "HP ProBook 450 G9", ALC236_FIXUP_HP_GPIO_LED), From bc6de2878429e85c1f1afaa566f7b5abb2243eef Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Sun, 17 Apr 2022 20:55:19 +0800 Subject: [PATCH 141/196] drivers: net: hippi: Fix deadlock in rr_close() There is a deadlock in rr_close(), which is shown below: (Thread 1) | (Thread 2) | rr_open() rr_close() | add_timer() spin_lock_irqsave() //(1) | (wait a time) ... | rr_timer() del_timer_sync() | spin_lock_irqsave() //(2) (wait timer to stop) | ... We hold rrpriv->lock in position (1) of thread 1 and use del_timer_sync() to wait timer to stop, but timer handler also need rrpriv->lock in position (2) of thread 2. As a result, rr_close() will block forever. This patch extracts del_timer_sync() from the protection of spin_lock_irqsave(), which could let timer handler to obtain the needed lock. Signed-off-by: Duoming Zhou Link: https://lore.kernel.org/r/20220417125519.82618-1-duoming@zju.edu.cn Signed-off-by: Paolo Abeni --- drivers/net/hippi/rrunner.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/hippi/rrunner.c b/drivers/net/hippi/rrunner.c index 16105292b140..74e845fa2e07 100644 --- a/drivers/net/hippi/rrunner.c +++ b/drivers/net/hippi/rrunner.c @@ -1355,7 +1355,9 @@ static int rr_close(struct net_device *dev) rrpriv->fw_running = 0; + spin_unlock_irqrestore(&rrpriv->lock, flags); del_timer_sync(&rrpriv->timer); + spin_lock_irqsave(&rrpriv->lock, flags); writel(0, ®s->TxPi); writel(0, ®s->IpRxPi); From 94f4c4965e5513ba624488f4b601d6b385635aec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 8 Apr 2022 16:22:55 +0200 Subject: [PATCH 142/196] drm/amdgpu: partial revert "remove ctx->lock" v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 461fa7b0ac565ef25c1da0ced31005dd437883a7. We are missing some inter dependencies here so re-introduce the lock until we have figured out what's missing. Just drop/retake it while adding dependencies. v2: still drop the lock while adding dependencies Signed-off-by: Christian König Tested-by: Mikhail Gavrilov (v1) Fixes: 461fa7b0ac56 ("drm/amdgpu: remove ctx->lock") Acked-by: Alex Deucher Link: https://patchwork.freedesktop.org/patch/msgid/20220419110633.166236-1-christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 21 +++++++++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 1 + 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 970b065e9a6b..d0d0ea565e3d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs goto free_chunk; } + mutex_lock(&p->ctx->lock); + /* skip guilty context job */ if (atomic_read(&p->ctx->guilty) == 1) { ret = -ECANCELED; @@ -709,6 +711,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, dma_fence_put(parser->fence); if (parser->ctx) { + mutex_unlock(&parser->ctx->lock); amdgpu_ctx_put(parser->ctx); } if (parser->bo_list) @@ -1157,6 +1160,9 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev, { int i, r; + /* TODO: Investigate why we still need the context lock */ + mutex_unlock(&p->ctx->lock); + for (i = 0; i < p->nchunks; ++i) { struct amdgpu_cs_chunk *chunk; @@ -1167,32 +1173,34 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev, case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES: r = amdgpu_cs_process_fence_dep(p, chunk); if (r) - return r; + goto out; break; case AMDGPU_CHUNK_ID_SYNCOBJ_IN: r = amdgpu_cs_process_syncobj_in_dep(p, chunk); if (r) - return r; + goto out; break; case AMDGPU_CHUNK_ID_SYNCOBJ_OUT: r = amdgpu_cs_process_syncobj_out_dep(p, chunk); if (r) - return r; + goto out; break; case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT: r = amdgpu_cs_process_syncobj_timeline_in_dep(p, chunk); if (r) - return r; + goto out; break; case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL: r = amdgpu_cs_process_syncobj_timeline_out_dep(p, chunk); if (r) - return r; + goto out; break; } } - return 0; +out: + mutex_lock(&p->ctx->lock); + return r; } static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p) @@ -1368,6 +1376,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) goto out; r = amdgpu_cs_submit(&parser, cs); + out: amdgpu_cs_parser_fini(&parser, r, reserved_buffers); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 5981c7d9bd48..8f0e6d93bb9c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -237,6 +237,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, kref_init(&ctx->refcount); spin_lock_init(&ctx->ring_lock); + mutex_init(&ctx->lock); ctx->reset_counter = atomic_read(&adev->gpu_reset_counter); ctx->reset_counter_query = ctx->reset_counter; @@ -357,6 +358,7 @@ static void amdgpu_ctx_fini(struct kref *ref) drm_dev_exit(idx); } + mutex_destroy(&ctx->lock); kfree(ctx); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h index d0cbfcea90f7..142f2f87d44c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h @@ -49,6 +49,7 @@ struct amdgpu_ctx { bool preamble_presented; int32_t init_priority; int32_t override_priority; + struct mutex lock; atomic_t guilty; unsigned long ras_counter_ce; unsigned long ras_counter_ue; From 0dcad700bb2776e3886fe0a645a4bf13b1e747cd Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Tue, 19 Apr 2022 17:18:27 +0530 Subject: [PATCH 143/196] powerpc/perf: Fix power9 event alternatives When scheduling a group of events, there are constraint checks done to make sure all events can go in a group. Example, one of the criteria is that events in a group cannot use the same PMC. But platform specific PMU supports alternative event for some of the event codes. During perf_event_open(), if any event group doesn't match constraint check criteria, further lookup is done to find alternative event. By current design, the array of alternatives events in PMU code is expected to be sorted by column 0. This is because in find_alternative() the return criteria is based on event code comparison. ie. "event < ev_alt[i][0])". This optimisation is there since find_alternative() can be called multiple times. In power9 PMU code, the alternative event array is not sorted properly and hence there is breakage in finding alternative events. To work with existing logic, fix the alternative event array to be sorted by column 0 for power9-pmu.c Results: With alternative events, multiplexing can be avoided. That is, for example, in power9 PM_LD_MISS_L1 (0x3e054) has alternative event, PM_LD_MISS_L1_ALT (0x400f0). This is an identical event which can be programmed in a different PMC. Before: # perf stat -e r3e054,r300fc Performance counter stats for 'system wide': 1057860 r3e054 (50.21%) 379 r300fc (49.79%) 0.944329741 seconds time elapsed Since both the events are using PMC3 in this case, they are multiplexed here. After: # perf stat -e r3e054,r300fc Performance counter stats for 'system wide': 1006948 r3e054 182 r300fc Fixes: 91e0bd1e6251 ("powerpc/perf: Add PM_LD_MISS_L1 and PM_BR_2PATH to power9 event list") Signed-off-by: Athira Rajeev Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220419114828.89843-1-atrajeev@linux.vnet.ibm.com --- arch/powerpc/perf/power9-pmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index c9eb5232e68b..c393e837648e 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -133,11 +133,11 @@ int p9_dd22_bl_ev[] = { /* Table of alternatives, sorted by column 0 */ static const unsigned int power9_event_alternatives[][MAX_ALT] = { - { PM_INST_DISP, PM_INST_DISP_ALT }, - { PM_RUN_CYC_ALT, PM_RUN_CYC }, - { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, - { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT }, { PM_BR_2PATH, PM_BR_2PATH_ALT }, + { PM_INST_DISP, PM_INST_DISP_ALT }, + { PM_RUN_CYC_ALT, PM_RUN_CYC }, + { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT }, + { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, }; static int power9_get_alternatives(u64 event, unsigned int flags, u64 alt[]) From c6cc9a852f123301d5271f1484df8e961b2b64f1 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Tue, 19 Apr 2022 17:18:28 +0530 Subject: [PATCH 144/196] powerpc/perf: Fix power10 event alternatives When scheduling a group of events, there are constraint checks done to make sure all events can go in a group. Example, one of the criteria is that events in a group cannot use the same PMC. But platform specific PMU supports alternative event for some of the event codes. During perf_event_open(), if any event group doesn't match constraint check criteria, further lookup is done to find alternative event. By current design, the array of alternatives events in PMU code is expected to be sorted by column 0. This is because in find_alternative() the return criteria is based on event code comparison. ie. "event < ev_alt[i][0])". This optimisation is there since find_alternative() can be called multiple times. In power10 PMU code, the alternative event array is not sorted properly and hence there is breakage in finding alternative event. To work with existing logic, fix the alternative event array to be sorted by column 0 for power10-pmu.c Results: In case where an alternative event is not chosen when we could, events will be multiplexed. ie, time sliced where it could actually run concurrently. Example, in power10 PM_INST_CMPL_ALT(0x00002) has alternative event, PM_INST_CMPL(0x500fa). Without the fix, if a group of events with PMC1 to PMC4 is used along with PM_INST_CMPL_ALT, it will be time sliced since all programmable PMC's are consumed already. But with the fix, when it picks alternative event on PMC5, all events will run concurrently. Before: # perf stat -e r00002,r100fc,r200fa,r300fc,r400fc Performance counter stats for 'system wide': 328668935 r00002 (79.94%) 56501024 r100fc (79.95%) 49564238 r200fa (79.95%) 376 r300fc (80.19%) 660 r400fc (79.97%) 4.039150522 seconds time elapsed With the fix, since alternative event is chosen to run on PMC6, events will be run concurrently. After: # perf stat -e r00002,r100fc,r200fa,r300fc,r400fc Performance counter stats for 'system wide': 23596607 r00002 4907738 r100fc 2283608 r200fa 135 r300fc 248 r400fc 1.664671390 seconds time elapsed Fixes: a64e697cef23 ("powerpc/perf: power10 Performance Monitoring support") Signed-off-by: Athira Rajeev Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220419114828.89843-2-atrajeev@linux.vnet.ibm.com --- arch/powerpc/perf/power10-pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/perf/power10-pmu.c b/arch/powerpc/perf/power10-pmu.c index d3398100a60f..c6d51e7093cf 100644 --- a/arch/powerpc/perf/power10-pmu.c +++ b/arch/powerpc/perf/power10-pmu.c @@ -91,8 +91,8 @@ extern u64 PERF_REG_EXTENDED_MASK; /* Table of alternatives, sorted by column 0 */ static const unsigned int power10_event_alternatives[][MAX_ALT] = { - { PM_CYC_ALT, PM_CYC }, { PM_INST_CMPL_ALT, PM_INST_CMPL }, + { PM_CYC_ALT, PM_CYC }, }; static int power10_get_alternatives(u64 event, unsigned int flags, u64 alt[]) From bb82c574691daf8f7fa9a160264d15c5804cb769 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Apr 2022 12:57:56 +1000 Subject: [PATCH 145/196] powerpc/perf: Fix 32bit compile The "read_bhrb" global symbol is only called under CONFIG_PPC64 of arch/powerpc/perf/core-book3s.c but it is compiled for both 32 and 64 bit anyway (and LLVM fails to link this on 32bit). This fixes it by moving bhrb.o to obj64 targets. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220421025756.571995-1-aik@ozlabs.ru --- arch/powerpc/perf/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile index 2f46e31c7612..4f53d0b97539 100644 --- a/arch/powerpc/perf/Makefile +++ b/arch/powerpc/perf/Makefile @@ -3,11 +3,11 @@ obj-y += callchain.o callchain_$(BITS).o perf_regs.o obj-$(CONFIG_COMPAT) += callchain_32.o -obj-$(CONFIG_PPC_PERF_CTRS) += core-book3s.o bhrb.o +obj-$(CONFIG_PPC_PERF_CTRS) += core-book3s.o obj64-$(CONFIG_PPC_PERF_CTRS) += ppc970-pmu.o power5-pmu.o \ power5+-pmu.o power6-pmu.o power7-pmu.o \ isa207-common.o power8-pmu.o power9-pmu.o \ - generic-compat-pmu.o power10-pmu.o + generic-compat-pmu.o power10-pmu.o bhrb.o obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o obj-$(CONFIG_PPC_POWERNV) += imc-pmu.o From 0014edaedfd804dbf35b009808789325ca615716 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 20 Apr 2022 15:19:25 +0200 Subject: [PATCH 146/196] fs: unset MNT_WRITE_HOLD on failure After mnt_hold_writers() has been called we will always have set MNT_WRITE_HOLD and consequently we always need to pair mnt_hold_writers() with mnt_unhold_writers(). After the recent cleanup in [1] where Al switched from a do-while to a for loop the cleanup currently fails to unset MNT_WRITE_HOLD for the first mount that was changed. Fix this and make sure that the first mount will be cleaned up and add some comments to make it more obvious. Link: https://lore.kernel.org/lkml/0000000000007cc21d05dd0432b8@google.com Link: https://lore.kernel.org/lkml/00000000000080e10e05dd043247@google.com Link: https://lore.kernel.org/r/20220420131925.2464685-1-brauner@kernel.org Fixes: e257039f0fc7 ("mount_setattr(): clean the control flow and calling conventions") [1] Cc: Hillf Danton Cc: Christoph Hellwig Cc: Al Viro Reported-by: syzbot+10a16d1c43580983f6a2@syzkaller.appspotmail.com Reported-by: syzbot+306090cfa3294f0bbfb3@syzkaller.appspotmail.com Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) --- fs/namespace.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index a0a36bfa3aa0..afe2b64b14f1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4058,10 +4058,22 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) if (err) { struct mount *p; - for (p = mnt; p != m; p = next_mnt(p, mnt)) { + /* + * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will + * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all + * mounts and needs to take care to include the first mount. + */ + for (p = mnt; p; p = next_mnt(p, mnt)) { /* If we had to hold writers unblock them. */ if (p->mnt.mnt_flags & MNT_WRITE_HOLD) mnt_unhold_writers(p); + + /* + * We're done once the first mount we changed got + * MNT_WRITE_HOLD unset. + */ + if (p == m) + break; } } return err; From 2d08935682ac5f6bfb70f7e6844ec27d4a245fa4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Apr 2022 00:43:41 +0000 Subject: [PATCH 147/196] KVM: x86: Don't re-acquire SRCU lock in complete_emulated_io() Don't re-acquire SRCU in complete_emulated_io() now that KVM acquires the lock in kvm_arch_vcpu_ioctl_run(). More importantly, don't overwrite vcpu->srcu_idx. If the index acquired by complete_emulated_io() differs from the one acquired by kvm_arch_vcpu_ioctl_run(), KVM will effectively leak a lock and hang if/when synchronize_srcu() is invoked for the relevant grace period. Fixes: 8d25b7beca7e ("KVM: x86: pull kvm->srcu read-side to kvm_arch_vcpu_ioctl_run") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20220415004343.2203171-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 547ba00ef64f..867c0fd8d187 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10387,12 +10387,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { - int r; - - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - return r; + return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); } static int complete_emulated_pio(struct kvm_vcpu *vcpu) From fdd6f6ac2e489b9b256cd05a880d13bfdbac7c2e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Apr 2022 00:43:42 +0000 Subject: [PATCH 148/196] KVM: RISC-V: Use kvm_vcpu.srcu_idx, drop RISC-V's unnecessary copy Use the generic kvm_vcpu's srcu_idx instead of using an indentical field in RISC-V's version of kvm_vcpu_arch. Generic KVM very intentionally does not touch vcpu->srcu_idx, i.e. there's zero chance of running afoul of common code. Signed-off-by: Sean Christopherson Message-Id: <20220415004343.2203171-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/riscv/include/asm/kvm_host.h | 3 --- arch/riscv/kvm/vcpu.c | 16 ++++++++-------- arch/riscv/kvm/vcpu_exit.c | 4 ++-- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 78da839657e5..cd4bbcecb0fb 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -193,9 +193,6 @@ struct kvm_vcpu_arch { /* Don't run the VCPU (blocked) */ bool pause; - - /* SRCU lock index for in-kernel run loop */ - int srcu_idx; }; static inline void kvm_arch_hardware_unsetup(void) {} diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index aad430668bb4..4a52fda6417b 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -727,13 +727,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Mark this VCPU ran at least once */ vcpu->arch.ran_atleast_once = true; - vcpu->arch.srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); /* Process MMIO value returned from user-space */ if (run->exit_reason == KVM_EXIT_MMIO) { ret = kvm_riscv_vcpu_mmio_return(vcpu, vcpu->run); if (ret) { - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); return ret; } } @@ -742,13 +742,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (run->exit_reason == KVM_EXIT_RISCV_SBI) { ret = kvm_riscv_vcpu_sbi_return(vcpu, vcpu->run); if (ret) { - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); return ret; } } if (run->immediate_exit) { - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); return -EINTR; } @@ -787,7 +787,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ vcpu->mode = IN_GUEST_MODE; - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); smp_mb__after_srcu_read_unlock(); /* @@ -805,7 +805,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; local_irq_enable(); preempt_enable(); - vcpu->arch.srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); continue; } @@ -849,7 +849,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) preempt_enable(); - vcpu->arch.srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); ret = kvm_riscv_vcpu_exit(vcpu, run, &trap); } @@ -858,7 +858,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) vcpu_put(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); return ret; } diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index aa8af129e4bb..2d56faddb9d1 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -456,9 +456,9 @@ static int stage2_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, void kvm_riscv_vcpu_wfi(struct kvm_vcpu *vcpu) { if (!kvm_arch_vcpu_runnable(vcpu)) { - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->arch.srcu_idx); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_vcpu_halt(vcpu); - vcpu->arch.srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_clear_request(KVM_REQ_UNHALT, vcpu); } } From 2031f2876896d82aca7e82f84accd9181b9587fb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Apr 2022 00:43:43 +0000 Subject: [PATCH 149/196] KVM: Add helpers to wrap vcpu->srcu_idx and yell if it's abused Add wrappers to acquire/release KVM's SRCU lock when stashing the index in vcpu->src_idx, along with rudimentary detection of illegal usage, e.g. re-acquiring SRCU and thus overwriting vcpu->src_idx. Because the SRCU index is (currently) either 0 or 1, illegal nesting bugs can go unnoticed for quite some time and only cause problems when the nested lock happens to get a different index. Wrap the WARNs in PROVE_RCU=y, and make them ONCE, otherwise KVM will likely yell so loudly that it will bring the kernel to its knees. Signed-off-by: Sean Christopherson Tested-by: Fabiano Rosas Message-Id: <20220415004343.2203171-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 9 +++++---- arch/powerpc/kvm/book3s_hv_nested.c | 16 +++++++-------- arch/powerpc/kvm/book3s_rtas.c | 4 ++-- arch/powerpc/kvm/powerpc.c | 4 ++-- arch/riscv/kvm/vcpu.c | 16 +++++++-------- arch/riscv/kvm/vcpu_exit.c | 4 ++-- arch/s390/kvm/interrupt.c | 4 ++-- arch/s390/kvm/kvm-s390.c | 8 ++++---- arch/s390/kvm/vsie.c | 4 ++-- arch/x86/kvm/x86.c | 28 ++++++++++++-------------- include/linux/kvm_host.h | 24 +++++++++++++++++++++- 11 files changed, 71 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index e4ce2a35483f..42851c32ff3b 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -168,9 +168,10 @@ int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, return -EINVAL; /* Read the entry from guest memory */ addr = base + (index * sizeof(rpte)); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + + kvm_vcpu_srcu_read_lock(vcpu); ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte)); - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); if (ret) { if (pte_ret_p) *pte_ret_p = addr; @@ -246,9 +247,9 @@ int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, /* Read the table to find the root of the radix tree */ ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry)); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry)); - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); if (ret) return ret; diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 9d373f8963ee..c943a051c6e7 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -306,10 +306,10 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) /* copy parameters in */ hv_ptr = kvmppc_get_gpr(vcpu, 4); regs_ptr = kvmppc_get_gpr(vcpu, 5); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); err = kvmhv_read_guest_state_and_regs(vcpu, &l2_hv, &l2_regs, hv_ptr, regs_ptr); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); if (err) return H_PARAMETER; @@ -410,10 +410,10 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) byteswap_hv_regs(&l2_hv); byteswap_pt_regs(&l2_regs); } - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); err = kvmhv_write_guest_state_and_regs(vcpu, &l2_hv, &l2_regs, hv_ptr, regs_ptr); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); if (err) return H_AUTHORITY; @@ -600,16 +600,16 @@ long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu) goto not_found; /* Write what was loaded into our buffer back to the L1 guest */ - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); rc = kvm_vcpu_write_guest(vcpu, gp_to, buf, n); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); if (rc) goto not_found; } else { /* Load the data to be stored from the L1 guest into our buf */ - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); rc = kvm_vcpu_read_guest(vcpu, gp_from, buf, n); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); if (rc) goto not_found; diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c index 0f847f1e5ddd..6808bda0dbc1 100644 --- a/arch/powerpc/kvm/book3s_rtas.c +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -229,9 +229,9 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu) */ args_phys = kvmppc_get_gpr(vcpu, 4) & KVM_PAM; - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args)); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); if (rc) goto fail; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 875c30c12db0..533c4232e5ab 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -425,9 +425,9 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, return EMULATE_DONE; } - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); rc = kvm_read_guest(vcpu->kvm, pte.raddr, ptr, size); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); if (rc) return EMULATE_DO_MMIO; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 4a52fda6417b..7461f964d20a 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -727,13 +727,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Mark this VCPU ran at least once */ vcpu->arch.ran_atleast_once = true; - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); /* Process MMIO value returned from user-space */ if (run->exit_reason == KVM_EXIT_MMIO) { ret = kvm_riscv_vcpu_mmio_return(vcpu, vcpu->run); if (ret) { - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); return ret; } } @@ -742,13 +742,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (run->exit_reason == KVM_EXIT_RISCV_SBI) { ret = kvm_riscv_vcpu_sbi_return(vcpu, vcpu->run); if (ret) { - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); return ret; } } if (run->immediate_exit) { - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); return -EINTR; } @@ -787,7 +787,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ vcpu->mode = IN_GUEST_MODE; - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); smp_mb__after_srcu_read_unlock(); /* @@ -805,7 +805,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; local_irq_enable(); preempt_enable(); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); continue; } @@ -849,7 +849,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) preempt_enable(); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); ret = kvm_riscv_vcpu_exit(vcpu, run, &trap); } @@ -858,7 +858,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) vcpu_put(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); return ret; } diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index 2d56faddb9d1..a72c15d4b42a 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -456,9 +456,9 @@ static int stage2_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, void kvm_riscv_vcpu_wfi(struct kvm_vcpu *vcpu) { if (!kvm_arch_vcpu_runnable(vcpu)) { - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); kvm_vcpu_halt(vcpu); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); kvm_clear_request(KVM_REQ_UNHALT, vcpu); } } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 9b30beac904d..af96dc0549a4 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1334,11 +1334,11 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) hrtimer_start(&vcpu->arch.ckc_timer, sltime, HRTIMER_MODE_REL); VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime); no_timer: - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); kvm_vcpu_halt(vcpu); vcpu->valid_wakeup = false; __unset_cpu_idle(vcpu); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); hrtimer_cancel(&vcpu->arch.ckc_timer); return 0; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 156d1c25a3c1..da3dabda1a12 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4237,14 +4237,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) * We try to hold kvm->srcu during most of vcpu_run (except when run- * ning the guest), so that memslots (and other stuff) are protected */ - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); do { rc = vcpu_pre_run(vcpu); if (rc) break; - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); /* * As PF_VCPU will be used in fault handler, between * guest_enter and guest_exit should be no uaccess. @@ -4281,12 +4281,12 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) __enable_cpu_timer_accounting(vcpu); guest_exit_irqoff(); local_irq_enable(); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); rc = vcpu_post_run(vcpu, exit_reason); } while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); return rc; } diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index acda4b6fc851..dada78b92691 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1091,7 +1091,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) handle_last_fault(vcpu, vsie_page); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); /* save current guest state of bp isolation override */ guest_bp_isolation = test_thread_flag(TIF_ISOLATE_BP_GUEST); @@ -1133,7 +1133,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (!guest_bp_isolation) clear_thread_flag(TIF_ISOLATE_BP_GUEST); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); if (rc == -EINTR) { VCPU_EVENT(vcpu, 3, "%s", "machine check"); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 867c0fd8d187..51eb27824452 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10097,7 +10097,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* Store vcpu->apicv_active before vcpu->mode. */ smp_store_release(&vcpu->mode, IN_GUEST_MODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); /* * 1) We should set ->mode before checking ->requests. Please see @@ -10128,7 +10128,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_wmb(); local_irq_enable(); preempt_enable(); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); r = 1; goto cancel_injection; } @@ -10254,7 +10254,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) local_irq_enable(); preempt_enable(); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); /* * Profile KVM exit RIPs: @@ -10284,7 +10284,7 @@ out: } /* Called within kvm->srcu read side. */ -static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) +static inline int vcpu_block(struct kvm_vcpu *vcpu) { bool hv_timer; @@ -10300,12 +10300,12 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) if (hv_timer) kvm_lapic_switch_to_sw_timer(vcpu); - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) kvm_vcpu_halt(vcpu); else kvm_vcpu_block(vcpu); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); if (hv_timer) kvm_lapic_switch_to_hv_timer(vcpu); @@ -10347,7 +10347,6 @@ static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) static int vcpu_run(struct kvm_vcpu *vcpu) { int r; - struct kvm *kvm = vcpu->kvm; vcpu->arch.l1tf_flush_l1d = true; @@ -10355,7 +10354,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (kvm_vcpu_running(vcpu)) { r = vcpu_enter_guest(vcpu); } else { - r = vcpu_block(kvm, vcpu); + r = vcpu_block(vcpu); } if (r <= 0) @@ -10374,9 +10373,9 @@ static int vcpu_run(struct kvm_vcpu *vcpu) } if (__xfer_to_guest_mode_work_pending()) { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); r = xfer_to_guest_mode_handle_work(vcpu); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); if (r) return r; } @@ -10479,7 +10478,6 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; - struct kvm *kvm = vcpu->kvm; int r; vcpu_load(vcpu); @@ -10487,7 +10485,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_run->flags = 0; kvm_load_guest_fpu(vcpu); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { if (kvm_run->immediate_exit) { r = -EINTR; @@ -10499,9 +10497,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu)); - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); kvm_vcpu_block(vcpu); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); if (kvm_apic_accept_events(vcpu) < 0) { r = 0; @@ -10562,7 +10560,7 @@ out: if (kvm_run->kvm_valid_regs) store_regs(vcpu); post_kvm_run_save(vcpu); - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); kvm_sigset_deactivate(vcpu); vcpu_put(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3f9b22c4983a..2dab4b696682 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -315,7 +315,10 @@ struct kvm_vcpu { int cpu; int vcpu_id; /* id given by userspace at creation */ int vcpu_idx; /* index in kvm->vcpus array */ - int srcu_idx; + int ____srcu_idx; /* Don't use this directly. You've been warned. */ +#ifdef CONFIG_PROVE_RCU + int srcu_depth; +#endif int mode; u64 requests; unsigned long guest_debug; @@ -840,6 +843,25 @@ static inline void kvm_vm_bugged(struct kvm *kvm) unlikely(__ret); \ }) +static inline void kvm_vcpu_srcu_read_lock(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PROVE_RCU + WARN_ONCE(vcpu->srcu_depth++, + "KVM: Illegal vCPU srcu_idx LOCK, depth=%d", vcpu->srcu_depth - 1); +#endif + vcpu->____srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); +} + +static inline void kvm_vcpu_srcu_read_unlock(struct kvm_vcpu *vcpu) +{ + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->____srcu_idx); + +#ifdef CONFIG_PROVE_RCU + WARN_ONCE(--vcpu->srcu_depth, + "KVM: Illegal vCPU srcu_idx UNLOCK, depth=%d", vcpu->srcu_depth); +#endif +} + static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) { return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); From 5c697c367a66307a5d943c3449421aff2aa3ca4a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Apr 2022 00:46:22 +0000 Subject: [PATCH 150/196] KVM: Initialize debugfs_dentry when a VM is created to avoid NULL deref Initialize debugfs_entry to its semi-magical -ENOENT value when the VM is created. KVM's teardown when VM creation fails is kludgy and calls kvm_uevent_notify_change() and kvm_destroy_vm_debugfs() even if KVM never attempted kvm_create_vm_debugfs(). Because debugfs_entry is zero initialized, the IS_ERR() checks pass and KVM derefs a NULL pointer. BUG: kernel NULL pointer dereference, address: 0000000000000018 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 1068b1067 P4D 1068b1067 PUD 1068b0067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 871 Comm: repro Not tainted 5.18.0-rc1+ #825 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:__dentry_path+0x7b/0x130 Call Trace: dentry_path_raw+0x42/0x70 kvm_uevent_notify_change.part.0+0x10c/0x200 [kvm] kvm_put_kvm+0x63/0x2b0 [kvm] kvm_dev_ioctl+0x43a/0x920 [kvm] __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x31/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xae Modules linked in: kvm_intel kvm irqbypass Fixes: a44a4cc1c969 ("KVM: Don't create VM debugfs files outside of the VM directory") Cc: stable@vger.kernel.org Cc: Marc Zyngier Cc: Oliver Upton Reported-by: syzbot+df6fbbd2ee39f21289ef@syzkaller.appspotmail.com Signed-off-by: Sean Christopherson Reviewed-by: Oliver Upton Message-Id: <20220415004622.2207751-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index dfb7dabdbc63..d292c4397579 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -955,12 +955,6 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; - /* - * Force subsequent debugfs file creations to fail if the VM directory - * is not created. - */ - kvm->debugfs_dentry = ERR_PTR(-ENOENT); - if (!debugfs_initialized()) return 0; @@ -1081,6 +1075,12 @@ static struct kvm *kvm_create_vm(unsigned long type) BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); + /* + * Force subsequent debugfs file creations to fail if the VM directory + * is not created (by kvm_create_vm_debugfs()). + */ + kvm->debugfs_dentry = ERR_PTR(-ENOENT); + if (init_srcu_struct(&kvm->srcu)) goto out_err_no_srcu; if (init_srcu_struct(&kvm->irq_srcu)) From 80f0497c221112fc25845a8b68f1c5b4a23b3567 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 20 Apr 2022 01:37:29 +0000 Subject: [PATCH 151/196] KVM: x86: Tag APICv DISABLE inhibit, not ABSENT, if APICv is disabled Set the DISABLE inhibit, not the ABSENT inhibit, if APICv is disabled via module param. A recent refactoring to add a wrapper for setting/clearing inhibits unintentionally changed the flag, probably due to a copy+paste goof. Fixes: 4f4c4a3ee53c ("KVM: x86: Trace all APICv inhibit changes and capture overall status") Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20220420013732.3308816-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51eb27824452..d54d4a67b226 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9111,7 +9111,7 @@ static void kvm_apicv_init(struct kvm *kvm) if (!enable_apicv) set_or_clear_apicv_inhibit(inhibits, - APICV_INHIBIT_REASON_ABSENT, true); + APICV_INHIBIT_REASON_DISABLE, true); } static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) From 7c69661e225cc484fbf44a0b99b56714a5241ae3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 20 Apr 2022 01:37:30 +0000 Subject: [PATCH 152/196] KVM: nVMX: Defer APICv updates while L2 is active until L1 is active Defer APICv updates that occur while L2 is active until nested VM-Exit, i.e. until L1 regains control. vmx_refresh_apicv_exec_ctrl() assumes L1 is active and (a) stomps all over vmcs02 and (b) neglects to ever updated vmcs01. E.g. if vmcs12 doesn't enable the TPR shadow for L2 (and thus no APICv controls), L1 performs nested VM-Enter APICv inhibited, and APICv becomes unhibited while L2 is active, KVM will set various APICv controls in vmcs02 and trigger a failed VM-Entry. The kicker is that, unless running with nested_early_check=1, KVM blames L1 and chaos ensues. In all cases, ignoring vmcs02 and always deferring the inhibition change to vmcs01 is correct (or at least acceptable). The ABSENT and DISABLE inhibitions cannot truly change while L2 is active (see below). IRQ_BLOCKING can change, but it is firmly a best effort debug feature. Furthermore, only L2's APIC is accelerated/virtualized to the full extent possible, e.g. even if L1 passes through its APIC to L2, normal MMIO/MSR interception will apply to the virtual APIC managed by KVM. The exception is the SELF_IPI register when x2APIC is enabled, but that's an acceptable hole. Lastly, Hyper-V's Auto EOI can technically be toggled if L1 exposes the MSRs to L2, but for that to work in any sane capacity, L1 would need to pass through IRQs to L2 as well, and IRQs must be intercepted to enable virtual interrupt delivery. I.e. exposing Auto EOI to L2 and enabling VID for L2 are, for all intents and purposes, mutually exclusive. Lack of dynamic toggling is also why this scenario is all but impossible to encounter in KVM's current form. But a future patch will pend an APICv update request _during_ vCPU creation to plug a race where a vCPU that's being created doesn't get included in the "all vCPUs request" because it's not yet visible to other vCPUs. If userspaces restores L2 after VM creation (hello, KVM selftests), the first KVM_RUN will occur while L2 is active and thus service the APICv update request made during VM creation. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220420013732.3308816-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 5 +++++ arch/x86/kvm/vmx/vmx.c | 5 +++++ arch/x86/kvm/vmx/vmx.h | 1 + 3 files changed, 11 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f18744f7ff82..856c87563883 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4618,6 +4618,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); } + if (vmx->nested.update_vmcs01_apicv_status) { + vmx->nested.update_vmcs01_apicv_status = false; + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + } + if ((vm_exit_reason != -1) && (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) vmx->nested.need_vmcs12_to_shadow_sync = true; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 04d170c4b61e..d58b763df855 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4174,6 +4174,11 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (is_guest_mode(vcpu)) { + vmx->nested.update_vmcs01_apicv_status = true; + return; + } + pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); if (cpu_has_secondary_exec_ctrls()) { if (kvm_vcpu_apicv_active(vcpu)) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 9c6bfcd84008..b98c7e96697a 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -183,6 +183,7 @@ struct nested_vmx { bool change_vmcs01_virtual_apic_mode; bool reload_vmcs01_apic_access_page; bool update_vmcs01_cpu_dirty_logging; + bool update_vmcs01_apicv_status; /* * Enlightened VMCS has been enabled. It does not mean that L1 has to From 423ecfea77dda83823c71b0fad1c2ddb2af1e5fc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 20 Apr 2022 01:37:31 +0000 Subject: [PATCH 153/196] KVM: x86: Pend KVM_REQ_APICV_UPDATE during vCPU creation to fix a race Make a KVM_REQ_APICV_UPDATE request when creating a vCPU with an in-kernel local APIC and APICv enabled at the module level. Consuming kvm_apicv_activated() and stuffing vcpu->arch.apicv_active directly can race with __kvm_set_or_clear_apicv_inhibit(), as vCPU creation happens before the vCPU is fully onlined, i.e. it won't get the request made to "all" vCPUs. If APICv is globally inhibited between setting apicv_active and onlining the vCPU, the vCPU will end up running with APICv enabled and trigger KVM's sanity check. Mark APICv as active during vCPU creation if APICv is enabled at the module level, both to be optimistic about it's final state, e.g. to avoid additional VMWRITEs on VMX, and because there are likely bugs lurking since KVM checks apicv_active in multiple vCPU creation paths. While keeping the current behavior of consuming kvm_apicv_activated() is arguably safer from a regression perspective, force apicv_active so that vCPU creation runs with deterministic state and so that if there are bugs, they are found sooner than later, i.e. not when some crazy race condition is hit. WARNING: CPU: 0 PID: 484 at arch/x86/kvm/x86.c:9877 vcpu_enter_guest+0x2ae3/0x3ee0 arch/x86/kvm/x86.c:9877 Modules linked in: CPU: 0 PID: 484 Comm: syz-executor361 Not tainted 5.16.13 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1~cloud0 04/01/2014 RIP: 0010:vcpu_enter_guest+0x2ae3/0x3ee0 arch/x86/kvm/x86.c:9877 Call Trace: vcpu_run arch/x86/kvm/x86.c:10039 [inline] kvm_arch_vcpu_ioctl_run+0x337/0x15e0 arch/x86/kvm/x86.c:10234 kvm_vcpu_ioctl+0x4d2/0xc80 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3727 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:874 [inline] __se_sys_ioctl fs/ioctl.c:860 [inline] __x64_sys_ioctl+0x16d/0x1d0 fs/ioctl.c:860 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae The bug was hit by a syzkaller spamming VM creation with 2 vCPUs and a call to KVM_SET_GUEST_DEBUG. r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0) r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0) ioctl$KVM_CAP_SPLIT_IRQCHIP(r1, 0x4068aea3, &(0x7f0000000000)) (async) r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0) (async) r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x400000000000002) ioctl$KVM_SET_GUEST_DEBUG(r3, 0x4048ae9b, &(0x7f00000000c0)={0x5dda9c14aa95f5c5}) ioctl$KVM_RUN(r2, 0xae80, 0x0) Reported-by: Gaoning Pan Reported-by: Yongkang Jia Fixes: 8df14af42f00 ("kvm: x86: Add support for dynamic APICv activation") Cc: stable@vger.kernel.org Cc: Maxim Levitsky Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20220420013732.3308816-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d54d4a67b226..9c02217c1e47 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11189,8 +11189,21 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); if (r < 0) goto fail_mmu_destroy; - if (kvm_apicv_activated(vcpu->kvm)) + + /* + * Defer evaluating inhibits until the vCPU is first run, as + * this vCPU will not get notified of any changes until this + * vCPU is visible to other vCPUs (marked online and added to + * the set of vCPUs). Opportunistically mark APICv active as + * VMX in particularly is highly unlikely to have inhibits. + * Ignore the current per-VM APICv state so that vCPU creation + * is guaranteed to run with a deterministic value, the request + * will ensure the vCPU gets the correct state before VM-Entry. + */ + if (enable_apicv) { vcpu->arch.apicv_active = true; + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + } } else static_branch_inc(&kvm_has_noapic_vcpu); From 0047fb33f811e00db5c87d028e5fcf0a26632b40 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 20 Apr 2022 01:37:32 +0000 Subject: [PATCH 154/196] KVM: x86: Skip KVM_GUESTDBG_BLOCKIRQ APICv update if APICv is disabled Skip the APICv inhibit update for KVM_GUESTDBG_BLOCKIRQ if APICv is disabled at the module level to avoid having to acquire the mutex and potentially process all vCPUs. The DISABLE inhibit will (barring bugs) never be lifted, so piling on more inhibits is unnecessary. Fixes: cae72dcc3b21 ("KVM: x86: inhibit APICv when KVM_GUESTDBG_BLOCKIRQ active") Cc: Maxim Levitsky Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20220420013732.3308816-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9c02217c1e47..c89dc09a764f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10978,6 +10978,9 @@ static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) struct kvm_vcpu *vcpu; unsigned long i; + if (!enable_apicv) + return; + down_write(&kvm->arch.apicv_update_lock); kvm_for_each_vcpu(i, vcpu, kvm) { From a413a625b43e5f085d4e1a8c4053494d593fb3c1 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 10 Apr 2022 11:38:40 -0400 Subject: [PATCH 155/196] KVM: SPDX style and spelling fixes SPDX comments use use /* */ style comments in headers anad // style comments in .c files. Also fix two spelling mistakes. Signed-off-by: Tom Rix Message-Id: <20220410153840.55506-1-trix@redhat.com> Signed-off-by: Paolo Bonzini --- virt/kvm/dirty_ring.c | 2 +- virt/kvm/kvm_main.c | 4 ++-- virt/kvm/kvm_mm.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 222ecc81d7df..f4c2a6eb1666 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +// SPDX-License-Identifier: GPL-2.0-only /* * KVM dirty ring implementation * diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d292c4397579..2a23f24d13cf 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -662,7 +662,7 @@ void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, kvm->mmu_notifier_range_end = end; } else { /* - * Fully tracking multiple concurrent ranges has dimishing + * Fully tracking multiple concurrent ranges has diminishing * returns. Keep things simple and just find the minimal range * which includes the current and new ranges. As there won't be * enough information to subtract a range after its invalidate @@ -1799,7 +1799,7 @@ static int kvm_set_memslot(struct kvm *kvm, /* * No need to refresh new->arch, changes after dropping slots_arch_lock - * will directly hit the final, active memsot. Architectures are + * will directly hit the final, active memslot. Architectures are * responsible for knowing that new->arch may be stale. */ kvm_commit_memory_region(kvm, old, new, change); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 34ca40823260..41da467d99c9 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-only +/* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __KVM_MM_H__ #define __KVM_MM_H__ 1 From 0361bdfddca20c8855ea3bdbbbc9c999912b10ff Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 18 Apr 2022 00:42:32 -0700 Subject: [PATCH 156/196] x86/kvm: Preserve BSP MSR_KVM_POLL_CONTROL across suspend/resume MSR_KVM_POLL_CONTROL is cleared on reset, thus reverting guests to host-side polling after suspend/resume. Non-bootstrap CPUs are restored correctly by the haltpoll driver because they are hot-unplugged during suspend and hot-plugged during resume; however, the BSP is not hotpluggable and remains in host-sde polling mode after the guest resume. The makes the guest pay for the cost of vmexits every time the guest enters idle. Fix it by recording BSP's haltpoll state and resuming it during guest resume. Cc: Marcelo Tosatti Signed-off-by: Wanpeng Li Message-Id: <1650267752-46796-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a22deb58f86d..8b1c45c9cda8 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -69,6 +69,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; +static int has_guest_poll = 0; /* * No need for any "IO delay" on KVM */ @@ -706,14 +707,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu) static int kvm_suspend(void) { + u64 val = 0; + kvm_guest_cpu_offline(false); +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + rdmsrl(MSR_KVM_POLL_CONTROL, val); + has_guest_poll = !(val & 1); +#endif return 0; } static void kvm_resume(void) { kvm_cpu_online(raw_smp_processor_id()); + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) + wrmsrl(MSR_KVM_POLL_CONTROL, 0); +#endif } static struct syscore_ops kvm_syscore_ops = { From 75189d1de1b377e580ebd2d2c55914631eac9c64 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sat, 9 Apr 2022 09:52:26 +0800 Subject: [PATCH 157/196] KVM: x86/pmu: Update AMD PMC sample period to fix guest NMI-watchdog NMI-watchdog is one of the favorite features of kernel developers, but it does not work in AMD guest even with vPMU enabled and worse, the system misrepresents this capability via /proc. This is a PMC emulation error. KVM does not pass the latest valid value to perf_event in time when guest NMI-watchdog is running, thus the perf_event corresponding to the watchdog counter will enter the old state at some point after the first guest NMI injection, forcing the hardware register PMC0 to be constantly written to 0x800000000001. Meanwhile, the running counter should accurately reflect its new value based on the latest coordinated pmc->counter (from vPMC's point of view) rather than the value written directly by the guest. Fixes: 168d918f2643 ("KVM: x86: Adjust counter sample period after a wrmsr") Reported-by: Dongli Cao Signed-off-by: Like Xu Reviewed-by: Yanan Wang Tested-by: Yanan Wang Reviewed-by: Jim Mattson Message-Id: <20220409015226.38619-1-likexu@tencent.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.h | 9 +++++++++ arch/x86/kvm/svm/pmu.c | 1 + arch/x86/kvm/vmx/pmu_intel.c | 8 ++------ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 9e66fba1d6a3..22992b049d38 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -138,6 +138,15 @@ static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value) return sample_period; } +static inline void pmc_update_sample_period(struct kvm_pmc *pmc) +{ + if (!pmc->perf_event || pmc->is_paused) + return; + + perf_event_period(pmc->perf_event, + get_sample_period(pmc, pmc->counter)); +} + void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 24eb935b6f85..b14860863c39 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -257,6 +257,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { pmc->counter += data - pmc_read_counter(pmc); + pmc_update_sample_period(pmc); return 0; } /* MSR_EVNTSELn */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index bc3f8512bb64..b82b6709d7a8 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -431,15 +431,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event && !pmc->is_paused) - perf_event_period(pmc->perf_event, - get_sample_period(pmc, data)); + pmc_update_sample_period(pmc); return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event && !pmc->is_paused) - perf_event_period(pmc->perf_event, - get_sample_period(pmc, data)); + pmc_update_sample_period(pmc); return 0; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) From 266a19a0bc4fbfab4d981a47640ca98972a01865 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 14 Apr 2022 12:30:31 +0200 Subject: [PATCH 158/196] KVM: selftests: Silence compiler warning in the kvm_page_table_test When compiling kvm_page_table_test.c, I get this compiler warning with gcc 11.2: kvm_page_table_test.c: In function 'pre_init_before_test': ../../../../tools/include/linux/kernel.h:44:24: warning: comparison of distinct pointer types lacks a cast 44 | (void) (&_max1 == &_max2); \ | ^~ kvm_page_table_test.c:281:21: note: in expansion of macro 'max' 281 | alignment = max(0x100000, alignment); | ^~~ Fix it by adjusting the type of the absolute value. Signed-off-by: Thomas Huth Reviewed-by: Claudio Imbrenda Message-Id: <20220414103031.565037-1-thuth@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index ba1fdc3dcf4a..2c4a7563a4f8 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -278,7 +278,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) else guest_test_phys_mem = p->phys_offset; #ifdef __s390x__ - alignment = max(0x100000, alignment); + alignment = max(0x100000UL, alignment); #endif guest_test_phys_mem = align_down(guest_test_phys_mem, alignment); From 4bbef7e8eb8c2c7dabf57d97decfd2b4f48aaf02 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 21 Apr 2022 03:14:05 +0000 Subject: [PATCH 159/196] KVM: SVM: Simplify and harden helper to flush SEV guest page(s) Rework sev_flush_guest_memory() to explicitly handle only a single page, and harden it to fall back to WBINVD if VM_PAGE_FLUSH fails. Per-page flushing is currently used only to flush the VMSA, and in its current form, the helper is completely broken with respect to flushing actual guest memory, i.e. won't work correctly for an arbitrary memory range. VM_PAGE_FLUSH takes a host virtual address, and is subject to normal page walks, i.e. will fault if the address is not present in the host page tables or does not have the correct permissions. Current AMD CPUs also do not honor SMAP overrides (undocumented in kernel versions of the APM), so passing in a userspace address is completely out of the question. In other words, KVM would need to manually walk the host page tables to get the pfn, ensure the pfn is stable, and then use the direct map to invoke VM_PAGE_FLUSH. And the latter might not even work, e.g. if userspace is particularly evil/clever and backs the guest with Secret Memory (which unmaps memory from the direct map). Signed-off-by: Sean Christopherson Fixes: add5e2f04541 ("KVM: SVM: Add support for the SEV-ES VMSA") Reported-by: Mingwei Zhang Cc: stable@vger.kernel.org Signed-off-by: Mingwei Zhang Message-Id: <20220421031407.2516575-2-mizhang@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 54 ++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 537aaddc852f..b77b3913e2d9 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2226,9 +2226,18 @@ int sev_cpu_init(struct svm_cpu_data *sd) * Pages used by hardware to hold guest encrypted state must be flushed before * returning them to the system. */ -static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va, - unsigned long len) +static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) { + int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid; + + /* + * Note! The address must be a kernel address, as regular page walk + * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user + * address is non-deterministic and unsafe. This function deliberately + * takes a pointer to deter passing in a user address. + */ + unsigned long addr = (unsigned long)va; + /* * If hardware enforced cache coherency for encrypted mappings of the * same physical page is supported, nothing to do. @@ -2237,40 +2246,16 @@ static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va, return; /* - * If the VM Page Flush MSR is supported, use it to flush the page - * (using the page virtual address and the guest ASID). + * VM Page Flush takes a host virtual address and a guest ASID. Fall + * back to WBINVD if this faults so as not to make any problems worse + * by leaving stale encrypted data in the cache. */ - if (boot_cpu_has(X86_FEATURE_VM_PAGE_FLUSH)) { - struct kvm_sev_info *sev; - unsigned long va_start; - u64 start, stop; + if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) + goto do_wbinvd; - /* Align start and stop to page boundaries. */ - va_start = (unsigned long)va; - start = (u64)va_start & PAGE_MASK; - stop = PAGE_ALIGN((u64)va_start + len); + return; - if (start < stop) { - sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; - - while (start < stop) { - wrmsrl(MSR_AMD64_VM_PAGE_FLUSH, - start | sev->asid); - - start += PAGE_SIZE; - } - - return; - } - - WARN(1, "Address overflow, using WBINVD\n"); - } - - /* - * Hardware should always have one of the above features, - * but if not, use WBINVD and issue a warning. - */ - WARN_ONCE(1, "Using WBINVD to flush guest memory\n"); +do_wbinvd: wbinvd_on_all_cpus(); } @@ -2284,7 +2269,8 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); if (vcpu->arch.guest_state_protected) - sev_flush_guest_memory(svm, svm->sev_es.vmsa, PAGE_SIZE); + sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); + __free_page(virt_to_page(svm->sev_es.vmsa)); if (svm->sev_es.ghcb_sa_free) From d45829b351ee6ec5f54dd55e6aca1f44fe239fe6 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Thu, 21 Apr 2022 03:14:06 +0000 Subject: [PATCH 160/196] KVM: SVM: Flush when freeing encrypted pages even on SME_COHERENT CPUs Use clflush_cache_range() to flush the confidential memory when SME_COHERENT is supported in AMD CPU. Cache flush is still needed since SME_COHERENT only support cache invalidation at CPU side. All confidential cache lines are still incoherent with DMA devices. Cc: stable@vger.kerel.org Fixes: add5e2f04541 ("KVM: SVM: Add support for the SEV-ES VMSA") Reviewed-by: Sean Christopherson Signed-off-by: Mingwei Zhang Message-Id: <20220421031407.2516575-3-mizhang@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index b77b3913e2d9..9a0375987029 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2239,11 +2239,14 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) unsigned long addr = (unsigned long)va; /* - * If hardware enforced cache coherency for encrypted mappings of the - * same physical page is supported, nothing to do. + * If CPU enforced cache coherency for encrypted mappings of the + * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache + * flush is still needed in order to work properly with DMA devices. */ - if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) + if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) { + clflush_cache_range(va, PAGE_SIZE); return; + } /* * VM Page Flush takes a host virtual address and a guest ASID. Fall From 23e3d7f7061f8682c751c46512718f47580ad8f0 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 17 Mar 2022 22:21:37 +0800 Subject: [PATCH 161/196] jbd2: fix a potential race while discarding reserved buffers after an abort we got issue as follows: [ 72.796117] EXT4-fs error (device sda): ext4_journal_check_start:83: comm fallocate: Detected aborted journal [ 72.826847] EXT4-fs (sda): Remounting filesystem read-only fallocate: fallocate failed: Read-only file system [ 74.791830] jbd2_journal_commit_transaction: jh=0xffff9cfefe725d90 bh=0x0000000000000000 end delay [ 74.793597] ------------[ cut here ]------------ [ 74.794203] kernel BUG at fs/jbd2/transaction.c:2063! [ 74.794886] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 74.795533] CPU: 4 PID: 2260 Comm: jbd2/sda-8 Not tainted 5.17.0-rc8-next-20220315-dirty #150 [ 74.798327] RIP: 0010:__jbd2_journal_unfile_buffer+0x3e/0x60 [ 74.801971] RSP: 0018:ffffa828c24a3cb8 EFLAGS: 00010202 [ 74.802694] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 74.803601] RDX: 0000000000000001 RSI: ffff9cfefe725d90 RDI: ffff9cfefe725d90 [ 74.804554] RBP: ffff9cfefe725d90 R08: 0000000000000000 R09: ffffa828c24a3b20 [ 74.805471] R10: 0000000000000001 R11: 0000000000000001 R12: ffff9cfefe725d90 [ 74.806385] R13: ffff9cfefe725d98 R14: 0000000000000000 R15: ffff9cfe833a4d00 [ 74.807301] FS: 0000000000000000(0000) GS:ffff9d01afb00000(0000) knlGS:0000000000000000 [ 74.808338] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 74.809084] CR2: 00007f2b81bf4000 CR3: 0000000100056000 CR4: 00000000000006e0 [ 74.810047] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 74.810981] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 74.811897] Call Trace: [ 74.812241] [ 74.812566] __jbd2_journal_refile_buffer+0x12f/0x180 [ 74.813246] jbd2_journal_refile_buffer+0x4c/0xa0 [ 74.813869] jbd2_journal_commit_transaction.cold+0xa1/0x148 [ 74.817550] kjournald2+0xf8/0x3e0 [ 74.819056] kthread+0x153/0x1c0 [ 74.819963] ret_from_fork+0x22/0x30 Above issue may happen as follows: write truncate kjournald2 generic_perform_write ext4_write_begin ext4_walk_page_buffers do_journal_get_write_access ->add BJ_Reserved list ext4_journalled_write_end ext4_walk_page_buffers write_end_fn ext4_handle_dirty_metadata ***************JBD2 ABORT************** jbd2_journal_dirty_metadata -> return -EROFS, jh in reserved_list jbd2_journal_commit_transaction while (commit_transaction->t_reserved_list) jh = commit_transaction->t_reserved_list; truncate_pagecache_range do_invalidatepage ext4_journalled_invalidatepage jbd2_journal_invalidatepage journal_unmap_buffer __dispose_buffer __jbd2_journal_unfile_buffer jbd2_journal_put_journal_head ->put last ref_count __journal_remove_journal_head bh->b_private = NULL; jh->b_bh = NULL; jbd2_journal_refile_buffer(journal, jh); bh = jh2bh(jh); ->bh is NULL, later will trigger null-ptr-deref journal_free_journal_head(jh); After commit 96f1e0974575, we no longer hold the j_state_lock while iterating over the list of reserved handles in jbd2_journal_commit_transaction(). This potentially allows the journal_head to be freed by journal_unmap_buffer while the commit codepath is also trying to free the BJ_Reserved buffers. Keeping j_state_lock held while trying extends hold time of the lock minimally, and solves this issue. Fixes: 96f1e0974575("jbd2: avoid long hold times of j_state_lock while committing a transaction") Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220317142137.1821590-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5b9408e3b370..ac7f067b7bdd 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -488,7 +488,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_wait_updates(journal); commit_transaction->t_state = T_SWITCH; - write_unlock(&journal->j_state_lock); J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= journal->j_max_transaction_buffers); @@ -508,6 +507,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) * has reserved. This is consistent with the existing behaviour * that multiple jbd2_journal_get_write_access() calls to the same * buffer are perfectly permissible. + * We use journal->j_state_lock here to serialize processing of + * t_reserved_list with eviction of buffers from journal_unmap_buffer(). */ while (commit_transaction->t_reserved_list) { jh = commit_transaction->t_reserved_list; @@ -527,6 +528,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_refile_buffer(journal, jh); } + write_unlock(&journal->j_state_lock); /* * Now try to drop any written-back buffers from the journal's * checkpoint lists. We do this *before* commit because it potentially From b07908ab26ceab51165c13714277c19252e62594 Mon Sep 17 00:00:00 2001 From: Gongjun Song Date: Thu, 21 Apr 2022 11:35:46 -0500 Subject: [PATCH 162/196] ALSA: hda: intel-dsp-config: Add RaptorLake PCI IDs Add RaptorLake-P PCI IDs Reviewed-by: Kai Vehmanen Signed-off-by: Gongjun Song Signed-off-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20220421163546.319604-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Takashi Iwai --- sound/hda/intel-dsp-config.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sound/hda/intel-dsp-config.c b/sound/hda/intel-dsp-config.c index 8b0a16ba27d3..a8fe01764b25 100644 --- a/sound/hda/intel-dsp-config.c +++ b/sound/hda/intel-dsp-config.c @@ -424,6 +424,15 @@ static const struct config_entry config_table[] = { .flags = FLAG_SOF | FLAG_SOF_ONLY_IF_DMIC_OR_SOUNDWIRE, .device = 0x54c8, }, + /* RaptorLake-P */ + { + .flags = FLAG_SOF | FLAG_SOF_ONLY_IF_DMIC_OR_SOUNDWIRE, + .device = 0x51ca, + }, + { + .flags = FLAG_SOF | FLAG_SOF_ONLY_IF_DMIC_OR_SOUNDWIRE, + .device = 0x51cb, + }, #endif }; From 86222af07abf1f5f07a5873cc399c29ab8a9b8b8 Mon Sep 17 00:00:00 2001 From: Tim Crawford Date: Thu, 21 Apr 2022 11:04:12 -0600 Subject: [PATCH 163/196] ALSA: hda/realtek: Add quirk for Clevo NP70PNP Fixes headset detection on Clevo NP70PNP. Signed-off-by: Tim Crawford Cc: Link: https://lore.kernel.org/r/20220421170412.3697-1-tcrawford@system76.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 0cba2f19a772..4c0c593f3c0a 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9170,6 +9170,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x8562, "Clevo NH[57][0-9]RZ[Q]", ALC269_FIXUP_DMIC), SND_PCI_QUIRK(0x1558, 0x8668, "Clevo NP50B[BE]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x866d, "Clevo NP5[05]PN[HJK]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1558, 0x867c, "Clevo NP7[01]PNP", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x867d, "Clevo NP7[01]PN[HJK]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x8680, "Clevo NJ50LU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x8686, "Clevo NH50[CZ]U", ALC256_FIXUP_MIC_NO_PRESENCE_AND_RESUME), From 683412ccf61294d727ead4a73d97397396e69a6b Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Thu, 21 Apr 2022 03:14:07 +0000 Subject: [PATCH 164/196] KVM: SEV: add cache flush to solve SEV cache incoherency issues Flush the CPU caches when memory is reclaimed from an SEV guest (where reclaim also includes it being unmapped from KVM's memslots). Due to lack of coherency for SEV encrypted memory, failure to flush results in silent data corruption if userspace is malicious/broken and doesn't ensure SEV guest memory is properly pinned and unpinned. Cache coherency is not enforced across the VM boundary in SEV (AMD APM vol.2 Section 15.34.7). Confidential cachelines, generated by confidential VM guests have to be explicitly flushed on the host side. If a memory page containing dirty confidential cachelines was released by VM and reallocated to another user, the cachelines may corrupt the new user at a later time. KVM takes a shortcut by assuming all confidential memory remain pinned until the end of VM lifetime. Therefore, KVM does not flush cache at mmu_notifier invalidation events. Because of this incorrect assumption and the lack of cache flushing, malicous userspace can crash the host kernel: creating a malicious VM and continuously allocates/releases unpinned confidential memory pages when the VM is running. Add cache flush operations to mmu_notifier operations to ensure that any physical memory leaving the guest VM get flushed. In particular, hook mmu_notifier_invalidate_range_start and mmu_notifier_release events and flush cache accordingly. The hook after releasing the mmu lock to avoid contention with other vCPUs. Cc: stable@vger.kernel.org Suggested-by: Sean Christpherson Reported-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Message-Id: <20220421031407.2516575-4-mizhang@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/sev.c | 8 ++++++++ arch/x86/kvm/svm/svm.c | 1 + arch/x86/kvm/svm/svm.h | 2 ++ arch/x86/kvm/x86.c | 5 +++++ include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 27 ++++++++++++++++++++++++--- 8 files changed, 44 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 3c368b639c04..1a6d7e3f6c32 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -118,6 +118,7 @@ KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) +KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) KVM_X86_OP(get_msr_feature) KVM_X86_OP(can_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e0c0f0e1f754..4ff36610af6a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1484,6 +1484,7 @@ struct kvm_x86_ops { int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); + void (*guest_memory_reclaimed)(struct kvm *kvm); int (*get_msr_feature)(struct kvm_msr_entry *entry); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 9a0375987029..0ad70c12c7c3 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2262,6 +2262,14 @@ do_wbinvd: wbinvd_on_all_cpus(); } +void sev_guest_memory_reclaimed(struct kvm *kvm) +{ + if (!sev_guest(kvm)) + return; + + wbinvd_on_all_cpus(); +} + void sev_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bd4c64b362d2..7e45d03cd018 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4620,6 +4620,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_ioctl = sev_mem_enc_ioctl, .mem_enc_register_region = sev_mem_enc_register_region, .mem_enc_unregister_region = sev_mem_enc_unregister_region, + .guest_memory_reclaimed = sev_guest_memory_reclaimed, .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, .vm_move_enc_context_from = sev_vm_move_enc_context_from, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f77a7d2d39dd..f76deff71002 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -609,6 +609,8 @@ int sev_mem_enc_unregister_region(struct kvm *kvm, struct kvm_enc_region *range); int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd); int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd); +void sev_guest_memory_reclaimed(struct kvm *kvm); + void pre_sev_run(struct vcpu_svm *svm, int cpu); void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c89dc09a764f..a6ab19afc638 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9889,6 +9889,11 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); } +void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) +{ + static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm); +} + static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { if (!lapic_in_kernel(vcpu)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2dab4b696682..34eed5f85ed6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2219,6 +2219,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, unsigned long start, unsigned long end); +void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); + #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); #else diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2a23f24d13cf..f30bb8c16f26 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -164,6 +164,10 @@ __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, { } +__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) +{ +} + bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) { /* @@ -357,6 +361,12 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); #endif +static void kvm_flush_shadow_all(struct kvm *kvm) +{ + kvm_arch_flush_shadow_all(kvm); + kvm_arch_guest_memory_reclaimed(kvm); +} + #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, gfp_t gfp_flags) @@ -485,12 +495,15 @@ typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, unsigned long end); +typedef void (*on_unlock_fn_t)(struct kvm *kvm); + struct kvm_hva_range { unsigned long start; unsigned long end; pte_t pte; hva_handler_t handler; on_lock_fn_t on_lock; + on_unlock_fn_t on_unlock; bool flush_on_ret; bool may_block; }; @@ -578,8 +591,11 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, if (range->flush_on_ret && ret) kvm_flush_remote_tlbs(kvm); - if (locked) + if (locked) { KVM_MMU_UNLOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_unlock)) + range->on_unlock(kvm); + } srcu_read_unlock(&kvm->srcu, idx); @@ -600,6 +616,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, .pte = pte, .handler = handler, .on_lock = (void *)kvm_null_fn, + .on_unlock = (void *)kvm_null_fn, .flush_on_ret = true, .may_block = false, }; @@ -619,6 +636,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn .pte = __pte(0), .handler = handler, .on_lock = (void *)kvm_null_fn, + .on_unlock = (void *)kvm_null_fn, .flush_on_ret = false, .may_block = false, }; @@ -687,6 +705,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, .pte = __pte(0), .handler = kvm_unmap_gfn_range, .on_lock = kvm_inc_notifier_count, + .on_unlock = kvm_arch_guest_memory_reclaimed, .flush_on_ret = true, .may_block = mmu_notifier_range_blockable(range), }; @@ -741,6 +760,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, .pte = __pte(0), .handler = (void *)kvm_null_fn, .on_lock = kvm_dec_notifier_count, + .on_unlock = (void *)kvm_null_fn, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), }; @@ -813,7 +833,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, int idx; idx = srcu_read_lock(&kvm->srcu); - kvm_arch_flush_shadow_all(kvm); + kvm_flush_shadow_all(kvm); srcu_read_unlock(&kvm->srcu, idx); } @@ -1225,7 +1245,7 @@ static void kvm_destroy_vm(struct kvm *kvm) WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); kvm->mn_active_invalidate_count = 0; #else - kvm_arch_flush_shadow_all(kvm); + kvm_flush_shadow_all(kvm); #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); @@ -1652,6 +1672,7 @@ static void kvm_invalidate_memslot(struct kvm *kvm, * - kvm_is_visible_gfn (mmu_check_root) */ kvm_arch_flush_shadow_memslot(kvm, old); + kvm_arch_guest_memory_reclaimed(kvm); /* Was released by kvm_swap_active_memslots, reacquire. */ mutex_lock(&kvm->slots_arch_lock); From f18b4aebe107d092e384b1ae680b1e1de7a0196d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Apr 2022 06:27:27 -0400 Subject: [PATCH 165/196] kvm: selftests: do not use bitfields larger than 32-bits for PTEs Red Hat's QE team reported test failure on access_tracking_perf_test: Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages guest physical test memory offset: 0x3fffbffff000 Populating memory : 0.684014577s Writing to populated memory : 0.006230175s Reading from populated memory : 0.004557805s ==== Test Assertion Failure ==== lib/kvm_util.c:1411: false pid=125806 tid=125809 errno=4 - Interrupted system call 1 0x0000000000402f7c: addr_gpa2hva at kvm_util.c:1411 2 (inlined by) addr_gpa2hva at kvm_util.c:1405 3 0x0000000000401f52: lookup_pfn at access_tracking_perf_test.c:98 4 (inlined by) mark_vcpu_memory_idle at access_tracking_perf_test.c:152 5 (inlined by) vcpu_thread_main at access_tracking_perf_test.c:232 6 0x00007fefe9ff81ce: ?? ??:0 7 0x00007fefe9c64d82: ?? ??:0 No vm physical memory at 0xffbffff000 I can easily reproduce it with a Intel(R) Xeon(R) CPU E5-2630 with 46 bits PA. It turns out that the address translation for clearing idle page tracking returned a wrong result; addr_gva2gpa()'s last step, which is based on "pte[index[0]].pfn", did the calculation with 40 bits length and the high 12 bits got truncated. In above case the GPA address to be returned should be 0x3fffbffff000 for GVA 0xc0000000, but it got truncated into 0xffbffff000 and the subsequent gpa2hva lookup failed. The width of operations on bit fields greater than 32-bit is implementation defined, and differs between GCC (which uses the bitfield precision) and clang (which uses 64-bit arithmetic), so this is a potential minefield. Remove the bit fields and using manual masking instead. Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2075036 Reported-by: Nana Liu Reviewed-by: Peter Xu Tested-by: Peter Xu Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 15 ++ .../selftests/kvm/lib/x86_64/processor.c | 190 +++++++----------- 2 files changed, 91 insertions(+), 114 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 37db341d4cc5..86e79af64dea 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -60,6 +60,21 @@ /* CPUID.0x8000_0001.EDX */ #define CPUID_GBPAGES (1ul << 26) +/* Page table bitfield declarations */ +#define PTE_PRESENT_MASK BIT_ULL(0) +#define PTE_WRITABLE_MASK BIT_ULL(1) +#define PTE_USER_MASK BIT_ULL(2) +#define PTE_ACCESSED_MASK BIT_ULL(5) +#define PTE_DIRTY_MASK BIT_ULL(6) +#define PTE_LARGE_MASK BIT_ULL(7) +#define PTE_GLOBAL_MASK BIT_ULL(8) +#define PTE_NX_MASK BIT_ULL(63) + +#define PAGE_SHIFT 12 + +#define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12) +#define PTE_GET_PFN(pte) (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) + /* General Registers in 64-Bit Mode */ struct gpr64_regs { u64 rax; diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 9f000dfb5594..0dd442c26015 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -19,38 +19,6 @@ vm_vaddr_t exception_handlers; -/* Virtual translation table structure declarations */ -struct pageUpperEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t ignored_06:1; - uint64_t page_size:1; - uint64_t ignored_11_08:4; - uint64_t pfn:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; - -struct pageTableEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t dirty:1; - uint64_t reserved_07:1; - uint64_t global:1; - uint64_t ignored_11_09:3; - uint64_t pfn:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; - void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent) { @@ -195,23 +163,21 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr, return &page_table[index]; } -static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm, - uint64_t pt_pfn, - uint64_t vaddr, - uint64_t paddr, - int level, - enum x86_page_size page_size) +static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, + uint64_t pt_pfn, + uint64_t vaddr, + uint64_t paddr, + int level, + enum x86_page_size page_size) { - struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level); + uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level); - if (!pte->present) { - pte->writable = true; - pte->present = true; - pte->page_size = (level == page_size); - if (pte->page_size) - pte->pfn = paddr >> vm->page_shift; + if (!(*pte & PTE_PRESENT_MASK)) { + *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK; + if (level == page_size) + *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK); else - pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift; + *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK; } else { /* * Entry already present. Assert that the caller doesn't want @@ -221,7 +187,7 @@ static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm, TEST_ASSERT(level != page_size, "Cannot create hugepage at level: %u, vaddr: 0x%lx\n", page_size, vaddr); - TEST_ASSERT(!pte->page_size, + TEST_ASSERT(!(*pte & PTE_LARGE_MASK), "Cannot create page table at level: %u, vaddr: 0x%lx\n", level, vaddr); } @@ -232,8 +198,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, enum x86_page_size page_size) { const uint64_t pg_size = 1ull << ((page_size * 9) + 12); - struct pageUpperEntry *pml4e, *pdpe, *pde; - struct pageTableEntry *pte; + uint64_t *pml4e, *pdpe, *pde; + uint64_t *pte; TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); @@ -257,24 +223,22 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, */ pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift, vaddr, paddr, 3, page_size); - if (pml4e->page_size) + if (*pml4e & PTE_LARGE_MASK) return; - pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size); - if (pdpe->page_size) + pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size); + if (*pdpe & PTE_LARGE_MASK) return; - pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size); - if (pde->page_size) + pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size); + if (*pde & PTE_LARGE_MASK) return; /* Fill in page table entry. */ - pte = virt_get_pte(vm, pde->pfn, vaddr, 0); - TEST_ASSERT(!pte->present, + pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0); + TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr); - pte->pfn = paddr >> vm->page_shift; - pte->writable = true; - pte->present = 1; + *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); } void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) @@ -282,12 +246,12 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) __virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K); } -static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, +static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) { uint16_t index[4]; - struct pageUpperEntry *pml4e, *pdpe, *pde; - struct pageTableEntry *pte; + uint64_t *pml4e, *pdpe, *pde; + uint64_t *pte; struct kvm_cpuid_entry2 *entry; struct kvm_sregs sregs; int max_phy_addr; @@ -329,30 +293,29 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc index[3] = (vaddr >> 39) & 0x1ffu; pml4e = addr_gpa2hva(vm, vm->pgd); - TEST_ASSERT(pml4e[index[3]].present, + TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK, "Expected pml4e to be present for gva: 0x%08lx", vaddr); - TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) & - (rsvd_mask | (1ull << 7))) == 0, + TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0, "Unexpected reserved bits set."); - pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size); - TEST_ASSERT(pdpe[index[2]].present, + pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size); + TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK, "Expected pdpe to be present for gva: 0x%08lx", vaddr); - TEST_ASSERT(pdpe[index[2]].page_size == 0, + TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK), "Expected pdpe to map a pde not a 1-GByte page."); - TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0, + TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0, "Unexpected reserved bits set."); - pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size); - TEST_ASSERT(pde[index[1]].present, + pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size); + TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK, "Expected pde to be present for gva: 0x%08lx", vaddr); - TEST_ASSERT(pde[index[1]].page_size == 0, + TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK), "Expected pde to map a pte not a 2-MByte page."); - TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0, + TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0, "Unexpected reserved bits set."); - pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size); - TEST_ASSERT(pte[index[0]].present, + pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size); + TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK, "Expected pte to be present for gva: 0x%08lx", vaddr); return &pte[index[0]]; @@ -360,7 +323,7 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) { - struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr); + uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr); return *(uint64_t *)pte; } @@ -368,18 +331,17 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, uint64_t pte) { - struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid, - vaddr); + uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr); *(uint64_t *)new_pte = pte; } void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - struct pageUpperEntry *pml4e, *pml4e_start; - struct pageUpperEntry *pdpe, *pdpe_start; - struct pageUpperEntry *pde, *pde_start; - struct pageTableEntry *pte, *pte_start; + uint64_t *pml4e, *pml4e_start; + uint64_t *pdpe, *pdpe_start; + uint64_t *pde, *pde_start; + uint64_t *pte, *pte_start; if (!vm->pgd_created) return; @@ -389,58 +351,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*s index hvaddr gpaddr " "addr w exec dirty\n", indent, ""); - pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd); + pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd); for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { pml4e = &pml4e_start[n1]; - if (!pml4e->present) + if (!(*pml4e & PTE_PRESENT_MASK)) continue; - fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u " + fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u " " %u\n", indent, "", pml4e - pml4e_start, pml4e, - addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn, - pml4e->writable, pml4e->execute_disable); + addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e), + !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK)); - pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size); + pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK); for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { pdpe = &pdpe_start[n2]; - if (!pdpe->present) + if (!(*pdpe & PTE_PRESENT_MASK)) continue; - fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx " + fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx " "%u %u\n", indent, "", pdpe - pdpe_start, pdpe, addr_hva2gpa(vm, pdpe), - (uint64_t) pdpe->pfn, pdpe->writable, - pdpe->execute_disable); + PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK), + !!(*pdpe & PTE_NX_MASK)); - pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size); + pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK); for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { pde = &pde_start[n3]; - if (!pde->present) + if (!(*pde & PTE_PRESENT_MASK)) continue; fprintf(stream, "%*spde 0x%-3zx %p " - "0x%-12lx 0x%-10lx %u %u\n", + "0x%-12lx 0x%-10llx %u %u\n", indent, "", pde - pde_start, pde, addr_hva2gpa(vm, pde), - (uint64_t) pde->pfn, pde->writable, - pde->execute_disable); + PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK), + !!(*pde & PTE_NX_MASK)); - pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size); + pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK); for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { pte = &pte_start[n4]; - if (!pte->present) + if (!(*pte & PTE_PRESENT_MASK)) continue; fprintf(stream, "%*spte 0x%-3zx %p " - "0x%-12lx 0x%-10lx %u %u " + "0x%-12lx 0x%-10llx %u %u " " %u 0x%-10lx\n", indent, "", pte - pte_start, pte, addr_hva2gpa(vm, pte), - (uint64_t) pte->pfn, - pte->writable, - pte->execute_disable, - pte->dirty, + PTE_GET_PFN(*pte), + !!(*pte & PTE_WRITABLE_MASK), + !!(*pte & PTE_NX_MASK), + !!(*pte & PTE_DIRTY_MASK), ((uint64_t) n1 << 27) | ((uint64_t) n2 << 18) | ((uint64_t) n3 << 9) @@ -558,8 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint16_t index[4]; - struct pageUpperEntry *pml4e, *pdpe, *pde; - struct pageTableEntry *pte; + uint64_t *pml4e, *pdpe, *pde; + uint64_t *pte; TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); @@ -572,22 +534,22 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) if (!vm->pgd_created) goto unmapped_gva; pml4e = addr_gpa2hva(vm, vm->pgd); - if (!pml4e[index[3]].present) + if (!(pml4e[index[3]] & PTE_PRESENT_MASK)) goto unmapped_gva; - pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size); - if (!pdpe[index[2]].present) + pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size); + if (!(pdpe[index[2]] & PTE_PRESENT_MASK)) goto unmapped_gva; - pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size); - if (!pde[index[1]].present) + pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size); + if (!(pde[index[1]] & PTE_PRESENT_MASK)) goto unmapped_gva; - pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size); - if (!pte[index[0]].present) + pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size); + if (!(pte[index[0]] & PTE_PRESENT_MASK)) goto unmapped_gva; - return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu); + return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & 0xfffu); unmapped_gva: TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); From e852be8b148e117e25be1c98cf72ee489b05919e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Apr 2022 06:27:27 -0400 Subject: [PATCH 166/196] kvm: selftests: introduce and use more page size-related constants Clean up code that was hardcoding masks for various fields, now that the masks are included in processor.h. For more cleanup, define PAGE_SIZE and PAGE_MASK just like in Linux. PAGE_SIZE in particular was defined by several tests. Suggested-by: Sean Christopherson Reviewed-by: Peter Xu Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/x86_64/processor.h | 2 ++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 12 ++++++------ tools/testing/selftests/kvm/x86_64/amx_test.c | 1 - .../selftests/kvm/x86_64/emulator_error_test.c | 1 - tools/testing/selftests/kvm/x86_64/smm_test.c | 2 -- .../selftests/kvm/x86_64/vmx_tsc_adjust_test.c | 1 - tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c | 1 - tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c | 1 - 8 files changed, 8 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 86e79af64dea..d0d51adec76e 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -71,6 +71,8 @@ #define PTE_NX_MASK BIT_ULL(63) #define PAGE_SHIFT 12 +#define PAGE_SIZE (1ULL << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) #define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12) #define PTE_GET_PFN(pte) (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 0dd442c26015..33ea5e9955d9 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -255,13 +255,13 @@ static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, struct kvm_cpuid_entry2 *entry; struct kvm_sregs sregs; int max_phy_addr; - /* Set the bottom 52 bits. */ - uint64_t rsvd_mask = 0x000fffffffffffff; + uint64_t rsvd_mask = 0; entry = kvm_get_supported_cpuid_index(0x80000008, 0); max_phy_addr = entry->eax & 0x000000ff; - /* Clear the bottom bits of the reserved mask. */ - rsvd_mask = (rsvd_mask >> max_phy_addr) << max_phy_addr; + /* Set the high bits in the reserved mask. */ + if (max_phy_addr < 52) + rsvd_mask = GENMASK_ULL(51, max_phy_addr); /* * SDM vol 3, fig 4-11 "Formats of CR3 and Paging-Structure Entries @@ -271,7 +271,7 @@ static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, */ vcpu_sregs_get(vm, vcpuid, &sregs); if ((sregs.efer & EFER_NX) == 0) { - rsvd_mask |= (1ull << 63); + rsvd_mask |= PTE_NX_MASK; } TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " @@ -549,7 +549,7 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) if (!(pte[index[0]] & PTE_PRESENT_MASK)) goto unmapped_gva; - return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & 0xfffu); + return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & ~PAGE_MASK); unmapped_gva: TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 52a3ef6629e8..76f65c22796f 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -29,7 +29,6 @@ #define X86_FEATURE_XSAVE (1 << 26) #define X86_FEATURE_OSXSAVE (1 << 27) -#define PAGE_SIZE (1 << 12) #define NUM_TILES 8 #define TILE_SIZE 1024 #define XSAVE_SIZE ((NUM_TILES * TILE_SIZE) + PAGE_SIZE) diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c index f070ff0224fa..aeb3850f81bd 100644 --- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -12,7 +12,6 @@ #include "vmx.h" #define VCPU_ID 1 -#define PAGE_SIZE 4096 #define MAXPHYADDR 36 #define MEM_REGION_GVA 0x0000123456789000 diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index a626d40fdb48..b4e0c860769e 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -21,8 +21,6 @@ #define VCPU_ID 1 -#define PAGE_SIZE 4096 - #define SMRAM_SIZE 65536 #define SMRAM_MEMSLOT ((1 << 16) | 1) #define SMRAM_PAGES (SMRAM_SIZE / PAGE_SIZE) diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c index e683d0ac3e45..19b35c607dc6 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c @@ -32,7 +32,6 @@ #define MSR_IA32_TSC_ADJUST 0x3b #endif -#define PAGE_SIZE 4096 #define VCPU_ID 5 #define TSC_ADJUST_VALUE (1ll << 32) diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 865e17146815..bcd370827859 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -23,7 +23,6 @@ #define SHINFO_REGION_GVA 0xc0000000ULL #define SHINFO_REGION_GPA 0xc0000000ULL #define SHINFO_REGION_SLOT 10 -#define PAGE_SIZE 4096 #define DUMMY_REGION_GPA (SHINFO_REGION_GPA + (2 * PAGE_SIZE)) #define DUMMY_REGION_SLOT 11 diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c index adc94452b57c..b30fe9de1d4f 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c @@ -15,7 +15,6 @@ #define HCALL_REGION_GPA 0xc0000000ULL #define HCALL_REGION_SLOT 10 -#define PAGE_SIZE 4096 static struct kvm_vm *vm; From d5fdade9331f57335af97dbef61cf15b4930abc1 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Tue, 12 Apr 2022 09:03:35 +0530 Subject: [PATCH 167/196] RISC-V: mm: Fix set_satp_mode() for platform not having Sv57 When Sv57 is not available the satp.MODE test in set_satp_mode() will fail and lead to pgdir re-programming for Sv48. The pgdir re-programming will fail as well due to pre-existing pgdir entry used for Sv57 and as a result kernel fails to boot on RISC-V platform not having Sv57. To fix above issue, we should clear the pgdir memory in set_satp_mode() before re-programming. Fixes: 011f09d12052 ("riscv: mm: Set sv57 on defaultly") Reported-by: Mayuresh Chitale Signed-off-by: Anup Patel Reviewed-by: Atish Patra Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 9535bea8688c..b0793dc0c291 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -718,6 +718,7 @@ retry: if (!check_l4) { disable_pgtable_l5(); check_l4 = true; + memset(early_pg_dir, 0, PAGE_SIZE); goto retry; } disable_pgtable_l4(); From bf9bac40b7635e2ce43ba0051a64c3fd44312405 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 9 Apr 2022 15:53:17 -0700 Subject: [PATCH 168/196] RISC-V: cpuidle: fix Kconfig select for RISCV_SBI_CPUIDLE There can be lots of build errors when building cpuidle-riscv-sbi.o. They are all caused by a kconfig problem with this warning: WARNING: unmet direct dependencies detected for RISCV_SBI_CPUIDLE Depends on [n]: CPU_IDLE [=y] && RISCV [=y] && RISCV_SBI [=n] Selected by [y]: - SOC_VIRT [=y] && CPU_IDLE [=y] so make the 'select' of RISCV_SBI_CPUIDLE also depend on RISCV_SBI. Fixes: c5179ef1ca0c ("RISC-V: Enable RISC-V SBI CPU Idle driver for QEMU virt machine") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Reviewed-by: Anup Patel Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.socs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 34592d00dde8..f6ef358d8a2c 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -38,7 +38,7 @@ config SOC_VIRT select SIFIVE_PLIC select PM_GENERIC_DOMAINS if PM select PM_GENERIC_DOMAINS_OF if PM && OF - select RISCV_SBI_CPUIDLE if CPU_IDLE + select RISCV_SBI_CPUIDLE if CPU_IDLE && RISCV_SBI help This enables support for QEMU Virt Machine. From aafa9f958342db36c17ac2a7f1b841032c96feb4 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Thu, 21 Apr 2022 09:39:20 +0800 Subject: [PATCH 169/196] ata: pata_marvell: Check the 'bmdma_addr' beforing reading Before detecting the cable type on the dma bar, the driver should check whether the 'bmdma_addr' is zero, which means the adapter does not support DMA, otherwise we will get the following error: [ 5.146634] Bad IO access at port 0x1 (return inb(port)) [ 5.147206] WARNING: CPU: 2 PID: 303 at lib/iomap.c:44 ioread8+0x4a/0x60 [ 5.150856] RIP: 0010:ioread8+0x4a/0x60 [ 5.160238] Call Trace: [ 5.160470] [ 5.160674] marvell_cable_detect+0x6e/0xc0 [pata_marvell] [ 5.161728] ata_eh_recover+0x3520/0x6cc0 [ 5.168075] ata_do_eh+0x49/0x3c0 Signed-off-by: Zheyu Ma Signed-off-by: Damien Le Moal --- drivers/ata/pata_marvell.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ata/pata_marvell.c b/drivers/ata/pata_marvell.c index 0c5a51970fbf..014ccb0f45dc 100644 --- a/drivers/ata/pata_marvell.c +++ b/drivers/ata/pata_marvell.c @@ -77,6 +77,8 @@ static int marvell_cable_detect(struct ata_port *ap) switch(ap->port_no) { case 0: + if (!ap->ioaddr.bmdma_addr) + return ATA_CBL_PATA_UNK; if (ioread8(ap->ioaddr.bmdma_addr + 1) & 1) return ATA_CBL_PATA40; return ATA_CBL_PATA80; From 405ce051236cc65b30bbfe490b28ce60ae6aed85 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 21 Apr 2022 16:35:33 -0700 Subject: [PATCH 170/196] mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb() There is a race condition between memory_failure_hugetlb() and hugetlb free/demotion, which causes setting PageHWPoison flag on the wrong page. The one simple result is that wrong processes can be killed, but another (more serious) one is that the actual error is left unhandled, so no one prevents later access to it, and that might lead to more serious results like consuming corrupted data. Think about the below race window: CPU 1 CPU 2 memory_failure_hugetlb struct page *head = compound_head(p); hugetlb page might be freed to buddy, or even changed to another compound page. get_hwpoison_page -- page is not what we want now... The current code first does prechecks roughly and then reconfirms after taking refcount, but it's found that it makes code overly complicated, so move the prechecks in a single hugetlb_lock range. A newly introduced function, try_memory_failure_hugetlb(), always takes hugetlb_lock (even for non-hugetlb pages). That can be improved, but memory_failure() is rare in principle, so should not be a big problem. Link: https://lkml.kernel.org/r/20220408135323.1559401-2-naoya.horiguchi@linux.dev Fixes: 761ad8d7c7b5 ("mm: hwpoison: introduce memory_failure_hugetlb()") Signed-off-by: Naoya Horiguchi Reported-by: Mike Kravetz Reviewed-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: Yang Shi Cc: Dan Carpenter Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 ++ include/linux/mm.h | 8 +++ mm/hugetlb.c | 10 +++ mm/memory-failure.c | 147 ++++++++++++++++++++++++++++------------ 4 files changed, 128 insertions(+), 43 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 53c1b6082a4c..ac2a1d758a80 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -169,6 +169,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool isolate_huge_page(struct page *page, struct list_head *list); int get_hwpoison_huge_page(struct page *page, bool *hugetlb); +int get_huge_page_for_hwpoison(unsigned long pfn, int flags); void putback_active_hugepage(struct page *page); void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); @@ -378,6 +379,11 @@ static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb) return 0; } +static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +{ + return 0; +} + static inline void putback_active_hugepage(struct page *page) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index e34edb775334..9f44254af8ce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3197,6 +3197,14 @@ extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); +#ifdef CONFIG_MEMORY_FAILURE +extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags); +#else +static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +{ + return 0; +} +#endif #ifndef arch_memory_failure static inline int arch_memory_failure(unsigned long pfn, int flags) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f8ca7cca3c1a..3fc721789743 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6785,6 +6785,16 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) return ret; } +int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +{ + int ret; + + spin_lock_irq(&hugetlb_lock); + ret = __get_huge_page_for_hwpoison(pfn, flags); + spin_unlock_irq(&hugetlb_lock); + return ret; +} + void putback_active_hugepage(struct page *page) { spin_lock_irq(&hugetlb_lock); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index dcb6bb9cf731..2020944398c9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1498,50 +1498,113 @@ static int try_to_split_thp_page(struct page *page, const char *msg) return 0; } -static int memory_failure_hugetlb(unsigned long pfn, int flags) +/* + * Called from hugetlb code with hugetlb_lock held. + * + * Return values: + * 0 - free hugepage + * 1 - in-use hugepage + * 2 - not a hugepage + * -EBUSY - the hugepage is busy (try to retry) + * -EHWPOISON - the hugepage is already hwpoisoned + */ +int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) { - struct page *p = pfn_to_page(pfn); - struct page *head = compound_head(p); - int res; - unsigned long page_flags; + struct page *page = pfn_to_page(pfn); + struct page *head = compound_head(page); + int ret = 2; /* fallback to normal page handling */ + bool count_increased = false; + + if (!PageHeadHuge(head)) + goto out; + + if (flags & MF_COUNT_INCREASED) { + ret = 1; + count_increased = true; + } else if (HPageFreed(head) || HPageMigratable(head)) { + ret = get_page_unless_zero(head); + if (ret) + count_increased = true; + } else { + ret = -EBUSY; + goto out; + } if (TestSetPageHWPoison(head)) { - pr_err("Memory failure: %#lx: already hardware poisoned\n", - pfn); - res = -EHWPOISON; - if (flags & MF_ACTION_REQUIRED) + ret = -EHWPOISON; + goto out; + } + + return ret; +out: + if (count_increased) + put_page(head); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * Taking refcount of hugetlb pages needs extra care about race conditions + * with basic operations like hugepage allocation/free/demotion. + * So some of prechecks for hwpoison (pinning, and testing/setting + * PageHWPoison) should be done in single hugetlb_lock range. + */ +static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) +{ + int res; + struct page *p = pfn_to_page(pfn); + struct page *head; + unsigned long page_flags; + bool retry = true; + + *hugetlb = 1; +retry: + res = get_huge_page_for_hwpoison(pfn, flags); + if (res == 2) { /* fallback to normal page handling */ + *hugetlb = 0; + return 0; + } else if (res == -EHWPOISON) { + pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); + if (flags & MF_ACTION_REQUIRED) { + head = compound_head(p); res = kill_accessing_process(current, page_to_pfn(head), flags); + } return res; + } else if (res == -EBUSY) { + if (retry) { + retry = false; + goto retry; + } + action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); + return res; + } + + head = compound_head(p); + lock_page(head); + + if (hwpoison_filter(p)) { + ClearPageHWPoison(head); + res = -EOPNOTSUPP; + goto out; } num_poisoned_pages_inc(); - if (!(flags & MF_COUNT_INCREASED)) { - res = get_hwpoison_page(p, flags); - if (!res) { - lock_page(head); - if (hwpoison_filter(p)) { - if (TestClearPageHWPoison(head)) - num_poisoned_pages_dec(); - unlock_page(head); - return -EOPNOTSUPP; - } - unlock_page(head); - res = MF_FAILED; - if (__page_handle_poison(p)) { - page_ref_inc(p); - res = MF_RECOVERED; - } - action_result(pfn, MF_MSG_FREE_HUGE, res); - return res == MF_RECOVERED ? 0 : -EBUSY; - } else if (res < 0) { - action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); - return -EBUSY; + /* + * Handling free hugepage. The possible race with hugepage allocation + * or demotion can be prevented by PageHWPoison flag. + */ + if (res == 0) { + unlock_page(head); + res = MF_FAILED; + if (__page_handle_poison(p)) { + page_ref_inc(p); + res = MF_RECOVERED; } + action_result(pfn, MF_MSG_FREE_HUGE, res); + return res == MF_RECOVERED ? 0 : -EBUSY; } - lock_page(head); - /* * The page could have changed compound pages due to race window. * If this happens just bail out. @@ -1554,14 +1617,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) page_flags = head->flags; - if (hwpoison_filter(p)) { - if (TestClearPageHWPoison(head)) - num_poisoned_pages_dec(); - put_page(p); - res = -EOPNOTSUPP; - goto out; - } - /* * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so * simply disable it. In order to make it work properly, we need @@ -1588,6 +1643,12 @@ out: unlock_page(head); return res; } +#else +static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) +{ + return 0; +} +#endif static int memory_failure_dev_pagemap(unsigned long pfn, int flags, struct dev_pagemap *pgmap) @@ -1712,6 +1773,7 @@ int memory_failure(unsigned long pfn, int flags) int res = 0; unsigned long page_flags; bool retry = true; + int hugetlb = 0; if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1739,10 +1801,9 @@ int memory_failure(unsigned long pfn, int flags) } try_again: - if (PageHuge(p)) { - res = memory_failure_hugetlb(pfn, flags); + res = try_memory_failure_hugetlb(pfn, flags, &hugetlb); + if (hugetlb) goto unlock_mutex; - } if (TestSetPageHWPoison(p)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", From d173d5417fb67411e623d394aab986d847e47dad Mon Sep 17 00:00:00 2001 From: Xu Yu Date: Thu, 21 Apr 2022 16:35:37 -0700 Subject: [PATCH 171/196] mm/memory-failure.c: skip huge_zero_page in memory_failure() Kernel panic when injecting memory_failure for the global huge_zero_page, when CONFIG_DEBUG_VM is enabled, as follows. Injecting memory failure for pfn 0x109ff9 at process virtual address 0x20ff9000 page:00000000fb053fc3 refcount:2 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x109e00 head:00000000fb053fc3 order:9 compound_mapcount:0 compound_pincount:0 flags: 0x17fffc000010001(locked|head|node=0|zone=2|lastcpupid=0x1ffff) raw: 017fffc000010001 0000000000000000 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000000000 00000002ffffffff 0000000000000000 page dumped because: VM_BUG_ON_PAGE(is_huge_zero_page(head)) ------------[ cut here ]------------ kernel BUG at mm/huge_memory.c:2499! invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 6 PID: 553 Comm: split_bug Not tainted 5.18.0-rc1+ #11 Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 3288b3c 04/01/2014 RIP: 0010:split_huge_page_to_list+0x66a/0x880 Code: 84 9b fb ff ff 48 8b 7c 24 08 31 f6 e8 9f 5d 2a 00 b8 b8 02 00 00 e9 e8 fb ff ff 48 c7 c6 e8 47 3c 82 4c b RSP: 0018:ffffc90000dcbdf8 EFLAGS: 00010246 RAX: 000000000000003c RBX: 0000000000000001 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff823e4c4f RDI: 00000000ffffffff RBP: ffff88843fffdb40 R08: 0000000000000000 R09: 00000000fffeffff R10: ffffc90000dcbc48 R11: ffffffff82d68448 R12: ffffea0004278000 R13: ffffffff823c6203 R14: 0000000000109ff9 R15: ffffea000427fe40 FS: 00007fc375a26740(0000) GS:ffff88842fd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc3757c9290 CR3: 0000000102174006 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: try_to_split_thp_page+0x3a/0x130 memory_failure+0x128/0x800 madvise_inject_error.cold+0x8b/0xa1 __x64_sys_madvise+0x54/0x60 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fc3754f8bf9 Code: 01 00 48 81 c4 80 00 00 00 e9 f1 fe ff ff 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 8 RSP: 002b:00007ffeda93a1d8 EFLAGS: 00000217 ORIG_RAX: 000000000000001c RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fc3754f8bf9 RDX: 0000000000000064 RSI: 0000000000003000 RDI: 0000000020ff9000 RBP: 00007ffeda93a200 R08: 0000000000000000 R09: 0000000000000000 R10: 00000000ffffffff R11: 0000000000000217 R12: 0000000000400490 R13: 00007ffeda93a2e0 R14: 0000000000000000 R15: 0000000000000000 This makes huge_zero_page bail out explicitly before split in memory_failure(), thus the panic above won't happen again. Link: https://lkml.kernel.org/r/497d3835612610e370c74e697ea3c721d1d55b9c.1649775850.git.xuyu@linux.alibaba.com Fixes: 6a46079cf57a ("HWPOISON: The high level memory error handler in the VM v7") Signed-off-by: Xu Yu Reported-by: Abaci Suggested-by: Naoya Horiguchi Acked-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: Anshuman Khandual Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2020944398c9..27760c19bad7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1860,6 +1860,19 @@ try_again: } if (PageTransHuge(hpage)) { + /* + * Bail out before SetPageHasHWPoisoned() if hpage is + * huge_zero_page, although PG_has_hwpoisoned is not + * checked in set_huge_zero_page(). + * + * TODO: Handle memory failure of huge_zero_page thoroughly. + */ + if (is_huge_zero_page(hpage)) { + action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); + res = -EBUSY; + goto unlock_mutex; + } + /* * The flag must be set after the refcount is bumped * otherwise it may race with THP split. From 9b3016154c913b2e7ec5ae5c9a42eb9e732d86aa Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 21 Apr 2022 16:35:40 -0700 Subject: [PATCH 172/196] memcg: sync flush only if periodic flush is delayed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Daniel Dao has reported [1] a regression on workloads that may trigger a lot of refaults (anon and file). The underlying issue is that flushing rstat is expensive. Although rstat flush are batched with (nr_cpus * MEMCG_BATCH) stat updates, it seems like there are workloads which genuinely do stat updates larger than batch value within short amount of time. Since the rstat flush can happen in the performance critical codepaths like page faults, such workload can suffer greatly. This patch fixes this regression by making the rstat flushing conditional in the performance critical codepaths. More specifically, the kernel relies on the async periodic rstat flusher to flush the stats and only if the periodic flusher is delayed by more than twice the amount of its normal time window then the kernel allows rstat flushing from the performance critical codepaths. Now the question: what are the side-effects of this change? The worst that can happen is the refault codepath will see 4sec old lruvec stats and may cause false (or missed) activations of the refaulted page which may under-or-overestimate the workingset size. Though that is not very concerning as the kernel can already miss or do false activations. There are two more codepaths whose flushing behavior is not changed by this patch and we may need to come to them in future. One is the writeback stats used by dirty throttling and second is the deactivation heuristic in the reclaim. For now keeping an eye on them and if there is report of regression due to these codepaths, we will reevaluate then. Link: https://lore.kernel.org/all/CA+wXwBSyO87ZX5PVwdHm-=dBjZYECGmfnydUicUyrQqndgX2MQ@mail.gmail.com [1] Link: https://lkml.kernel.org/r/20220304184040.1304781-1-shakeelb@google.com Fixes: 1f828223b799 ("memcg: flush lruvec stats in the refault") Signed-off-by: Shakeel Butt Reported-by: Daniel Dao Tested-by: Ivan Babrou Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Koutný Cc: Frank Hofmann Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++++ mm/memcontrol.c | 12 +++++++++++- mm/workingset.c | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a68dce3873fc..89b14729d59f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1012,6 +1012,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, } void mem_cgroup_flush_stats(void); +void mem_cgroup_flush_stats_delayed(void); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); @@ -1455,6 +1456,10 @@ static inline void mem_cgroup_flush_stats(void) { } +static inline void mem_cgroup_flush_stats_delayed(void) +{ +} + static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 725f76723220..598fece89e2b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -587,6 +587,9 @@ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); static DEFINE_SPINLOCK(stats_flush_lock); static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_threshold = ATOMIC_INIT(0); +static u64 flush_next_time; + +#define FLUSH_TIME (2UL*HZ) /* * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can @@ -637,6 +640,7 @@ static void __mem_cgroup_flush_stats(void) if (!spin_trylock_irqsave(&stats_flush_lock, flag)) return; + flush_next_time = jiffies_64 + 2*FLUSH_TIME; cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); atomic_set(&stats_flush_threshold, 0); spin_unlock_irqrestore(&stats_flush_lock, flag); @@ -648,10 +652,16 @@ void mem_cgroup_flush_stats(void) __mem_cgroup_flush_stats(); } +void mem_cgroup_flush_stats_delayed(void) +{ + if (time_after64(jiffies_64, flush_next_time)) + mem_cgroup_flush_stats(); +} + static void flush_memcg_stats_dwork(struct work_struct *w) { __mem_cgroup_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } /** diff --git a/mm/workingset.c b/mm/workingset.c index 8a3828acc0bf..592569a8974c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -355,7 +355,7 @@ void workingset_refault(struct folio *folio, void *shadow) mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats_delayed(); /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if From 0e88904cb700a9654c9f0d9ca4967e761e7c9ee8 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 21 Apr 2022 16:35:43 -0700 Subject: [PATCH 173/196] userfaultfd: mark uffd_wp regardless of VM_WRITE flag When a PTE is set by UFFD operations such as UFFDIO_COPY, the PTE is currently only marked as write-protected if the VMA has VM_WRITE flag set. This seems incorrect or at least would be unexpected by the users. Consider the following sequence of operations that are being performed on a certain page: mprotect(PROT_READ) UFFDIO_COPY(UFFDIO_COPY_MODE_WP) mprotect(PROT_READ|PROT_WRITE) At this point the user would expect to still get UFFD notification when the page is accessed for write, but the user would not get one, since the PTE was not marked as UFFD_WP during UFFDIO_COPY. Fix it by always marking PTEs as UFFD_WP regardless on the write-permission in the VMA flags. Link: https://lkml.kernel.org/r/20220217211602.2769-1-namit@vmware.com Fixes: 292924b26024 ("userfaultfd: wp: apply _PAGE_UFFD_WP bit") Signed-off-by: Nadav Amit Acked-by: Peter Xu Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/userfaultfd.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0cb8e5ef1713..e9bb6db002aa 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -72,12 +72,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, _dst_pte = pte_mkdirty(_dst_pte); if (page_in_cache && !vm_shared) writable = false; - if (writable) { - if (wp_copy) - _dst_pte = pte_mkuffd_wp(_dst_pte); - else - _dst_pte = pte_mkwrite(_dst_pte); - } + + /* + * Always mark a PTE as write-protected when needed, regardless of + * VM_WRITE, which the user might change. + */ + if (wp_copy) + _dst_pte = pte_mkuffd_wp(_dst_pte); + else if (writable) + _dst_pte = pte_mkwrite(_dst_pte); dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); From 5f24d5a579d1eace79d505b148808a850b417d4c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Apr 2022 16:35:46 -0700 Subject: [PATCH 174/196] mm, hugetlb: allow for "high" userspace addresses This is a fix for commit f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") for hugetlb. This patch adds support for "high" userspace addresses that are optionally supported on the system and have to be requested via a hint mechanism ("high" addr parameter to mmap). Architectures such as powerpc and x86 achieve this by making changes to their architectural versions of hugetlb_get_unmapped_area() function. However, arm64 uses the generic version of that function. So take into account arch_get_mmap_base() and arch_get_mmap_end() in hugetlb_get_unmapped_area(). To allow that, move those two macros out of mm/mmap.c into include/linux/sched/mm.h If these macros are not defined in architectural code then they default to (TASK_SIZE) and (base) so should not introduce any behavioural changes to architectures that do not define them. For the time being, only ARM64 is affected by this change. Catalin (ARM64) said "We should have fixed hugetlb_get_unmapped_area() as well when we added support for 52-bit VA. The reason for commit f6795053dac8 was to prevent normal mmap() from returning addresses above 48-bit by default as some user-space had hard assumptions about this. It's a slight ABI change if you do this for hugetlb_get_unmapped_area() but I doubt anyone would notice. It's more likely that the current behaviour would cause issues, so I'd rather have them consistent. Basically when arm64 gained support for 52-bit addresses we did not want user-space calling mmap() to suddenly get such high addresses, otherwise we could have inadvertently broken some programs (similar behaviour to x86 here). Hence we added commit f6795053dac8. But we missed hugetlbfs which could still get such high mmap() addresses. So in theory that's a potential regression that should have bee addressed at the same time as commit f6795053dac8 (and before arm64 enabled 52-bit addresses)" Link: https://lkml.kernel.org/r/ab847b6edb197bffdfe189e70fb4ac76bfe79e0d.1650033747.git.christophe.leroy@csgroup.eu Fixes: f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") Signed-off-by: Christophe Leroy Reviewed-by: Catalin Marinas Cc: Steve Capper Cc: Will Deacon Cc: [5.0.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 9 +++++---- include/linux/sched/mm.h | 8 ++++++++ mm/mmap.c | 8 -------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 99c7477cee5c..dd3a088db11d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = current->mm->mmap_base; + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); addr = vm_unmapped_area(&info); } @@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); + const unsigned long mmap_end = arch_get_mmap_end(addr); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index a80356e9dc69..1ad1f4bfa025 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -136,6 +136,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU +#ifndef arch_get_mmap_end +#define arch_get_mmap_end(addr) (TASK_SIZE) +#endif + +#ifndef arch_get_mmap_base +#define arch_get_mmap_base(addr, base) (base) +#endif + extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); extern unsigned long diff --git a/mm/mmap.c b/mm/mmap.c index 3aa839f81e63..313b57d55a63 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2117,14 +2117,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) return addr; } -#ifndef arch_get_mmap_end -#define arch_get_mmap_end(addr) (TASK_SIZE) -#endif - -#ifndef arch_get_mmap_base -#define arch_get_mmap_base(addr, base) (base) -#endif - /* Get an address range which is currently unmapped. * For shmat() with addr=0. * From 9c85a9bae267f6b5e5e374d0d023bbbe9db096d3 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 21 Apr 2022 16:35:49 -0700 Subject: [PATCH 175/196] selftest/vm: verify mmap addr in mremap_test Avoid calling mmap with requested addresses that are less than the system's mmap_min_addr. When run as root, mmap returns EACCES when trying to map addresses < mmap_min_addr. This is not one of the error codes for the condition to retry the mmap in the test. Rather than arbitrarily retrying on EACCES, don't attempt an mmap until addr > vm.mmap_min_addr. Add a munmap call after an alignment check as the mappings are retained after the retry and can reach the vm.max_map_count sysctl. Link: https://lkml.kernel.org/r/20220420215721.4868-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/mremap_test.c | 41 +++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c index 7c0b0617b9f8..9a518fee6306 100644 --- a/tools/testing/selftests/vm/mremap_test.c +++ b/tools/testing/selftests/vm/mremap_test.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -63,6 +64,35 @@ enum { .expect_failure = should_fail \ } +/* Returns mmap_min_addr sysctl tunable from procfs */ +static unsigned long long get_mmap_min_addr(void) +{ + FILE *fp; + int n_matched; + static unsigned long long addr; + + if (addr) + return addr; + + fp = fopen("/proc/sys/vm/mmap_min_addr", "r"); + if (fp == NULL) { + ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n", + strerror(errno)); + exit(KSFT_SKIP); + } + + n_matched = fscanf(fp, "%llu", &addr); + if (n_matched != 1) { + ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n", + strerror(errno)); + fclose(fp); + exit(KSFT_SKIP); + } + + fclose(fp); + return addr; +} + /* * Returns the start address of the mapping on success, else returns * NULL on failure. @@ -71,8 +101,15 @@ static void *get_source_mapping(struct config c) { unsigned long long addr = 0ULL; void *src_addr = NULL; + unsigned long long mmap_min_addr; + + mmap_min_addr = get_mmap_min_addr(); + retry: addr += c.src_alignment; + if (addr < mmap_min_addr) + goto retry; + src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, -1, 0); @@ -90,8 +127,10 @@ retry: * alignment in the tests. */ if (((unsigned long long) src_addr & (c.src_alignment - 1)) || - !((unsigned long long) src_addr & c.src_alignment)) + !((unsigned long long) src_addr & c.src_alignment)) { + munmap(src_addr, c.region_size); goto retry; + } if (!src_addr) goto error; From 18d609daa546c919fd36b62a7b510c18de4b4af8 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 21 Apr 2022 16:35:52 -0700 Subject: [PATCH 176/196] selftest/vm: verify remap destination address in mremap_test Because mremap does not have a MAP_FIXED_NOREPLACE flag, it can destroy existing mappings. This causes a segfault when regions such as text are remapped and the permissions are changed. Verify the requested mremap destination address does not overlap any existing mappings by using mmap's MAP_FIXED_NOREPLACE flag. Keep incrementing the destination address until a valid mapping is found or fail the current test once the max address is reached. Link: https://lkml.kernel.org/r/20220420215721.4868-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/mremap_test.c | 42 ++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c index 9a518fee6306..58775dab3cc6 100644 --- a/tools/testing/selftests/vm/mremap_test.c +++ b/tools/testing/selftests/vm/mremap_test.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "../kselftest.h" @@ -64,6 +65,30 @@ enum { .expect_failure = should_fail \ } +/* + * Returns false if the requested remap region overlaps with an + * existing mapping (e.g text, stack) else returns true. + */ +static bool is_remap_region_valid(void *addr, unsigned long long size) +{ + void *remap_addr = NULL; + bool ret = true; + + /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */ + remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE, + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); + + if (remap_addr == MAP_FAILED) { + if (errno == EEXIST) + ret = false; + } else { + munmap(remap_addr, size); + } + + return ret; +} + /* Returns mmap_min_addr sysctl tunable from procfs */ static unsigned long long get_mmap_min_addr(void) { @@ -111,8 +136,8 @@ retry: goto retry; src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, - MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, - -1, 0); + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); if (src_addr == MAP_FAILED) { if (errno == EPERM || errno == EEXIST) goto retry; @@ -179,9 +204,20 @@ static long long remap_region(struct config c, unsigned int threshold_mb, if (!((unsigned long long) addr & c.dest_alignment)) addr = (void *) ((unsigned long long) addr | c.dest_alignment); + /* Don't destroy existing mappings unless expected to overlap */ + while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) { + /* Check for unsigned overflow */ + if (addr + c.dest_alignment < addr) { + ksft_print_msg("Couldn't find a valid region to remap to\n"); + ret = -1; + goto out; + } + addr += c.dest_alignment; + } + clock_gettime(CLOCK_MONOTONIC, &t_start); dest_addr = mremap(src_addr, c.region_size, c.region_size, - MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr); + MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr); clock_gettime(CLOCK_MONOTONIC, &t_end); if (dest_addr == MAP_FAILED) { From e5508fc52c76fe42d8bb091fbd7796eeb64b52c4 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 21 Apr 2022 16:35:55 -0700 Subject: [PATCH 177/196] selftest/vm: support xfail in mremap_test Use ksft_test_result_xfail for the tests which are expected to fail. Link: https://lkml.kernel.org/r/20220420215721.4868-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/mremap_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c index 58775dab3cc6..db0270127aeb 100644 --- a/tools/testing/selftests/vm/mremap_test.c +++ b/tools/testing/selftests/vm/mremap_test.c @@ -268,7 +268,7 @@ static void run_mremap_test_case(struct test test_case, int *failures, if (remap_time < 0) { if (test_case.expect_failure) - ksft_test_result_pass("%s\n\tExpected mremap failure\n", + ksft_test_result_xfail("%s\n\tExpected mremap failure\n", test_case.name); else { ksft_test_result_fail("%s\n", test_case.name); From 80df2fb95df26c849c6cc137344013cc048a083f Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 21 Apr 2022 16:35:58 -0700 Subject: [PATCH 178/196] selftest/vm: add skip support to mremap_test Allow the mremap test to be skipped due to errors such as failing to parse the mmap_min_addr sysctl. Link: https://lkml.kernel.org/r/20220420215721.4868-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/run_vmtests.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 3b265f140c25..352ba00cf26b 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -291,11 +291,16 @@ echo "-------------------" echo "running mremap_test" echo "-------------------" ./mremap_test -if [ $? -ne 0 ]; then +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else echo "[FAIL]" exitcode=1 -else - echo "[PASS]" fi echo "-----------------" From e4a38402c36e42df28eb1a5394be87e6571fb48a Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Thu, 21 Apr 2022 16:36:01 -0700 Subject: [PATCH 179/196] oom_kill.c: futex: delay the OOM reaper to allow time for proper futex cleanup The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which can be targeted by the oom reaper. This mapping is used to store the futex robust list head; the kernel does not keep a copy of the robust list and instead references a userspace address to maintain the robustness during a process death. A race can occur between exit_mm and the oom reaper that allows the oom reaper to free the memory of the futex robust list before the exit path has handled the futex death: CPU1 CPU2 -------------------------------------------------------------------- page_fault do_exit "signal" wake_oom_reaper oom_reaper oom_reap_task_mm (invalidates mm) exit_mm exit_mm_release futex_exit_release futex_cleanup exit_robust_list get_user (EFAULT- can't access memory) If the get_user EFAULT's, the kernel will be unable to recover the waiters on the robust_list, leaving userspace mutexes hung indefinitely. Delay the OOM reaper, allowing more time for the exit path to perform the futex cleanup. Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer Based on a patch by Michal Hocko. Link: https://elixir.bootlin.com/glibc/glibc-2.35/source/nptl/allocatestack.c#L370 [1] Link: https://lkml.kernel.org/r/20220414144042.677008-1-npache@redhat.com Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") Signed-off-by: Joel Savitz Signed-off-by: Nico Pache Co-developed-by: Joel Savitz Suggested-by: Thomas Gleixner Acked-by: Thomas Gleixner Acked-by: Michal Hocko Cc: Rafael Aquini Cc: Waiman Long Cc: Herton R. Krzesinski Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Daniel Bristot de Oliveira Cc: David Rientjes Cc: Andrea Arcangeli Cc: Davidlohr Bueso Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Joel Savitz Cc: Darren Hart Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + mm/oom_kill.c | 56 +++++++++++++++++++++++++++++++------------ 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d5e3c00b74e1..a8911b1f35aa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1443,6 +1443,7 @@ struct task_struct { int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; + struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7ec38194f8e1..49d7df39b02d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -632,7 +632,7 @@ done: */ set_bit(MMF_OOM_SKIP, &mm->flags); - /* Drop a reference taken by wake_oom_reaper */ + /* Drop a reference taken by queue_oom_reaper */ put_task_struct(tsk); } @@ -644,12 +644,12 @@ static int oom_reaper(void *unused) struct task_struct *tsk = NULL; wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); - spin_lock(&oom_reaper_lock); + spin_lock_irq(&oom_reaper_lock); if (oom_reaper_list != NULL) { tsk = oom_reaper_list; oom_reaper_list = tsk->oom_reaper_list; } - spin_unlock(&oom_reaper_lock); + spin_unlock_irq(&oom_reaper_lock); if (tsk) oom_reap_task(tsk); @@ -658,20 +658,46 @@ static int oom_reaper(void *unused) return 0; } -static void wake_oom_reaper(struct task_struct *tsk) +static void wake_oom_reaper(struct timer_list *timer) +{ + struct task_struct *tsk = container_of(timer, struct task_struct, + oom_reaper_timer); + struct mm_struct *mm = tsk->signal->oom_mm; + unsigned long flags; + + /* The victim managed to terminate on its own - see exit_mmap */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + put_task_struct(tsk); + return; + } + + spin_lock_irqsave(&oom_reaper_lock, flags); + tsk->oom_reaper_list = oom_reaper_list; + oom_reaper_list = tsk; + spin_unlock_irqrestore(&oom_reaper_lock, flags); + trace_wake_reaper(tsk->pid); + wake_up(&oom_reaper_wait); +} + +/* + * Give the OOM victim time to exit naturally before invoking the oom_reaping. + * The timers timeout is arbitrary... the longer it is, the longer the worst + * case scenario for the OOM can take. If it is too small, the oom_reaper can + * get in the way and release resources needed by the process exit path. + * e.g. The futex robust list can sit in Anon|Private memory that gets reaped + * before the exit path is able to wake the futex waiters. + */ +#define OOM_REAPER_DELAY (2*HZ) +static void queue_oom_reaper(struct task_struct *tsk) { /* mm is already queued? */ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) return; get_task_struct(tsk); - - spin_lock(&oom_reaper_lock); - tsk->oom_reaper_list = oom_reaper_list; - oom_reaper_list = tsk; - spin_unlock(&oom_reaper_lock); - trace_wake_reaper(tsk->pid); - wake_up(&oom_reaper_wait); + timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); + tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; + add_timer(&tsk->oom_reaper_timer); } static int __init oom_init(void) @@ -681,7 +707,7 @@ static int __init oom_init(void) } subsys_initcall(oom_init) #else -static inline void wake_oom_reaper(struct task_struct *tsk) +static inline void queue_oom_reaper(struct task_struct *tsk) { } #endif /* CONFIG_MMU */ @@ -932,7 +958,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) rcu_read_unlock(); if (can_oom_reap) - wake_oom_reaper(victim); + queue_oom_reaper(victim); mmdrop(mm); put_task_struct(victim); @@ -968,7 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) task_lock(victim); if (task_will_free_mem(victim)) { mark_oom_victim(victim); - wake_oom_reaper(victim); + queue_oom_reaper(victim); task_unlock(victim); put_task_struct(victim); return; @@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *oc) */ if (task_will_free_mem(current)) { mark_oom_victim(current); - wake_oom_reaper(current); + queue_oom_reaper(current); return true; } From 415fccf85920925d23e5358dfb6a64bcf8a6fdd4 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Thu, 21 Apr 2022 16:36:04 -0700 Subject: [PATCH 180/196] MAINTAINERS: add Vincenzo Frascino to KASAN reviewers Add my email address to KASAN reviewers list to make sure that I am Cc'ed in all the KASAN changes that may affect arm64 MTE. Link: https://lkml.kernel.org/r/20220419170640.21404-1-vincenzo.frascino@arm.com Signed-off-by: Vincenzo Frascino Cc: Andrey Ryabinin Cc: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 40fa1955ca3f..19053767bed2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10549,6 +10549,7 @@ M: Andrey Ryabinin R: Alexander Potapenko R: Andrey Konovalov R: Dmitry Vyukov +R: Vincenzo Frascino L: kasan-dev@googlegroups.com S: Maintained F: Documentation/dev-tools/kasan.rst From ecc04463d1a36f88baa750d45dfb02c364e1fdb1 Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Thu, 21 Apr 2022 16:36:07 -0700 Subject: [PATCH 181/196] kcov: don't generate a warning on vm_insert_page()'s failure vm_insert_page()'s failure is not an unexpected condition, so don't do WARN_ONCE() in such a case. Instead, print a kernel message and just return an error code. This flaw has been reported under an OOM condition by sysbot [1]. The message is mainly for the benefit of the test log, in this case the fuzzer's log so that humans inspecting the log can figure out what was going on. KCOV is a testing tool, so I think being a little more chatty when KCOV unexpectedly is about to fail will save someone debugging time. We don't want the WARN, because it's not a kernel bug that syzbot should report, and failure can happen if the fuzzer tries hard enough (as above). Link: https://lkml.kernel.org/r/Ylkr2xrVbhQYwNLf@elver.google.com [1] Link: https://lkml.kernel.org/r/20220401182512.249282-1-nogikh@google.com Fixes: b3d7fe86fbd0 ("kcov: properly handle subsequent mmap calls"), Signed-off-by: Aleksandr Nogikh Acked-by: Marco Elver Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Alexander Potapenko Cc: Taras Madan Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 475524bd900a..b3732b210593 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -475,8 +475,11 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) vma->vm_flags |= VM_DONTEXPAND; for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); - if (vm_insert_page(vma, vma->vm_start + off, page)) - WARN_ONCE(1, "vm_insert_page() failed"); + res = vm_insert_page(vma, vma->vm_start + off, page); + if (res) { + pr_warn_once("kcov: vm_insert_page() failed\n"); + return res; + } } return 0; exit: From 319561669a59d8e9206ab311ae5433ef92fd79d1 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 21 Apr 2022 16:36:10 -0700 Subject: [PATCH 182/196] mm/mmu_notifier.c: fix race in mmu_interval_notifier_remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases it is possible for mmu_interval_notifier_remove() to race with mn_tree_inv_end() allowing it to return while the notifier data structure is still in use. Consider the following sequence: CPU0 - mn_tree_inv_end() CPU1 - mmu_interval_notifier_remove() ----------------------------------- ------------------------------------ spin_lock(subscriptions->lock); seq = subscriptions->invalidate_seq; spin_lock(subscriptions->lock); spin_unlock(subscriptions->lock); subscriptions->invalidate_seq++; wait_event(invalidate_seq != seq); return; interval_tree_remove(interval_sub); kfree(interval_sub); spin_unlock(subscriptions->lock); wake_up_all(); As the wait_event() condition is true it will return immediately. This can lead to use-after-free type errors if the caller frees the data structure containing the interval notifier subscription while it is still on a deferred list. Fix this by taking the appropriate lock when reading invalidate_seq to ensure proper synchronisation. I observed this whilst running stress testing during some development. You do have to be pretty unlucky, but it leads to the usual problems of use-after-free (memory corruption, kernel crash, difficult to diagnose WARN_ON, etc). Link: https://lkml.kernel.org/r/20220420043734.476348-1-apopple@nvidia.com Fixes: 99cb252f5e68 ("mm/mmu_notifier: add an interval tree notifier") Signed-off-by: Alistair Popple Signed-off-by: Jason Gunthorpe Cc: Christian König Cc: John Hubbard Cc: Ralph Campbell Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 459d195d2ff6..f45ff1b7626a 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -1036,6 +1036,18 @@ int mmu_interval_notifier_insert_locked( } EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); +static bool +mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions, + unsigned long seq) +{ + bool ret; + + spin_lock(&subscriptions->lock); + ret = subscriptions->invalidate_seq != seq; + spin_unlock(&subscriptions->lock); + return ret; +} + /** * mmu_interval_notifier_remove - Remove a interval notifier * @interval_sub: Interval subscription to unregister @@ -1083,7 +1095,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub) lock_map_release(&__mmu_notifier_invalidate_range_start_map); if (seq) wait_event(subscriptions->wq, - READ_ONCE(subscriptions->invalidate_seq) != seq); + mmu_interval_seq_released(subscriptions, seq)); /* pairs with mmgrab in mmu_interval_notifier_insert() */ mmdrop(mm); From 23bc8f69f0eceecbb87c3801d2e48827d2dca92b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 22 Apr 2022 14:00:33 +0800 Subject: [PATCH 183/196] arm64: mm: fix p?d_leaf() The pmd_leaf() is used to test a leaf mapped PMD, however, it misses the PROT_NONE mapped PMD on arm64. Fix it. A real world issue [1] caused by this was reported by Qian Cai. Also fix pud_leaf(). Link: https://patchwork.kernel.org/comment/24798260/ [1] Fixes: 8aa82df3c123 ("arm64: mm: add p?d_leaf() definitions") Reported-by: Qian Cai Signed-off-by: Muchun Song Link: https://lore.kernel.org/r/20220422060033.48711-1-songmuchun@bytedance.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 94e147e5456c..dff2b483ea50 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -535,7 +535,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, PMD_TYPE_TABLE) #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_SECT) -#define pmd_leaf(pmd) pmd_sect(pmd) +#define pmd_leaf(pmd) (pmd_present(pmd) && !pmd_table(pmd)) #define pmd_bad(pmd) (!pmd_table(pmd)) #define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE) @@ -625,7 +625,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define pud_none(pud) (!pud_val(pud)) #define pud_bad(pud) (!pud_table(pud)) #define pud_present(pud) pte_present(pud_pte(pud)) -#define pud_leaf(pud) pud_sect(pud) +#define pud_leaf(pud) (pud_present(pud) && !pud_table(pud)) #define pud_valid(pud) pte_valid(pud_pte(pud)) static inline void set_pud(pud_t *pudp, pud_t pud) From 3b8000ae185cb068adbda5f966a3835053c85fd4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 22 Apr 2022 16:01:05 +1000 Subject: [PATCH 184/196] mm/vmalloc: huge vmalloc backing pages should be split rather than compound Huge vmalloc higher-order backing pages were allocated with __GFP_COMP in order to allow the sub-pages to be refcounted by callers such as "remap_vmalloc_page [sic]" (remap_vmalloc_range). However a similar problem exists for other struct page fields callers use, for example fb_deferred_io_fault() takes a vmalloc'ed page and not only refcounts it but uses ->lru, ->mapping, ->index. This is not compatible with compound sub-pages, and can cause bad page state issues like BUG: Bad page state in process swapper/0 pfn:00743 page:(____ptrval____) refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x743 flags: 0x7ffff000000000(node=0|zone=0|lastcpupid=0x7ffff) raw: 007ffff000000000 c00c00000001d0c8 c00c00000001d0c8 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: corrupted mapping in tail page Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.18.0-rc3-00082-gfc6fff4a7ce1-dirty #2810 Call Trace: dump_stack_lvl+0x74/0xa8 (unreliable) bad_page+0x12c/0x170 free_tail_pages_check+0xe8/0x190 free_pcp_prepare+0x31c/0x4e0 free_unref_page+0x40/0x1b0 __vunmap+0x1d8/0x420 ... The correct approach is to use split high-order pages for the huge vmalloc backing. These allow callers to treat them in exactly the same way as individually-allocated order-0 pages. Link: https://lore.kernel.org/all/14444103-d51b-0fb3-ee63-c3f182f0b546@molgen.mpg.de/ Signed-off-by: Nicholas Piggin Cc: Paul Menzel Cc: Song Liu Cc: Rick Edgecombe Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 07da85ae825b..cadfbb5155ea 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2653,15 +2653,18 @@ static void __vunmap(const void *addr, int deallocate_pages) vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { - unsigned int page_order = vm_area_page_order(area); - int i, step = 1U << page_order; + int i; - for (i = 0; i < area->nr_pages; i += step) { + for (i = 0; i < area->nr_pages; i++) { struct page *page = area->pages[i]; BUG_ON(!page); - mod_memcg_page_state(page, MEMCG_VMALLOC, -step); - __free_pages(page, page_order); + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + /* + * High-order allocs for huge vmallocs are split, so + * can be freed as an array of order-0 allocations + */ + __free_pages(page, 0); cond_resched(); } atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); @@ -2914,12 +2917,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (nr != nr_pages_request) break; } - } else - /* - * Compound pages required for remap_vmalloc_page if - * high-order pages. - */ - gfp |= __GFP_COMP; + } /* High-order pages or fallback path if "bulk" fails. */ @@ -2933,6 +2931,15 @@ vm_area_alloc_pages(gfp_t gfp, int nid, page = alloc_pages_node(nid, gfp, order); if (unlikely(!page)) break; + /* + * Higher order allocations must be able to be treated as + * indepdenent small pages by callers (as they can with + * small-page vmallocs). Some drivers do their own refcounting + * on vmalloc_to_page() pages, some use page->mapping, + * page->lru, etc. + */ + if (order) + split_page(page, order); /* * Careful, we allocate and map page-order pages, but @@ -2992,11 +2999,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (gfp_mask & __GFP_ACCOUNT) { - int i, step = 1U << page_order; + int i; - for (i = 0; i < area->nr_pages; i += step) - mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, - step); + for (i = 0; i < area->nr_pages; i++) + mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); } /* From b9663a6ff8289a095d56d9a3a3f9c185a7b7b0d7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 22 Apr 2022 13:20:21 -0400 Subject: [PATCH 185/196] tools: Add kmem_cache_alloc_lru() Turn kmem_cache_alloc() into a wrapper around kmem_cache_alloc_lru(). Fixes: 9bbdc0f32409 ("xarray: use kmem_cache_alloc_lru to allocate xa_node") Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Liam R. Howlett Reported-by: Li Wang --- tools/include/linux/slab.h | 8 +++++++- tools/testing/radix-tree/linux.c | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/include/linux/slab.h b/tools/include/linux/slab.h index f41d8a0eb1a4..0616409513eb 100644 --- a/tools/include/linux/slab.h +++ b/tools/include/linux/slab.h @@ -28,7 +28,13 @@ static inline void *kzalloc(size_t size, gfp_t gfp) return kmalloc(size, gfp | __GFP_ZERO); } -void *kmem_cache_alloc(struct kmem_cache *cachep, int flags); +struct list_lru; + +void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *, int flags); +static inline void *kmem_cache_alloc(struct kmem_cache *cachep, int flags) +{ + return kmem_cache_alloc_lru(cachep, NULL, flags); +} void kmem_cache_free(struct kmem_cache *cachep, void *objp); struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c index 81539f543954..d5c1bcba86fe 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/radix-tree/linux.c @@ -25,7 +25,8 @@ struct kmem_cache { void (*ctor)(void *); }; -void *kmem_cache_alloc(struct kmem_cache *cachep, int gfp) +void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, + int gfp) { void *p; From 63b1898fffcd8bd81905b95104ecc52b45a97e21 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 22 Apr 2022 13:23:12 -0400 Subject: [PATCH 186/196] XArray: Disallow sibling entries of nodes There is a race between xas_split() and xas_load() which can result in the wrong page being returned, and thus data corruption. Fortunately, it's hard to hit (syzbot took three months to find it) and often guarded with VM_BUG_ON(). The anatomy of this race is: thread A thread B order-9 page is stored at index 0x200 lookup of page at index 0x274 page split starts load of sibling entry at offset 9 stores nodes at offsets 8-15 load of entry at offset 8 The entry at offset 8 turns out to be a node, and so we descend into it, and load the page at index 0x234 instead of 0x274. This is hard to fix on the split side; we could replace the entire node that contains the order-9 page instead of replacing the eight entries. Fixing it on the lookup side is easier; just disallow sibling entries that point to nodes. This cannot ever be a useful thing as the descent would not know the correct offset to use within the new node. The test suite continues to pass, but I have not added a new test for this bug. Reported-by: syzbot+cf4cf13056f85dec2c40@syzkaller.appspotmail.com Tested-by: syzbot+cf4cf13056f85dec2c40@syzkaller.appspotmail.com Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache") Signed-off-by: Matthew Wilcox (Oracle) --- lib/xarray.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/xarray.c b/lib/xarray.c index 4acc88ea7c21..54e646e8e6ee 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -207,6 +207,8 @@ static void *xas_descend(struct xa_state *xas, struct xa_node *node) if (xa_is_sibling(entry)) { offset = xa_to_sibling(entry); entry = xa_entry(xas->xa, node, offset); + if (node->shift && xa_is_node(entry)) + entry = XA_RETRY_ENTRY; } xas->xa_offset = offset; From 533bec143a4c32f7b2014a159d0f5376226e5b4d Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Wed, 20 Apr 2022 01:49:13 +0000 Subject: [PATCH 187/196] arm/xen: Fix some refcount leaks The of_find_compatible_node() function returns a node pointer with refcount incremented, We should use of_node_put() on it when done Add the missing of_node_put() to release the refcount. Fixes: 9b08aaa3199a ("ARM: XEN: Move xen_early_init() before efi_init()") Fixes: b2371587fe0c ("arm/xen: Read extended regions from DT and init Xen resource") Signed-off-by: Miaoqian Lin Reviewed-by: Stefano Stabellini Signed-off-by: Stefano Stabellini --- arch/arm/xen/enlighten.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index ec5b082f3de6..07eb69f9e7df 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -337,12 +337,15 @@ int __init arch_xen_unpopulated_init(struct resource **res) if (!nr_reg) { pr_err("No extended regions are found\n"); + of_node_put(np); return -EINVAL; } regs = kcalloc(nr_reg, sizeof(*regs), GFP_KERNEL); - if (!regs) + if (!regs) { + of_node_put(np); return -ENOMEM; + } /* * Create resource from extended regions provided by the hypervisor to be @@ -403,8 +406,8 @@ int __init arch_xen_unpopulated_init(struct resource **res) *res = &xen_resource; err: + of_node_put(np); kfree(regs); - return rc; } #endif @@ -424,8 +427,10 @@ static void __init xen_dt_guest_init(void) if (of_address_to_resource(xen_node, GRANT_TABLE_INDEX, &res)) { pr_err("Xen grant table region is not found\n"); + of_node_put(xen_node); return; } + of_node_put(xen_node); xen_grant_frames = res.start; } From 06fb4ecfeac7e00d6704fa5ed19299f2fefb3cc9 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 22 Apr 2022 08:14:52 -0500 Subject: [PATCH 188/196] gpio: Request interrupts after IRQ is initialized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 5467801f1fcb ("gpio: Restrict usage of GPIO chip irq members before initialization") attempted to fix a race condition that lead to a NULL pointer, but in the process caused a regression for _AEI/_EVT declared GPIOs. This manifests in messages showing deferred probing while trying to allocate IRQs like so: amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x0000 to IRQ, err -517 amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x002C to IRQ, err -517 amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x003D to IRQ, err -517 [ .. more of the same .. ] The code for walking _AEI doesn't handle deferred probing and so this leads to non-functional GPIO interrupts. Fix this issue by moving the call to `acpi_gpiochip_request_interrupts` to occur after gc->irc.initialized is set. Fixes: 5467801f1fcb ("gpio: Restrict usage of GPIO chip irq members before initialization") Link: https://lore.kernel.org/linux-gpio/BL1PR12MB51577A77F000A008AA694675E2EF9@BL1PR12MB5157.namprd12.prod.outlook.com/ Link: https://bugzilla.suse.com/show_bug.cgi?id=1198697 Link: https://bugzilla.kernel.org/show_bug.cgi?id=215850 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1979 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1976 Reported-by: Mario Limonciello Signed-off-by: Mario Limonciello Reviewed-by: Shreeya Patel Tested-By: Samuel Čavoj Tested-By: lukeluk498@gmail.com Link: Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Reviewed-and-tested-by: Takashi Iwai Cc: Shreeya Patel Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- drivers/gpio/gpiolib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 085348e08986..b7694171655c 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1601,8 +1601,6 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, gpiochip_set_irq_hooks(gc); - acpi_gpiochip_request_interrupts(gc); - /* * Using barrier() here to prevent compiler from reordering * gc->irq.initialized before initialization of above @@ -1612,6 +1610,8 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, gc->irq.initialized = true; + acpi_gpiochip_request_interrupts(gc); + return 0; } From d22588d73b18fd12fd971e2dab7fa5ddf978e496 Mon Sep 17 00:00:00 2001 From: Guilherme Amadio Date: Sat, 16 Apr 2022 09:45:55 +0200 Subject: [PATCH 189/196] perf clang: Fix header include for LLVM >= 14 The header TargetRegistry.h has moved in LLVM/clang 14. Committer notes: The problem as noticed when building in ubuntu:22.04: 90 98.61 ubuntu:22.04 : FAIL gcc version 11.2.0 (Ubuntu 11.2.0-19ubuntu1) util/c++/clang.cpp:23:10: fatal error: llvm/Support/TargetRegistry.h: No such file or directory 23 | #include "llvm/Support/TargetRegistry.h" | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ compilation terminated. Fixed after applying this patch. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Guilherme Amadio Tested-by: Arnaldo Carvalho de Melo Link: https://twitter.com/GuilhermeAmadio/status/1514970524232921088 Link: http://lore.kernel.org/lkml/Ylp0M/VYgHOxtcnF@gentoo.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/c++/clang.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/perf/util/c++/clang.cpp b/tools/perf/util/c++/clang.cpp index df7b18fb6b6e..1aad7d6d34aa 100644 --- a/tools/perf/util/c++/clang.cpp +++ b/tools/perf/util/c++/clang.cpp @@ -20,7 +20,11 @@ #include "llvm/Option/Option.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/ManagedStatic.h" +#if CLANG_VERSION_MAJOR >= 14 +#include "llvm/MC/TargetRegistry.h" +#else #include "llvm/Support/TargetRegistry.h" +#endif #include "llvm/Support/TargetSelect.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" From c6d8df01064333dcf140eda996abdb60a60e24b3 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Sun, 17 Apr 2022 19:48:37 +0800 Subject: [PATCH 190/196] perf script: Always allow field 'data_src' for auxtrace If use command 'perf script -F,+data_src' to dump memory samples with Arm SPE trace data, it reports error: # perf script -F,+data_src Samples for 'dummy:u' event do not have DATA_SRC attribute set. Cannot print 'data_src' field. This is because the 'dummy:u' event is absent DATA_SRC bit in its sample type, so if a file contains AUX area tracing data then always allow field 'data_src' to be selected as an option for perf script. Fixes: e55ed3423c1bb29f ("perf arm-spe: Synthesize memory event") Signed-off-by: Leo Yan Cc: Adrian Hunter Cc: Alexander Shishkin Cc: German Gomez Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220417114837.839896-1-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index a2f117936188..cf5eab5431b4 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -461,7 +461,7 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session) return -EINVAL; if (PRINT_FIELD(DATA_SRC) && - evsel__check_stype(evsel, PERF_SAMPLE_DATA_SRC, "DATA_SRC", PERF_OUTPUT_DATA_SRC)) + evsel__do_check_stype(evsel, PERF_SAMPLE_DATA_SRC, "DATA_SRC", PERF_OUTPUT_DATA_SRC, allow_user_set)) return -EINVAL; if (PRINT_FIELD(WEIGHT) && From ccb17caecfbd542f49a2a79ae088136ba8bfb794 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 14 Apr 2022 20:32:01 +0800 Subject: [PATCH 191/196] perf report: Set PERF_SAMPLE_DATA_SRC bit for Arm SPE event Since commit bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem info is not available") "perf mem report" and "perf report --mem-mode" don't report result if the PERF_SAMPLE_DATA_SRC bit is missed in sample type. The commit ffab487052054162 ("perf: arm-spe: Fix perf report --mem-mode") partially fixes the issue. It adds PERF_SAMPLE_DATA_SRC bit for Arm SPE event, this allows the perf data file generated by kernel v5.18-rc1 or later version can be reported properly. On the other hand, perf tool still fails to be backward compatibility for a data file recorded by an older version's perf which contains Arm SPE trace data. This patch is a workaround in reporting phase, when detects ARM SPE PMU event and without PERF_SAMPLE_DATA_SRC bit, it will force to set the bit in the sample type and give a warning info. Fixes: bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem info is not available") Reviewed-by: James Clark Signed-off-by: Leo Yan Tested-by: German Gomez Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/20220414123201.842754-1-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 1ad75c7ba074..afe4a5539ecc 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -353,6 +353,7 @@ static int report__setup_sample_type(struct report *rep) struct perf_session *session = rep->session; u64 sample_type = evlist__combined_sample_type(session->evlist); bool is_pipe = perf_data__is_pipe(session->data); + struct evsel *evsel; if (session->itrace_synth_opts->callchain || session->itrace_synth_opts->add_callchain || @@ -407,6 +408,19 @@ static int report__setup_sample_type(struct report *rep) } if (sort__mode == SORT_MODE__MEMORY) { + /* + * FIXUP: prior to kernel 5.18, Arm SPE missed to set + * PERF_SAMPLE_DATA_SRC bit in sample type. For backward + * compatibility, set the bit if it's an old perf data file. + */ + evlist__for_each_entry(session->evlist, evsel) { + if (strstr(evsel->name, "arm_spe") && + !(sample_type & PERF_SAMPLE_DATA_SRC)) { + evsel->core.attr.sample_type |= PERF_SAMPLE_DATA_SRC; + sample_type |= PERF_SAMPLE_DATA_SRC; + } + } + if (!is_pipe && !(sample_type & PERF_SAMPLE_DATA_SRC)) { ui__error("Selected --mem-mode but no mem data. " "Did you call perf record without -d?\n"); From 5bb017d4b97a0f135f43ef77091b7edcce4dcee6 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 20 Apr 2022 08:29:21 +0200 Subject: [PATCH 192/196] perf test: Fix error message for test case 71 on s390, where it is not supported Test case 71 'Convert perf time to TSC' is not supported on s390. Subtest 71.1 is skipped with the correct message, but subtest 71.2 is not skipped and fails. The root cause is function evlist__open() called from test__perf_time_to_tsc(). evlist__open() returns -ENOENT because the event cycles:u is not supported by the selected PMU, for example platform s390 on z/VM or an x86_64 virtual machine. The PMU driver returns -ENOENT in this case. This error is leads to the failure. Fix this by returning TEST_SKIP on -ENOENT. Output before: 71: Convert perf time to TSC: 71.1: TSC support: Skip (This architecture does not support) 71.2: Perf time to TSC: FAILED! Output after: 71: Convert perf time to TSC: 71.1: TSC support: Skip (This architecture does not support) 71.2: Perf time to TSC: Skip (perf_read_tsc_conversion is not supported) This also happens on an x86_64 virtual machine: # uname -m x86_64 $ ./perf test -F 71 71: Convert perf time to TSC : 71.1: TSC support : Ok 71.2: Perf time to TSC : FAILED! $ Committer testing: Continues to work on x86_64: $ perf test 71 71: Convert perf time to TSC : 71.1: TSC support : Ok 71.2: Perf time to TSC : Ok $ Fixes: 290fa68bdc458863 ("perf test tsc: Fix error message when not supported") Signed-off-by: Thomas Richter Acked-by: Sumanth Korikkar Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Chengdong Li Cc: chengdongli@tencent.com Cc: Heiko Carstens Cc: Sven Schnelle Cc: Vasily Gorbik Link: https://lore.kernel.org/r/20220420062921.1211825-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/perf-time-to-tsc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c index cc6df49a65a1..4ad0dfbc8b21 100644 --- a/tools/perf/tests/perf-time-to-tsc.c +++ b/tools/perf/tests/perf-time-to-tsc.c @@ -123,6 +123,10 @@ static int test__perf_time_to_tsc(struct test_suite *test __maybe_unused, int su evsel->core.attr.enable_on_exec = 0; } + if (evlist__open(evlist) == -ENOENT) { + err = TEST_SKIP; + goto out_err; + } CHECK__(evlist__open(evlist)); CHECK__(evlist__mmap(evlist, UINT_MAX)); From 9423edfc5188c436f6df4356d0737bae09d35b82 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 Apr 2022 20:25:17 -0700 Subject: [PATCH 193/196] sparc: cacheflush_32.h needs struct page Add a struct page forward declaration to cacheflush_32.h. Fixes this build warning: CC drivers/crypto/xilinx/zynqmp-sha.o In file included from arch/sparc/include/asm/cacheflush.h:11, from include/linux/cacheflush.h:5, from drivers/crypto/xilinx/zynqmp-sha.c:6: arch/sparc/include/asm/cacheflush_32.h:38:37: warning: 'struct page' declared inside parameter list will not be visible outside of this definition or declaration 38 | void sparc_flush_page_to_ram(struct page *page); Exposed by commit 0e03b8fd2936 ("crypto: xilinx - Turn SHA into a tristate and allow COMPILE_TEST") but not Fixes: that commit because the underlying problem is older. Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Herbert Xu Cc: David S. Miller Cc: Sam Ravnborg Cc: sparclinux@vger.kernel.org Acked-by: Sam Ravnborg Acked-by: Herbert Xu Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/cacheflush_32.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h index 41c6d734a474..adb6991d0455 100644 --- a/arch/sparc/include/asm/cacheflush_32.h +++ b/arch/sparc/include/asm/cacheflush_32.h @@ -35,6 +35,7 @@ #define flush_page_for_dma(addr) \ sparc32_cachetlb_ops->page_for_dma(addr) +struct page; void sparc_flush_page_to_ram(struct page *page); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 From f2edd118d02dd11449b126f786f09749ca152ba5 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 15 Apr 2022 09:44:11 -0700 Subject: [PATCH 194/196] page_alloc: use vmalloc_huge for large system hash Use vmalloc_huge() in alloc_large_system_hash() so that large system hash (>= PMD_SIZE) could benefit from huge pages. Note that vmalloc_huge only allocates huge pages for systems with HAVE_ARCH_HUGE_VMALLOC. Signed-off-by: Song Liu Reviewed-by: Christoph Hellwig Reviewed-by: Rik van Riel Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 33ca8cab21e6..0e42038382c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8919,7 +8919,7 @@ void *__init alloc_large_system_hash(const char *tablename, table = memblock_alloc_raw(size, SMP_CACHE_BYTES); } else if (get_order(size) >= MAX_ORDER || hashdist) { - table = __vmalloc(size, gfp_flags); + table = vmalloc_huge(size, gfp_flags); virt = true; if (table) huge = is_vm_area_hugepages(table); From 9becb688913023124464c5463b4389b3b293f0e7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 22 Apr 2022 11:41:38 -0700 Subject: [PATCH 195/196] kvmalloc: use vmalloc_huge for vmalloc allocations Since commit 559089e0a93d ("vmalloc: replace VM_NO_HUGE_VMAP with VM_ALLOW_HUGE_VMAP"), the use of hugepage mappings for vmalloc is an opt-in strategy, because it caused a number of problems that weren't noticed until x86 enabled it too. One of the issues was fixed by Nick Piggin in commit 3b8000ae185c ("mm/vmalloc: huge vmalloc backing pages should be split rather than compound"), but I'm still worried about page protection issues, and VM_FLUSH_RESET_PERMS in particular. However, like the hash table allocation case (commit f2edd118d02d: "page_alloc: use vmalloc_huge for large system hash"), the use of kvmalloc() should be safe from any such games, since the returned pointer might be a SLUB allocation, and as such no user should reasonably be using it in any odd ways. We also know that the allocations are fairly large, since it falls back to the vmalloc case only when a kmalloc() fails. So using a hugepage mapping seems both safe and relevant. This patch does show a weakness in the opt-in strategy: since the opt-in flag is in the 'vm_flags', not the usual gfp_t allocation flags, very few of the usual interfaces actually expose it. That's not much of an issue in this case that already used one of the fairly specialized low-level vmalloc interfaces for the allocation, but for a lot of other vmalloc() users that might want to opt in, it's going to be very inconvenient. We'll either have to fix any compatibility problems, or expose it in the gfp flags (__GFP_COMP would have made a lot of sense) to allow normal vmalloc() users to use hugepage mappings. That said, the cases that really matter were probably already taken care of by the hash tabel allocation. Link: https://lore.kernel.org/all/20220415164413.2727220-1-song@kernel.org/ Link: https://lore.kernel.org/all/CAHk-=whao=iosX1s5Z4SF-ZGa-ebAukJoAdUJFk5SPwnofV+Vg@mail.gmail.com/ Cc: Nicholas Piggin Cc: Paul Menzel Cc: Song Liu Cc: Rick Edgecombe Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/util.c b/mm/util.c index 54e5e761a9a9..3492a9e81aa3 100644 --- a/mm/util.c +++ b/mm/util.c @@ -592,8 +592,15 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) return NULL; } - return __vmalloc_node(size, 1, flags, node, - __builtin_return_address(0)); + /* + * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, + * since the callers already cannot assume anything + * about the resulting pointer, and cannot play + * protection games. + */ + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); } EXPORT_SYMBOL(kvmalloc_node); From af2d861d4cd2a4da5137f795ee3509e6f944a25b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 24 Apr 2022 14:51:22 -0700 Subject: [PATCH 196/196] Linux 5.18-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fa5112a0ec1b..c3ec1ea42379 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 18 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Superb Owl # *DOCUMENTATION*