From e8e1ce8454c9cc8ad2e4422bef346428e52455e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Apr 2023 09:43:53 +0000 Subject: [PATCH 1/5] net: add debugging checks in skb_attempt_defer_free() Make sure skbs that are stored in softnet_data.defer_list do not have a dst attached. Also make sure the the skb was orphaned. Link: https://lore.kernel.org/netdev/CANn89iJuEVe72bPmEftyEJHLzzN=QNR2yueFjTxYXCEpS5S8HQ@mail.gmail.com/T/ Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0d998806b377..bd815a00d2af 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6881,6 +6881,9 @@ nodefer: __kfree_skb(skb); return; } + DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); + DEBUG_NET_WARN_ON_ONCE(skb->destructor); + sd = &per_cpu(softnet_data, cpu); defer_max = READ_ONCE(sysctl_skb_defer_max); if (READ_ONCE(sd->defer_count) >= defer_max) From 931e93bdf8ca71cef1f8759c43bc2c5385392b8b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Apr 2023 09:43:54 +0000 Subject: [PATCH 2/5] net: do not provide hard irq safety for sd->defer_lock kfree_skb() can be called from hard irq handlers, but skb_attempt_defer_free() is meant to be used from process or BH contexts, and skb_defer_free_flush() is meant to be called from BH contexts. Not having to mask hard irq can save some cycles. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- net/core/skbuff.c | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 1551aabac343..d15568f5a44f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6632,11 +6632,11 @@ static void skb_defer_free_flush(struct softnet_data *sd) if (!READ_ONCE(sd->defer_list)) return; - spin_lock_irq(&sd->defer_lock); + spin_lock(&sd->defer_lock); skb = sd->defer_list; sd->defer_list = NULL; sd->defer_count = 0; - spin_unlock_irq(&sd->defer_lock); + spin_unlock(&sd->defer_lock); while (skb != NULL) { next = skb->next; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bd815a00d2af..304a966164d8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6870,7 +6870,6 @@ void skb_attempt_defer_free(struct sk_buff *skb) { int cpu = skb->alloc_cpu; struct softnet_data *sd; - unsigned long flags; unsigned int defer_max; bool kick; @@ -6889,7 +6888,7 @@ nodefer: __kfree_skb(skb); if (READ_ONCE(sd->defer_count) >= defer_max) goto nodefer; - spin_lock_irqsave(&sd->defer_lock, flags); + spin_lock_bh(&sd->defer_lock); /* Send an IPI every time queue reaches half capacity. */ kick = sd->defer_count == (defer_max >> 1); /* Paired with the READ_ONCE() few lines above */ @@ -6898,7 +6897,7 @@ nodefer: __kfree_skb(skb); skb->next = sd->defer_list; /* Paired with READ_ONCE() in skb_defer_free_flush() */ WRITE_ONCE(sd->defer_list, skb); - spin_unlock_irqrestore(&sd->defer_lock, flags); + spin_unlock_bh(&sd->defer_lock); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). From e6f50edfef046cf8ad541b4d5972bf38fdcdec39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Apr 2023 09:43:55 +0000 Subject: [PATCH 3/5] net: move skb_defer_free_flush() up We plan using skb_defer_free_flush() from napi_threaded_poll() in the following patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index d15568f5a44f..81ded215731b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6598,6 +6598,27 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } +static void skb_defer_free_flush(struct softnet_data *sd) +{ + struct sk_buff *skb, *next; + + /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ + if (!READ_ONCE(sd->defer_list)) + return; + + spin_lock(&sd->defer_lock); + skb = sd->defer_list; + sd->defer_list = NULL; + sd->defer_count = 0; + spin_unlock(&sd->defer_lock); + + while (skb != NULL) { + next = skb->next; + napi_consume_skb(skb, 1); + skb = next; + } +} + static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; @@ -6624,27 +6645,6 @@ static int napi_threaded_poll(void *data) return 0; } -static void skb_defer_free_flush(struct softnet_data *sd) -{ - struct sk_buff *skb, *next; - - /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ - if (!READ_ONCE(sd->defer_list)) - return; - - spin_lock(&sd->defer_lock); - skb = sd->defer_list; - sd->defer_list = NULL; - sd->defer_count = 0; - spin_unlock(&sd->defer_lock); - - while (skb != NULL) { - next = skb->next; - napi_consume_skb(skb, 1); - skb = next; - } -} - static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); From a1aaee7f8f79d1b0595e24f8c3caed24630d6cb6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Apr 2023 09:43:56 +0000 Subject: [PATCH 4/5] net: make napi_threaded_poll() aware of sd->defer_list If we call skb_defer_free_flush() from napi_threaded_poll(), we can avoid to raise IPI from skb_attempt_defer_free() when the list becomes too big. This allows napi_threaded_poll() to rely less on softirqs, and lowers latency caused by a too big list. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 81ded215731b..7d9ec23f97c6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6622,6 +6622,7 @@ static void skb_defer_free_flush(struct softnet_data *sd) static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; + struct softnet_data *sd; void *have; while (!napi_thread_wait(napi)) { @@ -6629,11 +6630,13 @@ static int napi_threaded_poll(void *data) bool repoll = false; local_bh_disable(); + sd = this_cpu_ptr(&softnet_data); have = netpoll_poll_lock(napi); __napi_poll(napi, &repoll); netpoll_poll_unlock(have); + skb_defer_free_flush(sd); local_bh_enable(); if (!repoll) From 87eff2ec57b6d68d294013d8dd21e839a1175e3a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Apr 2023 09:43:57 +0000 Subject: [PATCH 5/5] net: optimize napi_threaded_poll() vs RPS/RFS We use napi_threaded_poll() in order to reduce our softirq dependency. We can add a followup of 821eba962d95 ("net: optimize napi_schedule_rps()") to further remove the need of firing NET_RX_SOFTIRQ whenever RPS/RFS are used. Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 12 ++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a6a3e9457d6c..08fbd4622ccf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3194,7 +3194,10 @@ struct softnet_data { #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; #endif + bool in_net_rx_action; + bool in_napi_threaded_poll; + #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit __rcu *flow_limit; #endif diff --git a/net/core/dev.c b/net/core/dev.c index 7d9ec23f97c6..735096d42c1d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4603,10 +4603,10 @@ static void napi_schedule_rps(struct softnet_data *sd) sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; - /* If not called from net_rx_action() + /* If not called from net_rx_action() or napi_threaded_poll() * we have to raise NET_RX_SOFTIRQ. */ - if (!mysd->in_net_rx_action) + if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll) __raise_softirq_irqoff(NET_RX_SOFTIRQ); return; } @@ -6631,11 +6631,19 @@ static int napi_threaded_poll(void *data) local_bh_disable(); sd = this_cpu_ptr(&softnet_data); + sd->in_napi_threaded_poll = true; have = netpoll_poll_lock(napi); __napi_poll(napi, &repoll); netpoll_poll_unlock(have); + sd->in_napi_threaded_poll = false; + barrier(); + + if (sd_has_rps_ipi_waiting(sd)) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } skb_defer_free_flush(sd); local_bh_enable();