From cfe43f478b79ba45573ca22d52d0d8823be068fa Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 12 Nov 2021 18:52:01 +0000 Subject: [PATCH 001/155] preempt/dynamic: Introduce preemption model accessors CONFIG_PREEMPT{_NONE, _VOLUNTARY} designate either: o The build-time preemption model when !PREEMPT_DYNAMIC o The default boot-time preemption model when PREEMPT_DYNAMIC IOW, using those on PREEMPT_DYNAMIC kernels is meaningless - the actual model could have been set to something else by the "preempt=foo" cmdline parameter. Same problem applies to CONFIG_PREEMPTION. Introduce a set of helpers to determine the actual preemption model used by the live kernel. Suggested-by: Marco Elver Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20211112185203.280040-3-valentin.schneider@arm.com --- include/linux/sched.h | 41 +++++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 12 ++++++++++++ 2 files changed, 53 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index d5e3c00b74e1..67f06f72c50e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2117,6 +2117,47 @@ static inline void cond_resched_rcu(void) #endif } +#ifdef CONFIG_PREEMPT_DYNAMIC + +extern bool preempt_model_none(void); +extern bool preempt_model_voluntary(void); +extern bool preempt_model_full(void); + +#else + +static inline bool preempt_model_none(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_NONE); +} +static inline bool preempt_model_voluntary(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); +} +static inline bool preempt_model_full(void) +{ + return IS_ENABLED(CONFIG_PREEMPT); +} + +#endif + +static inline bool preempt_model_rt(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_RT); +} + +/* + * Does the preemption model allow non-cooperative preemption? + * + * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with + * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the + * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the + * PREEMPT_NONE model. + */ +static inline bool preempt_model_preemptible(void) +{ + return preempt_model_full() || preempt_model_rt(); +} + /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPTION, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d575b4914925..068c088e9584 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8409,6 +8409,18 @@ static void __init preempt_dynamic_init(void) } } +#define PREEMPT_MODEL_ACCESSOR(mode) \ + bool preempt_model_##mode(void) \ + { \ + WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ + return preempt_dynamic_mode == preempt_dynamic_##mode; \ + } \ + EXPORT_SYMBOL_GPL(preempt_model_##mode) + +PREEMPT_MODEL_ACCESSOR(none); +PREEMPT_MODEL_ACCESSOR(voluntary); +PREEMPT_MODEL_ACCESSOR(full); + #else /* !CONFIG_PREEMPT_DYNAMIC */ static inline void preempt_dynamic_init(void) { } From 8ed00760203d8018bee042fbfe8e076579be2c2b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Jan 2022 09:52:44 -0800 Subject: [PATCH 002/155] srcu: Tighten cleanup_srcu_struct() GP checks Currently, cleanup_srcu_struct() checks for a grace period in progress, but it does not check for a grace period that has not yet started but which might start at any time. Such a situation could result in a use-after-free bug, so this commit adds a check for a grace period that is needed but not yet started to cleanup_srcu_struct(). Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6833d8887181..d30e4db04506 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -382,9 +382,11 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) return; /* Forgot srcu_barrier(), so just leak it! */ } if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) || WARN_ON(srcu_readers_active(ssp))) { - pr_info("%s: Active srcu_struct %p state: %d\n", - __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", + __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)), + rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); return; /* Caller forgot to stop doing call_srcu()? */ } free_percpu(ssp->sda); From 95ebe80d99de3cb849c522a1f768e5e8befa0b7c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Jan 2022 13:16:18 -0800 Subject: [PATCH 003/155] srcu: Fix s/is/if/ typo in srcu_node comment This commit fixed a typo in the srcu_node structure's ->srcu_have_cbs comment. While in the area, redo a couple of comments to take advantage of 100-character line lengths. Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index cb1f4351e8ba..4025840ba9a3 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -47,11 +47,9 @@ struct srcu_data { */ struct srcu_node { spinlock_t __private lock; - unsigned long srcu_have_cbs[4]; /* GP seq for children */ - /* having CBs, but only */ - /* is > ->srcu_gq_seq. */ - unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs */ - /* have CBs for given GP? */ + unsigned long srcu_have_cbs[4]; /* GP seq for children having CBs, but only */ + /* if greater than ->srcu_gq_seq. */ + unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs have CBs for given GP? */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ struct srcu_node *srcu_parent; /* Next up in tree. */ int grplo; /* Least CPU for node. */ From 7b9e9b5856e188c1b3ff51185f3600ee79b4ab41 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Jan 2022 13:39:33 -0800 Subject: [PATCH 004/155] srcu: Make srcu_funnel_gp_start() cache ->mynode in snp_leaf Currently, the srcu_funnel_gp_start() walks its local variable snp up the tree and reloads sdp->mynode whenever it is necessary to check whether it is still at the leaf srcu_node level. This works, but is a bit more obtuse than absolutely necessary. In addition, upcoming commits will dynamically size srcu_struct structures, in which case sdp->mynode will no longer necessarily be a constant, and this commit helps prepare for that dynamic sizing. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d30e4db04506..7d13e35e5d27 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -632,20 +632,21 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, { unsigned long flags; int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); - struct srcu_node *snp = sdp->mynode; + struct srcu_node *snp; + struct srcu_node *snp_leaf = sdp->mynode; unsigned long snp_seq; /* Each pass through the loop does one level of the srcu_node tree. */ - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode) + for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { snp_seq = snp->srcu_have_cbs[idx]; - if (snp == sdp->mynode && snp_seq == s) + if (snp == snp_leaf && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; spin_unlock_irqrestore_rcu_node(snp, flags); - if (snp == sdp->mynode && snp_seq != s) { + if (snp == snp_leaf && snp_seq != s) { srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); @@ -656,7 +657,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, return; } snp->srcu_have_cbs[idx] = s; - if (snp == sdp->mynode) + if (snp == snp_leaf) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); From 994f706872e6ce080506bd795ecf783d5b617de6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Jan 2022 09:46:57 -0800 Subject: [PATCH 005/155] srcu: Make Tree SRCU able to operate without snp_node array This commit makes Tree SRCU able to operate without an snp_node array, that is, when the srcu_data structures' ->mynode pointers are NULL. This can result in high contention on the srcu_struct structure's ->lock, but only when there are lots of call_srcu(), synchronize_srcu(), and synchronize_srcu_expedited() calls. Note that when there is no snp_node array, all SRCU callbacks use CPU 0's callback queue. This is optimal in the common case of low update-side load because it removes the need to search each CPU for the single callback that made the grace period happen. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 14 ++- kernel/rcu/srcutree.c | 203 +++++++++++++++++++++------------------ 2 files changed, 124 insertions(+), 93 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 4025840ba9a3..8d1da136a93a 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -63,8 +63,9 @@ struct srcu_struct { struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */ struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ + int srcu_size_state; /* Small-to-big transition state. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ - spinlock_t __private lock; /* Protect counters */ + spinlock_t __private lock; /* Protect counters and size state. */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ unsigned int srcu_idx; /* Current rdr array element. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ @@ -83,6 +84,17 @@ struct srcu_struct { struct lockdep_map dep_map; }; +/* Values for size state variable (->srcu_size_state). */ +#define SRCU_SIZE_SMALL 0 +#define SRCU_SIZE_ALLOC 1 +#define SRCU_SIZE_WAIT_BARRIER 2 +#define SRCU_SIZE_WAIT_CALL 3 +#define SRCU_SIZE_WAIT_CBS1 4 +#define SRCU_SIZE_WAIT_CBS2 5 +#define SRCU_SIZE_WAIT_CBS3 6 +#define SRCU_SIZE_WAIT_CBS4 7 +#define SRCU_SIZE_BIG 8 + /* Values for state variable (bottom bits of ->srcu_gp_seq). */ #define SRCU_STATE_IDLE 0 #define SRCU_STATE_SCAN1 1 diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 7d13e35e5d27..e23696edd43b 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -152,16 +152,17 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); } + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); } /* * Initialize non-compile-time initialized fields, including the - * associated srcu_node and srcu_data structures. The is_static - * parameter is passed through to init_srcu_struct_nodes(), and - * also tells us that ->sda has already been wired up to srcu_data. + * associated srcu_node and srcu_data structures. The is_static parameter + * tells us that ->sda has already been wired up to srcu_data. */ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { + ssp->srcu_size_state = SRCU_SIZE_SMALL; mutex_init(&ssp->srcu_cb_mutex); mutex_init(&ssp->srcu_gp_mutex); ssp->srcu_idx = 0; @@ -175,6 +176,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) if (!ssp->sda) return -ENOMEM; init_srcu_struct_nodes(ssp); + ssp->srcu_size_state = SRCU_SIZE_BIG; ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ @@ -391,6 +393,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) } free_percpu(ssp->sda); ssp->sda = NULL; + ssp->srcu_size_state = SRCU_SIZE_SMALL; } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -439,6 +442,10 @@ static void srcu_gp_start(struct srcu_struct *ssp) struct srcu_data *sdp = this_cpu_ptr(ssp->sda); int state; + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + sdp = per_cpu_ptr(ssp->sda, 0); + else + sdp = this_cpu_ptr(ssp->sda); lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ @@ -539,38 +546,40 @@ static void srcu_gp_end(struct srcu_struct *ssp) /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - srcu_for_each_node_breadth_first(ssp, snp) { - spin_lock_irq_rcu_node(snp); - cbs = false; - last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; - if (last_lvl) - cbs = snp->srcu_have_cbs[idx] == gpseq; - snp->srcu_have_cbs[idx] = gpseq; - rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); - if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); - mask = snp->srcu_data_have_cbs[idx]; - snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq_rcu_node(snp); - if (cbs) - srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); - - /* Occasionally prevent srcu_data counter wrap. */ - if (!(gpseq & counter_wrap_check) && last_lvl) - for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irqsave_rcu_node(sdp, flags); - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed + 100)) - sdp->srcu_gp_seq_needed = gpseq; - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed_exp + 100)) - sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irqrestore_rcu_node(sdp, flags); - } + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) { + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay); + } else { + idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); + srcu_for_each_node_breadth_first(ssp, snp) { + spin_lock_irq_rcu_node(snp); + cbs = false; + last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; + if (last_lvl) + cbs = snp->srcu_have_cbs[idx] == gpseq; + snp->srcu_have_cbs[idx] = gpseq; + rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); + if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); + mask = snp->srcu_data_have_cbs[idx]; + snp->srcu_data_have_cbs[idx] = 0; + spin_unlock_irq_rcu_node(snp); + if (cbs) + srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); + } } + /* Occasionally prevent srcu_data counter wrap. */ + if (!(gpseq & counter_wrap_check)) + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_irqsave_rcu_node(sdp, flags); + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) + sdp->srcu_gp_seq_needed = gpseq; + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) + sdp->srcu_gp_seq_needed_exp = gpseq; + spin_unlock_irqrestore_rcu_node(sdp, flags); + } + /* Callback initiation done, allow grace periods after next. */ mutex_unlock(&ssp->srcu_cb_mutex); @@ -599,18 +608,19 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp { unsigned long flags; - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) || - ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) - return; - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + if (snp) + for (; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&ssp->srcu_gp_seq, s) || + ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) + return; + spin_lock_irqsave_rcu_node(snp, flags); + if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + spin_unlock_irqrestore_rcu_node(snp, flags); + return; + } + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); - return; } - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); - } spin_lock_irqsave_rcu_node(ssp, flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); @@ -633,36 +643,37 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, unsigned long flags; int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); struct srcu_node *snp; - struct srcu_node *snp_leaf = sdp->mynode; + struct srcu_node *snp_leaf = smp_load_acquire(&sdp->mynode); unsigned long snp_seq; - /* Each pass through the loop does one level of the srcu_node tree. */ - for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) - return; /* GP already done and CBs recorded. */ - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { - snp_seq = snp->srcu_have_cbs[idx]; - if (snp == snp_leaf && snp_seq == s) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore_rcu_node(snp, flags); - if (snp == snp_leaf && snp_seq != s) { - srcu_schedule_cbs_sdp(sdp, do_norm - ? SRCU_INTERVAL - : 0); + if (snp_leaf) + /* Each pass through the loop does one level of the srcu_node tree. */ + for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) + return; /* GP already done and CBs recorded. */ + spin_lock_irqsave_rcu_node(snp, flags); + if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { + snp_seq = snp->srcu_have_cbs[idx]; + if (snp == snp_leaf && snp_seq == s) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + spin_unlock_irqrestore_rcu_node(snp, flags); + if (snp == snp_leaf && snp_seq != s) { + srcu_schedule_cbs_sdp(sdp, do_norm + ? SRCU_INTERVAL + : 0); + return; + } + if (!do_norm) + srcu_funnel_exp_start(ssp, snp, s); return; } - if (!do_norm) - srcu_funnel_exp_start(ssp, snp, s); - return; + snp->srcu_have_cbs[idx] = s; + if (snp == snp_leaf) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); + spin_unlock_irqrestore_rcu_node(snp, flags); } - snp->srcu_have_cbs[idx] = s; - if (snp == snp_leaf) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); - } /* Top of tree, must ensure the grace period will be started. */ spin_lock_irqsave_rcu_node(ssp, flags); @@ -820,7 +831,10 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, check_init_srcu_struct(ssp); idx = srcu_read_lock(ssp); - sdp = raw_cpu_ptr(ssp->sda); + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_CALL) + sdp = per_cpu_ptr(ssp->sda, 0); + else + sdp = raw_cpu_ptr(ssp->sda); spin_lock_irqsave_rcu_node(sdp, flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); @@ -840,7 +854,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, if (needgp) srcu_funnel_gp_start(ssp, sdp, s, do_norm); else if (needexp) - srcu_funnel_exp_start(ssp, sdp->mynode, s); + srcu_funnel_exp_start(ssp, smp_load_acquire(&sdp->mynode), s); srcu_read_unlock(ssp, idx); return s; } @@ -1100,6 +1114,28 @@ static void srcu_barrier_cb(struct rcu_head *rhp) complete(&ssp->srcu_barrier_completion); } +/* + * Enqueue an srcu_barrier() callback on the specified srcu_data + * structure's ->cblist. but only if that ->cblist already has at least one + * callback enqueued. Note that if a CPU already has callbacks enqueue, + * it must have already registered the need for a future grace period, + * so all we need do is enqueue a callback that will use the same grace + * period as the last callback already in the queue. + */ +static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) +{ + spin_lock_irq_rcu_node(sdp); + atomic_inc(&ssp->srcu_barrier_cpu_cnt); + sdp->srcu_barrier_head.func = srcu_barrier_cb; + debug_rcu_head_queue(&sdp->srcu_barrier_head); + if (!rcu_segcblist_entrain(&sdp->srcu_cblist, + &sdp->srcu_barrier_head)) { + debug_rcu_head_unqueue(&sdp->srcu_barrier_head); + atomic_dec(&ssp->srcu_barrier_cpu_cnt); + } + spin_unlock_irq_rcu_node(sdp); +} + /** * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. * @ssp: srcu_struct on which to wait for in-flight callbacks. @@ -1107,7 +1143,6 @@ static void srcu_barrier_cb(struct rcu_head *rhp) void srcu_barrier(struct srcu_struct *ssp) { int cpu; - struct srcu_data *sdp; unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); check_init_srcu_struct(ssp); @@ -1123,27 +1158,11 @@ void srcu_barrier(struct srcu_struct *ssp) /* Initial count prevents reaching zero until all CBs are posted. */ atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); - /* - * Each pass through this loop enqueues a callback, but only - * on CPUs already having callbacks enqueued. Note that if - * a CPU already has callbacks enqueue, it must have already - * registered the need for a future grace period, so all we - * need do is enqueue a callback that will use the same - * grace period as the last callback already in the queue. - */ - for_each_possible_cpu(cpu) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irq_rcu_node(sdp); - atomic_inc(&ssp->srcu_barrier_cpu_cnt); - sdp->srcu_barrier_head.func = srcu_barrier_cb; - debug_rcu_head_queue(&sdp->srcu_barrier_head); - if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head)) { - debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&ssp->srcu_barrier_cpu_cnt); - } - spin_unlock_irq_rcu_node(sdp); - } + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0)); + else + for_each_possible_cpu(cpu) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); /* Remove the initial count, at which point reaching zero can happen. */ if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) From 2ec303113d978931ef368886c4c6bc854493e8bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jan 2022 16:13:52 -0800 Subject: [PATCH 006/155] srcu: Dynamically allocate srcu_node array This commit shrinks the srcu_struct structure by converting its ->node field from a fixed-size compile-time array to a pointer to a dynamically allocated array. In kernels built with large values of NR_CPUS that boot on systems with smaller numbers of CPUs, this can save significant memory. [ paulmck: Apply kernel test robot feedback. ] Reported-by: A cast of thousands Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 2 +- kernel/rcu/srcutree.c | 65 ++++++++++++++++++++++++++++++---------- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 8d1da136a93a..8501b6b45941 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -60,7 +60,7 @@ struct srcu_node { * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */ + struct srcu_node *node; /* Combining tree. */ struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ int srcu_size_state; /* Small-to-big transition state. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e23696edd43b..e98cc218e42b 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "rcu.h" @@ -75,12 +76,42 @@ do { \ spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ /* - * Initialize SRCU combining tree. Note that statically allocated + * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and * srcu_read_unlock() running against them. So if the is_static parameter * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. */ -static void init_srcu_struct_nodes(struct srcu_struct *ssp) +static void init_srcu_struct_data(struct srcu_struct *ssp) +{ + int cpu; + struct srcu_data *sdp; + + /* + * Initialize the per-CPU srcu_data array, which feeds into the + * leaves of the srcu_node tree. + */ + WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + ARRAY_SIZE(sdp->srcu_unlock_count)); + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + rcu_segcblist_init(&sdp->srcu_cblist); + sdp->srcu_cblist_invoking = false; + sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; + sdp->mynode = NULL; + sdp->cpu = cpu; + INIT_WORK(&sdp->work, srcu_invoke_callbacks); + timer_setup(&sdp->delay_work, srcu_delay_timer, 0); + sdp->ssp = ssp; + } +} + +/* + * Allocated and initialize SRCU combining tree. Returns @true if + * allocation succeeded and @false otherwise. + */ +static bool init_srcu_struct_nodes(struct srcu_struct *ssp) { int cpu; int i; @@ -92,6 +123,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) /* Initialize geometry if it has not already been initialized. */ rcu_init_geometry(); + ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), GFP_ATOMIC); + if (!ssp->node) + return false; /* Work out the overall tree geometry. */ ssp->level[0] = &ssp->node[0]; @@ -129,30 +163,20 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); level = rcu_num_lvls - 1; snp_first = ssp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); - rcu_segcblist_init(&sdp->srcu_cblist); - sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; sdp->mynode = &snp_first[cpu / levelspread[level]]; for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { if (snp->grplo < 0) snp->grplo = cpu; snp->grphi = cpu; } - sdp->cpu = cpu; - INIT_WORK(&sdp->work, srcu_invoke_callbacks); - timer_setup(&sdp->delay_work, srcu_delay_timer, 0); - sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); } smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); + return true; } /* @@ -163,6 +187,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { ssp->srcu_size_state = SRCU_SIZE_SMALL; + ssp->node = NULL; mutex_init(&ssp->srcu_cb_mutex); mutex_init(&ssp->srcu_gp_mutex); ssp->srcu_idx = 0; @@ -175,8 +200,16 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->sda = alloc_percpu(struct srcu_data); if (!ssp->sda) return -ENOMEM; - init_srcu_struct_nodes(ssp); - ssp->srcu_size_state = SRCU_SIZE_BIG; + init_srcu_struct_data(ssp); + if (!init_srcu_struct_nodes(ssp)) { + if (!is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + return -ENOMEM; + } + } else { + ssp->srcu_size_state = SRCU_SIZE_BIG; + } ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ @@ -393,6 +426,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) } free_percpu(ssp->sda); ssp->sda = NULL; + kfree(ssp->node); + ssp->node = NULL; ssp->srcu_size_state = SRCU_SIZE_SMALL; } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); From e2f638365dd6283b7df1cb5e82f5b2746359f062 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Jan 2022 15:41:32 -0800 Subject: [PATCH 007/155] srcu: Add size-state transitioning code This is just dead code at the moment, and will be used once the state-transition code is activated. Because srcu_barrier() must be aware of transition before call_srcu(), the state machine waits for an SRCU grace period before callbacks are queued to the non-CPU-0 queues. This requres that portions of srcu_barrier() be enclosed in an SRCU read-side critical section. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e98cc218e42b..00f9aed6f7b9 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -562,6 +562,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) unsigned long mask; struct srcu_data *sdp; struct srcu_node *snp; + int ss_state; /* Prevent more than one additional grace period. */ mutex_lock(&ssp->srcu_cb_mutex); @@ -629,6 +630,15 @@ static void srcu_gp_end(struct srcu_struct *ssp) } else { spin_unlock_irq_rcu_node(ssp); } + + /* Transition to big if needed. */ + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) { + if (ss_state == SRCU_SIZE_ALLOC) + init_srcu_struct_nodes(ssp); + else + smp_store_release(&ssp->srcu_size_state, ss_state + 1); + } } /* @@ -1178,6 +1188,7 @@ static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) void srcu_barrier(struct srcu_struct *ssp) { int cpu; + int idx; unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); check_init_srcu_struct(ssp); @@ -1193,11 +1204,13 @@ void srcu_barrier(struct srcu_struct *ssp) /* Initial count prevents reaching zero until all CBs are posted. */ atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); + idx = srcu_read_lock(ssp); if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0)); else for_each_possible_cpu(cpu) srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); + srcu_read_unlock(ssp, idx); /* Remove the initial count, at which point reaching zero can happen. */ if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) From 3bedebcf63c2ad7396f1be138bbef91a402f33cd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Jan 2022 17:05:51 -0800 Subject: [PATCH 008/155] srcu: Make rcutorture dump the SRCU size state This commit adds the numeric and string version of ->srcu_size_state to the Tree-SRCU-specific portion of the rcutorture output. [ paulmck: Apply feedback from kernel test robot and Dan Carpenter. ] [ quic_neeraju: Apply feedback from Jiapeng Chong. ] Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 00f9aed6f7b9..0874e3be6a5a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1407,15 +1407,33 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); +static const char * const srcu_size_state_name[] = { + "SRCU_SIZE_SMALL", + "SRCU_SIZE_ALLOC", + "SRCU_SIZE_WAIT_BARRIER", + "SRCU_SIZE_WAIT_CALL", + "SRCU_SIZE_WAIT_CBS1", + "SRCU_SIZE_WAIT_CBS2", + "SRCU_SIZE_WAIT_CBS3", + "SRCU_SIZE_WAIT_CBS4", + "SRCU_SIZE_BIG", + "SRCU_SIZE_???", +}; + void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) { int cpu; int idx; unsigned long s0 = 0, s1 = 0; + int ss_state = READ_ONCE(ssp->srcu_size_state); + int ss_state_idx = ss_state; idx = ssp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", - tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx); + if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) + ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; + pr_alert("%s%s Tree SRCU g%ld state %d (%s) per-CPU(idx=%d):", + tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, + srcu_size_state_name[ss_state_idx], idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; From aeb9b39b8f4aac1233302c53a1fd99a73fd2c262 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 26 Jan 2022 16:01:26 -0800 Subject: [PATCH 009/155] srcu: Compute snp_seq earlier in srcu_funnel_gp_start() Currently, srcu_funnel_gp_start() tests snp->srcu_have_cbs[idx] and then separately assigns it to the snp_seq local variable. This commit does the assignment earlier to simplify the code a bit. While in the area, this commit also takes advantage of the 100-character line limit to put the call to srcu_schedule_cbs_sdp() on a single line. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0874e3be6a5a..fec608f69962 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -697,15 +697,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { - snp_seq = snp->srcu_have_cbs[idx]; + snp_seq = snp->srcu_have_cbs[idx]; + if (ULONG_CMP_GE(snp_seq, s)) { if (snp == snp_leaf && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; spin_unlock_irqrestore_rcu_node(snp, flags); if (snp == snp_leaf && snp_seq != s) { - srcu_schedule_cbs_sdp(sdp, do_norm - ? SRCU_INTERVAL - : 0); + srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); return; } if (!do_norm) From cbdc98e93efa7bbf6f2fcd68c73df82c37b5fa65 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 26 Jan 2022 17:03:06 -0800 Subject: [PATCH 010/155] srcu: Use invalid initial value for srcu_node GP sequence numbers Currently, tree SRCU relies on the srcu_node structures being initialized at the same time that the srcu_struct itself is initialized, and thus use the initial grace-period sequence number as the initial value for the srcu_node structure's ->srcu_have_cbs[] and ->srcu_gp_seq_needed_exp fields. Although this has a high probability of also working when the srcu_node array is allocated and initialized at some random later time, it would be better to avoid leaving such things to chance. This commit therefore initializes these fields with 0x2, which is a recognizable invalid value. It then adds the required checks for this invalid value in order to avoid confusion on long-running kernels (especially those on 32-bit systems) that allocate and initialize srcu_node arrays late in life. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index fec608f69962..155c430c6a73 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -107,6 +107,18 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) } } +/* Invalid seq state, used during snp node initialization */ +#define SRCU_SNP_INIT_SEQ 0x2 + +/* + * Check whether sequence number corresponding to snp node, + * is invalid. + */ +static inline bool srcu_invl_snp_seq(unsigned long s) +{ + return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ; +} + /* * Allocated and initialize SRCU combining tree. Returns @true if * allocation succeeded and @false otherwise. @@ -139,10 +151,10 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp) WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { - snp->srcu_have_cbs[i] = 0; + snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; snp->srcu_data_have_cbs[i] = 0; } - snp->srcu_gp_seq_needed_exp = 0; + snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; snp->grplo = -1; snp->grphi = -1; if (snp == &ssp->node[0]) { @@ -386,8 +398,7 @@ static bool srcu_readers_active(struct srcu_struct *ssp) */ static unsigned long srcu_get_delay(struct srcu_struct *ssp) { - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), - READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) return 0; return SRCU_INTERVAL; } @@ -561,6 +572,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) int idx; unsigned long mask; struct srcu_data *sdp; + unsigned long sgsne; struct srcu_node *snp; int ss_state; @@ -594,7 +606,8 @@ static void srcu_gp_end(struct srcu_struct *ssp) cbs = snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); - if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) + sgsne = snp->srcu_gp_seq_needed_exp; + if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq)) WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; @@ -652,14 +665,17 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp unsigned long s) { unsigned long flags; + unsigned long sgsne; if (snp) for (; snp != NULL; snp = snp->srcu_parent) { + sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); if (rcu_seq_done(&ssp->srcu_gp_seq, s) || - ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) + (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) return; spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + sgsne = snp->srcu_gp_seq_needed_exp; + if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) { spin_unlock_irqrestore_rcu_node(snp, flags); return; } @@ -687,6 +703,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, { unsigned long flags; int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); + unsigned long sgsne; struct srcu_node *snp; struct srcu_node *snp_leaf = smp_load_acquire(&sdp->mynode); unsigned long snp_seq; @@ -698,7 +715,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; - if (ULONG_CMP_GE(snp_seq, s)) { + if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) { if (snp == snp_leaf && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; spin_unlock_irqrestore_rcu_node(snp, flags); @@ -713,7 +730,8 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, snp->srcu_have_cbs[idx] = s; if (snp == snp_leaf) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) + sgsne = snp->srcu_gp_seq_needed_exp; + if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s))) WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); } From 0b56f953908a751716f2c8f907942674b60d8db5 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Tue, 22 Feb 2022 11:39:01 +0530 Subject: [PATCH 011/155] srcu: Ensure snp nodes tree is fully initialized before traversal For configurations where snp node tree is not initialized at init time (added in subsequent commits), srcu_funnel_gp_start() and srcu_funnel_exp_start() can potential traverse and observe the snp nodes' transient (uninitialized) states. This can potentially happen, when init_srcu_struct_nodes() initialization of sdp->mynode races with srcu_funnel_gp_start() and srcu_funnel_exp_start() Consider the case below where srcu_funnel_gp_start() observes sdp->mynode to be not NULL and uses an uninitialized sdp->grpmask P1 P2 init_srcu_struct_nodes() void srcu_funnel_gp_start(...) { for_each_possible_cpu(cpu) { ... sdp->mynode = &snp_first[...]; for (snp = sdp->mynode;...) struct srcu_node *snp_leaf = smp_load_acquire(&sdp->mynode) ... if (snp_leaf) { for (snp = snp_leaf; ...) ... if (snp == snp_leaf) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); } } Similarly, init_srcu_struct_nodes() and srcu_funnel_exp_start() can race, where srcu_funnel_exp_start() could observe state of snp lock before spin_lock_init(). P1 P2 init_srcu_struct_nodes() void srcu_funnel_exp_start(...) { srcu_for_each_node_breadth_first(ssp, snp) { for (; ...) { spin_lock_...(snp, ) spin_lock_init(&ACCESS_PRIVATE(snp, lock)); ... } for_each_possible_cpu(cpu) { ... sdp->mynode = &snp_first[...]; To avoid these issues, ensure that snp node tree initialization is complete i.e. after SRCU_SIZE_WAIT_BARRIER srcu_size_state is reached, before traversing the tree. Given that srcu_funnel_gp_start() and srcu_funnel_exp_start() are called within SRCU read side critical sections, this check is safe, in the sense that all callbacks are enqueued on CPU0 srcu_cblist until SRCU_SIZE_WAIT_CALL is entered, and these read side critical sections (containing srcu_funnel_gp_start() and srcu_funnel_exp_start()) need to complete, before SRCU_SIZE_WAIT_CALL is reached. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 155c430c6a73..2e7ed67646db 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -705,9 +705,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); unsigned long sgsne; struct srcu_node *snp; - struct srcu_node *snp_leaf = smp_load_acquire(&sdp->mynode); + struct srcu_node *snp_leaf; unsigned long snp_seq; + /* Ensure that snp node tree is fully initialized before traversing it */ + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + snp_leaf = NULL; + else + snp_leaf = sdp->mynode; + if (snp_leaf) /* Each pass through the loop does one level of the srcu_node tree. */ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { @@ -889,10 +895,13 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, bool needgp = false; unsigned long s; struct srcu_data *sdp; + struct srcu_node *sdp_mynode; + int ss_state; check_init_srcu_struct(ssp); idx = srcu_read_lock(ssp); - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_CALL) + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_CALL) sdp = per_cpu_ptr(ssp->sda, 0); else sdp = raw_cpu_ptr(ssp->sda); @@ -912,10 +921,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, needexp = true; } spin_unlock_irqrestore_rcu_node(sdp, flags); + + /* Ensure that snp node tree is fully initialized before traversing it */ + if (ss_state < SRCU_SIZE_WAIT_BARRIER) + sdp_mynode = NULL; + else + sdp_mynode = sdp->mynode; + if (needgp) srcu_funnel_gp_start(ssp, sdp, s, do_norm); else if (needexp) - srcu_funnel_exp_start(ssp, smp_load_acquire(&sdp->mynode), s); + srcu_funnel_exp_start(ssp, sdp_mynode, s); srcu_read_unlock(ssp, idx); return s; } From c69a00a12e26cf4faffdcdb340cb2d059b61d57e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Jan 2022 15:41:10 -0800 Subject: [PATCH 012/155] srcu: Add boot-time control over srcu_node array allocation This commit adds an srcu_tree.convert_to_big kernel parameter that either refuses to convert at all (0), converts immediately at init_srcu_struct() time (1), or lets rcutorture convert it (2). An addition contention-based dynamic conversion choice will be added, along with documentation. [ paulmck: Apply callback-scanning feedback from Neeraj Upadhyay. ] Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 13 +++++ kernel/rcu/srcutree.c | 48 ++++++++++++------- 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f1cc5e317ed..1f1fcac7777d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5608,6 +5608,19 @@ off: Disable mitigation and remove performance impact to RDRAND and RDSEED + srcutree.convert_to_big [KNL] + Specifies under what conditions an SRCU tree + srcu_struct structure will be converted to big + form, that is, with an rcu_node tree: + + 0: Never. + 1: At init_srcu_struct() time. + 2: When rcutorture decides to. + + Either way, the srcu_node tree will be sized based + on the actual runtime number of CPUs (nr_cpu_ids) + instead of the compile-time CONFIG_NR_CPUS. + srcutree.counter_wrap_check [KNL] Specifies how frequently to check for grace-period sequence counter wrap for the diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 2e7ed67646db..9dd048989027 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -39,6 +39,15 @@ module_param(exp_holdoff, ulong, 0444); static ulong counter_wrap_check = (ULONG_MAX >> 2); module_param(counter_wrap_check, ulong, 0444); +/* + * Control conversion to SRCU_SIZE_BIG: + * 0: Don't convert at all (default). + * 1: Convert at init_srcu_struct() time. + * 2: Convert when rcutorture invokes srcu_torture_stats_print(). + */ +static int convert_to_big; +module_param(convert_to_big, int, 0444); + /* Early-boot callback-management, so early that no lock is required! */ static LIST_HEAD(srcu_boot_list); static bool __read_mostly srcu_init_done; @@ -123,7 +132,7 @@ static inline bool srcu_invl_snp_seq(unsigned long s) * Allocated and initialize SRCU combining tree. Returns @true if * allocation succeeded and @false otherwise. */ -static bool init_srcu_struct_nodes(struct srcu_struct *ssp) +static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) { int cpu; int i; @@ -135,7 +144,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp) /* Initialize geometry if it has not already been initialized. */ rcu_init_geometry(); - ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), GFP_ATOMIC); + ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags); if (!ssp->node) return false; @@ -213,17 +222,19 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) if (!ssp->sda) return -ENOMEM; init_srcu_struct_data(ssp); - if (!init_srcu_struct_nodes(ssp)) { - if (!is_static) { - free_percpu(ssp->sda); - ssp->sda = NULL; - return -ENOMEM; - } - } else { - ssp->srcu_size_state = SRCU_SIZE_BIG; - } ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 1) { + if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { + if (!is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + return -ENOMEM; + } + } else { + WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG); + } + } smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ return 0; } @@ -594,7 +605,8 @@ static void srcu_gp_end(struct srcu_struct *ssp) /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) { + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_BARRIER) { srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay); } else { idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); @@ -603,13 +615,16 @@ static void srcu_gp_end(struct srcu_struct *ssp) cbs = false; last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; if (last_lvl) - cbs = snp->srcu_have_cbs[idx] == gpseq; + cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); sgsne = snp->srcu_gp_seq_needed_exp; if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq)) WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); - mask = snp->srcu_data_have_cbs[idx]; + if (ss_state < SRCU_SIZE_BIG) + mask = ~0; + else + mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; spin_unlock_irq_rcu_node(snp); if (cbs) @@ -645,10 +660,9 @@ static void srcu_gp_end(struct srcu_struct *ssp) } /* Transition to big if needed. */ - ss_state = smp_load_acquire(&ssp->srcu_size_state); if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) { if (ss_state == SRCU_SIZE_ALLOC) - init_srcu_struct_nodes(ssp); + init_srcu_struct_nodes(ssp, GFP_KERNEL); else smp_store_release(&ssp->srcu_size_state, ss_state + 1); } @@ -1494,6 +1508,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) s1 += c1; } pr_cont(" T(%ld,%ld)\n", s0, s1); + if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 2) + WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_ALLOC); } EXPORT_SYMBOL_GPL(srcu_torture_stats_print); From db8f1471c61336477e2bf74dcb00e67d650e6dea Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 26 Jan 2022 10:03:54 -0500 Subject: [PATCH 013/155] srcu: Use export for srcu_struct defined by DEFINE_STATIC_SRCU() If an srcu_struct structure defined by tree SRCU's DEFINE_STATIC_SRCU() is used by a module, sparse will give the following diagnostic: sparse: symbol '__srcu_struct_nodes_srcu' was not declared. Should it be static? The problem is that a within-module DEFINE_STATIC_SRCU() must define a non-static srcu_struct because it is exported by referencing it in a special '__section("___srcu_struct_ptrs")'. This reference is needed so that module load and unloading can invoke init_srcu_struct() and cleanup_srcu_struct(), respectively. Unfortunately, sparse is unaware of '__section("___srcu_struct_ptrs")', resulting in the above false-positive diagnostic. To avoid this false positive, this commit therefore creates a prototype of the srcu_struct with an "extern" keyword. Signed-off-by: Alexander Aring Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 8501b6b45941..44e998643f48 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -131,6 +131,7 @@ struct srcu_struct { #ifdef MODULE # define __DEFINE_SRCU(name, is_static) \ is_static struct srcu_struct name; \ + extern struct srcu_struct * const __srcu_struct_##name; \ struct srcu_struct * const __srcu_struct_##name \ __section("___srcu_struct_ptrs") = &name #else From 4a230f8046454df18139ed1232f1a1e8a6dd36c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jan 2022 11:43:11 -0800 Subject: [PATCH 014/155] srcu: Avoid NULL dereference in srcu_torture_stats_print() You really shouldn't invoke srcu_torture_stats_print() after invoking cleanup_srcu_struct(), but there is really no reason to get a compiler-obfuscated per-CPU-variable NULL pointer dereference as the diagnostic. This commit therefore checks for NULL ->sda and makes a more polite console-message complaint in that case. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 56 ++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9dd048989027..b7138dbe1a2d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1477,37 +1477,43 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) idx = ssp->srcu_idx & 0x1; if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; - pr_alert("%s%s Tree SRCU g%ld state %d (%s) per-CPU(idx=%d):", + pr_alert("%s%s Tree SRCU g%ld state %d (%s)", tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, - srcu_size_state_name[ss_state_idx], idx); - for_each_possible_cpu(cpu) { - unsigned long l0, l1; - unsigned long u0, u1; - long c0, c1; - struct srcu_data *sdp; + srcu_size_state_name[ss_state_idx]); + if (!ssp->sda) { + // Called after cleanup_srcu_struct(), perhaps. + pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n"); + } else { + pr_cont(" per-CPU(idx=%d):", idx); + for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; + long c0, c1; + struct srcu_data *sdp; - sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(sdp->srcu_unlock_count[!idx]); - u1 = data_race(sdp->srcu_unlock_count[idx]); + sdp = per_cpu_ptr(ssp->sda, cpu); + u0 = data_race(sdp->srcu_unlock_count[!idx]); + u1 = data_race(sdp->srcu_unlock_count[idx]); - /* - * Make sure that a lock is always counted if the corresponding - * unlock is counted. - */ - smp_rmb(); + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); - l0 = data_race(sdp->srcu_lock_count[!idx]); - l1 = data_race(sdp->srcu_lock_count[idx]); + l0 = data_race(sdp->srcu_lock_count[!idx]); + l1 = data_race(sdp->srcu_lock_count[idx]); - c0 = l0 - u0; - c1 = l1 - u1; - pr_cont(" %d(%ld,%ld %c)", - cpu, c0, c1, - "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); - s0 += c0; - s1 += c1; + c0 = l0 - u0; + c1 = l1 - u1; + pr_cont(" %d(%ld,%ld %c)", + cpu, c0, c1, + "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); + s0 += c0; + s1 += c1; + } + pr_cont(" T(%ld,%ld)\n", s0, s1); } - pr_cont(" T(%ld,%ld)\n", s0, s1); if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 2) WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_ALLOC); } From 46470cf85d2b61abd37c6f66c4dacc1bc510d10f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jan 2022 13:20:49 -0800 Subject: [PATCH 015/155] srcu: Prevent cleanup_srcu_struct() from freeing non-dynamic ->sda When an srcu_struct structure is created (but not in a kernel module) by DEFINE_SRCU() and friends, the per-CPU srcu_data structure is statically allocated. In all other cases, that structure is obtained from alloc_percpu(), in which case cleanup_srcu_struct() must invoke free_percpu() on the resulting ->sda pointer in the srcu_struct pointer. Which it does. Except that it also invokes free_percpu() on the ->sda pointer referencing the statically allocated per-CPU srcu_data structures. Which free_percpu() is surprisingly OK with. This commit nevertheless stops cleanup_srcu_struct() from freeing statically allocated per-CPU srcu_data structures. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 1 + kernel/rcu/srcutree.c | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 44e998643f48..44bd204498a1 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -73,6 +73,7 @@ struct srcu_struct { unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ + bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ struct completion srcu_barrier_completion; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b7138dbe1a2d..7209fd95dde9 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -217,6 +217,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_barrier_mutex); atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->work, process_srcu); + ssp->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); if (!ssp->sda) @@ -226,7 +227,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 1) { if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { - if (!is_static) { + if (!ssp->sda_is_static) { free_percpu(ssp->sda); ssp->sda = NULL; return -ENOMEM; @@ -446,8 +447,10 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); return; /* Caller forgot to stop doing call_srcu()? */ } - free_percpu(ssp->sda); - ssp->sda = NULL; + if (!ssp->sda_is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + } kfree(ssp->node); ssp->node = NULL; ssp->srcu_size_state = SRCU_SIZE_SMALL; From ee5e2448bceb9400aa27207f0c0220f9dedd85eb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jan 2022 13:47:42 -0800 Subject: [PATCH 016/155] srcu: Explain srcu_funnel_gp_start() call to list_add() is safe This commit adds a comment explaining why an unprotected call to list_add() from srcu_funnel_gp_start() can be safe. TL;DR: It is only called during very early boot when we don't have no steeking concurrency! Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 7209fd95dde9..64993a172cff 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -776,6 +776,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); srcu_gp_start(ssp); + + // And how can that list_add() in the "else" clause + // possibly be safe for concurrent execution? Well, + // it isn't. And it does not have to be. After all, it + // can only be executed during early boot when there is only + // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) queue_delayed_work(rcu_gp_wq, &ssp->work, srcu_get_delay(ssp)); From 99659f64b14e55cfa48980f5396f83820bafd028 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jan 2022 14:56:39 -0800 Subject: [PATCH 017/155] srcu: Create concurrency-safe helper for initiating size transition Once there are contention-initiated size transitions, it will be possible for rcutorture to initiate a transition at the same time as a contention-initiated transition. This commit therefore creates a concurrency-safe helper function named srcu_transition_to_big() to safely initiate size transitions. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 64993a172cff..c9460374d437 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -272,6 +272,25 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* + * Initiate an idempotent transition to SRCU_SIZE_BIG. + */ +static void srcu_transition_to_big(struct srcu_struct *ssp) +{ + unsigned long flags; + + /* Double-checked locking on ->srcu_size-state. */ + if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) + return; + spin_lock_irqsave_rcu_node(ssp, flags); + if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) { + spin_unlock_irqrestore_rcu_node(ssp, flags); + return; + } + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); + spin_unlock_irqrestore_rcu_node(ssp, flags); +} + /* * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be @@ -1523,8 +1542,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) } pr_cont(" T(%ld,%ld)\n", s0, s1); } - if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 2) - WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_ALLOC); + if (convert_to_big == 2) + srcu_transition_to_big(ssp); } EXPORT_SYMBOL_GPL(srcu_torture_stats_print); From 9f2e91d94c91558e3764fe4e01c5e6281a90f239 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jan 2022 20:32:05 -0800 Subject: [PATCH 018/155] srcu: Add contention-triggered addition of srcu_node tree This commit instruments the acquisitions of the srcu_struct structure's ->lock, enabling the initiation of a transition from SRCU_SIZE_SMALL to SRCU_SIZE_BIG when sufficient contention is experienced. The instrumentation counts the number of trylock failures within the confines of a single jiffy. If that number exceeds the value specified by the srcutree.small_contention_lim kernel boot parameter (which defaults to 100), and if the value specified by the srcutree.convert_to_big kernel boot parameter has the 0x10 bit set (defaults to 0), then a transition will be automatically initiated. By default, there will never be any transitions, so that none of the srcu_struct structures ever gains an srcu_node array. The useful values for srcutree.convert_to_big are: 0x00: Never convert. 0x01: Always convert at init_srcu_struct() time. 0x02: Convert when rcutorture prints its first round of statistics. 0x03: Decide conversion approach at boot given system size. 0x10: Convert if contention is encountered. 0x12: Convert if contention is encountered or when rcutorture prints its first round of statistics, whichever comes first. The value 0x11 acts the same as 0x01 because the conversion happens before there is any chance of contention. [ paulmck: Apply "static" feedback from kernel test robot. ] Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 9 ++ include/linux/srcutree.h | 2 + kernel/rcu/srcutree.c | 107 ++++++++++++++---- 3 files changed, 94 insertions(+), 24 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1f1fcac7777d..177e688768c0 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5616,6 +5616,7 @@ 0: Never. 1: At init_srcu_struct() time. 2: When rcutorture decides to. + 0x1X: Above plus if high contention. Either way, the srcu_node tree will be sized based on the actual runtime number of CPUs (nr_cpu_ids) @@ -5638,6 +5639,14 @@ expediting. Set to zero to disable automatic expediting. + srcutree.small_contention_lim [KNL] + Specifies the number of update-side contention + events per jiffy will be tolerated before + initiating a conversion of an srcu_struct + structure to big form. Note that the value of + srcutree.convert_to_big must have the 0x10 bit + set for contention-based conversions to occur. + ssbd= [ARM64,HW] Speculative Store Bypass Disable control diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 44bd204498a1..1b9ff4ed37e4 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -72,6 +72,8 @@ struct srcu_struct { unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ + unsigned long srcu_size_jiffies; /* Current contention-measurement interval. */ + unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c9460374d437..0bc6a0a3edee 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -41,13 +41,29 @@ module_param(counter_wrap_check, ulong, 0444); /* * Control conversion to SRCU_SIZE_BIG: - * 0: Don't convert at all (default). - * 1: Convert at init_srcu_struct() time. - * 2: Convert when rcutorture invokes srcu_torture_stats_print(). + * 0: Don't convert at all (default). + * 1: Convert at init_srcu_struct() time. + * 2: Convert when rcutorture invokes srcu_torture_stats_print(). + * 3: Decide at boot time based on system shape. + * 0x1x: Convert when excessive contention encountered. */ -static int convert_to_big; +#define SRCU_SIZING_NONE 0 +#define SRCU_SIZING_INIT 1 +#define SRCU_SIZING_TORTURE 2 +#define SRCU_SIZING_AUTO 3 +#define SRCU_SIZING_CONTEND 0x10 +#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x) +#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE)) +#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT)) +#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE)) +#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND) +static int convert_to_big = SRCU_SIZING_NONE; module_param(convert_to_big, int, 0444); +/* Contention events per jiffy to initiate transition to big. */ +static int small_contention_lim __read_mostly = 100; +module_param(small_contention_lim, int, 0444); + /* Early-boot callback-management, so early that no lock is required! */ static LIST_HEAD(srcu_boot_list); static bool __read_mostly srcu_init_done; @@ -58,31 +74,40 @@ static void process_srcu(struct work_struct *work); static void srcu_delay_timer(struct timer_list *t); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ -#define spin_lock_rcu_node(p) \ -do { \ - spin_lock(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_rcu_node(p) \ +do { \ + spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) #define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irq_rcu_node(p) \ -do { \ - spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irq_rcu_node(p) \ +do { \ + spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irq_rcu_node(p) \ +#define spin_unlock_irq_rcu_node(p) \ spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irqsave_rcu_node(p, flags) \ -do { \ - spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irqrestore_rcu_node(p, flags) \ - spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ +#define spin_trylock_irqsave_rcu_node(p, flags) \ +({ \ + bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + +#define spin_unlock_irqrestore_rcu_node(p, flags) \ + spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ /* * Initialize SRCU per-CPU data. Note that statically allocated @@ -225,7 +250,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) init_srcu_struct_data(ssp); ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 1) { + if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { if (!ssp->sda_is_static) { free_percpu(ssp->sda); @@ -272,6 +297,15 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* + * Initiate a transition to SRCU_SIZE_BIG with lock held. + */ +static void __srcu_transition_to_big(struct srcu_struct *ssp) +{ + lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); +} + /* * Initiate an idempotent transition to SRCU_SIZE_BIG. */ @@ -287,10 +321,35 @@ static void srcu_transition_to_big(struct srcu_struct *ssp) spin_unlock_irqrestore_rcu_node(ssp, flags); return; } - smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); + __srcu_transition_to_big(ssp); spin_unlock_irqrestore_rcu_node(ssp, flags); } +/* + * Acquire the specified srcu_struct structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +{ + unsigned long j; + + if (spin_trylock_irqsave_rcu_node(ssp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state) + return; + j = jiffies; + if (ssp->srcu_size_jiffies != j) { + ssp->srcu_size_jiffies = j; + ssp->srcu_n_lock_retries = 0; + } + if (++ssp->srcu_n_lock_retries <= small_contention_lim) + return; + __srcu_transition_to_big(ssp); +} + /* * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be @@ -718,7 +777,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); } - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(ssp, flags); @@ -779,7 +838,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -1542,7 +1601,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) } pr_cont(" T(%ld,%ld)\n", s0, s1); } - if (convert_to_big == 2) + if (SRCU_SIZING_IS_TORTURE()) srcu_transition_to_big(ssp); } EXPORT_SYMBOL_GPL(srcu_torture_stats_print); From beb84099f1cf51e005e5df77d05b1644e490409e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Feb 2022 16:42:04 +0100 Subject: [PATCH 019/155] rcu: Remove rcu_is_nocb_cpu() The rcu_is_nocb_cpu() function is no longer used, so this commmit removes it. Reported-by: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 -- kernel/rcu/tree_nocb.h | 8 -------- 2 files changed, 10 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..4c53a2ccf711 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -538,10 +538,8 @@ extern struct workqueue_struct *rcu_par_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU -bool rcu_is_nocb_cpu(int cpu); void rcu_bind_current_to_nocb(void); #else -static inline bool rcu_is_nocb_cpu(int cpu) { return false; } static inline void rcu_bind_current_to_nocb(void) { } #endif diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 636d0546a4e9..02e1d05a11fc 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -215,14 +215,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_swait_queue_head(&rnp->nocb_gp_wq[1]); } -/* Is the specified CPU a no-CBs CPU? */ -bool rcu_is_nocb_cpu(int cpu) -{ - if (cpumask_available(rcu_nocb_mask)) - return cpumask_test_cpu(cpu, rcu_nocb_mask); - return false; -} - static bool __wake_nocb_gp(struct rcu_data *rdp_gp, struct rcu_data *rdp, bool force, unsigned long flags) From 8d2aaa9b7c290e766a41f29c71ec72192851d538 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 14 Feb 2022 14:23:39 +0100 Subject: [PATCH 020/155] rcu/nocb: Move rcu_nocb_is_setup to rcu_state This commit moves the RCU nocb initialization witness within rcu_state to consolidate RCU's global state. Reported-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_nocb.h | 13 +++++-------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 926673ebe355..f6a3d54585c9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -364,6 +364,7 @@ struct rcu_state { arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ + int nocb_is_setup; /* nocb is setup from boot */ }; /* Values for rcu_state structure's gp_flags field. */ diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 02e1d05a11fc..3c00240833d6 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -60,9 +60,6 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. * If the list is invalid, a warning is emitted and all CPUs are offloaded. */ - -static bool rcu_nocb_is_setup; - static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); @@ -72,7 +69,7 @@ static int __init rcu_nocb_setup(char *str) cpumask_setall(rcu_nocb_mask); } } - rcu_nocb_is_setup = true; + rcu_state.nocb_is_setup = true; return 1; } __setup("rcu_nocbs", rcu_nocb_setup); @@ -1172,10 +1169,10 @@ void __init rcu_init_nohz(void) return; } } - rcu_nocb_is_setup = true; + rcu_state.nocb_is_setup = true; } - if (!rcu_nocb_is_setup) + if (!rcu_state.nocb_is_setup) return; #if defined(CONFIG_NO_HZ_FULL) @@ -1233,7 +1230,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) struct task_struct *t; struct sched_param sp; - if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup) + if (!rcu_scheduler_fully_active || !rcu_state.nocb_is_setup) return; /* If there already is an rcuo kthread, then nothing to do. */ @@ -1279,7 +1276,7 @@ static void __init rcu_spawn_nocb_kthreads(void) { int cpu; - if (rcu_nocb_is_setup) { + if (rcu_state.nocb_is_setup) { for_each_online_cpu(cpu) rcu_spawn_cpu_nocb_kthread(cpu); } From 2eed973adc6e749439730e53e6220b122398d319 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Feb 2022 16:42:06 +0100 Subject: [PATCH 021/155] rcu: Assume rcu_init() is called before smp The rcu_init() function is called way before SMP is initialized and therefore only the boot CPU should be online at this stage. Simplify the boot per-cpu initialization accordingly. Signed-off-by: Frederic Weisbecker Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..e6a9e5744e45 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4782,7 +4782,7 @@ static void __init kfree_rcu_batch_init(void) void __init rcu_init(void) { - int cpu; + int cpu = smp_processor_id(); rcu_early_boot_tests(); @@ -4802,11 +4802,10 @@ void __init rcu_init(void) * or the scheduler are operational. */ pm_notifier(rcu_pm_notify, 0); - for_each_online_cpu(cpu) { - rcutree_prepare_cpu(cpu); - rcu_cpu_starting(cpu); - rcutree_online_cpu(cpu); - } + WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot. + rcutree_prepare_cpu(cpu); + rcu_cpu_starting(cpu); + rcutree_online_cpu(cpu); /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); From 3352911fa9b47a90165e5c6fed440048c55146d1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Feb 2022 16:42:07 +0100 Subject: [PATCH 022/155] rcu: Initialize boost kthread only for boot node prior SMP initialization The rcu_spawn_gp_kthread() function is called as an early initcall, which means that SMP initialization hasn't happened yet and only the boot CPU is online. Therefore, create only the boost kthread for the leaf node of the boot CPU. Signed-off-by: Frederic Weisbecker Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 16 ---------------- 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e6a9e5744e45..70b33c55d39a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4480,6 +4480,7 @@ static int __init rcu_spawn_gp_kthread(void) struct rcu_node *rnp; struct sched_param sp; struct task_struct *t; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); rcu_scheduler_fully_active = 1; t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); @@ -4498,7 +4499,9 @@ static int __init rcu_spawn_gp_kthread(void) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); rcu_spawn_nocb_kthreads(); - rcu_spawn_boost_kthreads(); + /* This is a pre-SMP initcall, we expect a single CPU */ + WARN_ON(num_online_cpus() > 1); + rcu_spawn_one_boost_kthread(rdp->mynode); rcu_spawn_core_kthreads(); return 0; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f6a3d54585c9..b2a0f2613ab9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -422,7 +422,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static bool rcu_is_callbacks_kthread(void); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); -static void __init rcu_spawn_boost_kthreads(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8360d86db1c0..b139635f33bd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1226,18 +1226,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) free_cpumask_var(cm); } -/* - * Spawn boost kthreads -- called as soon as the scheduler is running. - */ -static void __init rcu_spawn_boost_kthreads(void) -{ - struct rcu_node *rnp; - - rcu_for_each_leaf_node(rnp) - if (rcu_rnp_online_cpus(rnp)) - rcu_spawn_one_boost_kthread(rnp); -} - #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) @@ -1263,10 +1251,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } -static void __init rcu_spawn_boost_kthreads(void) -{ -} - #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* From 87c5adf06bfbf14c9d13e59d5d174ff5f2aafc0e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Feb 2022 16:42:08 +0100 Subject: [PATCH 023/155] rcu/nocb: Initialize nocb kthreads only for boot CPU prior SMP initialization The rcu_spawn_gp_kthread() function is called as an early initcall, which means that SMP initialization hasn't happened yet and only the boot CPU is online. Therefore, create only the NOCB kthreads related to the boot CPU. Signed-off-by: Frederic Weisbecker Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++++- kernel/rcu/tree.h | 1 - kernel/rcu/tree_nocb.h | 20 -------------------- 3 files changed, 5 insertions(+), 22 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 70b33c55d39a..9f7441a78f90 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4498,9 +4498,13 @@ static int __init rcu_spawn_gp_kthread(void) smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); - rcu_spawn_nocb_kthreads(); /* This is a pre-SMP initcall, we expect a single CPU */ WARN_ON(num_online_cpus() > 1); + /* + * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu() + * due to rcu_scheduler_fully_active. + */ + rcu_spawn_cpu_nocb_kthread(smp_processor_id()); rcu_spawn_one_boost_kthread(rdp->mynode); rcu_spawn_core_kthreads(); return 0; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index b2a0f2613ab9..25dc4166f218 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -439,7 +439,6 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); -static void __init rcu_spawn_nocb_kthreads(void); static void show_rcu_nocb_state(struct rcu_data *rdp); static void rcu_nocb_lock(struct rcu_data *rdp); static void rcu_nocb_unlock(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 3c00240833d6..46694e13398a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1266,22 +1266,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); } -/* - * Once the scheduler is running, spawn rcuo kthreads for all online - * no-CBs CPUs. This assumes that the early_initcall()s happen before - * non-boot CPUs come online -- if this changes, we will need to add - * some mutual exclusion. - */ -static void __init rcu_spawn_nocb_kthreads(void) -{ - int cpu; - - if (rcu_state.nocb_is_setup) { - for_each_online_cpu(cpu) - rcu_spawn_cpu_nocb_kthread(cpu); - } -} - /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ static int rcu_nocb_gp_stride = -1; module_param(rcu_nocb_gp_stride, int, 0444); @@ -1538,10 +1522,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) { } -static void __init rcu_spawn_nocb_kthreads(void) -{ -} - static void show_rcu_nocb_state(struct rcu_data *rdp) { } From f75fd4b9221d93177c50dcfde671b2e907f53e86 Mon Sep 17 00:00:00 2001 From: Padmanabha Srinivasaiah Date: Thu, 17 Feb 2022 16:25:19 +0100 Subject: [PATCH 024/155] rcu-tasks: Fix race in schedule and flush work While booting secondary CPUs, cpus_read_[lock/unlock] is not keeping online cpumask stable. The transient online mask results in below calltrace. [ 0.324121] CPU1: Booted secondary processor 0x0000000001 [0x410fd083] [ 0.346652] Detected PIPT I-cache on CPU2 [ 0.347212] CPU2: Booted secondary processor 0x0000000002 [0x410fd083] [ 0.377255] Detected PIPT I-cache on CPU3 [ 0.377823] CPU3: Booted secondary processor 0x0000000003 [0x410fd083] [ 0.379040] ------------[ cut here ]------------ [ 0.383662] WARNING: CPU: 0 PID: 10 at kernel/workqueue.c:3084 __flush_work+0x12c/0x138 [ 0.384850] Modules linked in: [ 0.385403] CPU: 0 PID: 10 Comm: rcu_tasks_rude_ Not tainted 5.17.0-rc3-v8+ #13 [ 0.386473] Hardware name: Raspberry Pi 4 Model B Rev 1.4 (DT) [ 0.387289] pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 0.388308] pc : __flush_work+0x12c/0x138 [ 0.388970] lr : __flush_work+0x80/0x138 [ 0.389620] sp : ffffffc00aaf3c60 [ 0.390139] x29: ffffffc00aaf3d20 x28: ffffffc009c16af0 x27: ffffff80f761df48 [ 0.391316] x26: 0000000000000004 x25: 0000000000000003 x24: 0000000000000100 [ 0.392493] x23: ffffffffffffffff x22: ffffffc009c16b10 x21: ffffffc009c16b28 [ 0.393668] x20: ffffffc009e53861 x19: ffffff80f77fbf40 x18: 00000000d744fcc9 [ 0.394842] x17: 000000000000000b x16: 00000000000001c2 x15: ffffffc009e57550 [ 0.396016] x14: 0000000000000000 x13: ffffffffffffffff x12: 0000000100000000 [ 0.397190] x11: 0000000000000462 x10: ffffff8040258008 x9 : 0000000100000000 [ 0.398364] x8 : 0000000000000000 x7 : ffffffc0093c8bf4 x6 : 0000000000000000 [ 0.399538] x5 : 0000000000000000 x4 : ffffffc00a976e40 x3 : ffffffc00810444c [ 0.400711] x2 : 0000000000000004 x1 : 0000000000000000 x0 : 0000000000000000 [ 0.401886] Call trace: [ 0.402309] __flush_work+0x12c/0x138 [ 0.402941] schedule_on_each_cpu+0x228/0x278 [ 0.403693] rcu_tasks_rude_wait_gp+0x130/0x144 [ 0.404502] rcu_tasks_kthread+0x220/0x254 [ 0.405264] kthread+0x174/0x1ac [ 0.405837] ret_from_fork+0x10/0x20 [ 0.406456] irq event stamp: 102 [ 0.406966] hardirqs last enabled at (101): [] _raw_spin_unlock_irq+0x78/0xb4 [ 0.408304] hardirqs last disabled at (102): [] el1_dbg+0x24/0x5c [ 0.409410] softirqs last enabled at (54): [] local_bh_enable+0xc/0x2c [ 0.410645] softirqs last disabled at (50): [] local_bh_disable+0xc/0x2c [ 0.411890] ---[ end trace 0000000000000000 ]--- [ 0.413000] smp: Brought up 1 node, 4 CPUs [ 0.413762] SMP: Total of 4 processors activated. [ 0.414566] CPU features: detected: 32-bit EL0 Support [ 0.415414] CPU features: detected: 32-bit EL1 Support [ 0.416278] CPU features: detected: CRC32 instructions [ 0.447021] Callback from call_rcu_tasks_rude() invoked. [ 0.506693] Callback from call_rcu_tasks() invoked. This commit therefore fixes this issue by applying a single-CPU optimization to the RCU Tasks Rude grace-period process. The key point here is that the purpose of this RCU flavor is to force a schedule on each online CPU since some past event. But the rcu_tasks_rude_wait_gp() function runs in the context of the RCU Tasks Rude's grace-period kthread, so there must already have been a context switch on the current CPU since the call to either synchronize_rcu_tasks_rude() or call_rcu_tasks_rude(). So if there is only a single CPU online, RCU Tasks Rude's grace-period kthread does not need to anything at all. It turns out that the rcu_tasks_rude_wait_gp() function's call to schedule_on_each_cpu() causes problems during early boot. During that time, there is only one online CPU, namely the boot CPU. Therefore, applying this single-CPU optimization fixes early-boot instances of this problem. Link: https://lore.kernel.org/lkml/20220210184319.25009-1-treasure4paddy@gmail.com/T/ Suggested-by: Paul E. McKenney Signed-off-by: Padmanabha Srinivasaiah Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 99cf3a13954c..b43320b149d2 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -950,6 +950,9 @@ static void rcu_tasks_be_rude(struct work_struct *work) // Wait for one rude RCU-tasks grace period. static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) { + if (num_online_cpus() <= 1) + return; // Fastpath for only one CPU. + rtp->n_ipis += cpumask_weight(cpu_online_mask); schedule_on_each_cpu(rcu_tasks_be_rude); } From f25390033fa2445cdc4d6cf8243a9b85d942845f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Feb 2022 16:01:12 -0800 Subject: [PATCH 025/155] rcu-tasks: Print pre-stall-warning informational messages RCU-tasks stall-warning messages are printed after the grace period is ten minutes old. Unfortunately, most of us will have rebooted the system in response to an apparently-hung command long before the ten minutes is up, and will thus see what looks to be a silent hang. This commit therefore adds pr_info() messages that are printed earlier. These should avoid being classified as errors, but should give impatient users a hint. These are controlled by new rcupdate.rcu_task_stall_info and rcupdate.rcu_task_stall_info_mult kernel-boot parameters. The former defines the initial delay in jiffies (defaulting to 10 seconds) and the latter defines the multiplier (defaulting to 3). Thus, by default, the first message will appear 10 seconds into the RCU-tasks grace period, the second 40 seconds in, and the third 160 seconds in. There would be a fourth at 640 seconds in, but the stall warning message appears 600 seconds in, and once a stall warning is printed for a given grace period, no further informational messages are printed. Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 30 ++++++++++++-- kernel/rcu/tasks.h | 40 ++++++++++++++++--- 2 files changed, 62 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f1cc5e317ed..babc701d4864 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4955,10 +4955,34 @@ number avoids disturbing real-time workloads, but lengthens grace periods. + rcupdate.rcu_task_stall_info= [KNL] + Set initial timeout in jiffies for RCU task stall + informational messages, which give some indication + of the problem for those not patient enough to + wait for ten minutes. Informational messages are + only printed prior to the stall-warning message + for a given grace period. Disable with a value + less than or equal to zero. Defaults to ten + seconds. A change in value does not take effect + until the beginning of the next grace period. + + rcupdate.rcu_task_stall_info_mult= [KNL] + Multiplier for time interval between successive + RCU task stall informational messages for a given + RCU tasks grace period. This value is clamped + to one through ten, inclusive. It defaults to + the value three, so that the first informational + message is printed 10 seconds into the grace + period, the second at 40 seconds, the third at + 160 seconds, and then the stall warning at 600 + seconds would prevent a fourth at 640 seconds. + rcupdate.rcu_task_stall_timeout= [KNL] - Set timeout in jiffies for RCU task stall warning - messages. Disable with a value less than or equal - to zero. + Set timeout in jiffies for RCU task stall + warning messages. Disable with a value less + than or equal to zero. Defaults to ten minutes. + A change in value does not take effect until + the beginning of the next grace period. rcupdate.rcu_self_test= [KNL] Run the RCU early boot self tests diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index b43320b149d2..76799c81d4be 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -143,6 +143,11 @@ module_param(rcu_task_ipi_delay, int, 0644); #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); +#define RCU_TASK_STALL_INFO (HZ * 10) +static int rcu_task_stall_info __read_mostly = RCU_TASK_STALL_INFO; +module_param(rcu_task_stall_info, int, 0644); +static int rcu_task_stall_info_mult __read_mostly = 3; +module_param(rcu_task_stall_info_mult, int, 0444); static int rcu_task_enqueue_lim __read_mostly = -1; module_param(rcu_task_enqueue_lim, int, 0444); @@ -548,8 +553,15 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) static void __init rcu_tasks_bootup_oddness(void) { #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + int rtsimc; + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + rtsimc = clamp(rcu_task_stall_info_mult, 1, 10); + if (rtsimc != rcu_task_stall_info_mult) { + pr_info("\tTasks-RCU CPU stall info multiplier clamped to %d (rcu_task_stall_info_mult).\n", rtsimc); + rcu_task_stall_info_mult = rtsimc; + } #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RCU pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); @@ -592,10 +604,15 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t); /* Wait for one RCU-tasks grace period. */ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) { - struct task_struct *g, *t; - unsigned long lastreport; - LIST_HEAD(holdouts); + struct task_struct *g; int fract; + LIST_HEAD(holdouts); + unsigned long j; + unsigned long lastinfo; + unsigned long lastreport; + bool reported = false; + int rtsi; + struct task_struct *t; set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); rtp->pregp_func(); @@ -621,6 +638,8 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) * is empty, we are done. */ lastreport = jiffies; + lastinfo = lastreport; + rtsi = READ_ONCE(rcu_task_stall_info); // Start off with initial wait and slowly back off to 1 HZ wait. fract = rtp->init_fract; @@ -630,7 +649,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) bool needreport; int rtst; - /* Slowly back off waiting for holdouts */ + // Slowly back off waiting for holdouts set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); schedule_timeout_idle(fract); @@ -639,12 +658,23 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); - if (needreport) + if (needreport) { lastreport = jiffies; + reported = true; + } firstreport = true; WARN_ON(signal_pending(current)); set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS); rtp->holdouts_func(&holdouts, needreport, &firstreport); + + // Print pre-stall informational messages if needed. + j = jiffies; + if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) { + lastinfo = j; + rtsi = rtsi * rcu_task_stall_info_mult; + pr_info("%s: %s grace period %lu is %lu jiffies old.\n", + __func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start); + } } set_tasks_gp_state(rtp, RTGS_POST_GP); From 88db792bbe9b140680c74e9f2f801ac00f54e05e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 4 Mar 2022 12:07:25 +0100 Subject: [PATCH 026/155] rcu-tasks: Use rcuwait for the rcu_tasks_kthread() The waitqueue used by rcu_tasks_kthread() has always only one waiter. With a guaranteed only one waiter, this can be replaced with rcuwait which is smaller and simpler. With rcuwait based wake counterpart, the irqwork function (call_rcu_tasks_iw_wakeup()) can be invoked hardirq context because it is only a wake up and no sleeping locks are involved (unlike the wait_queue_head). As a side effect, this is also one piece of the puzzle to pass the RCU selftest at early boot on PREEMPT_RT. Replace wait_queue_head with rcuwait and let the irqwork run in hardirq context on PREEMPT_RT. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 76799c81d4be..4b91cb214ca7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -46,7 +46,7 @@ struct rcu_tasks_percpu { /** * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. - * @cbs_wq: Wait queue allowing new callback to get kthread's attention. + * @cbs_wait: RCU wait allowing a new callback to get kthread's attention. * @cbs_gbl_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. @@ -77,7 +77,7 @@ struct rcu_tasks_percpu { * @kname: This flavor's kthread name. */ struct rcu_tasks { - struct wait_queue_head cbs_wq; + struct rcuwait cbs_wait; raw_spinlock_t cbs_gbl_lock; int gp_state; int gp_sleep; @@ -113,11 +113,11 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp); #define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \ .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \ - .rtp_irq_work = IRQ_WORK_INIT(call_rcu_tasks_iw_wakeup), \ + .rtp_irq_work = IRQ_WORK_INIT_HARD(call_rcu_tasks_iw_wakeup), \ }; \ static struct rcu_tasks rt_name = \ { \ - .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \ .gp_func = gp, \ .call_func = call, \ @@ -266,7 +266,7 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work); rtp = rtpcp->rtpp; - wake_up(&rtp->cbs_wq); + rcuwait_wake_up(&rtp->cbs_wait); } // Enqueue a callback for the specified flavor of Tasks RCU. @@ -514,7 +514,9 @@ static int __noreturn rcu_tasks_kthread(void *arg) set_tasks_gp_state(rtp, RTGS_WAIT_CBS); /* If there were none, wait a bit and start over. */ - wait_event_idle(rtp->cbs_wq, (needgpcb = rcu_tasks_need_gpcb(rtp))); + rcuwait_wait_event(&rtp->cbs_wait, + (needgpcb = rcu_tasks_need_gpcb(rtp)), + TASK_IDLE); if (needgpcb & 0x2) { // Wait for one grace period. From 5d90070816534882b9158f14154b7e2cdef1194a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Mar 2022 10:41:44 -0800 Subject: [PATCH 027/155] rcu-tasks: Make Tasks RCU account for userspace execution The main Tasks RCU quiescent state is voluntary context switch. However, userspace execution is also a valid quiescent state, and is a valuable one for userspace applications that spin repeatedly executing light-weight non-sleeping system calls. Currently, such an application can delay a Tasks RCU grace period for many tens of seconds. This commit therefore enlists the aid of the scheduler-clock interrupt to provide a Tasks RCU quiescent state when it interrupted a task executing in userspace. [ paulmck: Apply feedback from kernel test robot. ] Cc: Martin KaFai Lau Cc: Neil Spring Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 1 + kernel/rcu/tree.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index e7c39c200e2b..1a32036c918c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -196,6 +196,7 @@ void synchronize_rcu_tasks_rude(void); void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +#define rcu_tasks_classic_qs(t, preempt) do { } while (0) #define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..8dbfb63f0391 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2624,6 +2624,8 @@ void rcu_sched_clock_irq(int user) rcu_flavor_sched_clock_irq(user); if (rcu_pending(user)) invoke_rcu_core(); + if (user) + rcu_tasks_classic_qs(current, false); lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); From 777570d9ef820e470736fa9e02b8e3e48891c050 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 8 Mar 2022 09:54:13 -0800 Subject: [PATCH 028/155] rcu-tasks: Use schedule_hrtimeout_range() to wait for grace periods The synchronous RCU-tasks grace-period-wait primitives invoke schedule_timeout_idle() to give readers a chance to exit their read-side critical sections. Unfortunately, this fails during early boot on PREEMPT_RT because PREEMPT_RT relies solely on ksoftirqd to run timer handlers. Because ksoftirqd cannot operate until its kthreads are spawned, there is a brief period of time following scheduler initialization where PREEMPT_RT cannot run the timer handlers that schedule_timeout_idle() relies on, resulting in a hang. To avoid this boot-time hang, this commit replaces schedule_timeout_idle() with schedule_hrtimeout(), so that the timer expires in hardirq context. This is ensures that the timer fires even on PREEMPT_RT throughout the irqs-enabled portions of boot as well as during runtime. The timer is set to expire between fract and fract + HZ / 2 jiffies in order to align with any other timers that might expire during that time, thus reducing the number of wakeups. Note that RCU-tasks grace periods are infrequent, so the use of hrtimer should be fine. In contrast, in common-case code, user of hrtimer could result in performance issues. Cc: Martin KaFai Lau Cc: Andrii Nakryiko Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4b91cb214ca7..71fe340ab82a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -647,13 +647,16 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) fract = rtp->init_fract; while (!list_empty(&holdouts)) { + ktime_t exp; bool firstreport; bool needreport; int rtst; // Slowly back off waiting for holdouts set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - schedule_timeout_idle(fract); + exp = jiffies_to_nsecs(fract); + __set_current_state(TASK_IDLE); + schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD); if (fract < HZ) fract++; From bddf7122f7e321d5a677a695e8597064d987482c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Mar 2022 08:08:14 -0700 Subject: [PATCH 029/155] rcu-tasks: Restore use of timers for non-RT kernels The use of hrtimers for RCU-tasks grace-period delays works well in general, but can result in excessive grace-period delays for some corner-case workloads. This commit therefore reverts to the use of timers for non-RT kernels to mitigate those grace-period delays. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 71fe340ab82a..405614039515 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -654,9 +654,13 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // Slowly back off waiting for holdouts set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - exp = jiffies_to_nsecs(fract); - __set_current_state(TASK_IDLE); - schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + schedule_timeout_idle(fract); + } else { + exp = jiffies_to_nsecs(fract); + __set_current_state(TASK_IDLE); + schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD); + } if (fract < HZ) fract++; From 10b3742f939c51d53619a31a5c03055c5e0952b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Mar 2022 16:00:05 -0700 Subject: [PATCH 030/155] rcu-tasks: Make show_rcu_tasks_generic_gp_kthread() check all CPUs Currently, the show_rcu_tasks_generic_gp_kthread() function only looks at CPU 0's callback lists. Although this is not fatal, it can confuse debugging efforts in cases where any of the Tasks RCU flavors are in per-CPU queueing mode. This commit therefore causes this function to scan all CPUs' callback queues. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 405614039515..3aad0dfbfaf4 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -582,7 +582,17 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { - struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each... + int cpu; + bool havecbs = false; + + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) { + havecbs = true; + break; + } + } pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), @@ -590,7 +600,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) data_race(rcu_seq_current(&rtp->tasks_gp_seq)), data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], - ".C"[!data_race(rcu_segcblist_empty(&rtpcp->cblist))], + ".C"[havecbs], s); } #endif // #ifndef CONFIG_TINY_RCU From 07d95c34e8125a7bf833a94bc3c9d51992d92c45 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Apr 2022 12:30:18 -0700 Subject: [PATCH 031/155] rcu-tasks: Handle sparse cpu_possible_mask If the rcupdate.rcu_task_enqueue_lim kernel boot parameter is set to something greater than 1 and less than nr_cpu_ids, the code attempts to use a subset of the CPU's RCU Tasks callback lists. This works, but only if the cpu_possible_mask is contiguous. If there are "holes" in this mask, the callback-enqueue code might attempt to access a non-existent per-CPU ->rtcpu variable for a non-existent CPU. For example, if only CPUs 0, 4, 8, 12, 16 and so on are in cpu_possible_mask, specifying rcupdate.rcu_task_enqueue_lim=4 would cause the code to attempt to use callback queues for non-existent CPUs 1, 2, and 3. Because such systems have existed in the past and might still exist, the code needs to gracefully handle this situation. This commit therefore checks to see whether the desired CPU is present in cpu_possible_mask, and, if not, searches for the next CPU. This means that the systems administrator of a system with a sparse cpu_possible_mask will need to account for this sparsity when specifying the value of the rcupdate.rcu_task_enqueue_lim kernel boot parameter. For example, setting this parameter to the value 4 will use only CPUs 0 and 4, which CPU 4 getting three times the callback load of CPU 0. This commit assumes that bit (nr_cpu_ids - 1) is always set in cpu_possible_mask. Link: https://lore.kernel.org/lkml/CANn89iKaNEwyNZ=L_PQnkH0LP_XjLYrr_dpyRKNNoDJaWKdrmg@mail.gmail.com/ Signed-off-by: Eric Dumazet Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 3aad0dfbfaf4..fd70d86eb7cd 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -273,7 +273,9 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, struct rcu_tasks *rtp) { + int chosen_cpu; unsigned long flags; + int ideal_cpu; unsigned long j; bool needadjust = false; bool needwake; @@ -283,8 +285,9 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rhp->func = func; local_irq_save(flags); rcu_read_lock(); - rtpcp = per_cpu_ptr(rtp->rtpcpu, - smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); + ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift); + chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask); + rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu); if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. j = jiffies; From ab2756ea6b74987849b44ad0e33c3cfec159033b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Apr 2022 09:21:50 -0700 Subject: [PATCH 032/155] rcu-tasks: Handle sparse cpu_possible_mask in rcu_tasks_invoke_cbs() If the cpu_possible_mask is sparse (for example, if bits are set only for CPUs 0, 4, 8, ...), then rcu_tasks_invoke_cbs() will access per-CPU data for a CPU not in cpu_possible_mask. It makes these accesses while doing a workqueue-based binary search for non-empty callback lists. Although this search must pass through CPUs not represented in cpu_possible_mask, it has no need to check the callback list for such CPUs. This commit therefore changes the rcu_tasks_invoke_cbs() function's binary search so as to only check callback lists for CPUs present in cpu_possible_mask. Reported-by: Eric Dumazet Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fd70d86eb7cd..3925e32159b5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -468,7 +468,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu } } - if (rcu_segcblist_empty(&rtpcp->cblist)) + if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu)) return; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); From b6f3c6a2b1fe2d754acb7bf64a20e64a8f2c8a1b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 Feb 2022 17:53:22 -0800 Subject: [PATCH 033/155] torture: Add rcu_normal and rcu_expedited runs to torture.sh Currently, the rcupdate.rcu_normal and rcupdate.rcu_expedited kernel boot parameters are not regularly tested. The potential addition of polled expedited grace-period APIs increases the amount of code that is affected by these kernel boot parameters. This commit therefore adds a "--do-rt" argument to torture.sh to exercise these kernel-boot options. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/torture.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index bfe09e2829c8..e657a6e06417 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -54,6 +54,7 @@ do_kvfree=yes do_kasan=yes do_kcsan=no do_clocksourcewd=yes +do_rt=yes # doyesno - Helper function for yes/no arguments function doyesno () { @@ -82,6 +83,7 @@ usage () { echo " --do-rcuscale / --do-no-rcuscale" echo " --do-rcutorture / --do-no-rcutorture" echo " --do-refscale / --do-no-refscale" + echo " --do-rt / --do-no-rt" echo " --do-scftorture / --do-no-scftorture" echo " --duration [ | h | d ]" echo " --kcsan-kmake-arg kernel-make-arguments" @@ -118,6 +120,7 @@ do do_scftorture=yes do_rcuscale=yes do_refscale=yes + do_rt=yes do_kvfree=yes do_kasan=yes do_kcsan=yes @@ -148,6 +151,7 @@ do do_scftorture=no do_rcuscale=no do_refscale=no + do_rt=no do_kvfree=no do_kasan=no do_kcsan=no @@ -162,6 +166,9 @@ do --do-refscale|--do-no-refscale) do_refscale=`doyesno "$1" --do-refscale` ;; + --do-rt|--do-no-rt) + do_rt=`doyesno "$1" --do-rt` + ;; --do-scftorture|--do-no-scftorture) do_scftorture=`doyesno "$1" --do-scftorture` ;; @@ -354,6 +361,17 @@ then torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 1G --trust-make fi +if test "$do_rt" = "yes" +then + # With all post-boot grace periods forced to normal. + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 rcupdate.rcu_normal=1" + torture_set "rcurttorture" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration "$duration_rcutorture" --configs "TREE03" --trust-make + + # With all post-boot grace periods forced to expedited. + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 rcupdate.rcu_expedited=1" + torture_set "rcurttorture-exp" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration "$duration_rcutorture" --configs "TREE03" --trust-make +fi + if test "$do_refscale" = yes then primlist="`grep '\.name[ ]*=' kernel/rcu/refscale.c | sed -e 's/^[^"]*"//' -e 's/".*$//'`" From 99d6a2acb8955f12489bfba04f2db22bc0b57726 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Feb 2022 12:45:18 -0800 Subject: [PATCH 034/155] rcutorture: Suppress debugging grace period delays during flooding Tree RCU supports grace-period delays using the rcutree.gp_cleanup_delay, rcutree.gp_init_delay, and rcutree.gp_preinit_delay kernel boot parameters. These delays are strictly for debugging purposes, and have proven quite effective at exposing bugs involving race with CPU-hotplug operations. However, these delays can result in false positives when used in conjunction with callback flooding, for example, those generated by the rcutorture.fwd_progress kernel boot parameter. This commit therefore suppresses grace-period delays while callback flooding is in progress. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 ++++ kernel/rcu/rcutorture.c | 4 ++++ kernel/rcu/tree.c | 32 +++++++++++++++++++++++++++++--- 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..7a221393fcdb 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -523,6 +523,8 @@ static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { ret static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } +static inline void rcu_gp_slow_register(atomic_t *rgssp) { } +static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); @@ -535,6 +537,8 @@ void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; extern struct workqueue_struct *rcu_par_gp_wq; +void rcu_gp_slow_register(atomic_t *rgssp); +void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 55d049c39608..f37b7a01dcd0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2916,10 +2916,12 @@ rcu_torture_cleanup(void) pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); } + rcu_gp_slow_unregister(NULL); return; } if (!cur_ops) { torture_cleanup_end(); + rcu_gp_slow_unregister(NULL); return; } @@ -3016,6 +3018,7 @@ rcu_torture_cleanup(void) else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); torture_cleanup_end(); + rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay); } #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD @@ -3320,6 +3323,7 @@ rcu_torture_init(void) if (object_debug) rcu_test_debug_objects(); torture_init_end(); + rcu_gp_slow_register(&rcu_fwd_cb_nodelay); return 0; unwind: diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..db67dae8ed88 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1705,11 +1705,37 @@ static void note_gp_changes(struct rcu_data *rdp) rcu_gp_kthread_wake(); } +static atomic_t *rcu_gp_slow_suppress; + +/* Register a counter to suppress debugging grace-period delays. */ +void rcu_gp_slow_register(atomic_t *rgssp) +{ + WARN_ON_ONCE(rcu_gp_slow_suppress); + + WRITE_ONCE(rcu_gp_slow_suppress, rgssp); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_register); + +/* Unregister a counter, with NULL for not caring which. */ +void rcu_gp_slow_unregister(atomic_t *rgssp) +{ + WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress); + + WRITE_ONCE(rcu_gp_slow_suppress, NULL); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister); + +static bool rcu_gp_slow_is_suppressed(void) +{ + atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress); + + return rgssp && atomic_read(rgssp); +} + static void rcu_gp_slow(int delay) { - if (delay > 0 && - !(rcu_seq_ctr(rcu_state.gp_seq) % - (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) + if (!rcu_gp_slow_is_suppressed() && delay > 0 && + !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) schedule_timeout_idle(delay); } From 8106bddbab5f0ba180e6d693c7c1fc6926d57caa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Feb 2022 17:40:49 -0800 Subject: [PATCH 035/155] scftorture: Fix distribution of short handler delays The scftorture test module's scf_handler() function is supposed to provide three different distributions of short delays (including "no delay") and one distribution of long delays, if specified by the scftorture.longwait module parameter. However, the second of the two non-zero-wait short delays is disabled due to the first such delay's "goto out" not being enclosed in the "then" clause with the "udelay()". This commit therefore adjusts the code to provide the intended set of delays. Fixes: e9d338a0b179 ("scftorture: Add smp_call_function() torture test") Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index dcb0410950e4..5d113aa59e77 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -267,9 +267,10 @@ static void scf_handler(void *scfc_in) } this_cpu_inc(scf_invoked_count); if (longwait <= 0) { - if (!(r & 0xffc0)) + if (!(r & 0xffc0)) { udelay(r & 0x3f); - goto out; + goto out; + } } if (r & 0xfff) goto out; From 39b3cab92d3754e18b1f9b5de8158642145b2405 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 7 Mar 2022 14:46:55 -0800 Subject: [PATCH 036/155] rcutorture: Avoid corner-case #DE with nsynctypes check The rcutorture module is used to run torture tests that validate RCU. rcutorture takes a variety of module parameters that configure the functionality of the test. Amongst these parameters are the types of synchronization mechanisms that the rcu_torture_writer and rcu_torture_fakewriter tasks may use, and the torture_type of the run which determines what read and sync operations are used by the various writer and reader tasks that run throughout the test. When the module is configured to only use sync types for which the specified torture_type does not implement the necessary operations, we can end up in a state where nsynctypes is 0. This is not an erroneous state, but it currently crashes the kernel with a #DE due to nsynctypes being used with a modulo operator in rcu_torture_fakewriter(). Here is an example of such a #DE: $ insmod ./rcutorture.ko gp_cond=1 gp_cond_exp=0 gp_exp=0 gp_poll_exp=0 gp_normal=0 gp_poll=0 gp_poll_exp=0 verbose=9999 torture_type=trivial ... [ 8536.525096] divide error: 0000 [#1] PREEMPT SMP PTI [ 8536.525101] CPU: 30 PID: 392138 Comm: rcu_torture_fak Kdump: loaded Tainted: G S 5.17.0-rc1-00179-gc8c42c80febd #24 [ 8536.525105] Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A23 12/08/2020 [ 8536.525106] RIP: 0010:rcu_torture_fakewriter+0xf1/0x2d0 [rcutorture] [ 8536.525121] Code: 00 31 d2 8d 0c f5 00 00 00 00 48 63 c9 48 f7 f1 48 85 d2 0f 84 79 ff ff ff 48 89 e7 e8 78 78 01 00 48 63 0d 29 ca 00 00 31 d2 <48> f7 f1 8b 04 95 00 05 4e a0 83 f8 06 0f 84 ad 00 00 00 7f 1f 83 [ 8536.525124] RSP: 0018:ffffc9000777fef0 EFLAGS: 00010246 [ 8536.525127] RAX: 00000000223d006e RBX: cccccccccccccccd RCX: 0000000000000000 [ 8536.525130] RDX: 0000000000000000 RSI: ffffffff824315b9 RDI: ffffc9000777fef0 [ 8536.525132] RBP: ffffc9000487bb30 R08: 0000000000000002 R09: 000000000002a580 [ 8536.525134] R10: ffffffff82c5f920 R11: 0000000000000000 R12: ffff8881a2c35d00 [ 8536.525136] R13: ffff8881540c8d00 R14: ffffffffa04d39d0 R15: 0000000000000000 [ 8536.525137] FS: 0000000000000000(0000) GS:ffff88903ff80000(0000) knlGS:0000000000000000 [ 8536.525140] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8536.525142] CR2: 00007f839f022000 CR3: 0000000002c0a006 CR4: 00000000007706e0 [ 8536.525144] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 8536.525145] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 8536.525147] PKRU: 55555554 [ 8536.525148] Call Trace: [ 8536.525150] [ 8536.525153] kthread+0xe8/0x110 [ 8536.525161] ? kthread_complete_and_exit+0x20/0x20 [ 8536.525167] ret_from_fork+0x22/0x30 [ 8536.525174] The solution is to gracefully handle the case of nsynctypes being 0 in rcu_torture_fakewriter() by not performing any work. This is already being done in rcu_torture_writer(), though there is a missing return on that path which will be fixed in a subsequent patch. Signed-off-by: David Vernet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f37b7a01dcd0..d5105fb6c980 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1322,6 +1322,17 @@ rcu_torture_fakewriter(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); set_user_nice(current, MAX_NICE); + if (WARN_ONCE(nsynctypes == 0, + "%s: No update-side primitives.\n", __func__)) { + /* + * No updates primitives, so don't try updating. + * The resulting test won't be testing much, hence the + * above WARN_ONCE(). + */ + torture_kthread_stopping("rcu_torture_fakewriter"); + return 0; + } + do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && From 80dcee695143255261f30c7cc2a041ba413717a4 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 7 Mar 2022 14:46:57 -0800 Subject: [PATCH 037/155] rcutorture: Add missing return and use __func__ in warning The rcutorture module has an rcu_torture_writer task that repeatedly performs writes, synchronizations, and deletes. There is a corner-case check in rcu_torture_writer() wherein if nsynctypes is 0, a warning is issued and the task waits to be stopped via a call to torture_kthread_stopping() rather than performing any work. There should be a return statement following this call to torture_kthread_stopping(), as the intention with issuing the call to torture_kthread_stopping() in the first place is to avoid the rcu_torture_writer task from performing any work. Some of the work may even be dangerous to perform, such as potentially causing a #DE due to nsynctypes being used in a modulo operator when querying for sync updates to issue. This patch adds the missing return call. As a bonus, it also fixes a checkpatch warning that was emitted due to the WARN_ONCE() call using the name of the function rather than __func__. Signed-off-by: David Vernet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d5105fb6c980..f1292d9e86b5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1178,7 +1178,7 @@ rcu_torture_writer(void *arg) " GP expediting controlled from boot/sysfs for %s.\n", torture_type, cur_ops->name); if (WARN_ONCE(nsynctypes == 0, - "rcu_torture_writer: No update-side primitives.\n")) { + "%s: No update-side primitives.\n", __func__)) { /* * No updates primitives, so don't try updating. * The resulting test won't be testing much, hence the @@ -1186,6 +1186,7 @@ rcu_torture_writer(void *arg) */ rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); + return 0; } do { From bd6c375b92c3f367e184d164e12952e4b9d9fb4f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 15 Mar 2022 16:33:38 +0100 Subject: [PATCH 038/155] rcutorture: Call preempt_schedule() through static call/key The rcutorture test suite sometimess triggers a random scheduler preemption call while simulating a read delay. Unfortunately, its direct call to preempt_schedule() bypasses the static call/key filter used by CONFIG_PREEMPT_DYNAMIC. This breaks the no-preempt assumption when the dynamic preemption mode is "none". For example, rcu_blocking_is_gp() is fooled and abbreviates grace periods when the CPU runs in no-preempt UP mode. Fix this by making torture_preempt_schedule() call __preempt_schedule(), which uses the static call/key. Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/torture.h b/include/linux/torture.h index 63fa4196e51c..7038104463e4 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -118,7 +118,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); _torture_stop_kthread("Stopping " #n " task", &(tp)) #ifdef CONFIG_PREEMPTION -#define torture_preempt_schedule() preempt_schedule() +#define torture_preempt_schedule() __preempt_schedule() #else #define torture_preempt_schedule() do { } while (0) #endif From 46e861be589881e0905b9ade3d8439883858721c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Mar 2022 09:30:10 -0700 Subject: [PATCH 039/155] rcu: Make TASKS_RUDE_RCU select IRQ_WORK The TASKS_RUDE_RCU does not select IRQ_WORK, which can result in build failures for kernels that do not otherwise select IRQ_WORK. This commit therefore causes the TASKS_RUDE_RCU Kconfig option to select IRQ_WORK. Reported-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index bf8e341e75b4..f559870fbf8b 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -86,6 +86,7 @@ config TASKS_RCU config TASKS_RUDE_RCU def_bool 0 + select IRQ_WORK help This option enables a task-based RCU implementation that uses only context switch (including preemption) and user-mode From ab3ecd0bce32705e722f356a504694f4fd51d4c0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Feb 2022 17:19:31 -0800 Subject: [PATCH 040/155] torture: Reposition so that $? collects ssh code in torture.sh An "echo" slipped in between an "ssh" and the "ret=$?" that was intended to collect its exit code, which prevents torture.sh from detecting "ssh" failure. This commit therefore reassociates the two. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-remote.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh index 8c4c1e4792d0..03d7dede5f9b 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh @@ -139,13 +139,13 @@ chmod +x $T/bin/kvm-remote-*.sh for i in $systems do ncpus="`ssh $i getconf _NPROCESSORS_ONLN 2> /dev/null`" - echo $i: $ncpus CPUs " " `date` | tee -a "$oldrun/remote-log" ret=$? if test "$ret" -ne 0 then echo System $i unreachable, giving up. | tee -a "$oldrun/remote-log" exit 4 fi + echo $i: $ncpus CPUs " " `date` | tee -a "$oldrun/remote-log" done # Download and expand the tarball on all systems. From b20842baf89955a18ade95e6de6d94be757d6e5f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Feb 2022 17:22:32 -0800 Subject: [PATCH 041/155] torture: Use "-o Batchmode=yes" to disable ssh password requests The torture.sh script normally runs unattended, so there is not much point in the "ssh" command asking for a password. This commit therefore adds the "-o Batchmode=yes" argument to each "ssh" command to cause it to fail rather than ask for a password. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-remote.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh index 03d7dede5f9b..0ff59bd8b640 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh @@ -138,7 +138,7 @@ chmod +x $T/bin/kvm-remote-*.sh # Check first to avoid the need for cleanup for system-name typos for i in $systems do - ncpus="`ssh $i getconf _NPROCESSORS_ONLN 2> /dev/null`" + ncpus="`ssh -o BatchMode=yes $i getconf _NPROCESSORS_ONLN 2> /dev/null`" ret=$? if test "$ret" -ne 0 then @@ -153,14 +153,14 @@ echo Build-products tarball: `du -h $T/binres.tgz` | tee -a "$oldrun/remote-log" for i in $systems do echo Downloading tarball to $i `date` | tee -a "$oldrun/remote-log" - cat $T/binres.tgz | ssh $i "cd /tmp; tar -xzf -" + cat $T/binres.tgz | ssh -o BatchMode=yes $i "cd /tmp; tar -xzf -" ret=$? tries=0 while test "$ret" -ne 0 do echo Unable to download $T/binres.tgz to system $i, waiting and then retrying. $tries prior retries. | tee -a "$oldrun/remote-log" sleep 60 - cat $T/binres.tgz | ssh $i "cd /tmp; tar -xzf -" + cat $T/binres.tgz | ssh -o BatchMode=yes $i "cd /tmp; tar -xzf -" ret=$? if test "$ret" -ne 0 then @@ -185,7 +185,7 @@ checkremotefile () { while : do - ssh $1 "test -f \"$2\"" + ssh -o BatchMode=yes $1 "test -f \"$2\"" ret=$? if test "$ret" -eq 255 then @@ -228,7 +228,7 @@ startbatches () { then continue # System still running last test, skip. fi - ssh "$i" "cd \"$resdir/$ds\"; touch remote.run; PATH=\"$T/bin:$PATH\" nohup kvm-remote-$curbatch.sh > kvm-remote-$curbatch.sh.out 2>&1 &" 1>&2 + ssh -o BatchMode=yes "$i" "cd \"$resdir/$ds\"; touch remote.run; PATH=\"$T/bin:$PATH\" nohup kvm-remote-$curbatch.sh > kvm-remote-$curbatch.sh.out 2>&1 &" 1>&2 ret=$? if test "$ret" -ne 0 then @@ -267,7 +267,7 @@ do sleep 30 done echo " ---" Collecting results from $i `date` | tee -a "$oldrun/remote-log" - ( cd "$oldrun"; ssh $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu[_-]pid */qemu-retval */qemu-affinity; rm -rf $T > /dev/null 2>&1" | tar -xzf - ) + ( cd "$oldrun"; ssh -o BatchMode=yes $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu[_-]pid */qemu-retval */qemu-affinity; rm -rf $T > /dev/null 2>&1" | tar -xzf - ) done ( kvm-end-run-stats.sh "$oldrun" "$starttime"; echo $? > $T/exitcode ) | tee -a "$oldrun/remote-log" From 98bb264bdbbc0fe9d6b0340057fcc4e8e7043760 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Feb 2022 17:52:13 -0800 Subject: [PATCH 042/155] torture: Permit running of experimental torture types This commit weakens the checks of the kvm.sh script's --torture parameter and the kvm-recheck.sh script's parsing so that experimental torture tests may be created without updating these two scripts. The changes required are to the appropriate Makefile and Kconfig file, plus a directory whose name begins with "X" must be added to the rcutorture/configs file. This new directory's name can then be passed in via the kvm.sh script's --torture parameter. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 7 ++++++- tools/testing/selftests/rcutorture/bin/kvm.sh | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 0a5419982ab3..0789c5606d2a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -33,7 +33,12 @@ do TORTURE_SUITE="`cat $i/../torture_suite`" configfile=`echo $i | sed -e 's,^.*/,,'` rm -f $i/console.log.*.diags - kvm-recheck-${TORTURE_SUITE}.sh $i + case "${TORTURE_SUITE}" in + X*) + ;; + *) + kvm-recheck-${TORTURE_SUITE}.sh $i + esac if test -f "$i/qemu-retval" && test "`cat $i/qemu-retval`" -ne 0 && test "`cat $i/qemu-retval`" -ne 137 then echo QEMU error, output: diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 55b2c1533282..af58b86a503a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -86,7 +86,7 @@ usage () { echo " --remote" echo " --results absolute-pathname" echo " --shutdown-grace seconds" - echo " --torture lock|rcu|rcuscale|refscale|scf" + echo " --torture lock|rcu|rcuscale|refscale|scf|X*" echo " --trust-make" exit 1 } @@ -231,7 +231,7 @@ do shift ;; --torture) - checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuscale\|refscale\|scf\)$' '^--' + checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuscale\|refscale\|scf\|X.*\)$' '^--' TORTURE_SUITE=$2 TORTURE_MOD="`echo $TORTURE_SUITE | sed -e 's/^\(lock\|rcu\|scf\)$/\1torture/'`" shift From 8e82c28ea2b4c6096c7673c59a285c658c9f389f Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Tue, 22 Feb 2022 13:07:16 +0100 Subject: [PATCH 043/155] torture: Make thread detection more robust by using lspcu For consecutive numbers the lscpu command collapses the output and just shows the range with start and end. The processors are numbered that way on POWER8. $ sudo ppc64_cpu --smt=8 $ lscpu | grep '^NUMA node' NUMA node(s): 2 NUMA node0 CPU(s): 0-79 NUMA node8 CPU(s): 80-159 This causes the heuristic to detect the number threads per core, looking for the number after the first comma, to fail, and QEMU aborts because of invalid arguments. $ lscpu | grep '^NUMA node0' | sed -e 's/^[^,-]*(,|\-)\([0-9]*\),.*$/\1/' NUMA node0 CPU(s): 0-79 But the lscpu command shows the number of threads per core: $ sudo ppc64_cpu --smt=8 $ lscpu | grep 'Thread(s) per core' Thread(s) per core: 8 $ sudo ppc64_cpu --smt=off $ lscpu | grep 'Thread(s) per core' Thread(s) per core: 1 This commit therefore directly uses that value and replaces use of grep with "sed -n" and its "p" command. Signed-off-by: Paul Menzel Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index c35ba24f994c..66d0414d8e4b 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -301,7 +301,7 @@ specify_qemu_cpus () { echo $2 -smp $3 ;; qemu-system-ppc64) - nt="`lscpu | grep '^NUMA node0' | sed -e 's/^[^,]*,\([0-9]*\),.*$/\1/'`" + nt="`lscpu | sed -n 's/^Thread(s) per core:\s*//p'`" echo $2 -smp cores=`expr \( $3 + $nt - 1 \) / $nt`,threads=$nt ;; esac From 9c2970fbb425cca0256ecf0f96490e4f253fda24 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:13 +0100 Subject: [PATCH 044/155] tools/nolibc: use pselect6 on RISCV This arch doesn't provide the old-style select() syscall, we have to use pselect6(). Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index c1c285fe494a..ad23712f9cb5 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -1256,7 +1256,10 @@ struct sys_stat_struct { * - the arguments are cast to long and assigned into the target * registers which are then simply passed as registers to the asm code, * so that we don't have to experience issues with register constraints. + * + * On riscv, select() is not implemented so we have to use pselect6(). */ +#define __ARCH_WANT_SYS_PSELECT6 #define my_syscall0(num) \ ({ \ From 930c4acc064edacd2c9d5bec1a66acacb2fb2589 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:14 +0100 Subject: [PATCH 045/155] tools/nolibc: guard the main file against multiple inclusion Including nolibc.h multiple times results in build errors due to multiple definitions. Let's add a guard against multiple inclusions. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index ad23712f9cb5..4660637d9b17 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -80,6 +80,8 @@ * https://w3challs.com/syscalls/ * */ +#ifndef _NOLIBC_H +#define _NOLIBC_H #include #include @@ -2582,3 +2584,5 @@ dev_t makedev(unsigned int major, unsigned int minor) { return ((major & 0xfff) << 8) | (minor & 0xff); } + +#endif /* _NOLIBC_H */ From d22959aa93528c6cf4583560696856cf6bba6b72 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Feb 2022 09:10:04 -0800 Subject: [PATCH 046/155] rcu: Clarify fill-the-gap comment in rcu_segcblist_advance() Reported-by: Frederic Weisbecker Reported-by: Neeraj Upadhyay Reported-by: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 81145c3ece25..c54ea2b6a36b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -505,10 +505,10 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]); /* - * Callbacks moved, so clean up the misordered ->tails[] pointers - * that now point into the middle of the list of ready-to-invoke - * callbacks. The overall effect is to copy down the later pointers - * into the gap that was created by the now-ready segments. + * Callbacks moved, so there might be an empty RCU_WAIT_TAIL + * and a non-empty RCU_NEXT_READY_TAIL. If so, copy the + * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap + * created by the now-ready-to-invoke segments. */ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) From 90d2efe7bdbde5371b6122174af0718843f805c6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Feb 2022 09:54:56 -0800 Subject: [PATCH 047/155] rcu: Fix rcu_preempt_deferred_qs_irqrestore() strict QS reporting Suppose we have a kernel built with both CONFIG_RCU_STRICT_GRACE_PERIOD=y and CONFIG_PREEMPT=y. Suppose further that an RCU reader from which RCU core needs a quiescent state ends in rcu_preempt_deferred_qs_irqrestore(). This function will then invoke rcu_report_qs_rdp() in order to immediately report that quiescent state. Unfortunately, it will not have cleared that reader's CPU's rcu_data structure's ->cpu_no_qs.b.norm field. As a result, rcu_report_qs_rdp() will take an early exit because it will believe that this CPU has not yet encountered a quiescent state, and there will be no reporting of the current quiescent state. This commit therefore causes rcu_preempt_deferred_qs_irqrestore() to clear the ->cpu_no_qs.b.norm field before invoking rcu_report_qs_rdp(). Kudos to Boqun Feng and Neeraj Upadhyay for helping with analysis of this issue! Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8360d86db1c0..176639c6215f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -486,6 +486,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) t->rcu_read_unlock_special.s = 0; if (special.b.need_qs) { if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } else { From c708b08c65a0dfae127b9ee33b0fb73535a5e066 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Feb 2022 17:29:37 -0800 Subject: [PATCH 048/155] rcu: Check for jiffies going backwards A report of a 12-jiffy normal RCU CPU stall warning raises interesting questions about the nature of time on the offending system. This commit instruments rcu_sched_clock_irq(), which is RCU's hook into the scheduling-clock interrupt, checking for the jiffies counter going backwards. Reported-by: Saravanan D Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ++++++++++ kernel/rcu/tree.h | 1 + 2 files changed, 11 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..a5ea67454640 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1679,6 +1679,8 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); + if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); return ret; @@ -2609,6 +2611,13 @@ static void rcu_do_batch(struct rcu_data *rdp) */ void rcu_sched_clock_irq(int user) { + unsigned long j; + + if (IS_ENABLED(CONFIG_PROVE_RCU)) { + j = jiffies; + WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock))); + __this_cpu_write(rcu_data.last_sched_clock, j); + } trace_rcu_utilization(TPS("Start scheduler-tick")); lockdep_assert_irqs_disabled(); raw_cpu_inc(rcu_data.ticks_this_gp); @@ -4179,6 +4188,7 @@ rcu_boot_init_percpu_data(int cpu) rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; rdp->rcu_onl_gp_seq = rcu_state.gp_seq; rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; + rdp->last_sched_clock = jiffies; rdp->cpu = cpu; rcu_boot_init_nocb_percpu_data(rdp); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 926673ebe355..94b55f669915 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -254,6 +254,7 @@ struct rcu_data { unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ short rcu_onl_gp_flags; /* ->gp_flags at last online. */ unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ + unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ int cpu; }; From a031651ff2144a3d81d4916856c093bc1ea0a413 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Thu, 31 Mar 2022 17:16:54 +0200 Subject: [PATCH 049/155] efi: Allow to enable EFI runtime services by default on RT Commit d9f283ae71af ("efi: Disable runtime services on RT") disabled EFI runtime services by default when the CONFIG_PREEMPT_RT option is enabled. The rationale for that commit is that some EFI calls could take too much time, leading to large latencies which is an issue for Real-Time kernels. But a side effect of that change was that now is not possible anymore to enable the EFI runtime services by default when CONFIG_PREEMPT_RT is set, without passing an efi=runtime command line parameter to the kernel. Instead, let's add a new EFI_DISABLE_RUNTIME boolean Kconfig option, that would be set to n by default but to y if CONFIG_PREEMPT_RT is enabled. That way, the current behaviour is preserved but gives users a mechanism to enable the EFI runtimes services in their kernels if that is required. For example, if the firmware could guarantee bounded time for EFI calls. Also, having a separate boolean config could allow users to disable the EFI runtime services by default even when CONFIG_PREEMPT_RT is not set. Reported-by: Alexander Larsson Fixes: d9f283ae71af ("efi: Disable runtime services on RT") Signed-off-by: Javier Martinez Canillas Link: https://lore.kernel.org/r/20220331151654.184433-1-javierm@redhat.com Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/Kconfig | 15 +++++++++++++++ drivers/firmware/efi/efi.c | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index 2c3dac5ecb36..243882f5e5f9 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -284,3 +284,18 @@ config EFI_CUSTOM_SSDT_OVERLAYS See Documentation/admin-guide/acpi/ssdt-overlays.rst for more information. + +config EFI_DISABLE_RUNTIME + bool "Disable EFI runtime services support by default" + default y if PREEMPT_RT + help + Allow to disable the EFI runtime services support by default. This can + already be achieved by using the efi=noruntime option, but it could be + useful to have this default without any kernel command line parameter. + + The EFI runtime services are disabled by default when PREEMPT_RT is + enabled, because measurements have shown that some EFI functions calls + might take too much time to complete, causing large latencies which is + an issue for Real-Time kernels. + + This default can be overridden by using the efi=runtime option. diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 5502e176d51b..ff57db8f8d05 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -66,7 +66,7 @@ struct mm_struct efi_mm = { struct workqueue_struct *efi_rts_wq; -static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT); +static bool disable_runtime = IS_ENABLED(CONFIG_EFI_DISABLE_RUNTIME); static int __init setup_noefi(char *arg) { disable_runtime = true; From 1227418989346af3af179742cf42ce842e0ad484 Mon Sep 17 00:00:00 2001 From: Dov Murik Date: Tue, 12 Apr 2022 21:21:24 +0000 Subject: [PATCH 050/155] efi: Save location of EFI confidential computing area Confidential computing (coco) hardware such as AMD SEV (Secure Encrypted Virtualization) allows a guest owner to inject secrets into the VMs memory without the host/hypervisor being able to read them. Firmware support for secret injection is available in OVMF, which reserves a memory area for secret injection and includes a pointer to it the in EFI config table entry LINUX_EFI_COCO_SECRET_TABLE_GUID. If EFI exposes such a table entry, uefi_init() will keep a pointer to the EFI config table entry in efi.coco_secret, so it can be used later by the kernel (specifically drivers/virt/coco/efi_secret). It will also appear in the kernel log as "CocoSecret=ADDRESS"; for example: [ 0.000000] efi: EFI v2.70 by EDK II [ 0.000000] efi: CocoSecret=0x7f22e680 SMBIOS=0x7f541000 ACPI=0x7f77e000 ACPI 2.0=0x7f77e014 MEMATTR=0x7ea0c018 The new functionality can be enabled with CONFIG_EFI_COCO_SECRET=y. Signed-off-by: Dov Murik Reviewed-by: Gerd Hoffmann Link: https://lore.kernel.org/r/20220412212127.154182-2-dovmurik@linux.ibm.com Signed-off-by: Ard Biesheuvel --- arch/x86/platform/efi/efi.c | 3 +++ drivers/firmware/efi/Kconfig | 16 ++++++++++++++++ drivers/firmware/efi/efi.c | 6 ++++++ include/linux/efi.h | 10 ++++++++++ 4 files changed, 35 insertions(+) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 147c30a81f15..1591d67e0bcd 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -93,6 +93,9 @@ static const unsigned long * const efi_tables[] = { #ifdef CONFIG_LOAD_UEFI_KEYS &efi.mokvar_table, #endif +#ifdef CONFIG_EFI_COCO_SECRET + &efi.coco_secret, +#endif }; u64 efi_setup; /* efi setup_data physical address */ diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index 243882f5e5f9..f8ddd2259ba0 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -299,3 +299,19 @@ config EFI_DISABLE_RUNTIME an issue for Real-Time kernels. This default can be overridden by using the efi=runtime option. + +config EFI_COCO_SECRET + bool "EFI Confidential Computing Secret Area Support" + depends on EFI + help + Confidential Computing platforms (such as AMD SEV) allow the + Guest Owner to securely inject secrets during guest VM launch. + The secrets are placed in a designated EFI reserved memory area. + + In order to use the secrets in the kernel, the location of the secret + area (as published in the EFI config table) must be kept. + + If you say Y here, the address of the EFI secret area will be kept + for usage inside the kernel. This will allow the + virt/coco/efi_secret module to access the secrets, which in turn + allows userspace programs to access the injected secrets. diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index ff57db8f8d05..e0f43262d451 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -46,6 +46,9 @@ struct efi __read_mostly efi = { #ifdef CONFIG_LOAD_UEFI_KEYS .mokvar_table = EFI_INVALID_TABLE_ADDR, #endif +#ifdef CONFIG_EFI_COCO_SECRET + .coco_secret = EFI_INVALID_TABLE_ADDR, +#endif }; EXPORT_SYMBOL(efi); @@ -528,6 +531,9 @@ static const efi_config_table_type_t common_tables[] __initconst = { #endif #ifdef CONFIG_LOAD_UEFI_KEYS {LINUX_EFI_MOK_VARIABLE_TABLE_GUID, &efi.mokvar_table, "MOKvar" }, +#endif +#ifdef CONFIG_EFI_COCO_SECRET + {LINUX_EFI_COCO_SECRET_AREA_GUID, &efi.coco_secret, "CocoSecret" }, #endif {}, }; diff --git a/include/linux/efi.h b/include/linux/efi.h index ccd4d3f91c98..771d4cd06b56 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -405,6 +405,7 @@ void efi_native_runtime_setup(void); #define LINUX_EFI_MEMRESERVE_TABLE_GUID EFI_GUID(0x888eb0c6, 0x8ede, 0x4ff5, 0xa8, 0xf0, 0x9a, 0xee, 0x5c, 0xb9, 0x77, 0xc2) #define LINUX_EFI_INITRD_MEDIA_GUID EFI_GUID(0x5568e427, 0x68fc, 0x4f3d, 0xac, 0x74, 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68) #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89) +#define LINUX_EFI_COCO_SECRET_AREA_GUID EFI_GUID(0xadf956ad, 0xe98c, 0x484c, 0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47) /* OEM GUIDs */ #define DELLEMC_EFI_RCI2_TABLE_GUID EFI_GUID(0x2d9f28a2, 0xa886, 0x456a, 0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55) @@ -596,6 +597,7 @@ extern struct efi { unsigned long tpm_log; /* TPM2 Event Log table */ unsigned long tpm_final_log; /* TPM2 Final Events Log table */ unsigned long mokvar_table; /* MOK variable config table */ + unsigned long coco_secret; /* Confidential computing secret table */ efi_get_time_t *get_time; efi_set_time_t *set_time; @@ -1335,4 +1337,12 @@ extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt); static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) { } #endif +struct linux_efi_coco_secret_area { + u64 base_pa; + u64 size; +}; + +/* Header of a populated EFI secret area */ +#define EFI_SECRET_TABLE_HEADER_GUID EFI_GUID(0x1e74f542, 0x71dd, 0x4d66, 0x96, 0x3e, 0xef, 0x42, 0x87, 0xff, 0x17, 0x3b) + #endif /* _LINUX_EFI_H */ From cbabf03c3ef3cce74a97f140cf57611a9e8a21bc Mon Sep 17 00:00:00 2001 From: Dov Murik Date: Tue, 12 Apr 2022 21:21:25 +0000 Subject: [PATCH 051/155] virt: Add efi_secret module to expose confidential computing secrets The new efi_secret module exposes the confidential computing (coco) EFI secret area via securityfs interface. When the module is loaded (and securityfs is mounted, typically under /sys/kernel/security), a "secrets/coco" directory is created in securityfs. In it, a file is created for each secret entry. The name of each such file is the GUID of the secret entry, and its content is the secret data. This allows applications running in a confidential computing setting to read secrets provided by the guest owner via a secure secret injection mechanism (such as AMD SEV's LAUNCH_SECRET command). Removing (unlinking) files in the "secrets/coco" directory will zero out the secret in memory, and remove the filesystem entry. If the module is removed and loaded again, that secret will not appear in the filesystem. Signed-off-by: Dov Murik Reviewed-by: Gerd Hoffmann Link: https://lore.kernel.org/r/20220412212127.154182-3-dovmurik@linux.ibm.com Signed-off-by: Ard Biesheuvel --- .../ABI/testing/securityfs-secrets-coco | 51 +++ drivers/virt/Kconfig | 3 + drivers/virt/Makefile | 1 + drivers/virt/coco/efi_secret/Kconfig | 16 + drivers/virt/coco/efi_secret/Makefile | 2 + drivers/virt/coco/efi_secret/efi_secret.c | 349 ++++++++++++++++++ 6 files changed, 422 insertions(+) create mode 100644 Documentation/ABI/testing/securityfs-secrets-coco create mode 100644 drivers/virt/coco/efi_secret/Kconfig create mode 100644 drivers/virt/coco/efi_secret/Makefile create mode 100644 drivers/virt/coco/efi_secret/efi_secret.c diff --git a/Documentation/ABI/testing/securityfs-secrets-coco b/Documentation/ABI/testing/securityfs-secrets-coco new file mode 100644 index 000000000000..f2b6909155f9 --- /dev/null +++ b/Documentation/ABI/testing/securityfs-secrets-coco @@ -0,0 +1,51 @@ +What: security/secrets/coco +Date: February 2022 +Contact: Dov Murik +Description: + Exposes confidential computing (coco) EFI secrets to + userspace via securityfs. + + EFI can declare memory area used by confidential computing + platforms (such as AMD SEV and SEV-ES) for secret injection by + the Guest Owner during VM's launch. The secrets are encrypted + by the Guest Owner and decrypted inside the trusted enclave, + and therefore are not readable by the untrusted host. + + The efi_secret module exposes the secrets to userspace. Each + secret appears as a file under /secrets/coco, + where the filename is the GUID of the entry in the secrets + table. This module is loaded automatically by the EFI driver + if the EFI secret area is populated. + + Two operations are supported for the files: read and unlink. + Reading the file returns the content of secret entry. + Unlinking the file overwrites the secret data with zeroes and + removes the entry from the filesystem. A secret cannot be read + after it has been unlinked. + + For example, listing the available secrets:: + + # modprobe efi_secret + # ls -l /sys/kernel/security/secrets/coco + -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b + -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6 + -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2 + -r--r----- 1 root root 0 Jun 28 11:54 e6f5a162-d67f-4750-a67c-5d065f2a9910 + + Reading the secret data by reading a file:: + + # cat /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910 + the-content-of-the-secret-data + + Wiping a secret by unlinking a file:: + + # rm /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910 + # ls -l /sys/kernel/security/secrets/coco + -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b + -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6 + -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2 + + Note: The binary format of the secrets table injected by the + Guest Owner is described in + drivers/virt/coco/efi_secret/efi_secret.c under "Structure of + the EFI secret area". diff --git a/drivers/virt/Kconfig b/drivers/virt/Kconfig index 121b9293c737..c877da072d4d 100644 --- a/drivers/virt/Kconfig +++ b/drivers/virt/Kconfig @@ -47,4 +47,7 @@ source "drivers/virt/vboxguest/Kconfig" source "drivers/virt/nitro_enclaves/Kconfig" source "drivers/virt/acrn/Kconfig" + +source "drivers/virt/coco/efi_secret/Kconfig" + endif diff --git a/drivers/virt/Makefile b/drivers/virt/Makefile index 108d0ffcc9aa..067b5427f40f 100644 --- a/drivers/virt/Makefile +++ b/drivers/virt/Makefile @@ -9,3 +9,4 @@ obj-y += vboxguest/ obj-$(CONFIG_NITRO_ENCLAVES) += nitro_enclaves/ obj-$(CONFIG_ACRN_HSM) += acrn/ +obj-$(CONFIG_EFI_SECRET) += coco/efi_secret/ diff --git a/drivers/virt/coco/efi_secret/Kconfig b/drivers/virt/coco/efi_secret/Kconfig new file mode 100644 index 000000000000..4404d198f3b2 --- /dev/null +++ b/drivers/virt/coco/efi_secret/Kconfig @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-only +config EFI_SECRET + tristate "EFI secret area securityfs support" + depends on EFI && X86_64 + select EFI_COCO_SECRET + select SECURITYFS + help + This is a driver for accessing the EFI secret area via securityfs. + The EFI secret area is a memory area designated by the firmware for + confidential computing secret injection (for example for AMD SEV + guests). The driver exposes the secrets as files in + /secrets/coco. Files can be read and deleted (deleting + a file wipes the secret from memory). + + To compile this driver as a module, choose M here. + The module will be called efi_secret. diff --git a/drivers/virt/coco/efi_secret/Makefile b/drivers/virt/coco/efi_secret/Makefile new file mode 100644 index 000000000000..c7047ce804f7 --- /dev/null +++ b/drivers/virt/coco/efi_secret/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_EFI_SECRET) += efi_secret.o diff --git a/drivers/virt/coco/efi_secret/efi_secret.c b/drivers/virt/coco/efi_secret/efi_secret.c new file mode 100644 index 000000000000..e700a5ef7043 --- /dev/null +++ b/drivers/virt/coco/efi_secret/efi_secret.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * efi_secret module + * + * Copyright (C) 2022 IBM Corporation + * Author: Dov Murik + */ + +/** + * DOC: efi_secret: Allow reading EFI confidential computing (coco) secret area + * via securityfs interface. + * + * When the module is loaded (and securityfs is mounted, typically under + * /sys/kernel/security), a "secrets/coco" directory is created in securityfs. + * In it, a file is created for each secret entry. The name of each such file + * is the GUID of the secret entry, and its content is the secret data. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define EFI_SECRET_NUM_FILES 64 + +struct efi_secret { + struct dentry *secrets_dir; + struct dentry *fs_dir; + struct dentry *fs_files[EFI_SECRET_NUM_FILES]; + void __iomem *secret_data; + u64 secret_data_len; +}; + +/* + * Structure of the EFI secret area + * + * Offset Length + * (bytes) (bytes) Usage + * ------- ------- ----- + * 0 16 Secret table header GUID (must be 1e74f542-71dd-4d66-963e-ef4287ff173b) + * 16 4 Length of bytes of the entire secret area + * + * 20 16 First secret entry's GUID + * 36 4 First secret entry's length in bytes (= 16 + 4 + x) + * 40 x First secret entry's data + * + * 40+x 16 Second secret entry's GUID + * 56+x 4 Second secret entry's length in bytes (= 16 + 4 + y) + * 60+x y Second secret entry's data + * + * (... and so on for additional entries) + * + * The GUID of each secret entry designates the usage of the secret data. + */ + +/** + * struct secret_header - Header of entire secret area; this should be followed + * by instances of struct secret_entry. + * @guid: Must be EFI_SECRET_TABLE_HEADER_GUID + * @len: Length in bytes of entire secret area, including header + */ +struct secret_header { + efi_guid_t guid; + u32 len; +} __attribute((packed)); + +/** + * struct secret_entry - Holds one secret entry + * @guid: Secret-specific GUID (or NULL_GUID if this secret entry was deleted) + * @len: Length of secret entry, including its guid and len fields + * @data: The secret data (full of zeros if this secret entry was deleted) + */ +struct secret_entry { + efi_guid_t guid; + u32 len; + u8 data[]; +} __attribute((packed)); + +static size_t secret_entry_data_len(struct secret_entry *e) +{ + return e->len - sizeof(*e); +} + +static struct efi_secret the_efi_secret; + +static inline struct efi_secret *efi_secret_get(void) +{ + return &the_efi_secret; +} + +static int efi_secret_bin_file_show(struct seq_file *file, void *data) +{ + struct secret_entry *e = file->private; + + if (e) + seq_write(file, e->data, secret_entry_data_len(e)); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(efi_secret_bin_file); + +/* + * Overwrite memory content with zeroes, and ensure that dirty cache lines are + * actually written back to memory, to clear out the secret. + */ +static void wipe_memory(void *addr, size_t size) +{ + memzero_explicit(addr, size); +#ifdef CONFIG_X86 + clflush_cache_range(addr, size); +#endif +} + +static int efi_secret_unlink(struct inode *dir, struct dentry *dentry) +{ + struct efi_secret *s = efi_secret_get(); + struct inode *inode = d_inode(dentry); + struct secret_entry *e = (struct secret_entry *)inode->i_private; + int i; + + if (e) { + /* Zero out the secret data */ + wipe_memory(e->data, secret_entry_data_len(e)); + e->guid = NULL_GUID; + } + + inode->i_private = NULL; + + for (i = 0; i < EFI_SECRET_NUM_FILES; i++) + if (s->fs_files[i] == dentry) + s->fs_files[i] = NULL; + + /* + * securityfs_remove tries to lock the directory's inode, but we reach + * the unlink callback when it's already locked + */ + inode_unlock(dir); + securityfs_remove(dentry); + inode_lock(dir); + + return 0; +} + +static const struct inode_operations efi_secret_dir_inode_operations = { + .lookup = simple_lookup, + .unlink = efi_secret_unlink, +}; + +static int efi_secret_map_area(struct platform_device *dev) +{ + int ret; + struct efi_secret *s = efi_secret_get(); + struct linux_efi_coco_secret_area *secret_area; + + if (efi.coco_secret == EFI_INVALID_TABLE_ADDR) { + dev_err(&dev->dev, "Secret area address is not available\n"); + return -EINVAL; + } + + secret_area = memremap(efi.coco_secret, sizeof(*secret_area), MEMREMAP_WB); + if (secret_area == NULL) { + dev_err(&dev->dev, "Could not map secret area EFI config entry\n"); + return -ENOMEM; + } + if (!secret_area->base_pa || secret_area->size < sizeof(struct secret_header)) { + dev_err(&dev->dev, + "Invalid secret area memory location (base_pa=0x%llx size=0x%llx)\n", + secret_area->base_pa, secret_area->size); + ret = -EINVAL; + goto unmap; + } + + s->secret_data = ioremap_encrypted(secret_area->base_pa, secret_area->size); + if (s->secret_data == NULL) { + dev_err(&dev->dev, "Could not map secret area\n"); + ret = -ENOMEM; + goto unmap; + } + + s->secret_data_len = secret_area->size; + ret = 0; + +unmap: + memunmap(secret_area); + return ret; +} + +static void efi_secret_securityfs_teardown(struct platform_device *dev) +{ + struct efi_secret *s = efi_secret_get(); + int i; + + for (i = (EFI_SECRET_NUM_FILES - 1); i >= 0; i--) { + securityfs_remove(s->fs_files[i]); + s->fs_files[i] = NULL; + } + + securityfs_remove(s->fs_dir); + s->fs_dir = NULL; + + securityfs_remove(s->secrets_dir); + s->secrets_dir = NULL; + + dev_dbg(&dev->dev, "Removed securityfs entries\n"); +} + +static int efi_secret_securityfs_setup(struct platform_device *dev) +{ + struct efi_secret *s = efi_secret_get(); + int ret = 0, i = 0, bytes_left; + unsigned char *ptr; + struct secret_header *h; + struct secret_entry *e; + struct dentry *dent; + char guid_str[EFI_VARIABLE_GUID_LEN + 1]; + + ptr = (void __force *)s->secret_data; + h = (struct secret_header *)ptr; + if (efi_guidcmp(h->guid, EFI_SECRET_TABLE_HEADER_GUID)) { + /* + * This is not an error: it just means that EFI defines secret + * area but it was not populated by the Guest Owner. + */ + dev_dbg(&dev->dev, "EFI secret area does not start with correct GUID\n"); + return -ENODEV; + } + if (h->len < sizeof(*h)) { + dev_err(&dev->dev, "EFI secret area reported length is too small\n"); + return -EINVAL; + } + if (h->len > s->secret_data_len) { + dev_err(&dev->dev, "EFI secret area reported length is too big\n"); + return -EINVAL; + } + + s->secrets_dir = NULL; + s->fs_dir = NULL; + memset(s->fs_files, 0, sizeof(s->fs_files)); + + dent = securityfs_create_dir("secrets", NULL); + if (IS_ERR(dent)) { + dev_err(&dev->dev, "Error creating secrets securityfs directory entry err=%ld\n", + PTR_ERR(dent)); + return PTR_ERR(dent); + } + s->secrets_dir = dent; + + dent = securityfs_create_dir("coco", s->secrets_dir); + if (IS_ERR(dent)) { + dev_err(&dev->dev, "Error creating coco securityfs directory entry err=%ld\n", + PTR_ERR(dent)); + return PTR_ERR(dent); + } + d_inode(dent)->i_op = &efi_secret_dir_inode_operations; + s->fs_dir = dent; + + bytes_left = h->len - sizeof(*h); + ptr += sizeof(*h); + while (bytes_left >= (int)sizeof(*e) && i < EFI_SECRET_NUM_FILES) { + e = (struct secret_entry *)ptr; + if (e->len < sizeof(*e) || e->len > (unsigned int)bytes_left) { + dev_err(&dev->dev, "EFI secret area is corrupted\n"); + ret = -EINVAL; + goto err_cleanup; + } + + /* Skip deleted entries (which will have NULL_GUID) */ + if (efi_guidcmp(e->guid, NULL_GUID)) { + efi_guid_to_str(&e->guid, guid_str); + + dent = securityfs_create_file(guid_str, 0440, s->fs_dir, (void *)e, + &efi_secret_bin_file_fops); + if (IS_ERR(dent)) { + dev_err(&dev->dev, "Error creating efi_secret securityfs entry\n"); + ret = PTR_ERR(dent); + goto err_cleanup; + } + + s->fs_files[i++] = dent; + } + ptr += e->len; + bytes_left -= e->len; + } + + dev_info(&dev->dev, "Created %d entries in securityfs secrets/coco\n", i); + return 0; + +err_cleanup: + efi_secret_securityfs_teardown(dev); + return ret; +} + +static void efi_secret_unmap_area(void) +{ + struct efi_secret *s = efi_secret_get(); + + if (s->secret_data) { + iounmap(s->secret_data); + s->secret_data = NULL; + s->secret_data_len = 0; + } +} + +static int efi_secret_probe(struct platform_device *dev) +{ + int ret; + + ret = efi_secret_map_area(dev); + if (ret) + return ret; + + ret = efi_secret_securityfs_setup(dev); + if (ret) + goto err_unmap; + + return ret; + +err_unmap: + efi_secret_unmap_area(); + return ret; +} + +static int efi_secret_remove(struct platform_device *dev) +{ + efi_secret_securityfs_teardown(dev); + efi_secret_unmap_area(); + return 0; +} + +static struct platform_driver efi_secret_driver = { + .probe = efi_secret_probe, + .remove = efi_secret_remove, + .driver = { + .name = "efi_secret", + }, +}; + +module_platform_driver(efi_secret_driver); + +MODULE_DESCRIPTION("Confidential computing EFI secret area access"); +MODULE_AUTHOR("IBM"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:efi_secret"); From 20ffd9205ef60fca8912bc9df1602bb627756602 Mon Sep 17 00:00:00 2001 From: Dov Murik Date: Tue, 12 Apr 2022 21:21:26 +0000 Subject: [PATCH 052/155] efi: Register efi_secret platform device if EFI secret area is declared During efi initialization, check if coco_secret is defined in the EFI configuration table; in such case, register platform device "efi_secret". This allows udev to automatically load the efi_secret module (platform driver), which in turn will populate the /secrets/coco directory in guests into which secrets were injected. Note that a declared address of an EFI secret area doesn't mean that secrets where indeed injected to that area; if the secret area is not populated, the driver will not load (but the platform device will still be registered). Signed-off-by: Dov Murik Reviewed-by: Gerd Hoffmann Link: https://lore.kernel.org/r/20220412212127.154182-4-dovmurik@linux.ibm.com Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/efi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index e0f43262d451..860534bcfdac 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -425,6 +425,11 @@ static int __init efisubsys_init(void) if (efi_enabled(EFI_DBG) && efi_enabled(EFI_PRESERVE_BS_REGIONS)) efi_debugfs_init(); +#ifdef CONFIG_EFI_COCO_SECRET + if (efi.coco_secret != EFI_INVALID_TABLE_ADDR) + platform_device_register_simple("efi_secret", 0, NULL, 0); +#endif + return 0; err_remove_group: From 7419995a331c24a239bc0bce5df24360544c7038 Mon Sep 17 00:00:00 2001 From: Dov Murik Date: Tue, 12 Apr 2022 21:21:27 +0000 Subject: [PATCH 053/155] docs: security: Add secrets/coco documentation Add documentation for the efi_secret module which allows access to Confidential Computing injected secrets. Signed-off-by: Dov Murik Reviewed-by: Gerd Hoffmann Link: https://lore.kernel.org/r/20220412212127.154182-5-dovmurik@linux.ibm.com Signed-off-by: Ard Biesheuvel --- Documentation/security/index.rst | 1 + Documentation/security/secrets/coco.rst | 103 +++++++++++++++++++++++ Documentation/security/secrets/index.rst | 9 ++ 3 files changed, 113 insertions(+) create mode 100644 Documentation/security/secrets/coco.rst create mode 100644 Documentation/security/secrets/index.rst diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst index 16335de04e8c..6ed8d2fa6f9e 100644 --- a/Documentation/security/index.rst +++ b/Documentation/security/index.rst @@ -17,3 +17,4 @@ Security Documentation tpm/index digsig landlock + secrets/index diff --git a/Documentation/security/secrets/coco.rst b/Documentation/security/secrets/coco.rst new file mode 100644 index 000000000000..262e7abb1b24 --- /dev/null +++ b/Documentation/security/secrets/coco.rst @@ -0,0 +1,103 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================== +Confidential Computing secrets +============================== + +This document describes how Confidential Computing secret injection is handled +from the firmware to the operating system, in the EFI driver and the efi_secret +kernel module. + + +Introduction +============ + +Confidential Computing (coco) hardware such as AMD SEV (Secure Encrypted +Virtualization) allows guest owners to inject secrets into the VMs +memory without the host/hypervisor being able to read them. In SEV, +secret injection is performed early in the VM launch process, before the +guest starts running. + +The efi_secret kernel module allows userspace applications to access these +secrets via securityfs. + + +Secret data flow +================ + +The guest firmware may reserve a designated memory area for secret injection, +and publish its location (base GPA and length) in the EFI configuration table +under a ``LINUX_EFI_COCO_SECRET_AREA_GUID`` entry +(``adf956ad-e98c-484c-ae11-b51c7d336447``). This memory area should be marked +by the firmware as ``EFI_RESERVED_TYPE``, and therefore the kernel should not +be use it for its own purposes. + +During the VM's launch, the virtual machine manager may inject a secret to that +area. In AMD SEV and SEV-ES this is performed using the +``KVM_SEV_LAUNCH_SECRET`` command (see [sev]_). The strucutre of the injected +Guest Owner secret data should be a GUIDed table of secret values; the binary +format is described in ``drivers/virt/coco/efi_secret/efi_secret.c`` under +"Structure of the EFI secret area". + +On kernel start, the kernel's EFI driver saves the location of the secret area +(taken from the EFI configuration table) in the ``efi.coco_secret`` field. +Later it checks if the secret area is populated: it maps the area and checks +whether its content begins with ``EFI_SECRET_TABLE_HEADER_GUID`` +(``1e74f542-71dd-4d66-963e-ef4287ff173b``). If the secret area is populated, +the EFI driver will autoload the efi_secret kernel module, which exposes the +secrets to userspace applications via securityfs. The details of the +efi_secret filesystem interface are in [secrets-coco-abi]_. + + +Application usage example +========================= + +Consider a guest performing computations on encrypted files. The Guest Owner +provides the decryption key (= secret) using the secret injection mechanism. +The guest application reads the secret from the efi_secret filesystem and +proceeds to decrypt the files into memory and then performs the needed +computations on the content. + +In this example, the host can't read the files from the disk image +because they are encrypted. Host can't read the decryption key because +it is passed using the secret injection mechanism (= secure channel). +Host can't read the decrypted content from memory because it's a +confidential (memory-encrypted) guest. + +Here is a simple example for usage of the efi_secret module in a guest +to which an EFI secret area with 4 secrets was injected during launch:: + + # ls -la /sys/kernel/security/secrets/coco + total 0 + drwxr-xr-x 2 root root 0 Jun 28 11:54 . + drwxr-xr-x 3 root root 0 Jun 28 11:54 .. + -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b + -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6 + -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2 + -r--r----- 1 root root 0 Jun 28 11:54 e6f5a162-d67f-4750-a67c-5d065f2a9910 + + # hd /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910 + 00000000 74 68 65 73 65 2d 61 72 65 2d 74 68 65 2d 6b 61 |these-are-the-ka| + 00000010 74 61 2d 73 65 63 72 65 74 73 00 01 02 03 04 05 |ta-secrets......| + 00000020 06 07 |..| + 00000022 + + # rm /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910 + + # ls -la /sys/kernel/security/secrets/coco + total 0 + drwxr-xr-x 2 root root 0 Jun 28 11:55 . + drwxr-xr-x 3 root root 0 Jun 28 11:54 .. + -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b + -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6 + -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2 + + +References +========== + +See [sev-api-spec]_ for more info regarding SEV ``LAUNCH_SECRET`` operation. + +.. [sev] Documentation/virt/kvm/amd-memory-encryption.rst +.. [secrets-coco-abi] Documentation/ABI/testing/securityfs-secrets-coco +.. [sev-api-spec] https://www.amd.com/system/files/TechDocs/55766_SEV-KM_API_Specification.pdf diff --git a/Documentation/security/secrets/index.rst b/Documentation/security/secrets/index.rst new file mode 100644 index 000000000000..ced34e9c43bd --- /dev/null +++ b/Documentation/security/secrets/index.rst @@ -0,0 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +Secrets documentation +===================== + +.. toctree:: + + coco From aa480379d8bdb33920d68acfd90f823c8af32578 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 4 Mar 2022 07:36:37 +0100 Subject: [PATCH 054/155] efi: Add missing prototype for efi_capsule_setup_info Fixes "no previous declaration for 'efi_capsule_setup_info'" warnings under W=1. Fixes: 2959c95d510c ("efi/capsule: Add support for Quark security header") Signed-off-by: Jan Kiszka Link: https://lore.kernel.org/r/c28d3f86-dd72-27d1-e2c2-40971b8da6bd@siemens.com Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/efi.h b/include/linux/efi.h index 771d4cd06b56..fd266198f9d8 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -213,6 +213,8 @@ struct capsule_info { size_t page_bytes_remain; }; +int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, + size_t hdr_bytes); int __efi_capsule_setup_info(struct capsule_info *cap_info); /* From 6172de3c7f1171e55314bfc5ee4ae6edd225b048 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Feb 2022 12:26:00 -0800 Subject: [PATCH 055/155] docs: Add documentation for rude and trace RCU flavors This commit belatedly adds documentation of Tasks Rude RCU and Tasks Trace RCU to RCU's requirements document. Tested-by: Bagas Sanjaya Reviewed-by: Steven Rostedt (Google) Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.rst | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index 45278e2974c0..ff2be1ac54c4 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -2654,6 +2654,38 @@ synchronize_rcu(), and rcu_barrier(), respectively. In three APIs are therefore implemented by separate functions that check for voluntary context switches. +Tasks Rude RCU +~~~~~~~~~~~~~~ + +Some forms of tracing need to wait for all preemption-disabled regions +of code running on any online CPU, including those executed when RCU is +not watching. This means that synchronize_rcu() is insufficient, and +Tasks Rude RCU must be used instead. This flavor of RCU does its work by +forcing a workqueue to be scheduled on each online CPU, hence the "Rude" +moniker. And this operation is considered to be quite rude by real-time +workloads that don't want their ``nohz_full`` CPUs receiving IPIs and +by battery-powered systems that don't want their idle CPUs to be awakened. + +The tasks-rude-RCU API is also reader-marking-free and thus quite compact, +consisting of call_rcu_tasks_rude(), synchronize_rcu_tasks_rude(), +and rcu_barrier_tasks_rude(). + +Tasks Trace RCU +~~~~~~~~~~~~~~~ + +Some forms of tracing need to sleep in readers, but cannot tolerate +SRCU's read-side overhead, which includes a full memory barrier in both +srcu_read_lock() and srcu_read_unlock(). This need is handled by a +Tasks Trace RCU that uses scheduler locking and IPIs to synchronize with +readers. Real-time systems that cannot tolerate IPIs may build their +kernels with ``CONFIG_TASKS_TRACE_RCU_READ_MB=y``, which avoids the IPIs at +the expense of adding full memory barriers to the read-side primitives. + +The tasks-trace-RCU API is also reasonably compact, +consisting of rcu_read_lock_trace(), rcu_read_unlock_trace(), +rcu_read_lock_trace_held(), call_rcu_tasks_trace(), +synchronize_rcu_tasks_trace(), and rcu_barrier_tasks_trace(). + Possible Future Changes ----------------------- From 404147faaaf28319ba8e60392ba9d4f3b6055ad5 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Wed, 30 Mar 2022 23:41:00 +0900 Subject: [PATCH 056/155] docs: Update RCU cross-references as suggested in doc-guide The RCU documentation contains old-style cross references which do not follow the best practices outlined in doc-guide/sphinx.rst. In addition, some of the cross references use URLs that should be replaced by pathnames. Update all of these cross references and adjust the surrounding words. Summary of changes: - out-of-date plaintext file names (*.txt) -> *.rst - references by :ref: tags -> path names of *.rst * use relative paths to .rst files under the RCU/ subdirectory * use abs paths of Documentation/xxx for other .rst files - references by URL under https://www.kernel.org/ -> paths of *.rst - adjust surrounding words of some of updated references. Note: The automarkup.py script interprets references via "*.txt" as if they were via "*.rst", so the *.txt -> *.rst changes should be regarded as cleanups rather than bug fixes. Cc: rcu@vger.kernel.org Cc: linux-doc@vger.kernel.org Tested-by: Bagas Sanjaya Reviewed-by: Steven Rostedt (Google) Signed-off-by: Akira Yokosawa Signed-off-by: Paul E. McKenney --- .../Design/Data-Structures/Data-Structures.rst | 2 +- .../Expedited-Grace-Periods.rst | 2 +- .../RCU/Design/Requirements/Requirements.rst | 4 ++-- Documentation/RCU/arrayRCU.rst | 4 ++-- Documentation/RCU/checklist.rst | 9 ++++----- Documentation/RCU/rcu.rst | 13 ++++++------- Documentation/RCU/rculist_nulls.rst | 2 +- Documentation/RCU/whatisRCU.rst | 18 ++++++++---------- 8 files changed, 25 insertions(+), 29 deletions(-) diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst index f4efd6897b09..b34990c7c377 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst @@ -973,7 +973,7 @@ The ``->dynticks`` field counts the corresponding CPU's transitions to and from either dyntick-idle or user mode, so that this counter has an even value when the CPU is in dyntick-idle mode or user mode and an odd value otherwise. The transitions to/from user mode need to be counted -for user mode adaptive-ticks support (see timers/NO_HZ.txt). +for user mode adaptive-ticks support (see Documentation/timers/no_hz.rst). The ``->rcu_need_heavy_qs`` field is used to record the fact that the RCU core code would really like to see a quiescent state from the diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst index 6f89cf1e567d..c9c957c85bac 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst @@ -406,7 +406,7 @@ In earlier implementations, the task requesting the expedited grace period also drove it to completion. This straightforward approach had the disadvantage of needing to account for POSIX signals sent to user tasks, so more recent implemementations use the Linux kernel's -`workqueues `__. +workqueues (see Documentation/core-api/workqueue.rst). The requesting task still does counter snapshotting and funnel-lock processing, but the task reaching the top of the funnel lock does a diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index ff2be1ac54c4..04ed8bf27a0e 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -370,8 +370,8 @@ pointer fetched by rcu_dereference() may not be used outside of the outermost RCU read-side critical section containing that rcu_dereference(), unless protection of the corresponding data element has been passed from RCU to some other synchronization -mechanism, most commonly locking or `reference -counting `__. +mechanism, most commonly locking or reference counting +(see ../../rcuref.rst). .. |high-quality implementation of C11 memory_order_consume [PDF]| replace:: high-quality implementation of C11 ``memory_order_consume`` [PDF] .. _high-quality implementation of C11 memory_order_consume [PDF]: http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf diff --git a/Documentation/RCU/arrayRCU.rst b/Documentation/RCU/arrayRCU.rst index 4051ea3871ef..a5f2ff8fc54c 100644 --- a/Documentation/RCU/arrayRCU.rst +++ b/Documentation/RCU/arrayRCU.rst @@ -33,8 +33,8 @@ Situation 1: Hash Tables Hash tables are often implemented as an array, where each array entry has a linked-list hash chain. Each hash chain can be protected by RCU -as described in the listRCU.txt document. This approach also applies -to other array-of-list situations, such as radix trees. +as described in listRCU.rst. This approach also applies to other +array-of-list situations, such as radix trees. .. _static_arrays: diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index f4545b7c9a63..42cc5d891bd2 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -140,8 +140,7 @@ over a rather long period of time, but improvements are always welcome! prevents destructive compiler optimizations. However, with a bit of devious creativity, it is possible to mishandle the return value from rcu_dereference(). - Please see rcu_dereference.txt in this directory for - more information. + Please see rcu_dereference.rst for more information. The rcu_dereference() primitive is used by the various "_rcu()" list-traversal primitives, such @@ -151,7 +150,7 @@ over a rather long period of time, but improvements are always welcome! primitives. This is particularly useful in code that is common to readers and updaters. However, lockdep will complain if you access rcu_dereference() outside - of an RCU read-side critical section. See lockdep.txt + of an RCU read-side critical section. See lockdep.rst to learn what to do about this. Of course, neither rcu_dereference() nor the "_rcu()" @@ -323,7 +322,7 @@ over a rather long period of time, but improvements are always welcome! primitives when the update-side lock is held is that doing so can be quite helpful in reducing code bloat when common code is shared between readers and updaters. Additional primitives - are provided for this case, as discussed in lockdep.txt. + are provided for this case, as discussed in lockdep.rst. One exception to this rule is when data is only ever added to the linked data structure, and is never removed during any @@ -480,4 +479,4 @@ over a rather long period of time, but improvements are always welcome! both rcu_barrier() and synchronize_rcu(), if necessary, using something like workqueues to to execute them concurrently. - See rcubarrier.txt for more information. + See rcubarrier.rst for more information. diff --git a/Documentation/RCU/rcu.rst b/Documentation/RCU/rcu.rst index 0e03c6ef3147..3cfe01ba9a49 100644 --- a/Documentation/RCU/rcu.rst +++ b/Documentation/RCU/rcu.rst @@ -10,9 +10,8 @@ A "grace period" must elapse between the two parts, and this grace period must be long enough that any readers accessing the item being deleted have since dropped their references. For example, an RCU-protected deletion from a linked list would first remove the item from the list, wait for -a grace period to elapse, then free the element. See the -:ref:`Documentation/RCU/listRCU.rst ` for more information on -using RCU with linked lists. +a grace period to elapse, then free the element. See listRCU.rst for more +information on using RCU with linked lists. Frequently Asked Questions -------------------------- @@ -50,7 +49,7 @@ Frequently Asked Questions - If I am running on a uniprocessor kernel, which can only do one thing at a time, why should I wait for a grace period? - See :ref:`Documentation/RCU/UP.rst ` for more information. + See UP.rst for more information. - How can I see where RCU is currently used in the Linux kernel? @@ -64,13 +63,13 @@ Frequently Asked Questions - What guidelines should I follow when writing code that uses RCU? - See the checklist.txt file in this directory. + See checklist.rst. - Why the name "RCU"? "RCU" stands for "read-copy update". - :ref:`Documentation/RCU/listRCU.rst ` has more information on where - this name came from, search for "read-copy update" to find it. + listRCU.rst has more information on where this name came from, search + for "read-copy update" to find it. - I hear that RCU is patented? What is with that? diff --git a/Documentation/RCU/rculist_nulls.rst b/Documentation/RCU/rculist_nulls.rst index a9fc774bc400..ca4692775ad4 100644 --- a/Documentation/RCU/rculist_nulls.rst +++ b/Documentation/RCU/rculist_nulls.rst @@ -8,7 +8,7 @@ This section describes how to use hlist_nulls to protect read-mostly linked lists and objects using SLAB_TYPESAFE_BY_RCU allocations. -Please read the basics in Documentation/RCU/listRCU.rst +Please read the basics in listRCU.rst. Using 'nulls' ============= diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index c34d2212eaca..77ea260efd12 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -224,7 +224,7 @@ synchronize_rcu() be delayed. This property results in system resilience in face of denial-of-service attacks. Code using call_rcu() should limit update rate in order to gain this same sort of resilience. See - checklist.txt for some approaches to limiting the update rate. + checklist.rst for some approaches to limiting the update rate. rcu_assign_pointer() ^^^^^^^^^^^^^^^^^^^^ @@ -318,7 +318,7 @@ rcu_dereference() must prohibit. The rcu_dereference_protected() variant takes a lockdep expression to indicate which locks must be acquired by the caller. If the indicated protection is not provided, - a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst + a lockdep splat is emitted. See Design/Requirements/Requirements.rst and the API's code comments for more details and example usage. .. [2] If the list_for_each_entry_rcu() instance might be used by @@ -399,8 +399,7 @@ for specialized uses, but are relatively uncommon. This section shows a simple use of the core RCU API to protect a global pointer to a dynamically allocated structure. More-typical -uses of RCU may be found in :ref:`listRCU.rst `, -:ref:`arrayRCU.rst `, and :ref:`NMI-RCU.rst `. +uses of RCU may be found in listRCU.rst, arrayRCU.rst, and NMI-RCU.rst. :: struct foo { @@ -482,10 +481,9 @@ So, to sum up: RCU read-side critical sections that might be referencing that data item. -See checklist.txt for additional rules to follow when using RCU. -And again, more-typical uses of RCU may be found in :ref:`listRCU.rst -`, :ref:`arrayRCU.rst `, and :ref:`NMI-RCU.rst -`. +See checklist.rst for additional rules to follow when using RCU. +And again, more-typical uses of RCU may be found in listRCU.rst, +arrayRCU.rst, and NMI-RCU.rst. .. _4_whatisRCU: @@ -579,7 +577,7 @@ to avoid having to write your own callback:: kfree_rcu(old_fp, rcu); -Again, see checklist.txt for additional rules governing the use of RCU. +Again, see checklist.rst for additional rules governing the use of RCU. .. _5_whatisRCU: @@ -663,7 +661,7 @@ been able to write-acquire the lock otherwise. The smp_mb__after_spinlock() promotes synchronize_rcu() to a full memory barrier in compliance with the "Memory-Barrier Guarantees" listed in: - Documentation/RCU/Design/Requirements/Requirements.rst + Design/Requirements/Requirements.rst It is possible to nest rcu_read_lock(), since reader-writer locks may be recursively acquired. Note also that rcu_read_lock() is immune From 3791a22374715b36ad806db13d8b2afb1b57fd36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Feb 2022 18:08:33 -0800 Subject: [PATCH 057/155] kernel/smp: Provide boot-time timeout for CSD lock diagnostics Debugging of problems involving insanely long-running SMI handlers proceeds better if the CSD-lock timeout can be adjusted. This commit therefore provides a new smp.csd_lock_timeout kernel boot parameter that specifies the timeout in milliseconds. The default remains at the previously hard-coded value of five seconds. [ paulmck: Apply feedback from Juergen Gross. ] Cc: Rik van Riel Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Reviewed-by: Juergen Gross Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 11 +++++++++++ kernel/smp.c | 7 +++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f1cc5e317ed..645c4c001b16 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5377,6 +5377,17 @@ smart2= [HW] Format: [,[,...,]] + smp.csd_lock_timeout= [KNL] + Specify the period of time in milliseconds + that smp_call_function() and friends will wait + for a CPU to release the CSD lock. This is + useful when diagnosing bugs involving CPUs + disabling interrupts for extended periods + of time. Defaults to 5,000 milliseconds, and + setting a value of zero disables this feature. + This feature may be more efficiently disabled + using the csdlock_debug- kernel parameter. + smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices smsc-ircc2.ircc_cfg= [HW] Device configuration I/O port smsc-ircc2.ircc_sir= [HW] SIR base I/O port diff --git a/kernel/smp.c b/kernel/smp.c index 01a7c1706a58..6a1f1daa3dc4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -183,7 +183,9 @@ static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local); -#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) +static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ +module_param(csd_lock_timeout, ulong, 0444); + static atomic_t csd_bug_count = ATOMIC_INIT(0); static u64 cfd_seq; @@ -329,6 +331,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; unsigned int flags = READ_ONCE(csd->node.u_flags); + unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC; if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) @@ -341,7 +344,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * ts2 = sched_clock(); ts_delta = ts2 - *ts1; - if (likely(ts_delta <= CSD_LOCK_TIMEOUT)) + if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0)) return false; firsttime = !*bug_id; From 75182a4eaaf8b697f66d68ad039f021f461dd2a4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Mar 2022 11:01:37 -0800 Subject: [PATCH 058/155] rcu: Add comments to final rcu_gp_cleanup() "if" statement The final "if" statement in rcu_gp_cleanup() has proven to be rather confusing, straightforward though it might have seemed when initially written. This commit therefore adds comments to its "then" and "else" clauses to at least provide a more elevated form of confusion. Reported-by: Boqun Feng Reported-by: Frederic Weisbecker Reported-by: Neeraj Upadhyay Reported-by: Uladzislau Rezki Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a5ea67454640..29669070348e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2098,14 +2098,29 @@ static noinline void rcu_gp_cleanup(void) /* Advance CBs to reduce false positives below. */ offloaded = rcu_rdp_is_offloaded(rdp); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { + + // We get here if a grace period was needed (“needgp”) + // and the above call to rcu_accelerate_cbs() did not set + // the RCU_GP_FLAG_INIT bit in ->gp_state (which records + // the need for another grace period).  The purpose + // of the “offloaded” check is to avoid invoking + // rcu_accelerate_cbs() on an offloaded CPU because we do not + // hold the ->nocb_lock needed to safely access an offloaded + // ->cblist.  We do not want to acquire that lock because + // it can be heavily contended during callback floods. + WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); - trace_rcu_grace_period(rcu_state.name, - rcu_state.gp_seq, - TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); } else { - WRITE_ONCE(rcu_state.gp_flags, - rcu_state.gp_flags & RCU_GP_FLAG_INIT); + + // We get here either if there is no need for an + // additional grace period or if rcu_accelerate_cbs() has + // already set the RCU_GP_FLAG_INIT bit in ->gp_flags.  + // So all we need to do is to clear all of the other + // ->gp_flags bits. + + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); From 80d530b47da41642fab317a9485d58dfbe1e8896 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Mar 2022 10:10:35 -0800 Subject: [PATCH 059/155] rcu: Print number of online CPUs in RCU CPU stall-warning messages RCU's synchronous grace periods act quite differently when there is only one online CPU, especially in the no-op case in kernels built with CONFIG_PREEMPTION=n. This change in behavior can be important debugging information, so this commit adds the number of online CPUs to the RCU CPU stall warning messages. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0c5d8516516a..268dd79c58e7 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -565,9 +565,9 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", + pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", smp_processor_id(), (long)(jiffies - gps), - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); if (ndetected) { rcu_dump_cpu_stacks(); @@ -626,9 +626,9 @@ static void print_cpu_stall(unsigned long gps) raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", + pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", jiffies - gps, - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); From 70ae7b0ce03347fab35d6d8df81e1165d7ea8045 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 14 Mar 2022 14:37:38 +0100 Subject: [PATCH 060/155] rcu: Fix preemption mode check on synchronize_rcu[_expedited]() An early check on synchronize_rcu[_expedited]() tries to determine if the current CPU is in UP mode on an SMP no-preempt kernel, in which case there is no need to start a grace period since the current assumed quiescent state is all we need. However the preemption mode doesn't take into account the boot selected preemption mode under CONFIG_PREEMPT_DYNAMIC=y, missing a possible early return if the running flavour is "none" or "voluntary". Use the shiny new preempt mode accessors to fix this. However, avoid invoking them during early boot because doing so triggers a WARN_ON_ONCE(). [ paulmck: Update for mainlined API. ] Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Boqun Feng Cc: Peter Zijlstra Cc: Neeraj Upadhyay Cc: Valentin Schneider Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 29669070348e..d3caa82b9954 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3741,7 +3741,9 @@ static int rcu_blocking_is_gp(void) { int ret; - if (IS_ENABLED(CONFIG_PREEMPTION)) + // Invoking preempt_model_*() too early gets a splat. + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE || + preempt_model_full() || preempt_model_rt()) return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; might_sleep(); /* Check for RCU read-side critical section. */ preempt_disable(); From 88ca472f80604c070526eb58b977ea0a9c3c2e1f Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 24 Mar 2022 19:15:15 +0800 Subject: [PATCH 061/155] rcu: Check for successful spawn of ->boost_kthread_task For the spawning of the priority-boost kthreads can fail, improbable though this might seem. This commit therefore refrains from attemoting to initiate RCU priority boosting when The ->boost_kthread_task pointer is NULL. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 176639c6215f..5c23aceecd62 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1125,7 +1125,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { raw_lockdep_assert_held_rcu_node(rnp); - if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { + if (!rnp->boost_kthread_task || + (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } From f1efe84d6fd2af163989025bc285ff9b3b0c764f Mon Sep 17 00:00:00 2001 From: David Vernet Date: Tue, 29 Mar 2022 15:26:13 -0700 Subject: [PATCH 062/155] rcu_sync: Fix comment to properly reflect rcu_sync_exit() behavior The rcu_sync_enter() function is used by updaters to force RCU readers (e.g. percpu-rwsem) to use their slow paths during an update. This is accomplished by setting the ->gp_state of the rcu_sync structure to GP_ENTER. In the case of percpu-rwsem, the readers' slow path waits on a semaphore instead of just incrementing a reader count. Each updater invokes the rcu_sync_exit() function to signal to readers that they may again take their fastpaths. The rcu_sync_exit() function sets the ->gp_state of the rcu_sync structure to GP_EXIT, and if all goes well, after a grace period the ->gp_state reverts back to GP_IDLE. Unfortunately, the rcu_sync_enter() function currently has a comment incorrectly stating that rcu_sync_exit() (by an updater) will re-enable reader "slowpaths". This patch changes the comment to state that this function re-enables reader fastpaths. Signed-off-by: David Vernet Signed-off-by: Paul E. McKenney --- kernel/rcu/sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 33d896d85902..5cefc702158f 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -111,7 +111,7 @@ static void rcu_sync_func(struct rcu_head *rhp) * a slowpath during the update. After this function returns, all * subsequent calls to rcu_sync_is_idle() will return false, which * tells readers to stay off their fastpaths. A later call to - * rcu_sync_exit() re-enables reader slowpaths. + * rcu_sync_exit() re-enables reader fastpaths. * * When called in isolation, rcu_sync_enter() must wait for a grace * period, however, closely spaced calls to rcu_sync_enter() can From f596e2ce1c0f250bb3ecc179f611be37e862635f Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 4 Apr 2022 07:59:32 +0800 Subject: [PATCH 063/155] rcu: Use IRQ_WORK_INIT_HARD() to avoid rcu_read_unlock() hangs When booting kernels built with both CONFIG_RCU_STRICT_GRACE_PERIOD=y and CONFIG_PREEMPT_RT=y, the rcu_read_unlock_special() function's invocation of irq_work_queue_on() the init_irq_work() causes the rcu_preempt_deferred_qs_handler() function to work execute in SCHED_FIFO irq_work kthreads. Because rcu_read_unlock_special() is invoked on each rcu_read_unlock() in such kernels, the amount of work just keeps piling up, resulting in a boot-time hang. This commit therefore avoids this hang by using IRQ_WORK_INIT_HARD() instead of init_irq_work(), but only in kernels built with both CONFIG_PREEMPT_RT=y and CONFIG_RCU_STRICT_GRACE_PERIOD=y. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5c23aceecd62..2a3715419073 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -661,7 +661,13 @@ static void rcu_read_unlock_special(struct task_struct *t) expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + IS_ENABLED(CONFIG_PREEMPT_RT)) + rdp->defer_qs_iw = IRQ_WORK_INIT_HARD( + rcu_preempt_deferred_qs_handler); + else + init_irq_work(&rdp->defer_qs_iw, + rcu_preempt_deferred_qs_handler); rdp->defer_qs_iw_pending = true; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } From 835f14ed53076384f0e1dad2fddb4881315f124f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Mar 2022 11:05:09 -0700 Subject: [PATCH 064/155] rcu: Make the TASKS_RCU Kconfig option be selected Currently, any kernel built with CONFIG_PREEMPTION=y also gets CONFIG_TASKS_RCU=y, which is not helpful to people trying to build preemptible kernels of minimal size. Because CONFIG_TASKS_RCU=y is needed only in kernels doing tracing of one form or another, this commit moves from TASKS_RCU deciding when it should be enabled to the tracing Kconfig options explicitly selecting it. This allows building preemptible kernels without TASKS_RCU, if desired. This commit also updates the SRCU-N and TREE09 rcutorture scenarios in order to avoid Kconfig errors that would otherwise result from CONFIG_TASKS_RCU being selected without its CONFIG_RCU_EXPERT dependency being met. [ paulmck: Apply BPF_SYSCALL feedback from Andrii Nakryiko. ] Reported-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Zhouyi Zhou Cc: Andrii Nakryiko Cc: Alexei Starovoitov Cc: Steven Rostedt Cc: Mathieu Desnoyers Acked-by: Masami Hiramatsu Signed-off-by: Paul E. McKenney --- arch/Kconfig | 1 + kernel/bpf/Kconfig | 1 + kernel/rcu/Kconfig | 3 ++- kernel/trace/Kconfig | 1 + tools/testing/selftests/rcutorture/configs/rcu/SRCU-N | 2 ++ tools/testing/selftests/rcutorture/configs/rcu/TREE09 | 2 ++ 6 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/Kconfig b/arch/Kconfig index 29b0167c088b..1bf29ce754af 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -35,6 +35,7 @@ config KPROBES depends on MODULES depends on HAVE_KPROBES select KALLSYMS + select TASKS_RCU if PREEMPTION help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index d56ee177d5f8..2dfe1079f772 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -27,6 +27,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK + select TASKS_RCU if PREEMPTION select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index f559870fbf8b..4f665ae0cf55 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -78,7 +78,8 @@ config TASKS_RCU_GENERIC task-based RCU implementations. Not for manual selection. config TASKS_RCU - def_bool PREEMPTION + def_bool 0 + select IRQ_WORK help This option enables a task-based RCU implementation that uses only voluntary context switch (not preemption!), idle, and diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2c43e327a619..bf5da6c4e999 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -144,6 +144,7 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING select TRACE_CLOCK + select TASKS_RCU if PREEMPTION config GENERIC_TRACER bool diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N index 2da8b49589a0..07f5e0a70ae7 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N @@ -6,3 +6,5 @@ CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n #CHECK#CONFIG_RCU_EXPERT=n +CONFIG_KPROBES=n +CONFIG_FTRACE=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 index 8523a7515cbf..fc45645bb5f4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 @@ -13,3 +13,5 @@ CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n #CHECK#CONFIG_RCU_EXPERT=n +CONFIG_KPROBES=n +CONFIG_FTRACE=n From 40c1278aa7cd51d4f8627f7adc66aa73e01aff81 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Mar 2022 13:29:59 -0700 Subject: [PATCH 065/155] rcutorture: Allow rcutorture without RCU Tasks Trace Unless a kernel builds rcutorture, whether built-in or as a module, that kernel is also built with CONFIG_TASKS_TRACE_RCU, whether anything else needs Tasks Trace RCU or not. This unnecessarily increases kernel size. This commit therefore decouples the presence of rcutorture from the presence of RCU Tasks Trace. However, there is a need to select CONFIG_TASKS_TRACE_RCU for testing purposes. Except that casual users must not be bothered with questions -- for them, this needs to be fully automated. There is thus a CONFIG_FORCE_TASKS_TRACE_RCU that selects CONFIG_TASKS_TRACE_RCU, is user-selectable, but which depends on CONFIG_RCU_EXPERT. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 22 ++-- kernel/rcu/Kconfig.debug | 1 - kernel/rcu/rcutorture.c | 101 ++++++++++-------- .../selftests/rcutorture/configs/rcu/TRACE01 | 2 + .../selftests/rcutorture/configs/rcu/TRACE02 | 2 + 5 files changed, 75 insertions(+), 53 deletions(-) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 4f665ae0cf55..2befd328e6a0 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -95,15 +95,23 @@ config TASKS_RUDE_RCU switches on all online CPUs, including idle ones, so use with caution. -config TASKS_TRACE_RCU - def_bool 0 - select IRQ_WORK +config FORCE_TASKS_TRACE_RCU + bool "Force selection of Tasks Trace RCU" + depends on RCU_EXPERT + select TASKS_TRACE_RCU + default n help This option enables a task-based RCU implementation that uses explicit rcu_read_lock_trace() read-side markers, and allows - these readers to appear in the idle loop as well as on the CPU - hotplug code paths. It can force IPIs on online CPUs, including - idle ones, so use with caution. + these readers to appear in the idle loop as well as on the + CPU hotplug code paths. It can force IPIs on online CPUs, + including idle ones, so use with caution. Not for manual + selection in most cases. + +config TASKS_TRACE_RCU + bool + default n + select IRQ_WORK config RCU_STALL_COMMON def_bool TREE_RCU @@ -227,7 +235,7 @@ config RCU_NOCB_CPU config TASKS_TRACE_RCU_READ_MB bool "Tasks Trace RCU readers use memory barriers in user and idle" - depends on RCU_EXPERT + depends on RCU_EXPERT && TASKS_TRACE_RCU default PREEMPT_RT || NR_CPUS < 8 help Use this option to further reduce the number of IPIs sent diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4fd64999300f..d7f4bb1c4979 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -49,7 +49,6 @@ config RCU_TORTURE_TEST select SRCU select TASKS_RCU select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs torture tests diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 55d049c39608..7dd3e14ec907 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -737,6 +737,48 @@ static struct rcu_torture_ops busted_srcud_ops = { .name = "busted_srcud" }; +/* + * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. + * This implementation does not necessarily work well with CPU hotplug. + */ + +static void synchronize_rcu_trivial(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + WARN_ON_ONCE(raw_smp_processor_id() != cpu); + } +} + +static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +{ + preempt_disable(); + return 0; +} + +static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +{ + preempt_enable(); +} + +static struct rcu_torture_ops trivial_ops = { + .ttype = RCU_TRIVIAL_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .readlock_held = torture_readlock_not_held, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_trivial, + .exp_sync = synchronize_rcu_trivial, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "trivial" +}; + /* * Definitions for RCU-tasks torture testing. */ @@ -780,48 +822,6 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; -/* - * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. - * This implementation does not necessarily work well with CPU hotplug. - */ - -static void synchronize_rcu_trivial(void) -{ - int cpu; - - for_each_online_cpu(cpu) { - rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); - WARN_ON_ONCE(raw_smp_processor_id() != cpu); - } -} - -static int rcu_torture_read_lock_trivial(void) __acquires(RCU) -{ - preempt_disable(); - return 0; -} - -static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) -{ - preempt_enable(); -} - -static struct rcu_torture_ops trivial_ops = { - .ttype = RCU_TRIVIAL_FLAVOR, - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock_trivial, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_torture_read_unlock_trivial, - .readlock_held = torture_readlock_not_held, - .get_gp_seq = rcu_no_completed, - .sync = synchronize_rcu_trivial, - .exp_sync = synchronize_rcu_trivial, - .fqs = NULL, - .stats = NULL, - .irq_capable = 1, - .name = "trivial" -}; - /* * Definitions for rude RCU-tasks torture testing. */ @@ -851,6 +851,8 @@ static struct rcu_torture_ops tasks_rude_ops = { .name = "tasks-rude" }; +#ifdef CONFIG_TASKS_TRACE_RCU + /* * Definitions for tracing RCU-tasks torture testing. */ @@ -893,6 +895,15 @@ static struct rcu_torture_ops tasks_tracing_ops = { .name = "tasks-tracing" }; +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else #ifdef CONFIG_TASKS_TRACE_RCU + + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -3096,9 +3107,9 @@ rcu_torture_init(void) int flags = 0; unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, - &tasks_tracing_ops, &trivial_ops, + &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, + &tasks_ops, &tasks_rude_ops, TASKS_TRACING_OPS + &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 index e4d74e5fc1d0..0f5605ed1e48 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 @@ -7,5 +7,7 @@ CONFIG_PREEMPT=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_PROVE_LOCKING=n #CHECK#CONFIG_PROVE_RCU=n +CONFIG_FORCE_TASKS_TRACE_RCU=y +#CHECK#CONFIG_TASKS_TRACE_RCU=y CONFIG_TASKS_TRACE_RCU_READ_MB=y CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 index 77541eeb4e9f..093ea6e8e65c 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 @@ -7,5 +7,7 @@ CONFIG_PREEMPT=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y +CONFIG_FORCE_TASKS_TRACE_RCU=y +#CHECK#CONFIG_TASKS_TRACE_RCU=y CONFIG_TASKS_TRACE_RCU_READ_MB=n CONFIG_RCU_EXPERT=y From 3b6e1dd42317ec366dab3205f99280e2ab1ad85a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Mar 2022 15:18:27 -0700 Subject: [PATCH 066/155] rcutorture: Allow rcutorture without RCU Tasks Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for RCU Tasks. Unless that kernel builds rcutorture, whether built-in or as a module, in which case RCU Tasks is (unnecessarily) used. This both increases kernel size and increases the complexity of certain tracing operations. This commit therefore decouples the presence of rcutorture from the presence of RCU Tasks. However, there is a need to select CONFIG_TASKS_RCU for testing purposes. Except that casual users must not be bothered with questions -- for them, this needs to be fully automated. There is thus a CONFIG_FORCE_TASKS_RCU that selects CONFIG_TASKS_RCU, is user-selectable, but which depends on CONFIG_RCU_EXPERT. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 20 +++++++++++++------ kernel/rcu/Kconfig.debug | 1 - kernel/rcu/rcutorture.c | 13 +++++++++++- .../selftests/rcutorture/configs/rcu/TASKS01 | 1 + .../selftests/rcutorture/configs/rcu/TASKS02 | 3 +++ .../selftests/rcutorture/configs/rcu/TASKS03 | 2 ++ 6 files changed, 32 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 2befd328e6a0..8eac165db09f 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -77,13 +77,21 @@ config TASKS_RCU_GENERIC This option enables generic infrastructure code supporting task-based RCU implementations. Not for manual selection. -config TASKS_RCU - def_bool 0 - select IRQ_WORK +config FORCE_TASKS_RCU + bool "Force selection of TASKS_RCU" + depends on RCU_EXPERT + select TASKS_RCU + default n help - This option enables a task-based RCU implementation that uses - only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. Not for manual selection. + This option force-enables a task-based RCU implementation + that uses only voluntary context switch (not preemption!), + idle, and user-mode execution as quiescent states. Not for + manual selection in most cases. + +config TASKS_RCU + bool + default n + select IRQ_WORK config TASKS_RUDE_RCU def_bool 0 diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index d7f4bb1c4979..c217a5e655a4 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -47,7 +47,6 @@ config RCU_TORTURE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU select TASKS_RUDE_RCU default n help diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7dd3e14ec907..65d045ff9766 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -779,6 +779,8 @@ static struct rcu_torture_ops trivial_ops = { .name = "trivial" }; +#ifdef CONFIG_TASKS_RCU + /* * Definitions for RCU-tasks torture testing. */ @@ -822,6 +824,15 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; +#define TASKS_OPS &tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define TASKS_OPS + +#endif // #else #ifdef CONFIG_TASKS_RCU + + /* * Definitions for rude RCU-tasks torture testing. */ @@ -3108,7 +3119,7 @@ rcu_torture_init(void) unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, - &tasks_ops, &tasks_rude_ops, TASKS_TRACING_OPS + TASKS_OPS &tasks_rude_ops, TASKS_TRACING_OPS &trivial_ops, }; diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 index 3ca112444ce7..d84801b9a7ae 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 @@ -7,4 +7,5 @@ CONFIG_PREEMPT=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y +CONFIG_TASKS_RCU=y CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 index ad2be91e5ee7..d333b69bc831 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 @@ -2,3 +2,6 @@ CONFIG_SMP=n CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n +#CHECK#CONFIG_TASKS_RCU=y +CONFIG_FORCE_TASKS_RCU=y +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 index dc02083803ce..dea26c568678 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 @@ -7,3 +7,5 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y #CHECK#CONFIG_RCU_EXPERT=n +CONFIG_TASKS_RCU=y +CONFIG_RCU_EXPERT=y From 4c3f7b0e1e880e892d4bc4f50bf627b251b6e2cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Mar 2022 16:16:45 -0700 Subject: [PATCH 067/155] rcutorture: Allow rcutorture without RCU Tasks Rude Unless a kernel builds rcutorture, whether built-in or as a module, that kernel is also built with CONFIG_TASKS_RUDE_RCU, whether anything else needs Tasks Rude RCU or not. This unnecessarily increases kernel size. This commit therefore decouples the presence of rcutorture from the presence of RCU Tasks Rude. However, there is a need to select CONFIG_TASKS_RUDE_RCU for testing purposes. Except that casual users must not be bothered with questions -- for them, this needs to be fully automated. There is thus a CONFIG_FORCE_TASKS_RUDE_RCU that selects CONFIG_TASKS_RUDE_RCU, is user-selectable, but which depends on CONFIG_RCU_EXPERT. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 23 ++++++++++++------- kernel/rcu/Kconfig.debug | 1 - kernel/rcu/rcutorture.c | 13 ++++++++++- .../selftests/rcutorture/configs/rcu/RUDE01 | 2 ++ 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 8eac165db09f..65d45c00fd1b 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -93,15 +93,22 @@ config TASKS_RCU default n select IRQ_WORK -config TASKS_RUDE_RCU - def_bool 0 - select IRQ_WORK +config FORCE_TASKS_RUDE_RCU + bool "Force selection of Tasks Rude RCU" + depends on RCU_EXPERT + select TASKS_RUDE_RCU + default n help - This option enables a task-based RCU implementation that uses - only context switch (including preemption) and user-mode - execution as quiescent states. It forces IPIs and context - switches on all online CPUs, including idle ones, so use - with caution. + This option force-enables a task-based RCU implementation + that uses only context switch (including preemption) and + user-mode execution as quiescent states. It forces IPIs and + context switches on all online CPUs, including idle ones, + so use with caution. Not for manual selection in most cases. + +config TASKS_RUDE_RCU + bool + default n + select IRQ_WORK config FORCE_TASKS_TRACE_RCU bool "Force selection of Tasks Trace RCU" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index c217a5e655a4..f4a4468cbf03 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -47,7 +47,6 @@ config RCU_TORTURE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RUDE_RCU default n help This option provides a kernel module that runs torture tests diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 65d045ff9766..d528245108c2 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -833,6 +833,8 @@ static struct rcu_torture_ops tasks_ops = { #endif // #else #ifdef CONFIG_TASKS_RCU +#ifdef CONFIG_TASKS_RUDE_RCU + /* * Definitions for rude RCU-tasks torture testing. */ @@ -862,6 +864,15 @@ static struct rcu_torture_ops tasks_rude_ops = { .name = "tasks-rude" }; +#define TASKS_RUDE_OPS &tasks_rude_ops, + +#else // #ifdef CONFIG_TASKS_RUDE_RCU + +#define TASKS_RUDE_OPS + +#endif // #else #ifdef CONFIG_TASKS_RUDE_RCU + + #ifdef CONFIG_TASKS_TRACE_RCU /* @@ -3119,7 +3130,7 @@ rcu_torture_init(void) unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, - TASKS_OPS &tasks_rude_ops, TASKS_TRACING_OPS + TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS &trivial_ops, }; diff --git a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 index 7093422050f6..6fd6acb94518 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 @@ -8,3 +8,5 @@ CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y CONFIG_RCU_EXPERT=y +CONFIG_FORCE_TASKS_RUDE_RCU=y +#CHECK#CONFIG_TASKS_RUDE_RCU=y From 3831fc02f496cd8a8e6c75217b290fe5158a3f36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Mar 2022 08:02:11 -0700 Subject: [PATCH 068/155] rcutorture: Add CONFIG_PREEMPT_DYNAMIC=n to TASKS02 scenario Now that CONFIG_PREEMPT_DYNAMIC=y is the default, TASKS02 no longer builds a pure non-preemptible kernel that uses Tiny RCU. This commit therefore fixes this new hole in rcutorture testing by adding CONFIG_PREEMPT_DYNAMIC=n to the TASKS02 rcutorture scenario. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TASKS02 | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 index d333b69bc831..2f9fcffff5ae 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 @@ -2,6 +2,7 @@ CONFIG_SMP=n CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n #CHECK#CONFIG_TASKS_RCU=y CONFIG_FORCE_TASKS_RCU=y CONFIG_RCU_EXPERT=y From 58524e0fed6a4509651005c06dc1a4ecb3ed0a61 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Mar 2022 08:10:18 -0700 Subject: [PATCH 069/155] rcutorture: Allow specifying per-scenario stat_interval The rcutorture test suite makes double use of the rcutorture.stat_interval module parameter. As its name suggests, it controls the frequency of statistics printing, but it also controls the rcu_torture_writer() stall timeout. The current setting of 15 seconds works surprisingly well. However, given that the RCU tasks stall-warning timeout is ten -minutes-, 15 seconds is too short for TASKS02, which runs a non-preemptible kernel on a single CPU. This commit therefore adds checks for per-scenario specification of the rcutorture.stat_interval module parameter. Signed-off-by: Paul E. McKenney --- .../rcutorture/configs/rcu/TASKS02.boot | 1 + .../rcutorture/configs/rcu/ver_functions.sh | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02.boot b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02.boot index cd2a188eeb6d..b9b6d67cbc5f 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02.boot @@ -1 +1,2 @@ rcutorture.torture_type=tasks +rcutorture.stat_interval=60 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh index effa415f9b92..e2bc99c785e7 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh @@ -9,7 +9,7 @@ # rcutorture_param_n_barrier_cbs bootparam-string # -# Adds n_barrier_cbs rcutorture module parameter to kernels having it. +# Adds n_barrier_cbs rcutorture module parameter if not already specified. rcutorture_param_n_barrier_cbs () { if echo $1 | grep -q "rcutorture\.n_barrier_cbs" then @@ -30,13 +30,25 @@ rcutorture_param_onoff () { fi } +# rcutorture_param_stat_interval bootparam-string +# +# Adds stat_interval rcutorture module parameter if not already specified. +rcutorture_param_stat_interval () { + if echo $1 | grep -q "rcutorture\.stat_interval" + then + : + else + echo rcutorture.stat_interval=15 + fi +} + # per_version_boot_params bootparam-string config-file seconds # # Adds per-version torture-module parameters to kernels supporting them. per_version_boot_params () { echo $1 `rcutorture_param_onoff "$1" "$2"` \ `rcutorture_param_n_barrier_cbs "$1"` \ - rcutorture.stat_interval=15 \ + `rcutorture_param_stat_interval "$1"` \ rcutorture.shutdown_secs=$3 \ rcutorture.test_no_idle_hz=1 \ rcutorture.verbose=1 From 5f654af150fd5aeb9fff138c7cbd72cea016b863 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Mar 2022 14:39:54 -0700 Subject: [PATCH 070/155] refscale: Allow refscale without RCU Tasks Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for RCU Tasks. Unless that kernel builds refscale, whether built-in or as a module, in which case RCU Tasks is (unnecessarily) built in. This both increases kernel size and increases the complexity of certain tracing operations. This commit therefore decouples the presence of refscale from the presence of RCU Tasks. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 1 - kernel/rcu/refscale.c | 12 +++++++++++- .../selftests/rcutorture/configs/refscale/CFcommon | 2 ++ .../selftests/rcutorture/configs/refscale/NOPREEMPT | 2 ++ 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index f4a4468cbf03..454924e03ef3 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -63,7 +63,6 @@ config RCU_REF_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU select TASKS_RUDE_RCU select TASKS_TRACE_RCU default n diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 5489ff7f478e..5079e47b3d18 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -207,6 +207,8 @@ static struct ref_scale_ops srcu_ops = { .name = "srcu" }; +#ifdef CONFIG_TASKS_RCU + // Definitions for RCU Tasks ref scale testing: Empty read markers. // These definitions also work for RCU Rude readers. static void rcu_tasks_ref_scale_read_section(const int nloops) @@ -232,6 +234,14 @@ static struct ref_scale_ops rcu_tasks_ops = { .name = "rcu-tasks" }; +#define RCU_TASKS_OPS &rcu_tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define RCU_TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + // Definitions for RCU Tasks Trace ref scale testing. static void rcu_trace_ref_scale_read_section(const int nloops) { @@ -790,7 +800,7 @@ ref_scale_init(void) long i; int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops, + &rcu_ops, &srcu_ops, &rcu_trace_ops, RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, }; diff --git a/tools/testing/selftests/rcutorture/configs/refscale/CFcommon b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon index a98b58b54bb1..14fdafc576ce 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon @@ -1,2 +1,4 @@ CONFIG_RCU_REF_SCALE_TEST=y CONFIG_PRINTK_TIME=y +CONFIG_FORCE_TASKS_RCU=y +#CHECK#CONFIG_TASKS_RCU=y diff --git a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT index 7f06838a91e6..ef2b501a6971 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT +++ b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT @@ -15,3 +15,5 @@ CONFIG_PROVE_LOCKING=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y +CONFIG_KPROBES=n +CONFIG_FTRACE=n From dec86781a54f4a527386a0b86b22e99e2ac67a09 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Mar 2022 15:21:07 -0700 Subject: [PATCH 071/155] refscale: Allow refscale without RCU Tasks Rude/Trace Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for RCU Tasks Rude and RCU Tasks Trace. Unless that kernel builds refscale, whether built-in or as a module, in which case these RCU Tasks flavors are (unnecessarily) built in. This both increases kernel size and increases the complexity of certain tracing operations. This commit therefore decouples the presence of refscale from the presence of RCU Tasks Rude and RCU Tasks Trace. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 2 -- kernel/rcu/refscale.c | 12 +++++++++++- .../selftests/rcutorture/configs/refscale/CFcommon | 2 ++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 454924e03ef3..dceaa3e754e5 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -63,8 +63,6 @@ config RCU_REF_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance tests diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 5079e47b3d18..909644abee67 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -242,6 +242,8 @@ static struct ref_scale_ops rcu_tasks_ops = { #endif // #else // #ifdef CONFIG_TASKS_RCU +#ifdef CONFIG_TASKS_TRACE_RCU + // Definitions for RCU Tasks Trace ref scale testing. static void rcu_trace_ref_scale_read_section(const int nloops) { @@ -271,6 +273,14 @@ static struct ref_scale_ops rcu_trace_ops = { .name = "rcu-trace" }; +#define RCU_TRACE_OPS &rcu_trace_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define RCU_TRACE_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + // Definitions for reference count static atomic_t refcnt; @@ -800,7 +810,7 @@ ref_scale_init(void) long i; int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, + &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, }; diff --git a/tools/testing/selftests/rcutorture/configs/refscale/CFcommon b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon index 14fdafc576ce..fbea3b13baba 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon @@ -2,3 +2,5 @@ CONFIG_RCU_REF_SCALE_TEST=y CONFIG_PRINTK_TIME=y CONFIG_FORCE_TASKS_RCU=y #CHECK#CONFIG_TASKS_RCU=y +CONFIG_FORCE_TASKS_TRACE_RCU=y +#CHECK#CONFIG_TASKS_TRACE_RCU=y From 4df002d908796c1ff87b985af1d31a0e36e6c66f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Mar 2022 16:39:01 -0700 Subject: [PATCH 072/155] rcuscale: Allow rcuscale without RCU Tasks Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for RCU Tasks. Unless that kernel builds rcuscale, whether built-in or as a module, in which case RCU Tasks is (unnecessarily) built. This both increases kernel size and increases the complexity of certain tracing operations. This commit therefore decouples the presence of rcuscale from the presence of RCU Tasks. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 1 - kernel/rcu/rcuscale.c | 12 +++++++++++- .../selftests/rcutorture/configs/rcuscale/CFcommon | 4 ++-- .../selftests/rcutorture/configs/rcuscale/TREE | 2 ++ 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index dceaa3e754e5..71e73fceff87 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -28,7 +28,6 @@ config RCU_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU select TASKS_RUDE_RCU select TASKS_TRACE_RCU default n diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 5e4f1f83d38e..311dbcb064ed 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -268,6 +268,8 @@ static struct rcu_scale_ops srcud_ops = { .name = "srcud" }; +#ifdef CONFIG_TASKS_RCU + /* * Definitions for RCU-tasks scalability testing. */ @@ -295,6 +297,14 @@ static struct rcu_scale_ops tasks_ops = { .name = "tasks" }; +#define TASKS_OPS &tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + /* * Definitions for RCU-tasks-trace scalability testing. */ @@ -797,7 +807,7 @@ rcu_scale_init(void) long i; int firsterr = 0; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, &tasks_tracing_ops + &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS &tasks_tracing_ops }; if (!torture_init_begin(scale_type, verbose)) diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon index 90942bb5bebc..2ed3b46a9c37 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon @@ -1,5 +1,5 @@ CONFIG_RCU_SCALE_TEST=y CONFIG_PRINTK_TIME=y CONFIG_TASKS_RCU_GENERIC=y -CONFIG_TASKS_RCU=y -CONFIG_TASKS_TRACE_RCU=y +CONFIG_FORCE_TASKS_RCU=y +#CHECK#CONFIG_TASKS_RCU=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE index f110d9ffbe4c..b10706fd03a4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE @@ -16,3 +16,5 @@ CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y CONFIG_RCU_TRACE=y +CONFIG_KPROBES=n +CONFIG_FTRACE=n From 5ce027f4cd0e2f28ea5574ede9eef290e2ede5c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Mar 2022 17:05:40 -0700 Subject: [PATCH 073/155] rcuscale: Allow rcuscale without RCU Tasks Rude/Trace Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for RCU Tasks Rude and RCU Tasks Trace. Unless that kernel builds rcuscale, whether built-in or as a module, in which case these RCU Tasks flavors are (unnecessarily) built in. This both increases kernel size and increases the complexity of certain tracing operations. This commit therefore decouples the presence of rcuscale from the presence of RCU Tasks Rude and RCU Tasks Trace. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 2 -- kernel/rcu/rcuscale.c | 12 +++++++++++- .../selftests/rcutorture/configs/rcuscale/CFcommon | 3 ++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 71e73fceff87..68092e1db64b 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -28,8 +28,6 @@ config RCU_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 311dbcb064ed..277a5bfb37d4 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -305,6 +305,8 @@ static struct rcu_scale_ops tasks_ops = { #endif // #else // #ifdef CONFIG_TASKS_RCU +#ifdef CONFIG_TASKS_TRACE_RCU + /* * Definitions for RCU-tasks-trace scalability testing. */ @@ -334,6 +336,14 @@ static struct rcu_scale_ops tasks_tracing_ops = { .name = "tasks-tracing" }; +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -807,7 +817,7 @@ rcu_scale_init(void) long i; int firsterr = 0; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS &tasks_tracing_ops + &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS }; if (!torture_init_begin(scale_type, verbose)) diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon index 2ed3b46a9c37..6a00157bee5b 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon @@ -1,5 +1,6 @@ CONFIG_RCU_SCALE_TEST=y CONFIG_PRINTK_TIME=y -CONFIG_TASKS_RCU_GENERIC=y CONFIG_FORCE_TASKS_RCU=y #CHECK#CONFIG_TASKS_RCU=y +CONFIG_FORCE_TASKS_TRACE_RCU=y +#CHECK#CONFIG_TASKS_TRACE_RCU=y From bf5e7a2f4609db6cd65c0cad22ab2fbb52f1927e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 27 Mar 2022 12:13:30 -0700 Subject: [PATCH 074/155] scftorture: Adjust for TASKS_RCU Kconfig option being selected This commit adjusts the scftorture PREEMPT and NOPREEMPT scenarios to account for the TASKS_RCU Kconfig option being explicitly selected rather than computed in isolation. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT | 2 ++ tools/testing/selftests/rcutorture/configs/scf/PREEMPT | 1 + 2 files changed, 3 insertions(+) diff --git a/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT index b8429d6c6ebc..3a59346b3de7 100644 --- a/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT +++ b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT @@ -7,3 +7,5 @@ CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_PROVE_LOCKING=n +CONFIG_KPROBES=n +CONFIG_FTRACE=n diff --git a/tools/testing/selftests/rcutorture/configs/scf/PREEMPT b/tools/testing/selftests/rcutorture/configs/scf/PREEMPT index ae4992b141b0..cb37e08037d6 100644 --- a/tools/testing/selftests/rcutorture/configs/scf/PREEMPT +++ b/tools/testing/selftests/rcutorture/configs/scf/PREEMPT @@ -7,3 +7,4 @@ CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y +CONFIG_RCU_EXPERT=y From 00f3133b7f9598304c1fe25ad2d5c12b91199761 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 Feb 2022 16:08:48 -0800 Subject: [PATCH 075/155] torture: Skip vmlinux check for kvm-again.sh runs The kvm-again.sh script reruns an previously built set of kernels, so the vmlinux files are associated with that previous run, not this on. This results in kvm-find_errors.sh reporting spurious failed-build errors. This commit therefore omits the vmlinux check for kvm-again.sh runs. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh index 5f682fc892dd..88983cba7956 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh @@ -36,7 +36,7 @@ do then egrep "error:|warning:|^ld: .*undefined reference to" < $i > $i.diags files="$files $i.diags $i" - elif ! test -f ${scenariobasedir}/vmlinux + elif ! test -f ${scenariobasedir}/vmlinux && ! test -f "${rundir}/re-run" then echo No ${scenariobasedir}/vmlinux file > $i.diags files="$files $i.diags $i" From 3e112a39f7ad6d4cb1110585b11c5e74294e9578 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Mar 2022 06:36:52 -0800 Subject: [PATCH 076/155] torture: Enable CSD-lock stall reports for scftorture This commit passes the csdlock_debug=1 kernel parameter in order to enable CSD-lock stall reports for torture.sh scftorure runs. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/torture.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index bfe09e2829c8..e84db823a50d 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -350,7 +350,7 @@ fi if test "$do_scftorture" = "yes" then - torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot" + torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 1G --trust-make fi From eec52c7fb51e5a1b89508bdc0e91d955256ec5f1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Mar 2022 10:23:46 -0800 Subject: [PATCH 077/155] rcutorture: Adjust scenarios' Kconfig options for CONFIG_PREEMPT_DYNAMIC Now that CONFIG_PREEMPT_DYNAMIC=y is the default, kernels that are ostensibly built with CONFIG_PREEMPT_NONE=y or CONFIG_PREEMPT_VOLUNTARY=y are now actually built with CONFIG_PREEMPT=y, but are by default booted so as to disable preemption. Although this allows much more flexibility from a single kernel binary, it means that the current rcutorture scenarios won't find build errors that happen only when preemption is fully disabled at build time. This commit therefore adds CONFIG_PREEMPT_DYNAMIC=n to several scenarios, and while in the area switches one from CONFIG_PREEMPT_NONE=y to CONFIG_PREEMPT_VOLUNTARY=y to add coverage of this Kconfig option. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TRACE01 | 1 + tools/testing/selftests/rcutorture/configs/rcu/TREE04 | 5 +++-- tools/testing/selftests/rcutorture/configs/rcu/TREE07 | 1 + tools/testing/selftests/rcutorture/configs/rcu/TREE10 | 1 + 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 index e4d74e5fc1d0..b54cefde6e87 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 @@ -4,6 +4,7 @@ CONFIG_HOTPLUG_CPU=y CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_PROVE_LOCKING=n #CHECK#CONFIG_PROVE_RCU=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index 22ad0261728d..ae395981b5e5 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -1,8 +1,9 @@ CONFIG_SMP=y CONFIG_NR_CPUS=8 -CONFIG_PREEMPT_NONE=y -CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=y CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n #CHECK#CONFIG_TREE_RCU=y CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 index 2789b47e4ecd..d30922d8c883 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 @@ -3,6 +3,7 @@ CONFIG_NR_CPUS=16 CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n #CHECK#CONFIG_TREE_RCU=y CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 index 4a00539bfdd7..a323d8948b7c 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 @@ -3,6 +3,7 @@ CONFIG_NR_CPUS=56 CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n #CHECK#CONFIG_TREE_RCU=y CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y From f877e3993b53e2dd1bdfadfc2bca68619d8a3f23 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 12 Mar 2022 21:12:41 -0800 Subject: [PATCH 078/155] scftorture: Remove extraneous "scf" from per_version_boot_params There is an extraneous "scf" in the per_version_boot_params shell function used by scftorture. No harm done in that it is just passed as an argument to the /init program in initrd, but this commit nevertheless removes it. Signed-off-by: Paul E. McKenney --- .../testing/selftests/rcutorture/configs/scf/ver_functions.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh index d3d9e35d3d55..2d949e58f5a5 100644 --- a/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh @@ -25,6 +25,5 @@ per_version_boot_params () { echo $1 `scftorture_param_onoff "$1" "$2"` \ scftorture.stat_interval=15 \ scftorture.shutdown_secs=$3 \ - scftorture.verbose=1 \ - scf + scftorture.verbose=1 } From c7756fff4fa11611f81f0f3b1cb13f63b5d0f87e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 12 Mar 2022 21:32:55 -0800 Subject: [PATCH 079/155] torture: Save "make allmodconfig" .config file Currently, torture.sh saves only the build output and exit code from the "make allmodconfig" test. This commit also saves the .config file. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/torture.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index e84db823a50d..c5b3dedc6dc4 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -322,6 +322,7 @@ then echo " --- make clean" > "$amcdir/Make.out" 2>&1 make -j$MAKE_ALLOTED_CPUS clean >> "$amcdir/Make.out" 2>&1 echo " --- make allmodconfig" >> "$amcdir/Make.out" 2>&1 + cp .config $amcdir make -j$MAKE_ALLOTED_CPUS allmodconfig >> "$amcdir/Make.out" 2>&1 echo " --- make " >> "$amcdir/Make.out" 2>&1 make -j$MAKE_ALLOTED_CPUS >> "$amcdir/Make.out" 2>&1 From 31015625768e6d8bc808a892b221b69afaaa8d07 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 27 Mar 2022 10:06:53 -0700 Subject: [PATCH 080/155] rcutorture: Make kvm.sh allow more memory for --kasan runs KASAN allots significant memory to track allocation state, and the amount of memory has increased recently, which results in frequent OOMs on a few of the rcutorture scenarios. This commit therefore provides 2G of memory for --kasan runs, up from the 512M default. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index af58b86a503a..263e16aeca0e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -44,6 +44,7 @@ TORTURE_KCONFIG_KASAN_ARG="" TORTURE_KCONFIG_KCSAN_ARG="" TORTURE_KMAKE_ARG="" TORTURE_QEMU_MEM=512 +torture_qemu_mem_default=1 TORTURE_REMOTE= TORTURE_SHUTDOWN_GRACE=180 TORTURE_SUITE=rcu @@ -180,6 +181,10 @@ do ;; --kasan) TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG + if test -n "$torture_qemu_mem_default" + then + TORTURE_QEMU_MEM=2G + fi ;; --kconfig|--kconfigs) checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\)*$' '^error$' @@ -202,6 +207,7 @@ do --memory) checkarg --memory "(memory size)" $# "$2" '^[0-9]\+[MG]\?$' error TORTURE_QEMU_MEM=$2 + torture_qemu_mem_default= shift ;; --no-initrd) From d69e048b27cceec20b637ae8ec72102c79ae673c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Mar 2022 13:16:18 -0700 Subject: [PATCH 081/155] rcutorture: Make torture.sh refscale and rcuscale specify Tasks Trace RCU Now that the Tasks RCU flavors are selected by their users rather than by the rcutorture scenarios, torture.sh fails when attempting to run NOPREEMPT scenarios for refscale and rcuscale. This commit therefore makes torture.sh specify CONFIG_TASKS_TRACE_RCU=y to avoid such failure. Why not also CONFIG_TASKS_RCU? Because tracing selects this one. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/torture.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index c5b3dedc6dc4..f9c1437b9a19 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -364,7 +364,7 @@ fi for prim in $primlist do torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$HALF_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" - torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make + torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make done if test "$do_rcuscale" = yes @@ -376,7 +376,7 @@ fi for prim in $primlist do torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$HALF_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot" - torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make + torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make done if test "$do_kvfree" = "yes" From fb036ad7db108649189d6577051a87e3d3741cf4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Mar 2022 13:30:15 -0700 Subject: [PATCH 082/155] rcutorture: Make torture.sh allow for --kasan The torture.sh script provides extra memory for scftorture and rcuscale. However, the total memory provided is only 1G, which is less than the 2G that is required for KASAN testing. This commit therefore ups the torture.sh script's 1G to 2G. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/torture.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index f9c1437b9a19..3be9cfab93b5 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -352,7 +352,7 @@ fi if test "$do_scftorture" = "yes" then torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" - torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 1G --trust-make + torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make fi if test "$do_refscale" = yes @@ -382,7 +382,7 @@ done if test "$do_kvfree" = "yes" then torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" - torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 1G --trust-make + torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make fi if test "$do_clocksourcewd" = "yes" From 967cce191f50090d5cbd3841ee2bbb7835afeae2 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:15 +0100 Subject: [PATCH 083/155] tools/nolibc/std: move the standard type definitions to std.h The ordering of includes and definitions for now is a bit of a mess, as for example asm/signal.h is included after int definitions, but plenty of structures are defined later as they rely on other includes. Let's move the standard type definitions to a dedicated file that is included first. We also move NULL there. This way all other includes are aware of it, and we can bring asm/signal.h back to the top of the file. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 42 ++++-------------------------- tools/include/nolibc/std.h | 49 +++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 37 deletions(-) create mode 100644 tools/include/nolibc/std.h diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 4660637d9b17..186a78c25326 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -83,7 +83,12 @@ #ifndef _NOLIBC_H #define _NOLIBC_H +/* standard type definitions */ +#include "std.h" + +/* system includes */ #include +#include // for SIGCHLD #include #include #include @@ -106,40 +111,6 @@ static int errno; */ #define MAX_ERRNO 4095 -/* Declare a few quite common macros and types that usually are in stdlib.h, - * stdint.h, ctype.h, unistd.h and a few other common locations. - */ - -#define NULL ((void *)0) - -/* stdint types */ -typedef unsigned char uint8_t; -typedef signed char int8_t; -typedef unsigned short uint16_t; -typedef signed short int16_t; -typedef unsigned int uint32_t; -typedef signed int int32_t; -typedef unsigned long long uint64_t; -typedef signed long long int64_t; -typedef unsigned long size_t; -typedef signed long ssize_t; -typedef unsigned long uintptr_t; -typedef signed long intptr_t; -typedef signed long ptrdiff_t; - -/* for stat() */ -typedef unsigned int dev_t; -typedef unsigned long ino_t; -typedef unsigned int mode_t; -typedef signed int pid_t; -typedef unsigned int uid_t; -typedef unsigned int gid_t; -typedef unsigned long nlink_t; -typedef signed long off_t; -typedef signed long blksize_t; -typedef signed long blkcnt_t; -typedef signed long time_t; - /* for poll() */ struct pollfd { int fd; @@ -248,9 +219,6 @@ struct stat { #define WEXITSTATUS(status) (((status) & 0xff00) >> 8) #define WIFEXITED(status) (((status) & 0x7f) == 0) -/* for SIGCHLD */ -#include - /* Below comes the architecture-specific code. For each architecture, we have * the syscall declarations and the _start code definition. This is the only * global part. On all architectures the kernel puts everything in the stack diff --git a/tools/include/nolibc/std.h b/tools/include/nolibc/std.h new file mode 100644 index 000000000000..1747ae125392 --- /dev/null +++ b/tools/include/nolibc/std.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Standard definitions and types for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau + */ + +#ifndef _NOLIBC_STD_H +#define _NOLIBC_STD_H + +/* Declare a few quite common macros and types that usually are in stdlib.h, + * stdint.h, ctype.h, unistd.h and a few other common locations. Please place + * integer type definitions and generic macros here, but avoid OS-specific and + * syscall-specific stuff, as this file is expected to be included very early. + */ + +/* note: may already be defined */ +#ifndef NULL +#define NULL ((void *)0) +#endif + +/* stdint types */ +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef unsigned short uint16_t; +typedef signed short int16_t; +typedef unsigned int uint32_t; +typedef signed int int32_t; +typedef unsigned long long uint64_t; +typedef signed long long int64_t; +typedef unsigned long size_t; +typedef signed long ssize_t; +typedef unsigned long uintptr_t; +typedef signed long intptr_t; +typedef signed long ptrdiff_t; + +/* those are commonly provided by sys/types.h */ +typedef unsigned int dev_t; +typedef unsigned long ino_t; +typedef unsigned int mode_t; +typedef signed int pid_t; +typedef unsigned int uid_t; +typedef unsigned int gid_t; +typedef unsigned long nlink_t; +typedef signed long off_t; +typedef signed long blksize_t; +typedef signed long blkcnt_t; +typedef signed long time_t; + +#endif /* _NOLIBC_STD_H */ From cc7a492ad0a076dff5cb4281b1516676d7924fcf Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:16 +0100 Subject: [PATCH 084/155] tools/nolibc/types: split syscall-specific definitions into their own files The macros and type definitions used by a number of syscalls were moved to types.h where they will be easier to maintain. A few of them are arch-specific and must not be moved there (e.g. O_*, sys_stat_struct). A warning about them was placed at the top of the file. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 110 +--------------------------- tools/include/nolibc/types.h | 133 ++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 108 deletions(-) create mode 100644 tools/include/nolibc/types.h diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 186a78c25326..3719959e6f57 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -94,7 +94,9 @@ #include #include #include +#include "types.h" +/* Used by programs to avoid std includes */ #define NOLIBC /* this way it will be removed if unused */ @@ -111,114 +113,6 @@ static int errno; */ #define MAX_ERRNO 4095 -/* for poll() */ -struct pollfd { - int fd; - short int events; - short int revents; -}; - -/* for getdents64() */ -struct linux_dirent64 { - uint64_t d_ino; - int64_t d_off; - unsigned short d_reclen; - unsigned char d_type; - char d_name[]; -}; - -/* commonly an fd_set represents 256 FDs */ -#define FD_SETSIZE 256 -typedef struct { uint32_t fd32[FD_SETSIZE/32]; } fd_set; - -/* needed by wait4() */ -struct rusage { - struct timeval ru_utime; - struct timeval ru_stime; - long ru_maxrss; - long ru_ixrss; - long ru_idrss; - long ru_isrss; - long ru_minflt; - long ru_majflt; - long ru_nswap; - long ru_inblock; - long ru_oublock; - long ru_msgsnd; - long ru_msgrcv; - long ru_nsignals; - long ru_nvcsw; - long ru_nivcsw; -}; - -/* stat flags (WARNING, octal here) */ -#define S_IFDIR 0040000 -#define S_IFCHR 0020000 -#define S_IFBLK 0060000 -#define S_IFREG 0100000 -#define S_IFIFO 0010000 -#define S_IFLNK 0120000 -#define S_IFSOCK 0140000 -#define S_IFMT 0170000 - -#define S_ISDIR(mode) (((mode) & S_IFDIR) == S_IFDIR) -#define S_ISCHR(mode) (((mode) & S_IFCHR) == S_IFCHR) -#define S_ISBLK(mode) (((mode) & S_IFBLK) == S_IFBLK) -#define S_ISREG(mode) (((mode) & S_IFREG) == S_IFREG) -#define S_ISFIFO(mode) (((mode) & S_IFIFO) == S_IFIFO) -#define S_ISLNK(mode) (((mode) & S_IFLNK) == S_IFLNK) -#define S_ISSOCK(mode) (((mode) & S_IFSOCK) == S_IFSOCK) - -#define DT_UNKNOWN 0 -#define DT_FIFO 1 -#define DT_CHR 2 -#define DT_DIR 4 -#define DT_BLK 6 -#define DT_REG 8 -#define DT_LNK 10 -#define DT_SOCK 12 - -/* all the *at functions */ -#ifndef AT_FDCWD -#define AT_FDCWD -100 -#endif - -/* lseek */ -#define SEEK_SET 0 -#define SEEK_CUR 1 -#define SEEK_END 2 - -/* reboot */ -#define LINUX_REBOOT_MAGIC1 0xfee1dead -#define LINUX_REBOOT_MAGIC2 0x28121969 -#define LINUX_REBOOT_CMD_HALT 0xcdef0123 -#define LINUX_REBOOT_CMD_POWER_OFF 0x4321fedc -#define LINUX_REBOOT_CMD_RESTART 0x01234567 -#define LINUX_REBOOT_CMD_SW_SUSPEND 0xd000fce2 - - -/* The format of the struct as returned by the libc to the application, which - * significantly differs from the format returned by the stat() syscall flavours. - */ -struct stat { - dev_t st_dev; /* ID of device containing file */ - ino_t st_ino; /* inode number */ - mode_t st_mode; /* protection */ - nlink_t st_nlink; /* number of hard links */ - uid_t st_uid; /* user ID of owner */ - gid_t st_gid; /* group ID of owner */ - dev_t st_rdev; /* device ID (if special file) */ - off_t st_size; /* total size, in bytes */ - blksize_t st_blksize; /* blocksize for file system I/O */ - blkcnt_t st_blocks; /* number of 512B blocks allocated */ - time_t st_atime; /* time of last access */ - time_t st_mtime; /* time of last modification */ - time_t st_ctime; /* time of last status change */ -}; - -#define WEXITSTATUS(status) (((status) & 0xff00) >> 8) -#define WIFEXITED(status) (((status) & 0x7f) == 0) - /* Below comes the architecture-specific code. For each architecture, we have * the syscall declarations and the _start code definition. This is the only * global part. On all architectures the kernel puts everything in the stack diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h new file mode 100644 index 000000000000..2f09abaf95f1 --- /dev/null +++ b/tools/include/nolibc/types.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Special types used by various syscalls for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau + */ + +#ifndef _NOLIBC_TYPES_H +#define _NOLIBC_TYPES_H + +#include "std.h" +#include + + +/* Only the generic macros and types may be defined here. The arch-specific + * ones such as the O_RDONLY and related macros used by fcntl() and open(), or + * the layout of sys_stat_struct must not be defined here. + */ + +/* stat flags (WARNING, octal here) */ +#define S_IFDIR 0040000 +#define S_IFCHR 0020000 +#define S_IFBLK 0060000 +#define S_IFREG 0100000 +#define S_IFIFO 0010000 +#define S_IFLNK 0120000 +#define S_IFSOCK 0140000 +#define S_IFMT 0170000 + +#define S_ISDIR(mode) (((mode) & S_IFDIR) == S_IFDIR) +#define S_ISCHR(mode) (((mode) & S_IFCHR) == S_IFCHR) +#define S_ISBLK(mode) (((mode) & S_IFBLK) == S_IFBLK) +#define S_ISREG(mode) (((mode) & S_IFREG) == S_IFREG) +#define S_ISFIFO(mode) (((mode) & S_IFIFO) == S_IFIFO) +#define S_ISLNK(mode) (((mode) & S_IFLNK) == S_IFLNK) +#define S_ISSOCK(mode) (((mode) & S_IFSOCK) == S_IFSOCK) + +/* dirent types */ +#define DT_UNKNOWN 0x0 +#define DT_FIFO 0x1 +#define DT_CHR 0x2 +#define DT_DIR 0x4 +#define DT_BLK 0x6 +#define DT_REG 0x8 +#define DT_LNK 0xa +#define DT_SOCK 0xc + +/* commonly an fd_set represents 256 FDs */ +#define FD_SETSIZE 256 + +/* Special FD used by all the *at functions */ +#ifndef AT_FDCWD +#define AT_FDCWD (-100) +#endif + +/* whence values for lseek() */ +#define SEEK_SET 0 +#define SEEK_CUR 1 +#define SEEK_END 2 + +/* cmd for reboot() */ +#define LINUX_REBOOT_MAGIC1 0xfee1dead +#define LINUX_REBOOT_MAGIC2 0x28121969 +#define LINUX_REBOOT_CMD_HALT 0xcdef0123 +#define LINUX_REBOOT_CMD_POWER_OFF 0x4321fedc +#define LINUX_REBOOT_CMD_RESTART 0x01234567 +#define LINUX_REBOOT_CMD_SW_SUSPEND 0xd000fce2 + +/* Macros used on waitpid()'s return status */ +#define WEXITSTATUS(status) (((status) & 0xff00) >> 8) +#define WIFEXITED(status) (((status) & 0x7f) == 0) + + +/* for select() */ +typedef struct { + uint32_t fd32[FD_SETSIZE / 32]; +} fd_set; + +/* for poll() */ +struct pollfd { + int fd; + short int events; + short int revents; +}; + +/* for getdents64() */ +struct linux_dirent64 { + uint64_t d_ino; + int64_t d_off; + unsigned short d_reclen; + unsigned char d_type; + char d_name[]; +}; + +/* needed by wait4() */ +struct rusage { + struct timeval ru_utime; + struct timeval ru_stime; + long ru_maxrss; + long ru_ixrss; + long ru_idrss; + long ru_isrss; + long ru_minflt; + long ru_majflt; + long ru_nswap; + long ru_inblock; + long ru_oublock; + long ru_msgsnd; + long ru_msgrcv; + long ru_nsignals; + long ru_nvcsw; + long ru_nivcsw; +}; + +/* The format of the struct as returned by the libc to the application, which + * significantly differs from the format returned by the stat() syscall flavours. + */ +struct stat { + dev_t st_dev; /* ID of device containing file */ + ino_t st_ino; /* inode number */ + mode_t st_mode; /* protection */ + nlink_t st_nlink; /* number of hard links */ + uid_t st_uid; /* user ID of owner */ + gid_t st_gid; /* group ID of owner */ + dev_t st_rdev; /* device ID (if special file) */ + off_t st_size; /* total size, in bytes */ + blksize_t st_blksize; /* blocksize for file system I/O */ + blkcnt_t st_blocks; /* number of 512B blocks allocated */ + time_t st_atime; /* time of last access */ + time_t st_mtime; /* time of last modification */ + time_t st_ctime; /* time of last status change */ +}; + +#endif /* _NOLIBC_TYPES_H */ From 271661c1cde5ff47eb7af9946866cd66b70dc328 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:17 +0100 Subject: [PATCH 085/155] tools/nolibc/arch: split arch-specific code into individual files In order to ease maintenance, this splits the arch-specific code into one file per architecture. A common file "arch.h" is used to include the right file among arch-* based on the detected architecture. Projects which are already split per architecture could simply rename these files to $arch/arch.h and get rid of the common arch.h. For this reason, include guards were placed into each arch-specific file. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/arch-aarch64.h | 199 +++++ tools/include/nolibc/arch-arm.h | 204 +++++ tools/include/nolibc/arch-i386.h | 196 +++++ tools/include/nolibc/arch-mips.h | 215 +++++ tools/include/nolibc/arch-riscv.h | 204 +++++ tools/include/nolibc/arch-x86_64.h | 215 +++++ tools/include/nolibc/arch.h | 32 + tools/include/nolibc/nolibc.h | 1187 +-------------------------- 8 files changed, 1266 insertions(+), 1186 deletions(-) create mode 100644 tools/include/nolibc/arch-aarch64.h create mode 100644 tools/include/nolibc/arch-arm.h create mode 100644 tools/include/nolibc/arch-i386.h create mode 100644 tools/include/nolibc/arch-mips.h create mode 100644 tools/include/nolibc/arch-riscv.h create mode 100644 tools/include/nolibc/arch-x86_64.h create mode 100644 tools/include/nolibc/arch.h diff --git a/tools/include/nolibc/arch-aarch64.h b/tools/include/nolibc/arch-aarch64.h new file mode 100644 index 000000000000..443de5fb7f54 --- /dev/null +++ b/tools/include/nolibc/arch-aarch64.h @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * AARCH64 specific definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau + */ + +#ifndef _NOLIBC_ARCH_AARCH64_H +#define _NOLIBC_ARCH_AARCH64_H + +/* O_* macros for fcntl/open are architecture-specific */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x4000 + +/* The struct returned by the newfstatat() syscall. Differs slightly from the + * x86_64's stat one by field ordering, so be careful. + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned int st_mode; + unsigned int st_nlink; + unsigned int st_uid; + unsigned int st_gid; + + unsigned long st_rdev; + unsigned long __pad1; + long st_size; + int st_blksize; + int __pad2; + + long st_blocks; + long st_atime; + unsigned long st_atime_nsec; + long st_mtime; + + unsigned long st_mtime_nsec; + long st_ctime; + unsigned long st_ctime_nsec; + unsigned int __unused[2]; +}; + +/* Syscalls for AARCH64 : + * - registers are 64-bit + * - stack is 16-byte aligned + * - syscall number is passed in x8 + * - arguments are in x0, x1, x2, x3, x4, x5 + * - the system call is performed by calling svc 0 + * - syscall return comes in x0. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * + * On aarch64, select() is not implemented so we have to use pselect6(). + */ +#define __ARCH_WANT_SYS_PSELECT6 + +#define my_syscall0(num) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0"); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + register long _arg4 asm("x3") = (long)(arg4); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + register long _arg4 asm("x3") = (long)(arg4); \ + register long _arg5 asm("x4") = (long)(arg5); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + register long _arg4 asm("x3") = (long)(arg4); \ + register long _arg5 asm("x4") = (long)(arg5); \ + register long _arg6 asm("x5") = (long)(arg6); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_arg6), "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" + "ldr x0, [sp]\n" // argc (x0) was in the stack + "add x1, sp, 8\n" // argv (x1) = sp + "lsl x2, x0, 3\n" // envp (x2) = 8*argc ... + "add x2, x2, 8\n" // + 8 (skip null) + "add x2, x2, x1\n" // + argv + "and sp, x1, -16\n" // sp must be 16-byte aligned in the callee + "bl main\n" // main() returns the status code, we'll exit with it. + "mov x8, 93\n" // NR_exit == 93 + "svc #0\n" + ""); + +#endif // _NOLIBC_ARCH_AARCH64_H diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h new file mode 100644 index 000000000000..66f687ad987f --- /dev/null +++ b/tools/include/nolibc/arch-arm.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * ARM specific definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau + */ + +#ifndef _NOLIBC_ARCH_ARM_H +#define _NOLIBC_ARCH_ARM_H + +/* O_* macros for fcntl/open are architecture-specific */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x4000 + +/* The struct returned by the stat() syscall, 32-bit only, the syscall returns + * exactly 56 bytes (stops before the unused array). In big endian, the format + * differs as devices are returned as short only. + */ +struct sys_stat_struct { +#if defined(__ARMEB__) + unsigned short st_dev; + unsigned short __pad1; +#else + unsigned long st_dev; +#endif + unsigned long st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + +#if defined(__ARMEB__) + unsigned short st_rdev; + unsigned short __pad2; +#else + unsigned long st_rdev; +#endif + unsigned long st_size; + unsigned long st_blksize; + unsigned long st_blocks; + + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + + unsigned long st_ctime; + unsigned long st_ctime_nsec; + unsigned long __unused[2]; +}; + +/* Syscalls for ARM in ARM or Thumb modes : + * - registers are 32-bit + * - stack is 8-byte aligned + * ( http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka4127.html) + * - syscall number is passed in r7 + * - arguments are in r0, r1, r2, r3, r4, r5 + * - the system call is performed by calling svc #0 + * - syscall return comes in r0. + * - only lr is clobbered. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + * + * Also, ARM supports the old_select syscall if newselect is not available + */ +#define __ARCH_WANT_SYS_OLD_SELECT + +#define my_syscall0(num) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0"); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + register long _arg3 asm("r2") = (long)(arg3); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + register long _arg3 asm("r2") = (long)(arg3); \ + register long _arg4 asm("r3") = (long)(arg4); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + register long _arg3 asm("r2") = (long)(arg3); \ + register long _arg4 asm("r3") = (long)(arg4); \ + register long _arg5 asm("r4") = (long)(arg5); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" +#if defined(__THUMBEB__) || defined(__THUMBEL__) + /* We enter here in 32-bit mode but if some previous functions were in + * 16-bit mode, the assembler cannot know, so we need to tell it we're in + * 32-bit now, then switch to 16-bit (is there a better way to do it than + * adding 1 by hand ?) and tell the asm we're now in 16-bit mode so that + * it generates correct instructions. Note that we do not support thumb1. + */ + ".code 32\n" + "add r0, pc, #1\n" + "bx r0\n" + ".code 16\n" +#endif + "pop {%r0}\n" // argc was in the stack + "mov %r1, %sp\n" // argv = sp + "add %r2, %r1, %r0, lsl #2\n" // envp = argv + 4*argc ... + "add %r2, %r2, $4\n" // ... + 4 + "and %r3, %r1, $-8\n" // AAPCS : sp must be 8-byte aligned in the + "mov %sp, %r3\n" // callee, an bl doesn't push (lr=pc) + "bl main\n" // main() returns the status code, we'll exit with it. + "movs r7, $1\n" // NR_exit == 1 + "svc $0x00\n" + ""); + +#endif // _NOLIBC_ARCH_ARM_H diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h new file mode 100644 index 000000000000..32f42e2cee26 --- /dev/null +++ b/tools/include/nolibc/arch-i386.h @@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * i386 specific definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau + */ + +#ifndef _NOLIBC_ARCH_I386_H +#define _NOLIBC_ARCH_I386_H + +/* O_* macros for fcntl/open are architecture-specific */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall, 32-bit only, the syscall returns + * exactly 56 bytes (stops before the unused array). + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + + unsigned long st_rdev; + unsigned long st_size; + unsigned long st_blksize; + unsigned long st_blocks; + + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + + unsigned long st_ctime; + unsigned long st_ctime_nsec; + unsigned long __unused[2]; +}; + +/* Syscalls for i386 : + * - mostly similar to x86_64 + * - registers are 32-bit + * - syscall number is passed in eax + * - arguments are in ebx, ecx, edx, esi, edi, ebp respectively + * - all registers are preserved (except eax of course) + * - the system call is performed by calling int $0x80 + * - syscall return comes in eax + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + * + * Also, i386 supports the old_select syscall if newselect is not available + */ +#define __ARCH_WANT_SYS_OLD_SELECT + +#define my_syscall0(num) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + register long _arg3 asm("edx") = (long)(arg3); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + register long _arg3 asm("edx") = (long)(arg3); \ + register long _arg4 asm("esi") = (long)(arg4); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + register long _arg3 asm("edx") = (long)(arg3); \ + register long _arg4 asm("esi") = (long)(arg4); \ + register long _arg5 asm("edi") = (long)(arg5); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +/* startup code */ +/* + * i386 System V ABI mandates: + * 1) last pushed argument must be 16-byte aligned. + * 2) The deepest stack frame should be set to zero + * + */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" + "pop %eax\n" // argc (first arg, %eax) + "mov %esp, %ebx\n" // argv[] (second arg, %ebx) + "lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx) + "xor %ebp, %ebp\n" // zero the stack frame + "and $-16, %esp\n" // x86 ABI : esp must be 16-byte aligned before + "sub $4, %esp\n" // the call instruction (args are aligned) + "push %ecx\n" // push all registers on the stack so that we + "push %ebx\n" // support both regparm and plain stack modes + "push %eax\n" + "call main\n" // main() returns the status code in %eax + "mov %eax, %ebx\n" // retrieve exit code (32-bit int) + "movl $1, %eax\n" // NR_exit == 1 + "int $0x80\n" // exit now + "hlt\n" // ensure it does not + ""); + +#endif // _NOLIBC_ARCH_I386_H diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h new file mode 100644 index 000000000000..e330201dde6a --- /dev/null +++ b/tools/include/nolibc/arch-mips.h @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * MIPS specific definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau + */ + +#ifndef _NOLIBC_ARCH_MIPS_H +#define _NOLIBC_ARCH_MIPS_H + +/* O_* macros for fcntl/open are architecture-specific */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_APPEND 0x0008 +#define O_NONBLOCK 0x0080 +#define O_CREAT 0x0100 +#define O_TRUNC 0x0200 +#define O_EXCL 0x0400 +#define O_NOCTTY 0x0800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall. 88 bytes are returned by the + * syscall. + */ +struct sys_stat_struct { + unsigned int st_dev; + long st_pad1[3]; + unsigned long st_ino; + unsigned int st_mode; + unsigned int st_nlink; + unsigned int st_uid; + unsigned int st_gid; + unsigned int st_rdev; + long st_pad2[2]; + long st_size; + long st_pad3; + + long st_atime; + long st_atime_nsec; + long st_mtime; + long st_mtime_nsec; + + long st_ctime; + long st_ctime_nsec; + long st_blksize; + long st_blocks; + long st_pad4[14]; +}; + +/* Syscalls for MIPS ABI O32 : + * - WARNING! there's always a delayed slot! + * - WARNING again, the syntax is different, registers take a '$' and numbers + * do not. + * - registers are 32-bit + * - stack is 8-byte aligned + * - syscall number is passed in v0 (starts at 0xfa0). + * - arguments are in a0, a1, a2, a3, then the stack. The caller needs to + * leave some room in the stack for the callee to save a0..a3 if needed. + * - Many registers are clobbered, in fact only a0..a2 and s0..s8 are + * preserved. See: https://www.linux-mips.org/wiki/Syscall as well as + * scall32-o32.S in the kernel sources. + * - the system call is performed by calling "syscall" + * - syscall return comes in v0, and register a3 needs to be checked to know + * if an error occurred, in which case errno is in v0. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + */ + +#define my_syscall0(num) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "r"(_num) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3") = (long)(arg4); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r" (_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3") = (long)(arg4); \ + register long _arg5 = (long)(arg5); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "sw %7, 16($sp)\n" \ + "syscall\n " \ + "addiu $sp, $sp, 32\n" \ + : "=r" (_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +/* startup code, note that it's called __start on MIPS */ +asm(".section .text\n" + ".set nomips16\n" + ".global __start\n" + ".set noreorder\n" + ".option pic0\n" + ".ent __start\n" + "__start:\n" + "lw $a0,($sp)\n" // argc was in the stack + "addiu $a1, $sp, 4\n" // argv = sp + 4 + "sll $a2, $a0, 2\n" // a2 = argc * 4 + "add $a2, $a2, $a1\n" // envp = argv + 4*argc ... + "addiu $a2, $a2, 4\n" // ... + 4 + "li $t0, -8\n" + "and $sp, $sp, $t0\n" // sp must be 8-byte aligned + "addiu $sp,$sp,-16\n" // the callee expects to save a0..a3 there! + "jal main\n" // main() returns the status code, we'll exit with it. + "nop\n" // delayed slot + "move $a0, $v0\n" // retrieve 32-bit exit code from v0 + "li $v0, 4001\n" // NR_exit == 4001 + "syscall\n" + ".end __start\n" + ""); + +#endif // _NOLIBC_ARCH_MIPS_H diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h new file mode 100644 index 000000000000..9d5ff78f606b --- /dev/null +++ b/tools/include/nolibc/arch-riscv.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * RISCV (32 and 64) specific definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau + */ + +#ifndef _NOLIBC_ARCH_RISCV_H +#define _NOLIBC_ARCH_RISCV_H + +/* O_* macros for fcntl/open are architecture-specific */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x100 +#define O_EXCL 0x200 +#define O_NOCTTY 0x400 +#define O_TRUNC 0x1000 +#define O_APPEND 0x2000 +#define O_NONBLOCK 0x4000 +#define O_DIRECTORY 0x200000 + +struct sys_stat_struct { + unsigned long st_dev; /* Device. */ + unsigned long st_ino; /* File serial number. */ + unsigned int st_mode; /* File mode. */ + unsigned int st_nlink; /* Link count. */ + unsigned int st_uid; /* User ID of the file's owner. */ + unsigned int st_gid; /* Group ID of the file's group. */ + unsigned long st_rdev; /* Device number, if device. */ + unsigned long __pad1; + long st_size; /* Size of file, in bytes. */ + int st_blksize; /* Optimal block size for I/O. */ + int __pad2; + long st_blocks; /* Number 512-byte blocks allocated. */ + long st_atime; /* Time of last access. */ + unsigned long st_atime_nsec; + long st_mtime; /* Time of last modification. */ + unsigned long st_mtime_nsec; + long st_ctime; /* Time of last status change. */ + unsigned long st_ctime_nsec; + unsigned int __unused4; + unsigned int __unused5; +}; + +#if __riscv_xlen == 64 +#define PTRLOG "3" +#define SZREG "8" +#elif __riscv_xlen == 32 +#define PTRLOG "2" +#define SZREG "4" +#endif + +/* Syscalls for RISCV : + * - stack is 16-byte aligned + * - syscall number is passed in a7 + * - arguments are in a0, a1, a2, a3, a4, a5 + * - the system call is performed by calling ecall + * - syscall return comes in a0 + * - the arguments are cast to long and assigned into the target + * registers which are then simply passed as registers to the asm code, + * so that we don't have to experience issues with register constraints. + * + * On riscv, select() is not implemented so we have to use pselect6(). + */ +#define __ARCH_WANT_SYS_PSELECT6 + +#define my_syscall0(num) \ +({ \ + register long _num asm("a7") = (num); \ + register long _arg1 asm("a0"); \ + \ + asm volatile ( \ + "ecall\n\t" \ + : "=r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num asm("a7") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + \ + asm volatile ( \ + "ecall\n" \ + : "+r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num asm("a7") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + \ + asm volatile ( \ + "ecall\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num asm("a7") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + \ + asm volatile ( \ + "ecall\n\t" \ + : "+r"(_arg1) \ + : "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num asm("a7") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3") = (long)(arg4); \ + \ + asm volatile ( \ + "ecall\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num asm("a7") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3") = (long)(arg4); \ + register long _arg5 asm("a4") = (long)(arg5); \ + \ + asm volatile ( \ + "ecall\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + register long _num asm("a7") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3") = (long)(arg4); \ + register long _arg5 asm("a4") = (long)(arg5); \ + register long _arg6 asm("a5") = (long)(arg6); \ + \ + asm volatile ( \ + "ecall\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), "r"(_arg6), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" + ".option push\n" + ".option norelax\n" + "lla gp, __global_pointer$\n" + ".option pop\n" + "ld a0, 0(sp)\n" // argc (a0) was in the stack + "add a1, sp, "SZREG"\n" // argv (a1) = sp + "slli a2, a0, "PTRLOG"\n" // envp (a2) = SZREG*argc ... + "add a2, a2, "SZREG"\n" // + SZREG (skip null) + "add a2,a2,a1\n" // + argv + "andi sp,a1,-16\n" // sp must be 16-byte aligned + "call main\n" // main() returns the status code, we'll exit with it. + "li a7, 93\n" // NR_exit == 93 + "ecall\n" + ""); + +#endif // _NOLIBC_ARCH_RISCV_H diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h new file mode 100644 index 000000000000..83c4b458ada7 --- /dev/null +++ b/tools/include/nolibc/arch-x86_64.h @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * x86_64 specific definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau + */ + +#ifndef _NOLIBC_ARCH_X86_64_H +#define _NOLIBC_ARCH_X86_64_H + +/* O_* macros for fcntl/open are architecture-specific */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall, equivalent to stat64(). The + * syscall returns 116 bytes and stops in the middle of __unused. + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned long st_nlink; + unsigned int st_mode; + unsigned int st_uid; + + unsigned int st_gid; + unsigned int __pad0; + unsigned long st_rdev; + long st_size; + long st_blksize; + + long st_blocks; + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + + unsigned long st_mtime_nsec; + unsigned long st_ctime; + unsigned long st_ctime_nsec; + long __unused[3]; +}; + +/* Syscalls for x86_64 : + * - registers are 64-bit + * - syscall number is passed in rax + * - arguments are in rdi, rsi, rdx, r10, r8, r9 respectively + * - the system call is performed by calling the syscall instruction + * - syscall return comes in rax + * - rcx and r11 are clobbered, others are preserved. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + * - see also x86-64 ABI section A.2 AMD64 Linux Kernel Conventions, A.2.1 + * Calling Conventions. + * + * Link x86-64 ABI: https://gitlab.com/x86-psABIs/x86-64-ABI/-/wikis/x86-64-psABI + * + */ + +#define my_syscall0(num) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "r"(_arg1), \ + "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "r"(_arg1), "r"(_arg2), \ + "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + register long _arg4 asm("r10") = (long)(arg4); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + register long _arg4 asm("r10") = (long)(arg4); \ + register long _arg5 asm("r8") = (long)(arg5); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + register long _arg4 asm("r10") = (long)(arg4); \ + register long _arg5 asm("r8") = (long)(arg5); \ + register long _arg6 asm("r9") = (long)(arg6); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_arg6), "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +/* startup code */ +/* + * x86-64 System V ABI mandates: + * 1) %rsp must be 16-byte aligned right before the function call. + * 2) The deepest stack frame should be zero (the %rbp). + * + */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" + "pop %rdi\n" // argc (first arg, %rdi) + "mov %rsp, %rsi\n" // argv[] (second arg, %rsi) + "lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx) + "xor %ebp, %ebp\n" // zero the stack frame + "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned before call + "call main\n" // main() returns the status code, we'll exit with it. + "mov %eax, %edi\n" // retrieve exit code (32 bit) + "mov $60, %eax\n" // NR_exit == 60 + "syscall\n" // really exit + "hlt\n" // ensure it does not return + ""); + +#endif // _NOLIBC_ARCH_X86_64_H diff --git a/tools/include/nolibc/arch.h b/tools/include/nolibc/arch.h new file mode 100644 index 000000000000..4c6992321b0d --- /dev/null +++ b/tools/include/nolibc/arch.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Copyright (C) 2017-2022 Willy Tarreau + */ + +/* Below comes the architecture-specific code. For each architecture, we have + * the syscall declarations and the _start code definition. This is the only + * global part. On all architectures the kernel puts everything in the stack + * before jumping to _start just above us, without any return address (_start + * is not a function but an entry pint). So at the stack pointer we find argc. + * Then argv[] begins, and ends at the first NULL. Then we have envp which + * starts and ends with a NULL as well. So envp=argv+argc+1. + */ + +#ifndef _NOLIBC_ARCH_H +#define _NOLIBC_ARCH_H + +#if defined(__x86_64__) +#include "arch-x86_64.h" +#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) +#include "arch-i386.h" +#elif defined(__ARM_EABI__) +#include "arch-arm.h" +#elif defined(__aarch64__) +#include "arch-aarch64.h" +#elif defined(__mips__) && defined(_ABIO32) +#include "arch-mips.h" +#elif defined(__riscv) +#include "arch-riscv.h" +#endif + +#endif /* _NOLIBC_ARCH_H */ diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 3719959e6f57..ccad3998d824 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -94,6 +94,7 @@ #include #include #include +#include "arch.h" #include "types.h" /* Used by programs to avoid std includes */ @@ -113,1192 +114,6 @@ static int errno; */ #define MAX_ERRNO 4095 -/* Below comes the architecture-specific code. For each architecture, we have - * the syscall declarations and the _start code definition. This is the only - * global part. On all architectures the kernel puts everything in the stack - * before jumping to _start just above us, without any return address (_start - * is not a function but an entry pint). So at the stack pointer we find argc. - * Then argv[] begins, and ends at the first NULL. Then we have envp which - * starts and ends with a NULL as well. So envp=argv+argc+1. - */ - -#if defined(__x86_64__) -/* Syscalls for x86_64 : - * - registers are 64-bit - * - syscall number is passed in rax - * - arguments are in rdi, rsi, rdx, r10, r8, r9 respectively - * - the system call is performed by calling the syscall instruction - * - syscall return comes in rax - * - rcx and r11 are clobbered, others are preserved. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - the syscall number is always specified last in order to allow to force - * some registers before (gcc refuses a %-register at the last position). - * - see also x86-64 ABI section A.2 AMD64 Linux Kernel Conventions, A.2.1 - * Calling Conventions. - * - * Link x86-64 ABI: https://gitlab.com/x86-psABIs/x86-64-ABI/-/wikis/x86-64-psABI - * - */ - -#define my_syscall0(num) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "r"(_arg1), \ - "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "r"(_arg1), "r"(_arg2), \ - "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - register long _arg5 asm("r8") = (long)(arg5); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - register long _arg5 asm("r8") = (long)(arg5); \ - register long _arg6 asm("r9") = (long)(arg6); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_arg6), "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -/* startup code */ -/* - * x86-64 System V ABI mandates: - * 1) %rsp must be 16-byte aligned right before the function call. - * 2) The deepest stack frame should be zero (the %rbp). - * - */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - "pop %rdi\n" // argc (first arg, %rdi) - "mov %rsp, %rsi\n" // argv[] (second arg, %rsi) - "lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx) - "xor %ebp, %ebp\n" // zero the stack frame - "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned before call - "call main\n" // main() returns the status code, we'll exit with it. - "mov %eax, %edi\n" // retrieve exit code (32 bit) - "mov $60, %eax\n" // NR_exit == 60 - "syscall\n" // really exit - "hlt\n" // ensure it does not return - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x10000 - -/* The struct returned by the stat() syscall, equivalent to stat64(). The - * syscall returns 116 bytes and stops in the middle of __unused. - */ -struct sys_stat_struct { - unsigned long st_dev; - unsigned long st_ino; - unsigned long st_nlink; - unsigned int st_mode; - unsigned int st_uid; - - unsigned int st_gid; - unsigned int __pad0; - unsigned long st_rdev; - long st_size; - long st_blksize; - - long st_blocks; - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - - unsigned long st_mtime_nsec; - unsigned long st_ctime; - unsigned long st_ctime_nsec; - long __unused[3]; -}; - -#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) -/* Syscalls for i386 : - * - mostly similar to x86_64 - * - registers are 32-bit - * - syscall number is passed in eax - * - arguments are in ebx, ecx, edx, esi, edi, ebp respectively - * - all registers are preserved (except eax of course) - * - the system call is performed by calling int $0x80 - * - syscall return comes in eax - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - the syscall number is always specified last in order to allow to force - * some registers before (gcc refuses a %-register at the last position). - * - * Also, i386 supports the old_select syscall if newselect is not available - */ -#define __ARCH_WANT_SYS_OLD_SELECT - -#define my_syscall0(num) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - register long _arg4 asm("esi") = (long)(arg4); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - register long _arg4 asm("esi") = (long)(arg4); \ - register long _arg5 asm("edi") = (long)(arg5); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -/* startup code */ -/* - * i386 System V ABI mandates: - * 1) last pushed argument must be 16-byte aligned. - * 2) The deepest stack frame should be set to zero - * - */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - "pop %eax\n" // argc (first arg, %eax) - "mov %esp, %ebx\n" // argv[] (second arg, %ebx) - "lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx) - "xor %ebp, %ebp\n" // zero the stack frame - "and $-16, %esp\n" // x86 ABI : esp must be 16-byte aligned before - "sub $4, %esp\n" // the call instruction (args are aligned) - "push %ecx\n" // push all registers on the stack so that we - "push %ebx\n" // support both regparm and plain stack modes - "push %eax\n" - "call main\n" // main() returns the status code in %eax - "mov %eax, %ebx\n" // retrieve exit code (32-bit int) - "movl $1, %eax\n" // NR_exit == 1 - "int $0x80\n" // exit now - "hlt\n" // ensure it does not - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x10000 - -/* The struct returned by the stat() syscall, 32-bit only, the syscall returns - * exactly 56 bytes (stops before the unused array). - */ -struct sys_stat_struct { - unsigned long st_dev; - unsigned long st_ino; - unsigned short st_mode; - unsigned short st_nlink; - unsigned short st_uid; - unsigned short st_gid; - - unsigned long st_rdev; - unsigned long st_size; - unsigned long st_blksize; - unsigned long st_blocks; - - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - unsigned long st_mtime_nsec; - - unsigned long st_ctime; - unsigned long st_ctime_nsec; - unsigned long __unused[2]; -}; - -#elif defined(__ARM_EABI__) -/* Syscalls for ARM in ARM or Thumb modes : - * - registers are 32-bit - * - stack is 8-byte aligned - * ( http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka4127.html) - * - syscall number is passed in r7 - * - arguments are in r0, r1, r2, r3, r4, r5 - * - the system call is performed by calling svc #0 - * - syscall return comes in r0. - * - only lr is clobbered. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - the syscall number is always specified last in order to allow to force - * some registers before (gcc refuses a %-register at the last position). - * - * Also, ARM supports the old_select syscall if newselect is not available - */ -#define __ARCH_WANT_SYS_OLD_SELECT - -#define my_syscall0(num) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0"); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - register long _arg4 asm("r3") = (long)(arg4); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - register long _arg4 asm("r3") = (long)(arg4); \ - register long _arg5 asm("r4") = (long)(arg5); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r" (_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" -#if defined(__THUMBEB__) || defined(__THUMBEL__) - /* We enter here in 32-bit mode but if some previous functions were in - * 16-bit mode, the assembler cannot know, so we need to tell it we're in - * 32-bit now, then switch to 16-bit (is there a better way to do it than - * adding 1 by hand ?) and tell the asm we're now in 16-bit mode so that - * it generates correct instructions. Note that we do not support thumb1. - */ - ".code 32\n" - "add r0, pc, #1\n" - "bx r0\n" - ".code 16\n" -#endif - "pop {%r0}\n" // argc was in the stack - "mov %r1, %sp\n" // argv = sp - "add %r2, %r1, %r0, lsl #2\n" // envp = argv + 4*argc ... - "add %r2, %r2, $4\n" // ... + 4 - "and %r3, %r1, $-8\n" // AAPCS : sp must be 8-byte aligned in the - "mov %sp, %r3\n" // callee, an bl doesn't push (lr=pc) - "bl main\n" // main() returns the status code, we'll exit with it. - "movs r7, $1\n" // NR_exit == 1 - "svc $0x00\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x4000 - -/* The struct returned by the stat() syscall, 32-bit only, the syscall returns - * exactly 56 bytes (stops before the unused array). In big endian, the format - * differs as devices are returned as short only. - */ -struct sys_stat_struct { -#if defined(__ARMEB__) - unsigned short st_dev; - unsigned short __pad1; -#else - unsigned long st_dev; -#endif - unsigned long st_ino; - unsigned short st_mode; - unsigned short st_nlink; - unsigned short st_uid; - unsigned short st_gid; -#if defined(__ARMEB__) - unsigned short st_rdev; - unsigned short __pad2; -#else - unsigned long st_rdev; -#endif - unsigned long st_size; - unsigned long st_blksize; - unsigned long st_blocks; - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - unsigned long st_mtime_nsec; - unsigned long st_ctime; - unsigned long st_ctime_nsec; - unsigned long __unused[2]; -}; - -#elif defined(__aarch64__) -/* Syscalls for AARCH64 : - * - registers are 64-bit - * - stack is 16-byte aligned - * - syscall number is passed in x8 - * - arguments are in x0, x1, x2, x3, x4, x5 - * - the system call is performed by calling svc 0 - * - syscall return comes in x0. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - * On aarch64, select() is not implemented so we have to use pselect6(). - */ -#define __ARCH_WANT_SYS_PSELECT6 - -#define my_syscall0(num) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0"); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - register long _arg5 asm("x4") = (long)(arg5); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r" (_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - register long _arg5 asm("x4") = (long)(arg5); \ - register long _arg6 asm("x5") = (long)(arg6); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r" (_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_arg6), "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - "ldr x0, [sp]\n" // argc (x0) was in the stack - "add x1, sp, 8\n" // argv (x1) = sp - "lsl x2, x0, 3\n" // envp (x2) = 8*argc ... - "add x2, x2, 8\n" // + 8 (skip null) - "add x2, x2, x1\n" // + argv - "and sp, x1, -16\n" // sp must be 16-byte aligned in the callee - "bl main\n" // main() returns the status code, we'll exit with it. - "mov x8, 93\n" // NR_exit == 93 - "svc #0\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x4000 - -/* The struct returned by the newfstatat() syscall. Differs slightly from the - * x86_64's stat one by field ordering, so be careful. - */ -struct sys_stat_struct { - unsigned long st_dev; - unsigned long st_ino; - unsigned int st_mode; - unsigned int st_nlink; - unsigned int st_uid; - unsigned int st_gid; - - unsigned long st_rdev; - unsigned long __pad1; - long st_size; - int st_blksize; - int __pad2; - - long st_blocks; - long st_atime; - unsigned long st_atime_nsec; - long st_mtime; - - unsigned long st_mtime_nsec; - long st_ctime; - unsigned long st_ctime_nsec; - unsigned int __unused[2]; -}; - -#elif defined(__mips__) && defined(_ABIO32) -/* Syscalls for MIPS ABI O32 : - * - WARNING! there's always a delayed slot! - * - WARNING again, the syntax is different, registers take a '$' and numbers - * do not. - * - registers are 32-bit - * - stack is 8-byte aligned - * - syscall number is passed in v0 (starts at 0xfa0). - * - arguments are in a0, a1, a2, a3, then the stack. The caller needs to - * leave some room in the stack for the callee to save a0..a3 if needed. - * - Many registers are clobbered, in fact only a0..a2 and s0..s8 are - * preserved. See: https://www.linux-mips.org/wiki/Syscall as well as - * scall32-o32.S in the kernel sources. - * - the system call is performed by calling "syscall" - * - syscall return comes in v0, and register a3 needs to be checked to know - * if an error occurred, in which case errno is in v0. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - */ - -#define my_syscall0(num) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "r"(_num) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2), "r"(_arg3) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r" (_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - register long _arg5 = (long)(arg5); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "sw %7, 16($sp)\n" \ - "syscall\n " \ - "addiu $sp, $sp, 32\n" \ - : "=r" (_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -/* startup code, note that it's called __start on MIPS */ -asm(".section .text\n" - ".set nomips16\n" - ".global __start\n" - ".set noreorder\n" - ".option pic0\n" - ".ent __start\n" - "__start:\n" - "lw $a0,($sp)\n" // argc was in the stack - "addiu $a1, $sp, 4\n" // argv = sp + 4 - "sll $a2, $a0, 2\n" // a2 = argc * 4 - "add $a2, $a2, $a1\n" // envp = argv + 4*argc ... - "addiu $a2, $a2, 4\n" // ... + 4 - "li $t0, -8\n" - "and $sp, $sp, $t0\n" // sp must be 8-byte aligned - "addiu $sp,$sp,-16\n" // the callee expects to save a0..a3 there! - "jal main\n" // main() returns the status code, we'll exit with it. - "nop\n" // delayed slot - "move $a0, $v0\n" // retrieve 32-bit exit code from v0 - "li $v0, 4001\n" // NR_exit == 4001 - "syscall\n" - ".end __start\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_APPEND 0x0008 -#define O_NONBLOCK 0x0080 -#define O_CREAT 0x0100 -#define O_TRUNC 0x0200 -#define O_EXCL 0x0400 -#define O_NOCTTY 0x0800 -#define O_DIRECTORY 0x10000 - -/* The struct returned by the stat() syscall. 88 bytes are returned by the - * syscall. - */ -struct sys_stat_struct { - unsigned int st_dev; - long st_pad1[3]; - unsigned long st_ino; - unsigned int st_mode; - unsigned int st_nlink; - unsigned int st_uid; - unsigned int st_gid; - unsigned int st_rdev; - long st_pad2[2]; - long st_size; - long st_pad3; - long st_atime; - long st_atime_nsec; - long st_mtime; - long st_mtime_nsec; - long st_ctime; - long st_ctime_nsec; - long st_blksize; - long st_blocks; - long st_pad4[14]; -}; - -#elif defined(__riscv) - -#if __riscv_xlen == 64 -#define PTRLOG "3" -#define SZREG "8" -#elif __riscv_xlen == 32 -#define PTRLOG "2" -#define SZREG "4" -#endif - -/* Syscalls for RISCV : - * - stack is 16-byte aligned - * - syscall number is passed in a7 - * - arguments are in a0, a1, a2, a3, a4, a5 - * - the system call is performed by calling ecall - * - syscall return comes in a0 - * - the arguments are cast to long and assigned into the target - * registers which are then simply passed as registers to the asm code, - * so that we don't have to experience issues with register constraints. - * - * On riscv, select() is not implemented so we have to use pselect6(). - */ -#define __ARCH_WANT_SYS_PSELECT6 - -#define my_syscall0(num) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0"); \ - \ - asm volatile ( \ - "ecall\n\t" \ - : "=r"(_arg1) \ - : "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - \ - asm volatile ( \ - "ecall\n" \ - : "+r"(_arg1) \ - : "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - \ - asm volatile ( \ - "ecall\n" \ - : "+r"(_arg1) \ - : "r"(_arg2), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - \ - asm volatile ( \ - "ecall\n\t" \ - : "+r"(_arg1) \ - : "r"(_arg2), "r"(_arg3), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - \ - asm volatile ( \ - "ecall\n" \ - : "+r"(_arg1) \ - : "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - register long _arg5 asm("a4") = (long)(arg5); \ - \ - asm volatile ( \ - "ecall\n" \ - : "+r"(_arg1) \ - : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - register long _arg5 asm("a4") = (long)(arg5); \ - register long _arg6 asm("a5") = (long)(arg6); \ - \ - asm volatile ( \ - "ecall\n" \ - : "+r"(_arg1) \ - : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), "r"(_arg6), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - ".option push\n" - ".option norelax\n" - "lla gp, __global_pointer$\n" - ".option pop\n" - "ld a0, 0(sp)\n" // argc (a0) was in the stack - "add a1, sp, "SZREG"\n" // argv (a1) = sp - "slli a2, a0, "PTRLOG"\n" // envp (a2) = SZREG*argc ... - "add a2, a2, "SZREG"\n" // + SZREG (skip null) - "add a2,a2,a1\n" // + argv - "andi sp,a1,-16\n" // sp must be 16-byte aligned - "call main\n" // main() returns the status code, we'll exit with it. - "li a7, 93\n" // NR_exit == 93 - "ecall\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x100 -#define O_EXCL 0x200 -#define O_NOCTTY 0x400 -#define O_TRUNC 0x1000 -#define O_APPEND 0x2000 -#define O_NONBLOCK 0x4000 -#define O_DIRECTORY 0x200000 - -struct sys_stat_struct { - unsigned long st_dev; /* Device. */ - unsigned long st_ino; /* File serial number. */ - unsigned int st_mode; /* File mode. */ - unsigned int st_nlink; /* Link count. */ - unsigned int st_uid; /* User ID of the file's owner. */ - unsigned int st_gid; /* Group ID of the file's group. */ - unsigned long st_rdev; /* Device number, if device. */ - unsigned long __pad1; - long st_size; /* Size of file, in bytes. */ - int st_blksize; /* Optimal block size for I/O. */ - int __pad2; - long st_blocks; /* Number 512-byte blocks allocated. */ - long st_atime; /* Time of last access. */ - unsigned long st_atime_nsec; - long st_mtime; /* Time of last modification. */ - unsigned long st_mtime_nsec; - long st_ctime; /* Time of last status change. */ - unsigned long st_ctime_nsec; - unsigned int __unused4; - unsigned int __unused5; -}; - -#endif - /* Below are the C functions used to declare the raw syscalls. They try to be * architecture-agnostic, and return either a success or -errno. Declaring them From bd8c8fbb866fe524b769a853f2b3525c227165fa Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:18 +0100 Subject: [PATCH 086/155] tools/nolibc/sys: split the syscall definitions into their own file The syscall definitions were moved to sys.h. They were arranged in a more easily maintainable order, whereby the sys_xxx() and xxx() functions were grouped together, which also enlights the occasional mappings such as wait relying on wait4(). Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 965 +------------------------- tools/include/nolibc/sys.h | 1189 +++++++++++++++++++++++++++++++++ 2 files changed, 1192 insertions(+), 962 deletions(-) create mode 100644 tools/include/nolibc/sys.h diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index ccad3998d824..2af56ec760e2 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -83,869 +83,18 @@ #ifndef _NOLIBC_H #define _NOLIBC_H -/* standard type definitions */ #include "std.h" - -/* system includes */ -#include -#include // for SIGCHLD -#include -#include -#include -#include -#include #include "arch.h" #include "types.h" +#include "sys.h" /* Used by programs to avoid std includes */ #define NOLIBC -/* this way it will be removed if unused */ -static int errno; - -#ifndef NOLIBC_IGNORE_ERRNO -#define SET_ERRNO(v) do { errno = (v); } while (0) -#else -#define SET_ERRNO(v) do { } while (0) -#endif - -/* errno codes all ensure that they will not conflict with a valid pointer - * because they all correspond to the highest addressable memory page. - */ -#define MAX_ERRNO 4095 - - -/* Below are the C functions used to declare the raw syscalls. They try to be - * architecture-agnostic, and return either a success or -errno. Declaring them - * static will lead to them being inlined in most cases, but it's still possible - * to reference them by a pointer if needed. - */ static __attribute__((unused)) -void *sys_brk(void *addr) +int tcsetpgrp(int fd, pid_t pid) { - return (void *)my_syscall1(__NR_brk, addr); -} - -static __attribute__((noreturn,unused)) -void sys_exit(int status) -{ - my_syscall1(__NR_exit, status & 255); - while(1); // shut the "noreturn" warnings. -} - -static __attribute__((unused)) -int sys_chdir(const char *path) -{ - return my_syscall1(__NR_chdir, path); -} - -static __attribute__((unused)) -int sys_chmod(const char *path, mode_t mode) -{ -#ifdef __NR_fchmodat - return my_syscall4(__NR_fchmodat, AT_FDCWD, path, mode, 0); -#elif defined(__NR_chmod) - return my_syscall2(__NR_chmod, path, mode); -#else -#error Neither __NR_fchmodat nor __NR_chmod defined, cannot implement sys_chmod() -#endif -} - -static __attribute__((unused)) -int sys_chown(const char *path, uid_t owner, gid_t group) -{ -#ifdef __NR_fchownat - return my_syscall5(__NR_fchownat, AT_FDCWD, path, owner, group, 0); -#elif defined(__NR_chown) - return my_syscall3(__NR_chown, path, owner, group); -#else -#error Neither __NR_fchownat nor __NR_chown defined, cannot implement sys_chown() -#endif -} - -static __attribute__((unused)) -int sys_chroot(const char *path) -{ - return my_syscall1(__NR_chroot, path); -} - -static __attribute__((unused)) -int sys_close(int fd) -{ - return my_syscall1(__NR_close, fd); -} - -static __attribute__((unused)) -int sys_dup(int fd) -{ - return my_syscall1(__NR_dup, fd); -} - -#ifdef __NR_dup3 -static __attribute__((unused)) -int sys_dup3(int old, int new, int flags) -{ - return my_syscall3(__NR_dup3, old, new, flags); -} -#endif - -static __attribute__((unused)) -int sys_dup2(int old, int new) -{ -#ifdef __NR_dup3 - return my_syscall3(__NR_dup3, old, new, 0); -#elif defined(__NR_dup2) - return my_syscall2(__NR_dup2, old, new); -#else -#error Neither __NR_dup3 nor __NR_dup2 defined, cannot implement sys_dup2() -#endif -} - -static __attribute__((unused)) -int sys_execve(const char *filename, char *const argv[], char *const envp[]) -{ - return my_syscall3(__NR_execve, filename, argv, envp); -} - -static __attribute__((unused)) -pid_t sys_fork(void) -{ -#ifdef __NR_clone - /* note: some archs only have clone() and not fork(). Different archs - * have a different API, but most archs have the flags on first arg and - * will not use the rest with no other flag. - */ - return my_syscall5(__NR_clone, SIGCHLD, 0, 0, 0, 0); -#elif defined(__NR_fork) - return my_syscall0(__NR_fork); -#else -#error Neither __NR_clone nor __NR_fork defined, cannot implement sys_fork() -#endif -} - -static __attribute__((unused)) -int sys_fsync(int fd) -{ - return my_syscall1(__NR_fsync, fd); -} - -static __attribute__((unused)) -int sys_getdents64(int fd, struct linux_dirent64 *dirp, int count) -{ - return my_syscall3(__NR_getdents64, fd, dirp, count); -} - -static __attribute__((unused)) -pid_t sys_getpgid(pid_t pid) -{ - return my_syscall1(__NR_getpgid, pid); -} - -static __attribute__((unused)) -pid_t sys_getpgrp(void) -{ - return sys_getpgid(0); -} - -static __attribute__((unused)) -pid_t sys_getpid(void) -{ - return my_syscall0(__NR_getpid); -} - -static __attribute__((unused)) -pid_t sys_gettid(void) -{ - return my_syscall0(__NR_gettid); -} - -static __attribute__((unused)) -int sys_gettimeofday(struct timeval *tv, struct timezone *tz) -{ - return my_syscall2(__NR_gettimeofday, tv, tz); -} - -static __attribute__((unused)) -int sys_ioctl(int fd, unsigned long req, void *value) -{ - return my_syscall3(__NR_ioctl, fd, req, value); -} - -static __attribute__((unused)) -int sys_kill(pid_t pid, int signal) -{ - return my_syscall2(__NR_kill, pid, signal); -} - -static __attribute__((unused)) -int sys_link(const char *old, const char *new) -{ -#ifdef __NR_linkat - return my_syscall5(__NR_linkat, AT_FDCWD, old, AT_FDCWD, new, 0); -#elif defined(__NR_link) - return my_syscall2(__NR_link, old, new); -#else -#error Neither __NR_linkat nor __NR_link defined, cannot implement sys_link() -#endif -} - -static __attribute__((unused)) -off_t sys_lseek(int fd, off_t offset, int whence) -{ - return my_syscall3(__NR_lseek, fd, offset, whence); -} - -static __attribute__((unused)) -int sys_mkdir(const char *path, mode_t mode) -{ -#ifdef __NR_mkdirat - return my_syscall3(__NR_mkdirat, AT_FDCWD, path, mode); -#elif defined(__NR_mkdir) - return my_syscall2(__NR_mkdir, path, mode); -#else -#error Neither __NR_mkdirat nor __NR_mkdir defined, cannot implement sys_mkdir() -#endif -} - -static __attribute__((unused)) -long sys_mknod(const char *path, mode_t mode, dev_t dev) -{ -#ifdef __NR_mknodat - return my_syscall4(__NR_mknodat, AT_FDCWD, path, mode, dev); -#elif defined(__NR_mknod) - return my_syscall3(__NR_mknod, path, mode, dev); -#else -#error Neither __NR_mknodat nor __NR_mknod defined, cannot implement sys_mknod() -#endif -} - -static __attribute__((unused)) -int sys_mount(const char *src, const char *tgt, const char *fst, - unsigned long flags, const void *data) -{ - return my_syscall5(__NR_mount, src, tgt, fst, flags, data); -} - -static __attribute__((unused)) -int sys_open(const char *path, int flags, mode_t mode) -{ -#ifdef __NR_openat - return my_syscall4(__NR_openat, AT_FDCWD, path, flags, mode); -#elif defined(__NR_open) - return my_syscall3(__NR_open, path, flags, mode); -#else -#error Neither __NR_openat nor __NR_open defined, cannot implement sys_open() -#endif -} - -static __attribute__((unused)) -int sys_pivot_root(const char *new, const char *old) -{ - return my_syscall2(__NR_pivot_root, new, old); -} - -static __attribute__((unused)) -int sys_poll(struct pollfd *fds, int nfds, int timeout) -{ -#if defined(__NR_ppoll) - struct timespec t; - - if (timeout >= 0) { - t.tv_sec = timeout / 1000; - t.tv_nsec = (timeout % 1000) * 1000000; - } - return my_syscall4(__NR_ppoll, fds, nfds, (timeout >= 0) ? &t : NULL, NULL); -#elif defined(__NR_poll) - return my_syscall3(__NR_poll, fds, nfds, timeout); -#else -#error Neither __NR_ppoll nor __NR_poll defined, cannot implement sys_poll() -#endif -} - -static __attribute__((unused)) -ssize_t sys_read(int fd, void *buf, size_t count) -{ - return my_syscall3(__NR_read, fd, buf, count); -} - -static __attribute__((unused)) -ssize_t sys_reboot(int magic1, int magic2, int cmd, void *arg) -{ - return my_syscall4(__NR_reboot, magic1, magic2, cmd, arg); -} - -static __attribute__((unused)) -int sys_sched_yield(void) -{ - return my_syscall0(__NR_sched_yield); -} - -static __attribute__((unused)) -int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) -{ -#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect) - struct sel_arg_struct { - unsigned long n; - fd_set *r, *w, *e; - struct timeval *t; - } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout }; - return my_syscall1(__NR_select, &arg); -#elif defined(__ARCH_WANT_SYS_PSELECT6) && defined(__NR_pselect6) - struct timespec t; - - if (timeout) { - t.tv_sec = timeout->tv_sec; - t.tv_nsec = timeout->tv_usec * 1000; - } - return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); -#elif defined(__NR__newselect) || defined(__NR_select) -#ifndef __NR__newselect -#define __NR__newselect __NR_select -#endif - return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout); -#else -#error None of __NR_select, __NR_pselect6, nor __NR__newselect defined, cannot implement sys_select() -#endif -} - -static __attribute__((unused)) -int sys_setpgid(pid_t pid, pid_t pgid) -{ - return my_syscall2(__NR_setpgid, pid, pgid); -} - -static __attribute__((unused)) -pid_t sys_setsid(void) -{ - return my_syscall0(__NR_setsid); -} - -static __attribute__((unused)) -int sys_stat(const char *path, struct stat *buf) -{ - struct sys_stat_struct stat; - long ret; - -#ifdef __NR_newfstatat - /* only solution for arm64 */ - ret = my_syscall4(__NR_newfstatat, AT_FDCWD, path, &stat, 0); -#elif defined(__NR_stat) - ret = my_syscall2(__NR_stat, path, &stat); -#else -#error Neither __NR_newfstatat nor __NR_stat defined, cannot implement sys_stat() -#endif - buf->st_dev = stat.st_dev; - buf->st_ino = stat.st_ino; - buf->st_mode = stat.st_mode; - buf->st_nlink = stat.st_nlink; - buf->st_uid = stat.st_uid; - buf->st_gid = stat.st_gid; - buf->st_rdev = stat.st_rdev; - buf->st_size = stat.st_size; - buf->st_blksize = stat.st_blksize; - buf->st_blocks = stat.st_blocks; - buf->st_atime = stat.st_atime; - buf->st_mtime = stat.st_mtime; - buf->st_ctime = stat.st_ctime; - return ret; -} - - -static __attribute__((unused)) -int sys_symlink(const char *old, const char *new) -{ -#ifdef __NR_symlinkat - return my_syscall3(__NR_symlinkat, old, AT_FDCWD, new); -#elif defined(__NR_symlink) - return my_syscall2(__NR_symlink, old, new); -#else -#error Neither __NR_symlinkat nor __NR_symlink defined, cannot implement sys_symlink() -#endif -} - -static __attribute__((unused)) -mode_t sys_umask(mode_t mode) -{ - return my_syscall1(__NR_umask, mode); -} - -static __attribute__((unused)) -int sys_umount2(const char *path, int flags) -{ - return my_syscall2(__NR_umount2, path, flags); -} - -static __attribute__((unused)) -int sys_unlink(const char *path) -{ -#ifdef __NR_unlinkat - return my_syscall3(__NR_unlinkat, AT_FDCWD, path, 0); -#elif defined(__NR_unlink) - return my_syscall1(__NR_unlink, path); -#else -#error Neither __NR_unlinkat nor __NR_unlink defined, cannot implement sys_unlink() -#endif -} - -static __attribute__((unused)) -pid_t sys_wait4(pid_t pid, int *status, int options, struct rusage *rusage) -{ - return my_syscall4(__NR_wait4, pid, status, options, rusage); -} - -static __attribute__((unused)) -pid_t sys_waitpid(pid_t pid, int *status, int options) -{ - return sys_wait4(pid, status, options, 0); -} - -static __attribute__((unused)) -pid_t sys_wait(int *status) -{ - return sys_waitpid(-1, status, 0); -} - -static __attribute__((unused)) -ssize_t sys_write(int fd, const void *buf, size_t count) -{ - return my_syscall3(__NR_write, fd, buf, count); -} - - -/* Below are the libc-compatible syscalls which return x or -1 and set errno. - * They rely on the functions above. Similarly they're marked static so that it - * is possible to assign pointers to them if needed. - */ - -static __attribute__((unused)) -int brk(void *addr) -{ - void *ret = sys_brk(addr); - - if (!ret) { - SET_ERRNO(ENOMEM); - return -1; - } - return 0; -} - -static __attribute__((noreturn,unused)) -void exit(int status) -{ - sys_exit(status); -} - -static __attribute__((unused)) -int chdir(const char *path) -{ - int ret = sys_chdir(path); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int chmod(const char *path, mode_t mode) -{ - int ret = sys_chmod(path, mode); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int chown(const char *path, uid_t owner, gid_t group) -{ - int ret = sys_chown(path, owner, group); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int chroot(const char *path) -{ - int ret = sys_chroot(path); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int close(int fd) -{ - int ret = sys_close(fd); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int dup(int fd) -{ - int ret = sys_dup(fd); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int dup2(int old, int new) -{ - int ret = sys_dup2(old, new); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -#ifdef __NR_dup3 -static __attribute__((unused)) -int dup3(int old, int new, int flags) -{ - int ret = sys_dup3(old, new, flags); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} -#endif - -static __attribute__((unused)) -int execve(const char *filename, char *const argv[], char *const envp[]) -{ - int ret = sys_execve(filename, argv, envp); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t fork(void) -{ - pid_t ret = sys_fork(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int fsync(int fd) -{ - int ret = sys_fsync(fd); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int getdents64(int fd, struct linux_dirent64 *dirp, int count) -{ - int ret = sys_getdents64(fd, dirp, count); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t getpgid(pid_t pid) -{ - pid_t ret = sys_getpgid(pid); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t getpgrp(void) -{ - pid_t ret = sys_getpgrp(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t getpid(void) -{ - pid_t ret = sys_getpid(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t gettid(void) -{ - pid_t ret = sys_gettid(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int gettimeofday(struct timeval *tv, struct timezone *tz) -{ - int ret = sys_gettimeofday(tv, tz); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int ioctl(int fd, unsigned long req, void *value) -{ - int ret = sys_ioctl(fd, req, value); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int kill(pid_t pid, int signal) -{ - int ret = sys_kill(pid, signal); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int link(const char *old, const char *new) -{ - int ret = sys_link(old, new); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -off_t lseek(int fd, off_t offset, int whence) -{ - off_t ret = sys_lseek(fd, offset, whence); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int mkdir(const char *path, mode_t mode) -{ - int ret = sys_mkdir(path, mode); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int mknod(const char *path, mode_t mode, dev_t dev) -{ - int ret = sys_mknod(path, mode, dev); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int mount(const char *src, const char *tgt, - const char *fst, unsigned long flags, - const void *data) -{ - int ret = sys_mount(src, tgt, fst, flags, data); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int open(const char *path, int flags, mode_t mode) -{ - int ret = sys_open(path, flags, mode); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int pivot_root(const char *new, const char *old) -{ - int ret = sys_pivot_root(new, old); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int poll(struct pollfd *fds, int nfds, int timeout) -{ - int ret = sys_poll(fds, nfds, timeout); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -ssize_t read(int fd, void *buf, size_t count) -{ - ssize_t ret = sys_read(fd, buf, count); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int reboot(int cmd) -{ - int ret = sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, 0); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -void *sbrk(intptr_t inc) -{ - void *ret; - - /* first call to find current end */ - if ((ret = sys_brk(0)) && (sys_brk(ret + inc) == ret + inc)) - return ret + inc; - - SET_ERRNO(ENOMEM); - return (void *)-1; -} - -static __attribute__((unused)) -int sched_yield(void) -{ - int ret = sys_sched_yield(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) -{ - int ret = sys_select(nfds, rfds, wfds, efds, timeout); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int setpgid(pid_t pid, pid_t pgid) -{ - int ret = sys_setpgid(pid, pgid); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t setsid(void) -{ - pid_t ret = sys_setsid(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; + return ioctl(fd, TIOCSPGRP, &pid); } static __attribute__((unused)) @@ -972,114 +121,6 @@ int msleep(unsigned int msecs) return 0; } -static __attribute__((unused)) -int stat(const char *path, struct stat *buf) -{ - int ret = sys_stat(path, buf); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int symlink(const char *old, const char *new) -{ - int ret = sys_symlink(old, new); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int tcsetpgrp(int fd, pid_t pid) -{ - return ioctl(fd, TIOCSPGRP, &pid); -} - -static __attribute__((unused)) -mode_t umask(mode_t mode) -{ - return sys_umask(mode); -} - -static __attribute__((unused)) -int umount2(const char *path, int flags) -{ - int ret = sys_umount2(path, flags); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int unlink(const char *path) -{ - int ret = sys_unlink(path); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage) -{ - pid_t ret = sys_wait4(pid, status, options, rusage); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t waitpid(pid_t pid, int *status, int options) -{ - pid_t ret = sys_waitpid(pid, status, options); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t wait(int *status) -{ - pid_t ret = sys_wait(status); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -ssize_t write(int fd, const void *buf, size_t count) -{ - ssize_t ret = sys_write(fd, buf, count); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - /* some size-optimized reimplementations of a few common str* and mem* * functions. They're marked static, except memcpy() and raise() which are used * by libgcc on ARM, so they are marked weak instead in order not to cause an diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h new file mode 100644 index 000000000000..98689f668ed3 --- /dev/null +++ b/tools/include/nolibc/sys.h @@ -0,0 +1,1189 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Syscall definitions for NOLIBC (those in man(2)) + * Copyright (C) 2017-2021 Willy Tarreau + */ + +#ifndef _NOLIBC_SYS_H +#define _NOLIBC_SYS_H + +#include "std.h" + +/* system includes */ +#include +#include // for SIGCHLD +#include +#include +#include +#include +#include + +#include "arch.h" +#include "types.h" + +/* this way it will be removed if unused */ +static int errno; + +#ifndef NOLIBC_IGNORE_ERRNO +#define SET_ERRNO(v) do { errno = (v); } while (0) +#else +#define SET_ERRNO(v) do { } while (0) +#endif + + +/* errno codes all ensure that they will not conflict with a valid pointer + * because they all correspond to the highest addressable memory page. + */ +#define MAX_ERRNO 4095 + + +/* Functions in this file only describe syscalls. They're declared static so + * that the compiler usually decides to inline them while still being allowed + * to pass a pointer to one of their instances. Each syscall exists in two + * versions: + * - the "internal" ones, which matches the raw syscall interface at the + * kernel level, which may sometimes slightly differ from the documented + * libc-level ones. For example most of them return either a valid value + * or -errno. All of these are prefixed with "sys_". They may be called + * by non-portable applications if desired. + * + * - the "exported" ones, whose interface must closely match the one + * documented in man(2), that applications are supposed to expect. These + * ones rely on the internal ones, and set errno. + * + * Each syscall will be defined with the two functions, sorted in alphabetical + * order applied to the exported names. + * + * In case of doubt about the relevance of a function here, only those which + * set errno should be defined here. Wrappers like those appearing in man(3) + * should not be placed here. + */ + + +/* + * int brk(void *addr); + * void *sbrk(intptr_t inc) + */ + +static __attribute__((unused)) +void *sys_brk(void *addr) +{ + return (void *)my_syscall1(__NR_brk, addr); +} + +static __attribute__((unused)) +int brk(void *addr) +{ + void *ret = sys_brk(addr); + + if (!ret) { + SET_ERRNO(ENOMEM); + return -1; + } + return 0; +} + +static __attribute__((unused)) +void *sbrk(intptr_t inc) +{ + void *ret; + + /* first call to find current end */ + if ((ret = sys_brk(0)) && (sys_brk(ret + inc) == ret + inc)) + return ret + inc; + + SET_ERRNO(ENOMEM); + return (void *)-1; +} + + +/* + * int chdir(const char *path); + */ + +static __attribute__((unused)) +int sys_chdir(const char *path) +{ + return my_syscall1(__NR_chdir, path); +} + +static __attribute__((unused)) +int chdir(const char *path) +{ + int ret = sys_chdir(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int chmod(const char *path, mode_t mode); + */ + +static __attribute__((unused)) +int sys_chmod(const char *path, mode_t mode) +{ +#ifdef __NR_fchmodat + return my_syscall4(__NR_fchmodat, AT_FDCWD, path, mode, 0); +#elif defined(__NR_chmod) + return my_syscall2(__NR_chmod, path, mode); +#else +#error Neither __NR_fchmodat nor __NR_chmod defined, cannot implement sys_chmod() +#endif +} + +static __attribute__((unused)) +int chmod(const char *path, mode_t mode) +{ + int ret = sys_chmod(path, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int chown(const char *path, uid_t owner, gid_t group); + */ + +static __attribute__((unused)) +int sys_chown(const char *path, uid_t owner, gid_t group) +{ +#ifdef __NR_fchownat + return my_syscall5(__NR_fchownat, AT_FDCWD, path, owner, group, 0); +#elif defined(__NR_chown) + return my_syscall3(__NR_chown, path, owner, group); +#else +#error Neither __NR_fchownat nor __NR_chown defined, cannot implement sys_chown() +#endif +} + +static __attribute__((unused)) +int chown(const char *path, uid_t owner, gid_t group) +{ + int ret = sys_chown(path, owner, group); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int chroot(const char *path); + */ + +static __attribute__((unused)) +int sys_chroot(const char *path) +{ + return my_syscall1(__NR_chroot, path); +} + +static __attribute__((unused)) +int chroot(const char *path) +{ + int ret = sys_chroot(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int close(int fd); + */ + +static __attribute__((unused)) +int sys_close(int fd) +{ + return my_syscall1(__NR_close, fd); +} + +static __attribute__((unused)) +int close(int fd) +{ + int ret = sys_close(fd); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int dup(int fd); + */ + +static __attribute__((unused)) +int sys_dup(int fd) +{ + return my_syscall1(__NR_dup, fd); +} + +static __attribute__((unused)) +int dup(int fd) +{ + int ret = sys_dup(fd); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int dup2(int old, int new); + */ + +static __attribute__((unused)) +int sys_dup2(int old, int new) +{ +#ifdef __NR_dup3 + return my_syscall3(__NR_dup3, old, new, 0); +#elif defined(__NR_dup2) + return my_syscall2(__NR_dup2, old, new); +#else +#error Neither __NR_dup3 nor __NR_dup2 defined, cannot implement sys_dup2() +#endif +} + +static __attribute__((unused)) +int dup2(int old, int new) +{ + int ret = sys_dup2(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int dup3(int old, int new, int flags); + */ + +#ifdef __NR_dup3 +static __attribute__((unused)) +int sys_dup3(int old, int new, int flags) +{ + return my_syscall3(__NR_dup3, old, new, flags); +} + +static __attribute__((unused)) +int dup3(int old, int new, int flags) +{ + int ret = sys_dup3(old, new, flags); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} +#endif + + +/* + * int execve(const char *filename, char *const argv[], char *const envp[]); + */ + +static __attribute__((unused)) +int sys_execve(const char *filename, char *const argv[], char *const envp[]) +{ + return my_syscall3(__NR_execve, filename, argv, envp); +} + +static __attribute__((unused)) +int execve(const char *filename, char *const argv[], char *const envp[]) +{ + int ret = sys_execve(filename, argv, envp); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * void exit(int status); + */ + +static __attribute__((noreturn,unused)) +void sys_exit(int status) +{ + my_syscall1(__NR_exit, status & 255); + while(1); // shut the "noreturn" warnings. +} + +static __attribute__((noreturn,unused)) +void exit(int status) +{ + sys_exit(status); +} + + +/* + * pid_t fork(void); + */ + +static __attribute__((unused)) +pid_t sys_fork(void) +{ +#ifdef __NR_clone + /* note: some archs only have clone() and not fork(). Different archs + * have a different API, but most archs have the flags on first arg and + * will not use the rest with no other flag. + */ + return my_syscall5(__NR_clone, SIGCHLD, 0, 0, 0, 0); +#elif defined(__NR_fork) + return my_syscall0(__NR_fork); +#else +#error Neither __NR_clone nor __NR_fork defined, cannot implement sys_fork() +#endif +} + +static __attribute__((unused)) +pid_t fork(void) +{ + pid_t ret = sys_fork(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int fsync(int fd); + */ + +static __attribute__((unused)) +int sys_fsync(int fd) +{ + return my_syscall1(__NR_fsync, fd); +} + +static __attribute__((unused)) +int fsync(int fd) +{ + int ret = sys_fsync(fd); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int getdents64(int fd, struct linux_dirent64 *dirp, int count); + */ + +static __attribute__((unused)) +int sys_getdents64(int fd, struct linux_dirent64 *dirp, int count) +{ + return my_syscall3(__NR_getdents64, fd, dirp, count); +} + +static __attribute__((unused)) +int getdents64(int fd, struct linux_dirent64 *dirp, int count) +{ + int ret = sys_getdents64(fd, dirp, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * pid_t getpgid(pid_t pid); + */ + +static __attribute__((unused)) +pid_t sys_getpgid(pid_t pid) +{ + return my_syscall1(__NR_getpgid, pid); +} + +static __attribute__((unused)) +pid_t getpgid(pid_t pid) +{ + pid_t ret = sys_getpgid(pid); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * pid_t getpgrp(void); + */ + +static __attribute__((unused)) +pid_t sys_getpgrp(void) +{ + return sys_getpgid(0); +} + +static __attribute__((unused)) +pid_t getpgrp(void) +{ + pid_t ret = sys_getpgrp(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * pid_t getpid(void); + */ + +static __attribute__((unused)) +pid_t sys_getpid(void) +{ + return my_syscall0(__NR_getpid); +} + +static __attribute__((unused)) +pid_t getpid(void) +{ + pid_t ret = sys_getpid(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * pid_t gettid(void); + */ + +static __attribute__((unused)) +pid_t sys_gettid(void) +{ + return my_syscall0(__NR_gettid); +} + +static __attribute__((unused)) +pid_t gettid(void) +{ + pid_t ret = sys_gettid(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int gettimeofday(struct timeval *tv, struct timezone *tz); + */ + +static __attribute__((unused)) +int sys_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + return my_syscall2(__NR_gettimeofday, tv, tz); +} + +static __attribute__((unused)) +int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + int ret = sys_gettimeofday(tv, tz); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int ioctl(int fd, unsigned long req, void *value); + */ + +static __attribute__((unused)) +int sys_ioctl(int fd, unsigned long req, void *value) +{ + return my_syscall3(__NR_ioctl, fd, req, value); +} + +static __attribute__((unused)) +int ioctl(int fd, unsigned long req, void *value) +{ + int ret = sys_ioctl(fd, req, value); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +/* + * int kill(pid_t pid, int signal); + */ + +static __attribute__((unused)) +int sys_kill(pid_t pid, int signal) +{ + return my_syscall2(__NR_kill, pid, signal); +} + +static __attribute__((unused)) +int kill(pid_t pid, int signal) +{ + int ret = sys_kill(pid, signal); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int link(const char *old, const char *new); + */ + +static __attribute__((unused)) +int sys_link(const char *old, const char *new) +{ +#ifdef __NR_linkat + return my_syscall5(__NR_linkat, AT_FDCWD, old, AT_FDCWD, new, 0); +#elif defined(__NR_link) + return my_syscall2(__NR_link, old, new); +#else +#error Neither __NR_linkat nor __NR_link defined, cannot implement sys_link() +#endif +} + +static __attribute__((unused)) +int link(const char *old, const char *new) +{ + int ret = sys_link(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * off_t lseek(int fd, off_t offset, int whence); + */ + +static __attribute__((unused)) +off_t sys_lseek(int fd, off_t offset, int whence) +{ + return my_syscall3(__NR_lseek, fd, offset, whence); +} + +static __attribute__((unused)) +off_t lseek(int fd, off_t offset, int whence) +{ + off_t ret = sys_lseek(fd, offset, whence); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int mkdir(const char *path, mode_t mode); + */ + +static __attribute__((unused)) +int sys_mkdir(const char *path, mode_t mode) +{ +#ifdef __NR_mkdirat + return my_syscall3(__NR_mkdirat, AT_FDCWD, path, mode); +#elif defined(__NR_mkdir) + return my_syscall2(__NR_mkdir, path, mode); +#else +#error Neither __NR_mkdirat nor __NR_mkdir defined, cannot implement sys_mkdir() +#endif +} + +static __attribute__((unused)) +int mkdir(const char *path, mode_t mode) +{ + int ret = sys_mkdir(path, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int mknod(const char *path, mode_t mode, dev_t dev); + */ + +static __attribute__((unused)) +long sys_mknod(const char *path, mode_t mode, dev_t dev) +{ +#ifdef __NR_mknodat + return my_syscall4(__NR_mknodat, AT_FDCWD, path, mode, dev); +#elif defined(__NR_mknod) + return my_syscall3(__NR_mknod, path, mode, dev); +#else +#error Neither __NR_mknodat nor __NR_mknod defined, cannot implement sys_mknod() +#endif +} + +static __attribute__((unused)) +int mknod(const char *path, mode_t mode, dev_t dev) +{ + int ret = sys_mknod(path, mode, dev); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int mount(const char *source, const char *target, + * const char *fstype, unsigned long flags, + * const void *data); + */ +static __attribute__((unused)) +int sys_mount(const char *src, const char *tgt, const char *fst, + unsigned long flags, const void *data) +{ + return my_syscall5(__NR_mount, src, tgt, fst, flags, data); +} + +static __attribute__((unused)) +int mount(const char *src, const char *tgt, + const char *fst, unsigned long flags, + const void *data) +{ + int ret = sys_mount(src, tgt, fst, flags, data); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int open(const char *path, int flags, mode_t mode); + */ + +static __attribute__((unused)) +int sys_open(const char *path, int flags, mode_t mode) +{ +#ifdef __NR_openat + return my_syscall4(__NR_openat, AT_FDCWD, path, flags, mode); +#elif defined(__NR_open) + return my_syscall3(__NR_open, path, flags, mode); +#else +#error Neither __NR_openat nor __NR_open defined, cannot implement sys_open() +#endif +} + +static __attribute__((unused)) +int open(const char *path, int flags, mode_t mode) +{ + int ret = sys_open(path, flags, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int pivot_root(const char *new, const char *old); + */ + +static __attribute__((unused)) +int sys_pivot_root(const char *new, const char *old) +{ + return my_syscall2(__NR_pivot_root, new, old); +} + +static __attribute__((unused)) +int pivot_root(const char *new, const char *old) +{ + int ret = sys_pivot_root(new, old); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int poll(struct pollfd *fds, int nfds, int timeout); + */ + +static __attribute__((unused)) +int sys_poll(struct pollfd *fds, int nfds, int timeout) +{ +#if defined(__NR_ppoll) + struct timespec t; + + if (timeout >= 0) { + t.tv_sec = timeout / 1000; + t.tv_nsec = (timeout % 1000) * 1000000; + } + return my_syscall4(__NR_ppoll, fds, nfds, (timeout >= 0) ? &t : NULL, NULL); +#elif defined(__NR_poll) + return my_syscall3(__NR_poll, fds, nfds, timeout); +#else +#error Neither __NR_ppoll nor __NR_poll defined, cannot implement sys_poll() +#endif +} + +static __attribute__((unused)) +int poll(struct pollfd *fds, int nfds, int timeout) +{ + int ret = sys_poll(fds, nfds, timeout); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * ssize_t read(int fd, void *buf, size_t count); + */ + +static __attribute__((unused)) +ssize_t sys_read(int fd, void *buf, size_t count) +{ + return my_syscall3(__NR_read, fd, buf, count); +} + +static __attribute__((unused)) +ssize_t read(int fd, void *buf, size_t count) +{ + ssize_t ret = sys_read(fd, buf, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int reboot(int cmd); + * is among LINUX_REBOOT_CMD_* + */ + +static __attribute__((unused)) +ssize_t sys_reboot(int magic1, int magic2, int cmd, void *arg) +{ + return my_syscall4(__NR_reboot, magic1, magic2, cmd, arg); +} + +static __attribute__((unused)) +int reboot(int cmd) +{ + int ret = sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, 0); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int sched_yield(void); + */ + +static __attribute__((unused)) +int sys_sched_yield(void) +{ + return my_syscall0(__NR_sched_yield); +} + +static __attribute__((unused)) +int sched_yield(void) +{ + int ret = sys_sched_yield(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int select(int nfds, fd_set *read_fds, fd_set *write_fds, + * fd_set *except_fds, struct timeval *timeout); + */ + +static __attribute__((unused)) +int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) +{ +#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect) + struct sel_arg_struct { + unsigned long n; + fd_set *r, *w, *e; + struct timeval *t; + } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout }; + return my_syscall1(__NR_select, &arg); +#elif defined(__ARCH_WANT_SYS_PSELECT6) && defined(__NR_pselect6) + struct timespec t; + + if (timeout) { + t.tv_sec = timeout->tv_sec; + t.tv_nsec = timeout->tv_usec * 1000; + } + return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); +#elif defined(__NR__newselect) || defined(__NR_select) +#ifndef __NR__newselect +#define __NR__newselect __NR_select +#endif + return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout); +#else +#error None of __NR_select, __NR_pselect6, nor __NR__newselect defined, cannot implement sys_select() +#endif +} + +static __attribute__((unused)) +int select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) +{ + int ret = sys_select(nfds, rfds, wfds, efds, timeout); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int setpgid(pid_t pid, pid_t pgid); + */ + +static __attribute__((unused)) +int sys_setpgid(pid_t pid, pid_t pgid) +{ + return my_syscall2(__NR_setpgid, pid, pgid); +} + +static __attribute__((unused)) +int setpgid(pid_t pid, pid_t pgid) +{ + int ret = sys_setpgid(pid, pgid); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * pid_t setsid(void); + */ + +static __attribute__((unused)) +pid_t sys_setsid(void) +{ + return my_syscall0(__NR_setsid); +} + +static __attribute__((unused)) +pid_t setsid(void) +{ + pid_t ret = sys_setsid(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int stat(const char *path, struct stat *buf); + * Warning: the struct stat's layout is arch-dependent. + */ + +static __attribute__((unused)) +int sys_stat(const char *path, struct stat *buf) +{ + struct sys_stat_struct stat; + long ret; + +#ifdef __NR_newfstatat + /* only solution for arm64 */ + ret = my_syscall4(__NR_newfstatat, AT_FDCWD, path, &stat, 0); +#elif defined(__NR_stat) + ret = my_syscall2(__NR_stat, path, &stat); +#else +#error Neither __NR_newfstatat nor __NR_stat defined, cannot implement sys_stat() +#endif + buf->st_dev = stat.st_dev; + buf->st_ino = stat.st_ino; + buf->st_mode = stat.st_mode; + buf->st_nlink = stat.st_nlink; + buf->st_uid = stat.st_uid; + buf->st_gid = stat.st_gid; + buf->st_rdev = stat.st_rdev; + buf->st_size = stat.st_size; + buf->st_blksize = stat.st_blksize; + buf->st_blocks = stat.st_blocks; + buf->st_atime = stat.st_atime; + buf->st_mtime = stat.st_mtime; + buf->st_ctime = stat.st_ctime; + return ret; +} + +static __attribute__((unused)) +int stat(const char *path, struct stat *buf) +{ + int ret = sys_stat(path, buf); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int symlink(const char *old, const char *new); + */ + +static __attribute__((unused)) +int sys_symlink(const char *old, const char *new) +{ +#ifdef __NR_symlinkat + return my_syscall3(__NR_symlinkat, old, AT_FDCWD, new); +#elif defined(__NR_symlink) + return my_syscall2(__NR_symlink, old, new); +#else +#error Neither __NR_symlinkat nor __NR_symlink defined, cannot implement sys_symlink() +#endif +} + +static __attribute__((unused)) +int symlink(const char *old, const char *new) +{ + int ret = sys_symlink(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * mode_t umask(mode_t mode); + */ + +static __attribute__((unused)) +mode_t sys_umask(mode_t mode) +{ + return my_syscall1(__NR_umask, mode); +} + +static __attribute__((unused)) +mode_t umask(mode_t mode) +{ + return sys_umask(mode); +} + + +/* + * int umount2(const char *path, int flags); + */ + +static __attribute__((unused)) +int sys_umount2(const char *path, int flags) +{ + return my_syscall2(__NR_umount2, path, flags); +} + +static __attribute__((unused)) +int umount2(const char *path, int flags) +{ + int ret = sys_umount2(path, flags); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int unlink(const char *path); + */ + +static __attribute__((unused)) +int sys_unlink(const char *path) +{ +#ifdef __NR_unlinkat + return my_syscall3(__NR_unlinkat, AT_FDCWD, path, 0); +#elif defined(__NR_unlink) + return my_syscall1(__NR_unlink, path); +#else +#error Neither __NR_unlinkat nor __NR_unlink defined, cannot implement sys_unlink() +#endif +} + +static __attribute__((unused)) +int unlink(const char *path) +{ + int ret = sys_unlink(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * pid_t wait(int *status); + * pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage); + * pid_t waitpid(pid_t pid, int *status, int options); + */ + +static __attribute__((unused)) +pid_t sys_wait4(pid_t pid, int *status, int options, struct rusage *rusage) +{ + return my_syscall4(__NR_wait4, pid, status, options, rusage); +} + +static __attribute__((unused)) +pid_t wait(int *status) +{ + pid_t ret = sys_wait4(-1, status, 0, NULL); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage) +{ + pid_t ret = sys_wait4(pid, status, options, rusage); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +static __attribute__((unused)) +pid_t waitpid(pid_t pid, int *status, int options) +{ + pid_t ret = sys_wait4(pid, status, options, NULL); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * ssize_t write(int fd, const void *buf, size_t count); + */ + +static __attribute__((unused)) +ssize_t sys_write(int fd, const void *buf, size_t count) +{ + return my_syscall3(__NR_write, fd, buf, count); +} + +static __attribute__((unused)) +ssize_t write(int fd, const void *buf, size_t count) +{ + ssize_t ret = sys_write(fd, buf, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +#endif /* _NOLIBC_SYS_H */ From 06fdba53e0a9a897ba00c3602f14b3498b321655 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:19 +0100 Subject: [PATCH 087/155] tools/nolibc/stdlib: extract the stdlib-specific functions to their own file The new file stdlib.h contains the definitions of functions that are usually found in stdlib.h. Many more could certainly be added. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 67 +-------------------------- tools/include/nolibc/stdlib.h | 85 +++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 66 deletions(-) create mode 100644 tools/include/nolibc/stdlib.h diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 2af56ec760e2..ed909a8daa1a 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -87,40 +87,11 @@ #include "arch.h" #include "types.h" #include "sys.h" +#include "stdlib.h" /* Used by programs to avoid std includes */ #define NOLIBC -static __attribute__((unused)) -int tcsetpgrp(int fd, pid_t pid) -{ - return ioctl(fd, TIOCSPGRP, &pid); -} - -static __attribute__((unused)) -unsigned int sleep(unsigned int seconds) -{ - struct timeval my_timeval = { seconds, 0 }; - - if (sys_select(0, 0, 0, 0, &my_timeval) < 0) - return my_timeval.tv_sec + !!my_timeval.tv_usec; - else - return 0; -} - -static __attribute__((unused)) -int msleep(unsigned int msecs) -{ - struct timeval my_timeval = { msecs / 1000, (msecs % 1000) * 1000 }; - - if (sys_select(0, 0, 0, 0, &my_timeval) < 0) - return (my_timeval.tv_sec * 1000) + - (my_timeval.tv_usec / 1000) + - !!(my_timeval.tv_usec % 1000); - else - return 0; -} - /* some size-optimized reimplementations of a few common str* and mem* * functions. They're marked static, except memcpy() and raise() which are used * by libgcc on ARM, so they are marked weak instead in order not to cause an @@ -216,35 +187,6 @@ int isdigit(int c) return (unsigned int)(c - '0') <= 9; } -static __attribute__((unused)) -long atol(const char *s) -{ - unsigned long ret = 0; - unsigned long d; - int neg = 0; - - if (*s == '-') { - neg = 1; - s++; - } - - while (1) { - d = (*s++) - '0'; - if (d > 9) - break; - ret *= 10; - ret += d; - } - - return neg ? -ret : ret; -} - -static __attribute__((unused)) -int atoi(const char *s) -{ - return atol(s); -} - static __attribute__((unused)) const char *ltoa(long in) { @@ -273,13 +215,6 @@ void *memcpy(void *dst, const void *src, size_t len) return memmove(dst, src, len); } -/* needed by libgcc for divide by zero */ -__attribute__((weak,unused)) -int raise(int signal) -{ - return kill(getpid(), signal); -} - /* Here come a few helper functions */ static __attribute__((unused)) diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h new file mode 100644 index 000000000000..09a506aadbbe --- /dev/null +++ b/tools/include/nolibc/stdlib.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * stdlib function definitions for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau + */ + +#ifndef _NOLIBC_STDLIB_H +#define _NOLIBC_STDLIB_H + +#include "std.h" +#include "arch.h" +#include "types.h" +#include "sys.h" + +/* + * As much as possible, please keep functions alphabetically sorted. + */ + +static __attribute__((unused)) +long atol(const char *s) +{ + unsigned long ret = 0; + unsigned long d; + int neg = 0; + + if (*s == '-') { + neg = 1; + s++; + } + + while (1) { + d = (*s++) - '0'; + if (d > 9) + break; + ret *= 10; + ret += d; + } + + return neg ? -ret : ret; +} + +static __attribute__((unused)) +int atoi(const char *s) +{ + return atol(s); +} + +static __attribute__((unused)) +int msleep(unsigned int msecs) +{ + struct timeval my_timeval = { msecs / 1000, (msecs % 1000) * 1000 }; + + if (sys_select(0, 0, 0, 0, &my_timeval) < 0) + return (my_timeval.tv_sec * 1000) + + (my_timeval.tv_usec / 1000) + + !!(my_timeval.tv_usec % 1000); + else + return 0; +} + +/* This one is not marked static as it's needed by libgcc for divide by zero */ +__attribute__((weak,unused)) +int raise(int signal) +{ + return kill(getpid(), signal); +} + +static __attribute__((unused)) +unsigned int sleep(unsigned int seconds) +{ + struct timeval my_timeval = { seconds, 0 }; + + if (sys_select(0, 0, 0, 0, &my_timeval) < 0) + return my_timeval.tv_sec + !!my_timeval.tv_usec; + else + return 0; +} + +static __attribute__((unused)) +int tcsetpgrp(int fd, pid_t pid) +{ + return ioctl(fd, TIOCSPGRP, &pid); +} + +#endif /* _NOLIBC_STDLIB_H */ From c91eb033895593a998c5a49d562fb760ab2fa5bb Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:20 +0100 Subject: [PATCH 088/155] tools/nolibc/string: split the string functions into string.h The string manipulation functions (mem*, str*) are now found in string.h. The file depends on almost nothing and will be usable from other includes if needed. Maybe more functions could be added. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 96 +----------------------------- tools/include/nolibc/string.h | 107 ++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 95 deletions(-) create mode 100644 tools/include/nolibc/string.h diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index ed909a8daa1a..b06bd5cb5651 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -88,99 +88,11 @@ #include "types.h" #include "sys.h" #include "stdlib.h" +#include "string.h" /* Used by programs to avoid std includes */ #define NOLIBC -/* some size-optimized reimplementations of a few common str* and mem* - * functions. They're marked static, except memcpy() and raise() which are used - * by libgcc on ARM, so they are marked weak instead in order not to cause an - * error when building a program made of multiple files (not recommended). - */ - -static __attribute__((unused)) -void *memmove(void *dst, const void *src, size_t len) -{ - ssize_t pos = (dst <= src) ? -1 : (long)len; - void *ret = dst; - - while (len--) { - pos += (dst <= src) ? 1 : -1; - ((char *)dst)[pos] = ((char *)src)[pos]; - } - return ret; -} - -static __attribute__((unused)) -void *memset(void *dst, int b, size_t len) -{ - char *p = dst; - - while (len--) - *(p++) = b; - return dst; -} - -static __attribute__((unused)) -int memcmp(const void *s1, const void *s2, size_t n) -{ - size_t ofs = 0; - char c1 = 0; - - while (ofs < n && !(c1 = ((char *)s1)[ofs] - ((char *)s2)[ofs])) { - ofs++; - } - return c1; -} - -static __attribute__((unused)) -char *strcpy(char *dst, const char *src) -{ - char *ret = dst; - - while ((*dst++ = *src++)); - return ret; -} - -static __attribute__((unused)) -char *strchr(const char *s, int c) -{ - while (*s) { - if (*s == (char)c) - return (char *)s; - s++; - } - return NULL; -} - -static __attribute__((unused)) -char *strrchr(const char *s, int c) -{ - const char *ret = NULL; - - while (*s) { - if (*s == (char)c) - ret = s; - s++; - } - return (char *)ret; -} - -static __attribute__((unused)) -size_t nolibc_strlen(const char *str) -{ - size_t len; - - for (len = 0; str[len]; len++); - return len; -} - -#define strlen(str) ({ \ - __builtin_constant_p((str)) ? \ - __builtin_strlen((str)) : \ - nolibc_strlen((str)); \ -}) - static __attribute__((unused)) int isdigit(int c) { @@ -209,12 +121,6 @@ const char *ltoa(long in) return pos + 1; } -__attribute__((weak,unused)) -void *memcpy(void *dst, const void *src, size_t len) -{ - return memmove(dst, src, len); -} - /* Here come a few helper functions */ static __attribute__((unused)) diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h new file mode 100644 index 000000000000..8a23cda2d450 --- /dev/null +++ b/tools/include/nolibc/string.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * string function definitions for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau + */ + +#ifndef _NOLIBC_STRING_H +#define _NOLIBC_STRING_H + +#include "std.h" + +/* + * As much as possible, please keep functions alphabetically sorted. + */ + +static __attribute__((unused)) +int memcmp(const void *s1, const void *s2, size_t n) +{ + size_t ofs = 0; + char c1 = 0; + + while (ofs < n && !(c1 = ((char *)s1)[ofs] - ((char *)s2)[ofs])) { + ofs++; + } + return c1; +} + +static __attribute__((unused)) +void *memmove(void *dst, const void *src, size_t len) +{ + ssize_t pos = (dst <= src) ? -1 : (long)len; + void *ret = dst; + + while (len--) { + pos += (dst <= src) ? 1 : -1; + ((char *)dst)[pos] = ((char *)src)[pos]; + } + return ret; +} + +/* must be exported, as it's used by libgcc on ARM */ +__attribute__((weak,unused)) +void *memcpy(void *dst, const void *src, size_t len) +{ + return memmove(dst, src, len); +} + +static __attribute__((unused)) +void *memset(void *dst, int b, size_t len) +{ + char *p = dst; + + while (len--) + *(p++) = b; + return dst; +} + +static __attribute__((unused)) +char *strchr(const char *s, int c) +{ + while (*s) { + if (*s == (char)c) + return (char *)s; + s++; + } + return NULL; +} + +static __attribute__((unused)) +char *strcpy(char *dst, const char *src) +{ + char *ret = dst; + + while ((*dst++ = *src++)); + return ret; +} + +/* this function is only used with arguments that are not constants */ +static __attribute__((unused)) +size_t nolibc_strlen(const char *str) +{ + size_t len; + + for (len = 0; str[len]; len++); + return len; +} + +#define strlen(str) ({ \ + __builtin_constant_p((str)) ? \ + __builtin_strlen((str)) : \ + nolibc_strlen((str)); \ +}) + +static __attribute__((unused)) +char *strrchr(const char *s, int c) +{ + const char *ret = NULL; + + while (*s) { + if (*s == (char)c) + ret = s; + s++; + } + return (char *)ret; +} + +#endif /* _NOLIBC_STRING_H */ From 62a2af077493df4ebea22df4de8df4aea3c882cc Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:21 +0100 Subject: [PATCH 089/155] tools/nolibc/ctype: split the is* functions to ctype.h In fact there's only isdigit() for now. More should definitely be added. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/ctype.h | 22 ++++++++++++++++++++++ tools/include/nolibc/nolibc.h | 7 +------ 2 files changed, 23 insertions(+), 6 deletions(-) create mode 100644 tools/include/nolibc/ctype.h diff --git a/tools/include/nolibc/ctype.h b/tools/include/nolibc/ctype.h new file mode 100644 index 000000000000..6735bd906f25 --- /dev/null +++ b/tools/include/nolibc/ctype.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * ctype function definitions for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau + */ + +#ifndef _NOLIBC_CTYPE_H +#define _NOLIBC_CTYPE_H + +#include "std.h" + +/* + * As much as possible, please keep functions alphabetically sorted. + */ + +static __attribute__((unused)) +int isdigit(int c) +{ + return (unsigned int)(c - '0') <= 9; +} + +#endif /* _NOLIBC_CTYPE_H */ diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index b06bd5cb5651..c96c6cb7f3ae 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -87,18 +87,13 @@ #include "arch.h" #include "types.h" #include "sys.h" +#include "ctype.h" #include "stdlib.h" #include "string.h" /* Used by programs to avoid std includes */ #define NOLIBC -static __attribute__((unused)) -int isdigit(int c) -{ - return (unsigned int)(c - '0') <= 9; -} - static __attribute__((unused)) const char *ltoa(long in) { From 50850c38b290c22da37a77e24c9382933b200c8c Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:22 +0100 Subject: [PATCH 090/155] tools/nolibc/ctype: add the missing is* functions There was only isdigit, this commit adds the other ones. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/ctype.h | 79 +++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/tools/include/nolibc/ctype.h b/tools/include/nolibc/ctype.h index 6735bd906f25..e3000b2992d7 100644 --- a/tools/include/nolibc/ctype.h +++ b/tools/include/nolibc/ctype.h @@ -13,10 +13,87 @@ * As much as possible, please keep functions alphabetically sorted. */ +static __attribute__((unused)) +int isascii(int c) +{ + /* 0x00..0x7f */ + return (unsigned int)c <= 0x7f; +} + +static __attribute__((unused)) +int isblank(int c) +{ + return c == '\t' || c == ' '; +} + +static __attribute__((unused)) +int iscntrl(int c) +{ + /* 0x00..0x1f, 0x7f */ + return (unsigned int)c < 0x20 || c == 0x7f; +} + static __attribute__((unused)) int isdigit(int c) { - return (unsigned int)(c - '0') <= 9; + return (unsigned int)(c - '0') < 10; +} + +static __attribute__((unused)) +int isgraph(int c) +{ + /* 0x21..0x7e */ + return (unsigned int)(c - 0x21) < 0x5e; +} + +static __attribute__((unused)) +int islower(int c) +{ + return (unsigned int)(c - 'a') < 26; +} + +static __attribute__((unused)) +int isprint(int c) +{ + /* 0x20..0x7e */ + return (unsigned int)(c - 0x20) < 0x5f; +} + +static __attribute__((unused)) +int isspace(int c) +{ + /* \t is 0x9, \n is 0xA, \v is 0xB, \f is 0xC, \r is 0xD */ + return ((unsigned int)c == ' ') || (unsigned int)(c - 0x09) < 5; +} + +static __attribute__((unused)) +int isupper(int c) +{ + return (unsigned int)(c - 'A') < 26; +} + +static __attribute__((unused)) +int isxdigit(int c) +{ + return isdigit(c) || (unsigned int)(c - 'A') < 6 || (unsigned int)(c - 'a') < 6; +} + +static __attribute__((unused)) +int isalpha(int c) +{ + return islower(c) || isupper(c); +} + +static __attribute__((unused)) +int isalnum(int c) +{ + return isalpha(c) || isdigit(c); +} + +static __attribute__((unused)) +int ispunct(int c) +{ + return isgraph(c) && !isalnum(c); } #endif /* _NOLIBC_CTYPE_H */ From 8cb98b3fce152e8ba46d1e25515deab08d7ec271 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 13 Feb 2022 09:52:10 +0100 Subject: [PATCH 091/155] tools/nolibc/types: move the FD_* functions to macros in types.h FD_SET, FD_CLR, FD_ISSET, FD_ZERO are often expected to be macros and not functions. In addition we already have a file dedicated to such macros and types used by syscalls, it's types.h, so let's move them there and turn them to macros. FD_CLR() and FD_ISSET() were missing, so they were added. FD_ZERO() now deals with its own loop so that it doesn't rely on memset() that sets one byte at a time. Cc: David Laight Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 14 -------------- tools/include/nolibc/types.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index c96c6cb7f3ae..2267d98337ea 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -118,20 +118,6 @@ const char *ltoa(long in) /* Here come a few helper functions */ -static __attribute__((unused)) -void FD_ZERO(fd_set *set) -{ - memset(set, 0, sizeof(*set)); -} - -static __attribute__((unused)) -void FD_SET(int fd, fd_set *set) -{ - if (fd < 0 || fd >= FD_SETSIZE) - return; - set->fd32[fd / 32] |= 1 << (fd & 31); -} - /* WARNING, it only deals with the 4096 first majors and 256 first minors */ static __attribute__((unused)) dev_t makedev(unsigned int major, unsigned int minor) diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index 2f09abaf95f1..a4dda0a22fc2 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -75,6 +75,36 @@ typedef struct { uint32_t fd32[FD_SETSIZE / 32]; } fd_set; +#define FD_CLR(fd, set) do { \ + fd_set *__set = (set); \ + int __fd = (fd); \ + if (__fd >= 0) \ + __set->fd32[__fd / 32] &= ~(1U << (__fd & 31)); \ + } while (0) + +#define FD_SET(fd, set) do { \ + fd_set *__set = (set); \ + int __fd = (fd); \ + if (__fd >= 0) \ + __set->fd32[__fd / 32] |= 1U << (__fd & 31); \ + } while (0) + +#define FD_ISSET(fd, set) ({ \ + fd_set *__set = (set); \ + int __fd = (fd); \ + int __r = 0; \ + if (__fd >= 0) \ + __r = !!(__set->fd32[__fd / 32] & 1U << (__fd & 31)); \ + __r; \ + }) + +#define FD_ZERO(set) do { \ + fd_set *__set = (set); \ + int __idx; \ + for (__idx = 0; __idx < FD_SETSIZE / 32; __idx ++) \ + __set->fd32[__idx] = 0; \ + } while (0) + /* for poll() */ struct pollfd { int fd; From 306c9fd4c686eebf4e0487bc4ad5dca8e68c19be Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 13 Feb 2022 09:53:01 +0100 Subject: [PATCH 092/155] tools/nolibc/types: make FD_SETSIZE configurable The macro was hard-coded to 256 but it's common to see it redefined. Let's support this and make sure we always allocate enough entries for the cases where it wouldn't be multiple of 32. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/types.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index a4dda0a22fc2..563dbbad0496 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -45,7 +45,9 @@ #define DT_SOCK 0xc /* commonly an fd_set represents 256 FDs */ +#ifndef FD_SETSIZE #define FD_SETSIZE 256 +#endif /* Special FD used by all the *at functions */ #ifndef AT_FDCWD @@ -72,7 +74,7 @@ /* for select() */ typedef struct { - uint32_t fd32[FD_SETSIZE / 32]; + uint32_t fd32[(FD_SETSIZE + 31) / 32]; } fd_set; #define FD_CLR(fd, set) do { \ @@ -101,7 +103,7 @@ typedef struct { #define FD_ZERO(set) do { \ fd_set *__set = (set); \ int __idx; \ - for (__idx = 0; __idx < FD_SETSIZE / 32; __idx ++) \ + for (__idx = 0; __idx < (FD_SETSIZE+31) / 32; __idx ++) \ __set->fd32[__idx] = 0; \ } while (0) From eba6d00d38e7c26dd5b948ed75200d05f67d8266 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:25 +0100 Subject: [PATCH 093/155] tools/nolibc/types: move makedev to types.h and make it a macro The makedev() man page says it's supposed to be a macro and that some OSes have it with the other ones in sys/types.h so it now makes sense to move it to types.h as a macro. Let's also define major() and minor() that perform the reverse operation. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 9 --------- tools/include/nolibc/types.h | 5 +++++ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 2267d98337ea..23fb81414b1b 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -116,13 +116,4 @@ const char *ltoa(long in) return pos + 1; } -/* Here come a few helper functions */ - -/* WARNING, it only deals with the 4096 first majors and 256 first minors */ -static __attribute__((unused)) -dev_t makedev(unsigned int major, unsigned int minor) -{ - return ((major & 0xfff) << 8) | (minor & 0xff); -} - #endif /* _NOLIBC_H */ diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index 563dbbad0496..b1bff5e717b6 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -162,4 +162,9 @@ struct stat { time_t st_ctime; /* time of last status change */ }; +/* WARNING, it only deals with the 4096 first majors and 256 first minors */ +#define makedev(major, minor) ((dev_t)((((major) & 0xfff) << 8) | ((minor) & 0xff))) +#define major(dev) ((unsigned int)(((dev) >> 8) & 0xfff)) +#define minor(dev) ((unsigned int)(((dev) & 0xff)) + #endif /* _NOLIBC_TYPES_H */ From 56d68a3c1f41ca0843fd9151654c35f4925d911b Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:26 +0100 Subject: [PATCH 094/155] tools/nolibc/stdlib: move ltoa() to stdlib.h This function is not standard and performs the opposite of atol(). Let's move it with atol(). It's been split between a reentrant function and one using a static buffer. There's no more definition in nolibc.h anymore now. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 22 ---------------------- tools/include/nolibc/stdlib.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 23fb81414b1b..a349c88c45ff 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -94,26 +94,4 @@ /* Used by programs to avoid std includes */ #define NOLIBC -static __attribute__((unused)) -const char *ltoa(long in) -{ - /* large enough for -9223372036854775808 */ - static char buffer[21]; - char *pos = buffer + sizeof(buffer) - 1; - int neg = in < 0; - unsigned long n = neg ? -in : in; - - *pos-- = '\0'; - do { - *pos-- = '0' + n % 10; - n /= 10; - if (pos < buffer) - return pos + 1; - } while (n); - - if (neg) - *pos-- = '-'; - return pos + 1; -} - #endif /* _NOLIBC_H */ diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index 09a506aadbbe..84fc4353fb01 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -45,6 +45,38 @@ int atoi(const char *s) return atol(s); } +/* performs the opposite of atol() using a user-fed buffer. The buffer must be + * at least 21 bytes long (large enough for "-9223372036854775808"). + */ +static __attribute__((unused)) +const char *ltoa_r(long in, char *buffer) +{ + char *pos = buffer + 21 - 1; + int neg = in < 0; + unsigned long n = neg ? -in : in; + + *pos-- = '\0'; + do { + *pos-- = '0' + n % 10; + n /= 10; + if (pos < buffer) + return pos + 1; + } while (n); + + if (neg) + *pos-- = '-'; + return pos + 1; +} + +/* performs the opposite of atol() using a statically allocated buffer */ +static __attribute__((unused)) +const char *ltoa(long in) +{ + /* large enough for -9223372036854775808 */ + static char buffer[21]; + return ltoa_r(in, buffer); +} + static __attribute__((unused)) int msleep(unsigned int msecs) { From 66c397c4d2e15871c50940c168b7d4a76aaa08a9 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:27 +0100 Subject: [PATCH 095/155] tools/nolibc/stdlib: replace the ltoa() function with more efficient ones The original ltoa() function and the reentrant one ltoa_r() present a number of drawbacks. The divide by 10 generates calls to external code from libgcc_s, and the number does not necessarily start at the beginning of the buffer. Let's rewrite these functions so that they do not involve a divide and only use loops on powers of 10, and implement both signed and unsigned variants, always starting from the buffer's first character. Instead of using a static buffer for each function, we're now using a common one. In order to avoid confusion with the ltoa() name, the new functions are called itoa_r() and utoa_r() to distinguish the signed and unsigned versions, and for convenience for their callers, these functions now reutrn the number of characters emitted. The ltoa_r() function is just an inline mapping to the signed one and which returns the buffer. The functions are quite small (86 bytes on x86_64, 68 on armv7) and do not depend anymore on external code. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdlib.h | 107 +++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 20 deletions(-) diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index 84fc4353fb01..dbb45631c7ca 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -12,6 +12,13 @@ #include "types.h" #include "sys.h" + +/* Buffer used to store int-to-ASCII conversions. Will only be implemented if + * any of the related functions is implemented. The area is large enough to + * store "18446744073709551615" or "-9223372036854775808" and the final zero. + */ +static __attribute__((unused)) char itoa_buffer[21]; + /* * As much as possible, please keep functions alphabetically sorted. */ @@ -45,36 +52,96 @@ int atoi(const char *s) return atol(s); } -/* performs the opposite of atol() using a user-fed buffer. The buffer must be - * at least 21 bytes long (large enough for "-9223372036854775808"). +/* Converts the unsigned long integer to its string representation into + * buffer , which must be long enough to store the number and the + * trailing zero (21 bytes for 18446744073709551615 in 64-bit, 11 for + * 4294967295 in 32-bit). The buffer is filled from the first byte, and the + * number of characters emitted (not counting the trailing zero) is returned. + * The function is constructed in a way to optimize the code size and avoid + * any divide that could add a dependency on large external functions. */ static __attribute__((unused)) -const char *ltoa_r(long in, char *buffer) +int utoa_r(unsigned long in, char *buffer) { - char *pos = buffer + 21 - 1; - int neg = in < 0; - unsigned long n = neg ? -in : in; + unsigned long lim; + int digits = 0; + int pos = (~0UL > 0xfffffffful) ? 19 : 9; + int dig; - *pos-- = '\0'; do { - *pos-- = '0' + n % 10; - n /= 10; - if (pos < buffer) - return pos + 1; - } while (n); + for (dig = 0, lim = 1; dig < pos; dig++) + lim *= 10; - if (neg) - *pos-- = '-'; - return pos + 1; + if (digits || in >= lim || !pos) { + for (dig = 0; in >= lim; dig++) + in -= lim; + buffer[digits++] = '0' + dig; + } + } while (pos--); + + buffer[digits] = 0; + return digits; } -/* performs the opposite of atol() using a statically allocated buffer */ +/* Converts the signed long integer to its string representation into + * buffer , which must be long enough to store the number and the + * trailing zero (21 bytes for -9223372036854775808 in 64-bit, 12 for + * -2147483648 in 32-bit). The buffer is filled from the first byte, and the + * number of characters emitted (not counting the trailing zero) is returned. + */ static __attribute__((unused)) -const char *ltoa(long in) +int itoa_r(long in, char *buffer) { - /* large enough for -9223372036854775808 */ - static char buffer[21]; - return ltoa_r(in, buffer); + char *ptr = buffer; + int len = 0; + + if (in < 0) { + in = -in; + *(ptr++) = '-'; + len++; + } + len += utoa_r(in, ptr); + return len; +} + +/* for historical compatibility, same as above but returns the pointer to the + * buffer. + */ +static inline __attribute__((unused)) +char *ltoa_r(long in, char *buffer) +{ + itoa_r(in, buffer); + return buffer; +} + +/* converts long integer to a string using the static itoa_buffer and + * returns the pointer to that string. + */ +static inline __attribute__((unused)) +char *itoa(long in) +{ + itoa_r(in, itoa_buffer); + return itoa_buffer; +} + +/* converts long integer to a string using the static itoa_buffer and + * returns the pointer to that string. Same as above, for compatibility. + */ +static inline __attribute__((unused)) +char *ltoa(long in) +{ + itoa_r(in, itoa_buffer); + return itoa_buffer; +} + +/* converts unsigned long integer to a string using the static itoa_buffer + * and returns the pointer to that string. + */ +static inline __attribute__((unused)) +char *utoa(unsigned long in) +{ + utoa_r(in, itoa_buffer); + return itoa_buffer; } static __attribute__((unused)) From b1c21e7d99cdab987fb858679fbc012868caac40 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:28 +0100 Subject: [PATCH 096/155] tools/nolibc/stdlib: add i64toa() and u64toa() These are 64-bit variants of the itoa() and utoa() functions. They also support reentrant ones, and use the same itoa_buffer. The functions are a bit larger than the previous ones in 32-bit mode (86 and 98 bytes on x86_64 and armv7 respectively), which is why we continue to provide them as separate functions. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdlib.h | 72 +++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index dbb45631c7ca..d972871bf2ba 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -144,6 +144,78 @@ char *utoa(unsigned long in) return itoa_buffer; } +/* Converts the unsigned 64-bit integer to its string representation into + * buffer , which must be long enough to store the number and the + * trailing zero (21 bytes for 18446744073709551615). The buffer is filled from + * the first byte, and the number of characters emitted (not counting the + * trailing zero) is returned. The function is constructed in a way to optimize + * the code size and avoid any divide that could add a dependency on large + * external functions. + */ +static __attribute__((unused)) +int u64toa_r(uint64_t in, char *buffer) +{ + unsigned long long lim; + int digits = 0; + int pos = 19; /* start with the highest possible digit */ + int dig; + + do { + for (dig = 0, lim = 1; dig < pos; dig++) + lim *= 10; + + if (digits || in >= lim || !pos) { + for (dig = 0; in >= lim; dig++) + in -= lim; + buffer[digits++] = '0' + dig; + } + } while (pos--); + + buffer[digits] = 0; + return digits; +} + +/* Converts the signed 64-bit integer to its string representation into + * buffer , which must be long enough to store the number and the + * trailing zero (21 bytes for -9223372036854775808). The buffer is filled from + * the first byte, and the number of characters emitted (not counting the + * trailing zero) is returned. + */ +static __attribute__((unused)) +int i64toa_r(int64_t in, char *buffer) +{ + char *ptr = buffer; + int len = 0; + + if (in < 0) { + in = -in; + *(ptr++) = '-'; + len++; + } + len += u64toa_r(in, ptr); + return len; +} + +/* converts int64_t to a string using the static itoa_buffer and returns + * the pointer to that string. + */ +static inline __attribute__((unused)) +char *i64toa(int64_t in) +{ + i64toa_r(in, itoa_buffer); + return itoa_buffer; +} + +/* converts uint64_t to a string using the static itoa_buffer and returns + * the pointer to that string. + */ +static inline __attribute__((unused)) +char *u64toa(uint64_t in) +{ + u64toa_r(in, itoa_buffer); + return itoa_buffer; +} + static __attribute__((unused)) int msleep(unsigned int msecs) { From 5f493178ef3187b939d3068563e5da6045085c2a Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:29 +0100 Subject: [PATCH 097/155] tools/nolibc/stdlib: add utoh() and u64toh() This adds a pair of functions to emit hex values. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdlib.h | 80 +++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index d972871bf2ba..82a4cf606d3c 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -52,6 +52,46 @@ int atoi(const char *s) return atol(s); } +/* Converts the unsigned long integer to its hex representation into + * buffer , which must be long enough to store the number and the + * trailing zero (17 bytes for "ffffffffffffffff" or 9 for "ffffffff"). The + * buffer is filled from the first byte, and the number of characters emitted + * (not counting the trailing zero) is returned. The function is constructed + * in a way to optimize the code size and avoid any divide that could add a + * dependency on large external functions. + */ +static __attribute__((unused)) +int utoh_r(unsigned long in, char *buffer) +{ + signed char pos = (~0UL > 0xfffffffful) ? 60 : 28; + int digits = 0; + int dig; + + do { + dig = in >> pos; + in -= (uint64_t)dig << pos; + pos -= 4; + if (dig || digits || pos < 0) { + if (dig > 9) + dig += 'a' - '0' - 10; + buffer[digits++] = '0' + dig; + } + } while (pos >= 0); + + buffer[digits] = 0; + return digits; +} + +/* converts unsigned long to an hex string using the static itoa_buffer + * and returns the pointer to that string. + */ +static inline __attribute__((unused)) +char *utoh(unsigned long in) +{ + utoh_r(in, itoa_buffer); + return itoa_buffer; +} + /* Converts the unsigned long integer to its string representation into * buffer , which must be long enough to store the number and the * trailing zero (21 bytes for 18446744073709551615 in 64-bit, 11 for @@ -144,6 +184,46 @@ char *utoa(unsigned long in) return itoa_buffer; } +/* Converts the unsigned 64-bit integer to its hex representation into + * buffer , which must be long enough to store the number and the + * trailing zero (17 bytes for "ffffffffffffffff"). The buffer is filled from + * the first byte, and the number of characters emitted (not counting the + * trailing zero) is returned. The function is constructed in a way to optimize + * the code size and avoid any divide that could add a dependency on large + * external functions. + */ +static __attribute__((unused)) +int u64toh_r(uint64_t in, char *buffer) +{ + signed char pos = 60; + int digits = 0; + int dig; + + do { + dig = in >> pos; + in -= (uint64_t)dig << pos; + pos -= 4; + if (dig || digits || pos < 0) { + if (dig > 9) + dig += 'a' - '0' - 10; + buffer[digits++] = '0' + dig; + } + } while (pos >= 0); + + buffer[digits] = 0; + return digits; +} + +/* converts uint64_t to an hex string using the static itoa_buffer and + * returns the pointer to that string. + */ +static inline __attribute__((unused)) +char *u64toh(uint64_t in) +{ + u64toh_r(in, itoa_buffer); + return itoa_buffer; +} + /* Converts the unsigned 64-bit integer to its string representation into * buffer , which must be long enough to store the number and the * trailing zero (21 bytes for 18446744073709551615). The buffer is filled from From 4e383a66acfe16827ce7fbc0e60c56782a83fc92 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:30 +0100 Subject: [PATCH 098/155] tools/nolibc/stdio: add a minimal set of stdio functions This only provides getchar(), putchar(), and puts(). Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 1 + tools/include/nolibc/stdio.h | 57 +++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 tools/include/nolibc/stdio.h diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index a349c88c45ff..7eaa09fe9f4d 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -88,6 +88,7 @@ #include "types.h" #include "sys.h" #include "ctype.h" +#include "stdio.h" #include "stdlib.h" #include "string.h" diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h new file mode 100644 index 000000000000..4c6af3016e2e --- /dev/null +++ b/tools/include/nolibc/stdio.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * minimal stdio function definitions for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau + */ + +#ifndef _NOLIBC_STDIO_H +#define _NOLIBC_STDIO_H + +#include "std.h" +#include "arch.h" +#include "types.h" +#include "sys.h" +#include "stdlib.h" +#include "string.h" + +#ifndef EOF +#define EOF (-1) +#endif + +static __attribute__((unused)) +int getchar(void) +{ + unsigned char ch; + + if (read(0, &ch, 1) <= 0) + return EOF; + return ch; +} + +static __attribute__((unused)) +int putchar(int c) +{ + unsigned char ch = c; + + if (write(1, &ch, 1) <= 0) + return EOF; + return ch; +} + +static __attribute__((unused)) +int puts(const char *s) +{ + size_t len = strlen(s); + ssize_t ret; + + while (len > 0) { + ret = write(1, s, len); + if (ret <= 0) + return EOF; + s += ret; + len -= ret; + } + return putchar('\n'); +} + +#endif /* _NOLIBC_STDIO_H */ From 99b037cbd5a22e34202d32aad15bcfa1c06d1d80 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:31 +0100 Subject: [PATCH 099/155] tools/nolibc/stdio: add stdin/stdout/stderr and fget*/fput* functions The standard puts() function always emits the trailing LF which makes it unconvenient for small string concatenation. fputs() ought to be used instead but it requires a FILE*. This adds 3 dummy FILE* values (stdin, stdout, stderr) which are in fact pointers to struct FILE of one byte. We reserve 3 pointer values for them, -3, -2 and -1, so that they are ordered, easing the tests and mapping to integer. >From this, fgetc(), fputc(), fgets() and fputs() were implemented, and the previous putchar() and getchar() now remap to these. The standard getc() and putc() macros were also implemented as pointing to these ones. There is absolutely no buffering, fgetc() and fgets() read one byte at a time, fputc() writes one byte at a time, and only fputs() which knows the string's length writes all of it at once. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdio.h | 101 +++++++++++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 9 deletions(-) diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 4c6af3016e2e..149c5ca59aad 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -18,12 +18,61 @@ #define EOF (-1) #endif +/* just define FILE as a non-empty type */ +typedef struct FILE { + char dummy[1]; +} FILE; + +/* We define the 3 common stdio files as constant invalid pointers that + * are easily recognized. + */ +static __attribute__((unused)) FILE* const stdin = (FILE*)-3; +static __attribute__((unused)) FILE* const stdout = (FILE*)-2; +static __attribute__((unused)) FILE* const stderr = (FILE*)-1; + +/* getc(), fgetc(), getchar() */ + +#define getc(stream) fgetc(stream) + +static __attribute__((unused)) +int fgetc(FILE* stream) +{ + unsigned char ch; + int fd; + + if (stream < stdin || stream > stderr) + return EOF; + + fd = 3 + (long)stream; + + if (read(fd, &ch, 1) <= 0) + return EOF; + return ch; +} + static __attribute__((unused)) int getchar(void) { - unsigned char ch; + return fgetc(stdin); +} - if (read(0, &ch, 1) <= 0) + +/* putc(), fputc(), putchar() */ + +#define putc(c, stream) fputc(c, stream) + +static __attribute__((unused)) +int fputc(int c, FILE* stream) +{ + unsigned char ch = c; + int fd; + + if (stream < stdin || stream > stderr) + return EOF; + + fd = 3 + (long)stream; + + if (write(fd, &ch, 1) <= 0) return EOF; return ch; } @@ -31,27 +80,61 @@ int getchar(void) static __attribute__((unused)) int putchar(int c) { - unsigned char ch = c; - - if (write(1, &ch, 1) <= 0) - return EOF; - return ch; + return fputc(c, stdout); } + +/* puts(), fputs(). Note that puts() emits '\n' but not fputs(). */ + static __attribute__((unused)) -int puts(const char *s) +int fputs(const char *s, FILE *stream) { size_t len = strlen(s); ssize_t ret; + int fd; + + if (stream < stdin || stream > stderr) + return EOF; + + fd = 3 + (long)stream; while (len > 0) { - ret = write(1, s, len); + ret = write(fd, s, len); if (ret <= 0) return EOF; s += ret; len -= ret; } + return 0; +} + +static __attribute__((unused)) +int puts(const char *s) +{ + if (fputs(s, stdout) == EOF) + return EOF; return putchar('\n'); } + +/* fgets() */ +static __attribute__((unused)) +char *fgets(char *s, int size, FILE *stream) +{ + int ofs; + int c; + + for (ofs = 0; ofs + 1 < size;) { + c = fgetc(stream); + if (c == EOF) + break; + s[ofs++] = c; + if (c == '\n') + break; + } + if (ofs < size) + s[ofs] = 0; + return ofs ? s : NULL; +} + #endif /* _NOLIBC_STDIO_H */ From e3e19052d54db8d492b8f82f8516139469bad5d3 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:32 +0100 Subject: [PATCH 100/155] tools/nolibc/stdio: add fwrite() to stdio We'll use it to write substrings. It relies on a simpler _fwrite() that only takes one size. fputs() was also modified to rely on it. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdio.h | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 149c5ca59aad..996bf89a30d2 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -84,12 +84,14 @@ int putchar(int c) } -/* puts(), fputs(). Note that puts() emits '\n' but not fputs(). */ +/* fwrite(), puts(), fputs(). Note that puts() emits '\n' but not fputs(). */ +/* internal fwrite()-like function which only takes a size and returns 0 on + * success or EOF on error. It automatically retries on short writes. + */ static __attribute__((unused)) -int fputs(const char *s, FILE *stream) +int _fwrite(const void *buf, size_t size, FILE *stream) { - size_t len = strlen(s); ssize_t ret; int fd; @@ -98,16 +100,35 @@ int fputs(const char *s, FILE *stream) fd = 3 + (long)stream; - while (len > 0) { - ret = write(fd, s, len); + while (size) { + ret = write(fd, buf, size); if (ret <= 0) return EOF; - s += ret; - len -= ret; + size -= ret; + buf += ret; } return 0; } +static __attribute__((unused)) +size_t fwrite(const void *s, size_t size, size_t nmemb, FILE *stream) +{ + size_t written; + + for (written = 0; written < nmemb; written++) { + if (_fwrite(s, size, stream) != 0) + break; + s += size; + } + return written; +} + +static __attribute__((unused)) +int fputs(const char *s, FILE *stream) +{ + return _fwrite(s, strlen(s), stream); +} + static __attribute__((unused)) int puts(const char *s) { From 7e4346f4a3a611a6233ba388080c1426545da4fc Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:33 +0100 Subject: [PATCH 101/155] tools/nolibc/stdio: add a minimal [vf]printf() implementation This adds a minimal vfprintf() implementation as well as the commonly used fprintf() and printf() that rely on it. For now the function supports: - formats: %s, %c, %u, %d, %x - modifiers: %l and %ll - unknown chars are considered as modifiers and are ignored It is designed to remain minimalist, despite this printf() is 549 bytes on x86_64. It would be wise not to add too many formats. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdio.h | 128 +++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 996bf89a30d2..a73cf24cb68d 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -7,6 +7,8 @@ #ifndef _NOLIBC_STDIO_H #define _NOLIBC_STDIO_H +#include + #include "std.h" #include "arch.h" #include "types.h" @@ -158,4 +160,130 @@ char *fgets(char *s, int size, FILE *stream) return ofs ? s : NULL; } + +/* minimal vfprintf(). It supports the following formats: + * - %[l*]{d,u,c,x} + * - %s + * - unknown modifiers are ignored. + */ +static __attribute__((unused)) +int vfprintf(FILE *stream, const char *fmt, va_list args) +{ + char escape, lpref, c; + unsigned long long v; + unsigned int written; + size_t len, ofs; + char tmpbuf[21]; + const char *outstr; + + written = ofs = escape = lpref = 0; + while (1) { + c = fmt[ofs++]; + + if (escape) { + /* we're in an escape sequence, ofs == 1 */ + escape = 0; + if (c == 'c' || c == 'd' || c == 'u' || c == 'x') { + if (lpref) { + if (lpref > 1) + v = va_arg(args, unsigned long long); + else + v = va_arg(args, unsigned long); + } else + v = va_arg(args, unsigned int); + + if (c == 'd') { + /* sign-extend the value */ + if (lpref == 0) + v = (long long)(int)v; + else if (lpref == 1) + v = (long long)(long)v; + } + + switch (c) { + case 'd': + i64toa_r(v, tmpbuf); + break; + case 'u': + u64toa_r(v, tmpbuf); + break; + case 'x': + u64toh_r(v, tmpbuf); + break; + default: /* 'c' */ + tmpbuf[0] = v; + tmpbuf[1] = 0; + break; + } + outstr = tmpbuf; + } + else if (c == 's') { + outstr = va_arg(args, char *); + } + else if (c == '%') { + /* queue it verbatim */ + continue; + } + else { + /* modifiers or final 0 */ + if (c == 'l') { + /* long format prefix, maintain the escape */ + lpref++; + } + escape = 1; + goto do_escape; + } + len = strlen(outstr); + goto flush_str; + } + + /* not an escape sequence */ + if (c == 0 || c == '%') { + /* flush pending data on escape or end */ + escape = 1; + lpref = 0; + outstr = fmt; + len = ofs - 1; + flush_str: + if (_fwrite(outstr, len, stream) != 0) + break; + + written += len; + do_escape: + if (c == 0) + break; + fmt += ofs; + ofs = 0; + continue; + } + + /* literal char, just queue it */ + } + return written; +} + +static __attribute__((unused)) +int fprintf(FILE *stream, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = vfprintf(stream, fmt, args); + va_end(args); + return ret; +} + +static __attribute__((unused)) +int printf(const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = vfprintf(stdout, fmt, args); + va_end(args); + return ret; +} + #endif /* _NOLIBC_STDIO_H */ From 51469d5ab38fd1ac2182da8cd49eea3420b8000b Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:34 +0100 Subject: [PATCH 102/155] tools/nolibc/types: define EXIT_SUCCESS and EXIT_FAILURE These ones are found in some examples found in man pages and ease portability tests. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/types.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index b1bff5e717b6..e0749dc0bd06 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -71,6 +71,9 @@ #define WEXITSTATUS(status) (((status) & 0xff00) >> 8) #define WIFEXITED(status) (((status) & 0x7f) == 0) +/* standard exit() codes */ +#define EXIT_SUCCESS 0 +#define EXIT_FAILURE 1 /* for select() */ typedef struct { From acab7bcdb1bc14d5a6a0c3c1d2b9bd681172cf47 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:35 +0100 Subject: [PATCH 103/155] tools/nolibc/stdio: add perror() to report the errno value It doesn't contain the text for the error codes, but instead displays "errno=" followed by the errno value. Just like the regular errno, if a non-empty message is passed, it's placed followed with ": " on the output before the errno code. The message is emitted on stderr. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdio.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index a73cf24cb68d..5f1cf32470d3 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -286,4 +286,10 @@ int printf(const char *fmt, ...) return ret; } +static __attribute__((unused)) +void perror(const char *msg) +{ + fprintf(stderr, "%s%serrno=%d\n", (msg && *msg) ? msg : "", (msg && *msg) ? ": " : "", errno); +} + #endif /* _NOLIBC_STDIO_H */ From a7604ba149e76d0449484116e7bf9cd0c26dafb2 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:36 +0100 Subject: [PATCH 104/155] tools/nolibc/sys: make open() take a vararg on the 3rd argument Let's pass a vararg to open() so that it remains compatible with existing code. The arg is only dereferenced when flags contain O_CREAT. The function is generally not inlined anymore, causing an extra call (total 16 extra bytes) but it's still optimized for constant propagation, limiting the excess to no more than 16 bytes in practice when open() is called without O_CREAT, and ~40 with O_CREAT, which remains reasonable. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/sys.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 98689f668ed3..539af457a91b 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -7,6 +7,7 @@ #ifndef _NOLIBC_SYS_H #define _NOLIBC_SYS_H +#include #include "std.h" /* system includes */ @@ -719,7 +720,7 @@ int mount(const char *src, const char *tgt, /* - * int open(const char *path, int flags, mode_t mode); + * int open(const char *path, int flags[, mode_t mode]); */ static __attribute__((unused)) @@ -735,9 +736,20 @@ int sys_open(const char *path, int flags, mode_t mode) } static __attribute__((unused)) -int open(const char *path, int flags, mode_t mode) +int open(const char *path, int flags, ...) { - int ret = sys_open(path, flags, mode); + mode_t mode = 0; + int ret; + + if (flags & O_CREAT) { + va_list args; + + va_start(args, flags); + mode = va_arg(args, mode_t); + va_end(args); + } + + ret = sys_open(path, flags, mode); if (ret < 0) { SET_ERRNO(-ret); From ac90226d53051c1ae0f0e1b71596fb038ddb6cf6 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:37 +0100 Subject: [PATCH 105/155] tools/nolibc/stdlib: avoid a 64-bit shift in u64toh_r() The build of printf() on mips requires libgcc for functions __ashldi3 and __lshrdi3 due to 64-bit shifts when scanning the input number. These are not really needed in fact since we scan the number 4 bits at a time. Let's arrange the loop to perform two 32-bit shifts instead on 32-bit platforms. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdlib.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index 82a4cf606d3c..db47362a750f 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -200,14 +200,18 @@ int u64toh_r(uint64_t in, char *buffer) int dig; do { - dig = in >> pos; - in -= (uint64_t)dig << pos; - pos -= 4; - if (dig || digits || pos < 0) { - if (dig > 9) - dig += 'a' - '0' - 10; - buffer[digits++] = '0' + dig; + if (sizeof(long) >= 8) { + dig = (in >> pos) & 0xF; + } else { + /* 32-bit platforms: avoid a 64-bit shift */ + uint32_t d = (pos >= 32) ? (in >> 32) : in; + dig = (d >> (pos & 31)) & 0xF; } + if (dig > 9) + dig += 'a' - '0' - 10; + pos -= 4; + if (dig || digits || pos < 0) + buffer[digits++] = '0' + dig; } while (pos >= 0); buffer[digits] = 0; From 6e277371a5c4c73deb799aabfd5613ee3758e810 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:38 +0100 Subject: [PATCH 106/155] tools/nolibc/stdlib: make raise() use the lower level syscalls only raise() doesn't set errno, so there's no point calling kill(), better call sys_kill(), which also reduces the function's size. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdlib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index db47362a750f..4cc1fdf6791e 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -317,7 +317,7 @@ int msleep(unsigned int msecs) __attribute__((weak,unused)) int raise(int signal) { - return kill(getpid(), signal); + return sys_kill(sys_getpid(), signal); } static __attribute__((unused)) From 830acd088edc1604ee46916188491ef634441fc6 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:39 +0100 Subject: [PATCH 107/155] tools/nolibc/sys: make getpgrp(), getpid(), gettid() not set errno These syscalls never fail so there is no need to extract and set errno for them. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/sys.h | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 539af457a91b..ef017cc0a580 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -458,13 +458,7 @@ pid_t sys_getpgrp(void) static __attribute__((unused)) pid_t getpgrp(void) { - pid_t ret = sys_getpgrp(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; + return sys_getpgrp(); } @@ -481,13 +475,7 @@ pid_t sys_getpid(void) static __attribute__((unused)) pid_t getpid(void) { - pid_t ret = sys_getpid(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; + return sys_getpid(); } @@ -504,13 +492,7 @@ pid_t sys_gettid(void) static __attribute__((unused)) pid_t gettid(void) { - pid_t ret = sys_gettid(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; + return sys_gettid(); } From d8dcc2d8d93e5d4263ca9f8c5bdfb713f82fe923 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:40 +0100 Subject: [PATCH 108/155] tools/nolibc/string: use unidirectional variants for memcpy() Till now memcpy() relies on memmove(), but it's always included for libgcc, so we have a larger than needed function. Let's implement two unidirectional variants to copy from bottom to top and from top to bottom, and use the former for memcpy(). The variants are optimized to be compact, and at the same time the compiler is sometimes able to detect the loop and to replace it with a "rep movsb". The new function is 24 bytes instead of 52 on x86_64. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/string.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index 8a23cda2d450..6d8fad7a92e6 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -25,6 +25,28 @@ int memcmp(const void *s1, const void *s2, size_t n) return c1; } +static __attribute__((unused)) +void *_nolibc_memcpy_up(void *dst, const void *src, size_t len) +{ + size_t pos = 0; + + while (pos < len) { + ((char *)dst)[pos] = ((const char *)src)[pos]; + pos++; + } + return dst; +} + +static __attribute__((unused)) +void *_nolibc_memcpy_down(void *dst, const void *src, size_t len) +{ + while (len) { + len--; + ((char *)dst)[len] = ((const char *)src)[len]; + } + return dst; +} + static __attribute__((unused)) void *memmove(void *dst, const void *src, size_t len) { @@ -42,7 +64,7 @@ void *memmove(void *dst, const void *src, size_t len) __attribute__((weak,unused)) void *memcpy(void *dst, const void *src, size_t len) { - return memmove(dst, src, len); + return _nolibc_memcpy_up(dst, src, len); } static __attribute__((unused)) From d76232ff8be662b8851e975d13d59cad4bf423d3 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:41 +0100 Subject: [PATCH 109/155] tools/nolibc/string: slightly simplify memmove() The direction test inside the loop was not always completely optimized, resulting in a larger than necessary function. This change adds a direction variable that is set out of the loop. Now the function is down to 48 bytes on x86, 32 on ARM and 68 on mips. It's worth noting that other approaches were attempted (including relying on the up and down functions) but they were only slightly beneficial on x86 and cost more on others. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/string.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index 6d8fad7a92e6..b831a02de83f 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -50,14 +50,22 @@ void *_nolibc_memcpy_down(void *dst, const void *src, size_t len) static __attribute__((unused)) void *memmove(void *dst, const void *src, size_t len) { - ssize_t pos = (dst <= src) ? -1 : (long)len; - void *ret = dst; + size_t dir, pos; - while (len--) { - pos += (dst <= src) ? 1 : -1; - ((char *)dst)[pos] = ((char *)src)[pos]; + pos = len; + dir = -1; + + if (dst < src) { + pos = -1; + dir = 1; } - return ret; + + while (len) { + pos += dir; + ((char *)dst)[pos] = ((const char *)src)[pos]; + len--; + } + return dst; } /* must be exported, as it's used by libgcc on ARM */ From b312eb0b8711dbfbe2b45681926eb553e8ac8de3 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:42 +0100 Subject: [PATCH 110/155] tools/nolibc/string: add strncpy() and strlcpy() These are minimal variants. strncpy() always fills the destination for chars, while strlcpy() copies no more than including the zero and returns the source's length. The respective sizes on various archs are: strncpy(): x86:0x1f mips:0x30 arm:0x20 strlcpy(): x86:0x17 mips:0x34 arm:0x1a Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/string.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index b831a02de83f..7c274efcdfae 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -121,6 +121,34 @@ size_t nolibc_strlen(const char *str) nolibc_strlen((str)); \ }) +static __attribute__((unused)) +size_t strlcpy(char *dst, const char *src, size_t size) +{ + size_t len; + char c; + + for (len = 0;;) { + c = src[len]; + if (len < size) + dst[len] = c; + if (!c) + break; + len++; + } + return len; +} + +static __attribute__((unused)) +char *strncpy(char *dst, const char *src, size_t size) +{ + size_t len; + + for (len = 0; len < size; len++) + if ((dst[len] = *src)) + src++; + return dst; +} + static __attribute__((unused)) char *strrchr(const char *s, int c) { From d9390de638cd9788090c6299273a41a8cfa0b499 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:43 +0100 Subject: [PATCH 111/155] tools/nolibc/string: add tiny versions of strncat() and strlcat() While these functions are often dangerous, forcing the user to work around their absence is often much worse. Let's provide small versions of each of them. The respective sizes in bytes on a few architectures are: strncat(): x86:0x33 mips:0x68 arm:0x3c strlcat(): x86:0x25 mips:0x4c arm:0x2c The two are quite different, and strncat() is even different from strncpy() in that it limits the amount of data it copies and will always terminate the output by one zero, while strlcat() will always limit the total output to the specified size and will put a zero if possible. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/string.h | 41 +++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index 7c274efcdfae..c550c9ba8f4c 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -121,6 +121,28 @@ size_t nolibc_strlen(const char *str) nolibc_strlen((str)); \ }) +static __attribute__((unused)) +size_t strlcat(char *dst, const char *src, size_t size) +{ + size_t len; + char c; + + for (len = 0; dst[len]; len++) + ; + + for (;;) { + c = *src; + if (len < size) + dst[len] = c; + if (!c) + break; + len++; + src++; + } + + return len; +} + static __attribute__((unused)) size_t strlcpy(char *dst, const char *src, size_t size) { @@ -138,6 +160,25 @@ size_t strlcpy(char *dst, const char *src, size_t size) return len; } +static __attribute__((unused)) +char *strncat(char *dst, const char *src, size_t size) +{ + char *orig = dst; + + while (*dst) + dst++; + + while (size && (*dst = *src)) { + src++; + dst++; + size--; + } + + *dst = 0; + return orig; +} + + static __attribute__((unused)) char *strncpy(char *dst, const char *src, size_t size) { From 07f47ea06fe9d38c5e8d9068fba2468ed8bb8b59 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:44 +0100 Subject: [PATCH 112/155] tools/nolibc: move exported functions to their own section Some functions like raise() and memcpy() are permanently exported because they're needed by libgcc on certain platforms. However most of the time they are not needed and needlessly take space. Let's move them to their own sub-section, called .text.nolibc_. This allows ld to get rid of them if unused when passed --gc-sections. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdlib.h | 2 +- tools/include/nolibc/string.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index 4cc1fdf6791e..da08ff30c15a 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -314,7 +314,7 @@ int msleep(unsigned int msecs) } /* This one is not marked static as it's needed by libgcc for divide by zero */ -__attribute__((weak,unused)) +__attribute__((weak,unused,section(".text.nolibc_raise"))) int raise(int signal) { return sys_kill(sys_getpid(), signal); diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index c550c9ba8f4c..c1661589cb3c 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -69,7 +69,7 @@ void *memmove(void *dst, const void *src, size_t len) } /* must be exported, as it's used by libgcc on ARM */ -__attribute__((weak,unused)) +__attribute__((weak,unused,section(".text.nolibc_memcpy"))) void *memcpy(void *dst, const void *src, size_t len) { return _nolibc_memcpy_up(dst, src, len); From dffeb81af5fe5eedccf5ea4a8a120d8c3accd26e Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:45 +0100 Subject: [PATCH 113/155] tools/nolibc/arch: mark the _start symbol as weak By doing so we can link together multiple C files that have been compiled with nolibc and which each have a _start symbol. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/arch-aarch64.h | 1 + tools/include/nolibc/arch-arm.h | 1 + tools/include/nolibc/arch-i386.h | 1 + tools/include/nolibc/arch-mips.h | 1 + tools/include/nolibc/arch-riscv.h | 1 + tools/include/nolibc/arch-x86_64.h | 1 + 6 files changed, 6 insertions(+) diff --git a/tools/include/nolibc/arch-aarch64.h b/tools/include/nolibc/arch-aarch64.h index 443de5fb7f54..87d9e434820c 100644 --- a/tools/include/nolibc/arch-aarch64.h +++ b/tools/include/nolibc/arch-aarch64.h @@ -183,6 +183,7 @@ struct sys_stat_struct { /* startup code */ asm(".section .text\n" + ".weak _start\n" ".global _start\n" "_start:\n" "ldr x0, [sp]\n" // argc (x0) was in the stack diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h index 66f687ad987f..001a3c8c9ad5 100644 --- a/tools/include/nolibc/arch-arm.h +++ b/tools/include/nolibc/arch-arm.h @@ -176,6 +176,7 @@ struct sys_stat_struct { /* startup code */ asm(".section .text\n" + ".weak _start\n" ".global _start\n" "_start:\n" #if defined(__THUMBEB__) || defined(__THUMBEL__) diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h index 32f42e2cee26..d7e4d53325a3 100644 --- a/tools/include/nolibc/arch-i386.h +++ b/tools/include/nolibc/arch-i386.h @@ -175,6 +175,7 @@ struct sys_stat_struct { * */ asm(".section .text\n" + ".weak _start\n" ".global _start\n" "_start:\n" "pop %eax\n" // argc (first arg, %eax) diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h index e330201dde6a..c9a6aac87c6d 100644 --- a/tools/include/nolibc/arch-mips.h +++ b/tools/include/nolibc/arch-mips.h @@ -190,6 +190,7 @@ struct sys_stat_struct { /* startup code, note that it's called __start on MIPS */ asm(".section .text\n" + ".weak __start\n" ".set nomips16\n" ".global __start\n" ".set noreorder\n" diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h index 9d5ff78f606b..bc10b7b5706d 100644 --- a/tools/include/nolibc/arch-riscv.h +++ b/tools/include/nolibc/arch-riscv.h @@ -184,6 +184,7 @@ struct sys_stat_struct { /* startup code */ asm(".section .text\n" + ".weak _start\n" ".global _start\n" "_start:\n" ".option push\n" diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h index 83c4b458ada7..fe517c16cd4d 100644 --- a/tools/include/nolibc/arch-x86_64.h +++ b/tools/include/nolibc/arch-x86_64.h @@ -198,6 +198,7 @@ struct sys_stat_struct { * */ asm(".section .text\n" + ".weak _start\n" ".global _start\n" "_start:\n" "pop %rdi\n" // argc (first arg, %rdi) From 023033fe343cdf2ba83ab762f8de69241c7fc086 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:46 +0100 Subject: [PATCH 114/155] tools/nolibc/types: define PATH_MAX and MAXPATHLEN These ones are often used and commonly set by applications to fallback values. Let's fix them both to agree on PATH_MAX=4096 by default, as is already present in linux/limits.h. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/types.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index e0749dc0bd06..e4026e740b56 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -49,6 +49,17 @@ #define FD_SETSIZE 256 #endif +/* PATH_MAX and MAXPATHLEN are often used and found with plenty of different + * values. + */ +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + +#ifndef MAXPATHLEN +#define MAXPATHLEN (PATH_MAX) +#endif + /* Special FD used by all the *at functions */ #ifndef AT_FDCWD #define AT_FDCWD (-100) From 8d304a3740232f018fca19d529cbc8d13afac755 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:47 +0100 Subject: [PATCH 115/155] tools/nolibc/string: export memset() and memmove() "clang -Os" and "gcc -Ofast" without -ffreestanding may ignore memset() and memmove(), hoping to provide their builtin equivalents, and finally not find them. Thus we must export these functions for these rare cases. Note that as they're set in their own sections, they will be eliminated by the linker if not used. In addition, they do not prevent gcc from identifying them and replacing them with the shorter "rep movsb" or "rep stosb" when relevant. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/string.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index c1661589cb3c..4554b6fcb400 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -47,7 +47,10 @@ void *_nolibc_memcpy_down(void *dst, const void *src, size_t len) return dst; } -static __attribute__((unused)) +/* might be ignored by the compiler without -ffreestanding, then found as + * missing. + */ +__attribute__((weak,unused,section(".text.nolibc_memmove"))) void *memmove(void *dst, const void *src, size_t len) { size_t dir, pos; @@ -75,7 +78,10 @@ void *memcpy(void *dst, const void *src, size_t len) return _nolibc_memcpy_up(dst, src, len); } -static __attribute__((unused)) +/* might be ignored by the compiler without -ffreestanding, then found as + * missing. + */ +__attribute__((weak,unused,section(".text.nolibc_memset"))) void *memset(void *dst, int b, size_t len) { char *p = dst; From 45a794bf7cee2988278802aeb64f7fc075f45f7f Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:48 +0100 Subject: [PATCH 116/155] tools/nolibc/errno: extract errno.h from sys.h This allows us to provide a minimal errno.h to ease porting applications that use it. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/errno.h | 27 +++++++++++++++++++++++++++ tools/include/nolibc/stdio.h | 1 + tools/include/nolibc/sys.h | 17 +---------------- 3 files changed, 29 insertions(+), 16 deletions(-) create mode 100644 tools/include/nolibc/errno.h diff --git a/tools/include/nolibc/errno.h b/tools/include/nolibc/errno.h new file mode 100644 index 000000000000..06893d6dfb7a --- /dev/null +++ b/tools/include/nolibc/errno.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Minimal errno definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau + */ + +#ifndef _NOLIBC_ERRNO_H +#define _NOLIBC_ERRNO_H + +#include + +/* this way it will be removed if unused */ +static int errno; + +#ifndef NOLIBC_IGNORE_ERRNO +#define SET_ERRNO(v) do { errno = (v); } while (0) +#else +#define SET_ERRNO(v) do { } while (0) +#endif + + +/* errno codes all ensure that they will not conflict with a valid pointer + * because they all correspond to the highest addressable memory page. + */ +#define MAX_ERRNO 4095 + +#endif /* _NOLIBC_ERRNO_H */ diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 5f1cf32470d3..cb4d3ab3a565 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -11,6 +11,7 @@ #include "std.h" #include "arch.h" +#include "errno.h" #include "types.h" #include "sys.h" #include "stdlib.h" diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index ef017cc0a580..28437863c63f 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -14,29 +14,14 @@ #include #include // for SIGCHLD #include -#include #include #include #include #include "arch.h" +#include "errno.h" #include "types.h" -/* this way it will be removed if unused */ -static int errno; - -#ifndef NOLIBC_IGNORE_ERRNO -#define SET_ERRNO(v) do { errno = (v); } while (0) -#else -#define SET_ERRNO(v) do { } while (0) -#endif - - -/* errno codes all ensure that they will not conflict with a valid pointer - * because they all correspond to the highest addressable memory page. - */ -#define MAX_ERRNO 4095 - /* Functions in this file only describe syscalls. They're declared static so * that the compiler usually decides to inline them while still being allowed From 4619de344657c23101e8976b816466910dfb2759 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:49 +0100 Subject: [PATCH 117/155] tools/nolibc/unistd: extract msleep(), sleep(), tcsetpgrp() to unistd.h These functions are normally provided by unistd.h. For ease of porting, let's create the file and move them there. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 1 + tools/include/nolibc/stdlib.h | 30 ----------------------- tools/include/nolibc/unistd.h | 46 +++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 30 deletions(-) create mode 100644 tools/include/nolibc/unistd.h diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 7eaa09fe9f4d..686726518431 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -91,6 +91,7 @@ #include "stdio.h" #include "stdlib.h" #include "string.h" +#include "unistd.h" /* Used by programs to avoid std includes */ #define NOLIBC diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index da08ff30c15a..0e6bca4ee089 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -300,19 +300,6 @@ char *u64toa(uint64_t in) return itoa_buffer; } -static __attribute__((unused)) -int msleep(unsigned int msecs) -{ - struct timeval my_timeval = { msecs / 1000, (msecs % 1000) * 1000 }; - - if (sys_select(0, 0, 0, 0, &my_timeval) < 0) - return (my_timeval.tv_sec * 1000) + - (my_timeval.tv_usec / 1000) + - !!(my_timeval.tv_usec % 1000); - else - return 0; -} - /* This one is not marked static as it's needed by libgcc for divide by zero */ __attribute__((weak,unused,section(".text.nolibc_raise"))) int raise(int signal) @@ -320,21 +307,4 @@ int raise(int signal) return sys_kill(sys_getpid(), signal); } -static __attribute__((unused)) -unsigned int sleep(unsigned int seconds) -{ - struct timeval my_timeval = { seconds, 0 }; - - if (sys_select(0, 0, 0, 0, &my_timeval) < 0) - return my_timeval.tv_sec + !!my_timeval.tv_usec; - else - return 0; -} - -static __attribute__((unused)) -int tcsetpgrp(int fd, pid_t pid) -{ - return ioctl(fd, TIOCSPGRP, &pid); -} - #endif /* _NOLIBC_STDLIB_H */ diff --git a/tools/include/nolibc/unistd.h b/tools/include/nolibc/unistd.h new file mode 100644 index 000000000000..87b448ff2191 --- /dev/null +++ b/tools/include/nolibc/unistd.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * unistd function definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau + */ + +#ifndef _NOLIBC_UNISTD_H +#define _NOLIBC_UNISTD_H + +#include "std.h" +#include "arch.h" +#include "types.h" +#include "sys.h" + + +static __attribute__((unused)) +int msleep(unsigned int msecs) +{ + struct timeval my_timeval = { msecs / 1000, (msecs % 1000) * 1000 }; + + if (sys_select(0, 0, 0, 0, &my_timeval) < 0) + return (my_timeval.tv_sec * 1000) + + (my_timeval.tv_usec / 1000) + + !!(my_timeval.tv_usec % 1000); + else + return 0; +} + +static __attribute__((unused)) +unsigned int sleep(unsigned int seconds) +{ + struct timeval my_timeval = { seconds, 0 }; + + if (sys_select(0, 0, 0, 0, &my_timeval) < 0) + return my_timeval.tv_sec + !!my_timeval.tv_usec; + else + return 0; +} + +static __attribute__((unused)) +int tcsetpgrp(int fd, pid_t pid) +{ + return ioctl(fd, TIOCSPGRP, &pid); +} + +#endif /* _NOLIBC_UNISTD_H */ From 180a9797b03472e6881066d9752f2f0d81e1880f Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:50 +0100 Subject: [PATCH 118/155] tools/nolibc/unistd: add usleep() This call is trivial to implement based on select() to complete sleep() and msleep(), let's add it. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/unistd.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/include/nolibc/unistd.h b/tools/include/nolibc/unistd.h index 87b448ff2191..1c25e20ee360 100644 --- a/tools/include/nolibc/unistd.h +++ b/tools/include/nolibc/unistd.h @@ -37,6 +37,14 @@ unsigned int sleep(unsigned int seconds) return 0; } +static __attribute__((unused)) +int usleep(unsigned int usecs) +{ + struct timeval my_timeval = { usecs / 1000000, usecs % 1000000 }; + + return sys_select(0, 0, 0, 0, &my_timeval); +} + static __attribute__((unused)) int tcsetpgrp(int fd, pid_t pid) { From 99cb50ab94b2aa11c756c4ba14ef09197a7ebe7b Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:51 +0100 Subject: [PATCH 119/155] tools/nolibc/signal: move raise() to signal.h This function is normally found in signal.h, and providing the file eases porting of existing programs. Let's move it there. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 1 + tools/include/nolibc/signal.h | 22 ++++++++++++++++++++++ tools/include/nolibc/stdlib.h | 7 ------- 3 files changed, 23 insertions(+), 7 deletions(-) create mode 100644 tools/include/nolibc/signal.h diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 686726518431..0f375e901a36 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -88,6 +88,7 @@ #include "types.h" #include "sys.h" #include "ctype.h" +#include "signal.h" #include "stdio.h" #include "stdlib.h" #include "string.h" diff --git a/tools/include/nolibc/signal.h b/tools/include/nolibc/signal.h new file mode 100644 index 000000000000..ef47e71e2be3 --- /dev/null +++ b/tools/include/nolibc/signal.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * signal function definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau + */ + +#ifndef _NOLIBC_SIGNAL_H +#define _NOLIBC_SIGNAL_H + +#include "std.h" +#include "arch.h" +#include "types.h" +#include "sys.h" + +/* This one is not marked static as it's needed by libgcc for divide by zero */ +__attribute__((weak,unused,section(".text.nolibc_raise"))) +int raise(int signal) +{ + return sys_kill(sys_getpid(), signal); +} + +#endif /* _NOLIBC_SIGNAL_H */ diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index 0e6bca4ee089..b46bebd48ba2 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -300,11 +300,4 @@ char *u64toa(uint64_t in) return itoa_buffer; } -/* This one is not marked static as it's needed by libgcc for divide by zero */ -__attribute__((weak,unused,section(".text.nolibc_raise"))) -int raise(int signal) -{ - return sys_kill(sys_getpid(), signal); -} - #endif /* _NOLIBC_STDLIB_H */ From cec1505321020287c3acc5a63dd75859ebf5ad0d Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:52 +0100 Subject: [PATCH 120/155] tools/nolibc/time: create time.h with time() The time() syscall is used by a few simple applications, and is trivial to implement based on gettimeofday() that we already have. Let's create the file to ease porting and provide the function. It never returns any error, though it may segfault in case of invalid pointer, like other implementations relying on gettimeofday(). Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 1 + tools/include/nolibc/time.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 tools/include/nolibc/time.h diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 0f375e901a36..561dcdb83cee 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -92,6 +92,7 @@ #include "stdio.h" #include "stdlib.h" #include "string.h" +#include "time.h" #include "unistd.h" /* Used by programs to avoid std includes */ diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h new file mode 100644 index 000000000000..d18b7661fdd7 --- /dev/null +++ b/tools/include/nolibc/time.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * time function definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau + */ + +#ifndef _NOLIBC_TIME_H +#define _NOLIBC_TIME_H + +#include "std.h" +#include "arch.h" +#include "types.h" +#include "sys.h" + +static __attribute__((unused)) +time_t time(time_t *tptr) +{ + struct timeval tv; + + /* note, cannot fail here */ + sys_gettimeofday(&tv, NULL); + + if (tptr) + *tptr = tv.tv_sec; + return tv.tv_sec; +} + +#endif /* _NOLIBC_TIME_H */ From c4486e97283d0b3d72a8eeda91e53e2ea9b62fbe Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:53 +0100 Subject: [PATCH 121/155] tools/nolibc: also mention how to build by just setting the include path Now that a few basic include files are provided, some simple portable programs may build, which will save them from having to surround their includes with #ifndef NOLIBC. This patch mentions how to proceed, and enumerates the list of files that are covered. A comprehensive list of required include files is available here: https://en.cppreference.com/w/c/header Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 561dcdb83cee..b2bc48d3cfe4 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -57,22 +57,32 @@ * having to specify anything. * * Finally some very common libc-level functions are provided. It is the case - * for a few functions usually found in string.h, ctype.h, or stdlib.h. Nothing - * is currently provided regarding stdio emulation. + * for a few functions usually found in string.h, ctype.h, or stdlib.h. * - * The macro NOLIBC is always defined, so that it is possible for a program to - * check this macro to know if it is being built against and decide to disable - * some features or simply not to include some standard libc files. - * - * Ideally this file should be split in multiple files for easier long term - * maintenance, but provided as a single file as it is now, it's quite - * convenient to use. Maybe some variations involving a set of includes at the - * top could work. + * The nolibc.h file is only a convenient entry point which includes all other + * files. It also defines the NOLIBC macro, so that it is possible for a + * program to check this macro to know if it is being built against and decide + * to disable some features or simply not to include some standard libc files. * * A simple static executable may be built this way : * $ gcc -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ * -static -include nolibc.h -o hello hello.c -lgcc * + * Simple programs meant to be reasonably portable to various libc and using + * only a few common includes, may also be built by simply making the include + * path point to the nolibc directory: + * $ gcc -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ + * -I../nolibc -o hello hello.c -lgcc + * + * The available standard (but limited) include files are: + * ctype.h, errno.h, signal.h, stdio.h, stdlib.h, string.h, time.h + * + * In addition, the following ones are expected to be provided by the compiler: + * float.h, stdarg.h, stddef.h + * + * The following ones which are part to the C standard are not provided: + * assert.h, locale.h, math.h, setjmp.h, limits.h + * * A very useful calling convention table may be found here : * http://man7.org/linux/man-pages/man2/syscall.2.html * From f0f04f28d5ae483b3b354cb3b63a3bab0b00cee4 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 7 Feb 2022 17:23:54 +0100 Subject: [PATCH 122/155] tools/nolibc/stdlib: implement abort() libgcc uses it for certain divide functions, so it must be exported. Like for memset() we do that in its own section so that the linker can strip it when not needed. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdlib.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index b46bebd48ba2..733105c574ee 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -23,6 +23,14 @@ static __attribute__((unused)) char itoa_buffer[21]; * As much as possible, please keep functions alphabetically sorted. */ +/* must be exported, as it's used by libgcc for various divide functions */ +__attribute__((weak,unused,noreturn,section(".text.nolibc_abort"))) +void abort(void) +{ + sys_kill(sys_getpid(), SIGABRT); + for (;;); +} + static __attribute__((unused)) long atol(const char *s) { From 170b230d22e89681ebca1a3d972dca441c8e4be5 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 21 Mar 2022 18:33:07 +0100 Subject: [PATCH 123/155] tools/nolibc/stdio: make printf(%s) accept NULL It's often convenient to support this, especially in test programs where a NULL may correspond to an allocation error or a non-existing value. Let's make printf("%s") support being passed a NULL. In this case it prints "(null)" like glibc's printf(). Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdio.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index cb4d3ab3a565..559ebe052a75 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -220,6 +220,8 @@ int vfprintf(FILE *stream, const char *fmt, va_list args) } else if (c == 's') { outstr = va_arg(args, char *); + if (!outstr) + outstr="(null)"; } else if (c == '%') { /* queue it verbatim */ From 077d0a392446981cde2e8dd23090140bdd9fb728 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 21 Mar 2022 18:33:08 +0100 Subject: [PATCH 124/155] tools/nolibc/stdlib: add a simple getenv() implementation This implementation relies on an extern definition of the environ variable, that the caller must declare and initialize from envp. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdlib.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index 733105c574ee..aca8616335e3 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -60,6 +60,29 @@ int atoi(const char *s) return atol(s); } +/* Tries to find the environment variable named in the environment array + * pointed to by global variable "environ" which must be declared as a char **, + * and must be terminated by a NULL (it is recommended to set this variable to + * the "envp" argument of main()). If the requested environment variable exists + * its value is returned otherwise NULL is returned. + */ +static __attribute__((unused)) +char *getenv(const char *name) +{ + extern char **environ; + int idx, i; + + if (environ) { + for (idx = 0; environ[idx]; idx++) { + for (i = 0; name[i] && name[i] == environ[idx][i];) + i++; + if (!name[i] && environ[idx][i] == '=') + return &environ[idx][i+1]; + } + } + return NULL; +} + /* Converts the unsigned long integer to its hex representation into * buffer , which must be long enough to store the number and the * trailing zero (17 bytes for "ffffffffffffffff" or 9 for "ffffffff"). The From bd845a193aae4b99da5215fcfff0e9014b9f7b96 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 21 Mar 2022 18:33:09 +0100 Subject: [PATCH 125/155] tools/nolibc/stdio: add support for '%p' to vfprintf() %p remains quite useful in test code, and the code path can easily be merged with the existing "%x" thus only adds ~50 bytes, thus let's add it. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdio.h | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 559ebe052a75..15dedf8d0902 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -163,7 +163,7 @@ char *fgets(char *s, int size, FILE *stream) /* minimal vfprintf(). It supports the following formats: - * - %[l*]{d,u,c,x} + * - %[l*]{d,u,c,x,p} * - %s * - unknown modifiers are ignored. */ @@ -184,8 +184,12 @@ int vfprintf(FILE *stream, const char *fmt, va_list args) if (escape) { /* we're in an escape sequence, ofs == 1 */ escape = 0; - if (c == 'c' || c == 'd' || c == 'u' || c == 'x') { - if (lpref) { + if (c == 'c' || c == 'd' || c == 'u' || c == 'x' || c == 'p') { + char *out = tmpbuf; + + if (c == 'p') + v = va_arg(args, unsigned long); + else if (lpref) { if (lpref > 1) v = va_arg(args, unsigned long long); else @@ -202,18 +206,22 @@ int vfprintf(FILE *stream, const char *fmt, va_list args) } switch (c) { + case 'c': + out[0] = v; + out[1] = 0; + break; case 'd': - i64toa_r(v, tmpbuf); + i64toa_r(v, out); break; case 'u': - u64toa_r(v, tmpbuf); + u64toa_r(v, out); break; - case 'x': - u64toh_r(v, tmpbuf); - break; - default: /* 'c' */ - tmpbuf[0] = v; - tmpbuf[1] = 0; + case 'p': + *(out++) = '0'; + *(out++) = 'x'; + /* fall through */ + default: /* 'x' and 'p' above */ + u64toh_r(v, out); break; } outstr = tmpbuf; From 0e7b492943ec8cfdc7fffd9304d496315f781ea7 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 21 Mar 2022 18:33:10 +0100 Subject: [PATCH 126/155] tools/nolibc/string: add strcmp() and strncmp() We need these functions all the time, including when checking environment variables and parsing command-line arguments. These implementations were optimized to show optimal code size on a wide range of compilers (22 bytes return included for strcmp(), 33 for strncmp()). Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/string.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index 4554b6fcb400..0d5e870c7c0b 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -102,6 +102,17 @@ char *strchr(const char *s, int c) return NULL; } +static __attribute__((unused)) +int strcmp(const char *a, const char *b) +{ + unsigned int c; + int diff; + + while (!(diff = (unsigned char)*a++ - (c = (unsigned char)*b++)) && c) + ; + return diff; +} + static __attribute__((unused)) char *strcpy(char *dst, const char *src) { @@ -184,6 +195,18 @@ char *strncat(char *dst, const char *src, size_t size) return orig; } +static __attribute__((unused)) +int strncmp(const char *a, const char *b, size_t size) +{ + unsigned int c; + int diff = 0; + + while (size-- && + !(diff = (unsigned char)*a++ - (c = (unsigned char)*b++)) && c) + ; + + return diff; +} static __attribute__((unused)) char *strncpy(char *dst, const char *src, size_t size) From 54abe3590fd350afb9cde2398dd3f4ba5bf6d167 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 21 Mar 2022 18:33:11 +0100 Subject: [PATCH 127/155] tools/nolibc/sys: add syscall definition for getppid() This is essentially for completeness as it's not the most often used in regtests. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/sys.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 28437863c63f..4d4308d5d111 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -464,6 +464,23 @@ pid_t getpid(void) } +/* + * pid_t getppid(void); + */ + +static __attribute__((unused)) +pid_t sys_getppid(void) +{ + return my_syscall0(__NR_getppid); +} + +static __attribute__((unused)) +pid_t getppid(void) +{ + return sys_getppid(); +} + + /* * pid_t gettid(void); */ From 96d2a1313fe00927b33bb0ccbc3e8cd731826f7d Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 21 Mar 2022 18:33:12 +0100 Subject: [PATCH 128/155] tools/nolibc/types: add poll() and waitpid() flag definitions - POLLIN etc were missing, so poll() could only be used with timeouts. - WNOHANG was not defined and is convenient to check if a child is still running Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/types.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index e4026e740b56..357e60ad38a8 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -82,6 +82,9 @@ #define WEXITSTATUS(status) (((status) & 0xff00) >> 8) #define WIFEXITED(status) (((status) & 0x7f) == 0) +/* waitpid() flags */ +#define WNOHANG 1 + /* standard exit() codes */ #define EXIT_SUCCESS 0 #define EXIT_FAILURE 1 @@ -122,6 +125,13 @@ typedef struct { } while (0) /* for poll() */ +#define POLLIN 0x0001 +#define POLLPRI 0x0002 +#define POLLOUT 0x0004 +#define POLLERR 0x0008 +#define POLLHUP 0x0010 +#define POLLNVAL 0x0020 + struct pollfd { int fd; short int events; From 24326164687b303e3a7a5b8ef83c0f34c5582b2c Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 21 Mar 2022 18:33:13 +0100 Subject: [PATCH 129/155] tools/nolibc: add a makefile to install headers This provides a target "headers_standalone" which installs the nolibc's arch-specific headers with "arch.h" taken from the current arch (or a concatenation of both i386 and x86_64 for arch=x86), then installs kernel headers. This creates a convenient sysroot which is directly usable by a bare-metal compiler to create any executable. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/Makefile | 42 +++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 tools/include/nolibc/Makefile diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile new file mode 100644 index 000000000000..7a16d917c185 --- /dev/null +++ b/tools/include/nolibc/Makefile @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for nolibc installation and tests +include ../../scripts/Makefile.include + +# we're in ".../tools/include/nolibc" +ifeq ($(srctree),) +srctree := $(patsubst %/tools/include/,%,$(dir $(CURDIR))) +endif + +nolibc_arch := $(patsubst arm64,aarch64,$(ARCH)) +arch_file := arch-$(nolibc_arch).h +all_files := ctype.h errno.h nolibc.h signal.h std.h stdio.h stdlib.h string.h \ + sys.h time.h types.h unistd.h + +# install all headers needed to support a bare-metal compiler +all: + +# Note: when ARCH is "x86" we concatenate both x86_64 and i386 +headers: + $(Q)mkdir -p $(OUTPUT)sysroot + $(Q)mkdir -p $(OUTPUT)sysroot/include + $(Q)cp $(all_files) $(OUTPUT)sysroot/include/ + $(Q)if [ "$(ARCH)" = "x86" ]; then \ + sed -e \ + 's,^#ifndef _NOLIBC_ARCH_X86_64_H,#if !defined(_NOLIBC_ARCH_X86_64_H) \&\& defined(__x86_64__),' \ + arch-x86_64.h; \ + sed -e \ + 's,^#ifndef _NOLIBC_ARCH_I386_H,#if !defined(_NOLIBC_ARCH_I386_H) \&\& !defined(__x86_64__),' \ + arch-i386.h; \ + elif [ -e "$(arch_file)" ]; then \ + cat $(arch_file); \ + else \ + echo "Fatal: architecture $(ARCH) not yet supported by nolibc." >&2; \ + exit 1; \ + fi > $(OUTPUT)sysroot/include/arch.h + +headers_standalone: headers + $(Q)$(MAKE) -C $(srctree) headers + $(Q)$(MAKE) -C $(srctree) headers_install INSTALL_HDR_PATH=$(OUTPUT)/sysroot + +clean: + $(call QUIET_CLEAN, nolibc) rm -rf "$(OUTPUT)sysroot" From 0b37dff10bc05576f9594a10e8ef9c718dab931f Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 21 Mar 2022 18:33:14 +0100 Subject: [PATCH 130/155] tools/nolibc: add the nolibc subdir to the common Makefile The Makefile in tools/ is used to forward options to the makefiles in the various subdirs. Let's add nolibc there so that it becomes possible to make tools/nolibc_headers_standalone from the main tree to simply create a completely usable sysroot. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/Makefile b/tools/Makefile index db2f7b8ebed5..724134f0e56c 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -24,6 +24,7 @@ help: @echo ' intel-speed-select - Intel Speed Select tool' @echo ' kvm_stat - top-like utility for displaying kvm statistics' @echo ' leds - LEDs tools' + @echo ' nolibc - nolibc headers testing and installation' @echo ' objtool - an ELF object analysis tool' @echo ' pci - PCI tools' @echo ' perf - Linux performance measurement and analysis tool' @@ -74,6 +75,9 @@ bpf/%: FORCE libapi: FORCE $(call descend,lib/api) +nolibc_%: FORCE + $(call descend,include/nolibc,$(patsubst nolibc_%,%,$@)) + # The perf build does not follow the descend function setup, # invoking it via it's own make rule. PERF_O = $(if $(O),$(O)/tools/perf,) From 96980b833a21c6dc29d0dfdc8f211fb8a10256a7 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 23 Mar 2022 08:18:06 +0100 Subject: [PATCH 131/155] tools/nolibc/string: do not use __builtin_strlen() at -O0 clang wants to use strlen() for __builtin_strlen() at -O0. We don't really care about -O0 but it at least ought to build, so let's make sure we don't choke on this, by dropping the optimizationn for constant strings in this case. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/string.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index 0d5e870c7c0b..75a453870498 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -122,7 +122,9 @@ char *strcpy(char *dst, const char *src) return ret; } -/* this function is only used with arguments that are not constants */ +/* this function is only used with arguments that are not constants or when + * it's not known because optimizations are disabled. + */ static __attribute__((unused)) size_t nolibc_strlen(const char *str) { @@ -132,11 +134,18 @@ size_t nolibc_strlen(const char *str) return len; } +/* do not trust __builtin_constant_p() at -O0, as clang will emit a test and + * the two branches, then will rely on an external definition of strlen(). + */ +#if defined(__OPTIMIZE__) #define strlen(str) ({ \ __builtin_constant_p((str)) ? \ __builtin_strlen((str)) : \ nolibc_strlen((str)); \ }) +#else +#define strlen(str) nolibc_strlen((str)) +#endif static __attribute__((unused)) size_t strlcat(char *dst, const char *src, size_t size) From 2475d37ac30b8a850d3dd4fcbcb20895928b73fd Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 23 Mar 2022 08:18:07 +0100 Subject: [PATCH 132/155] tools/nolibc/stdlib: only reference the external environ when inlined When building with gcc at -O0 we're seeing link errors due to the "environ" variable being referenced by getenv(). The problem is that at -O0 gcc will not inline getenv() and will not drop the external reference. One solution would be to locally declare the variable as weak, but then it would appear in all programs even those not using it, and would be confusing to users of getenv() who would forget to set environ to envp. An alternate approach used in this patch consists in always inlining the outer part of getenv() that references this extern so that it's always dropped when not used. The biggest part of the function was now moved to a new function called _getenv() that's still not inlined by default. Reported-by: Ammar Faizi Signed-off-by: Willy Tarreau Tested-by: Ammar Faizi Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdlib.h | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index aca8616335e3..8a07e263f0d0 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -60,16 +60,17 @@ int atoi(const char *s) return atol(s); } -/* Tries to find the environment variable named in the environment array - * pointed to by global variable "environ" which must be declared as a char **, - * and must be terminated by a NULL (it is recommended to set this variable to - * the "envp" argument of main()). If the requested environment variable exists - * its value is returned otherwise NULL is returned. +/* getenv() tries to find the environment variable named in the + * environment array pointed to by global variable "environ" which must be + * declared as a char **, and must be terminated by a NULL (it is recommended + * to set this variable to the "envp" argument of main()). If the requested + * environment variable exists its value is returned otherwise NULL is + * returned. getenv() is forcefully inlined so that the reference to "environ" + * will be dropped if unused, even at -O0. */ static __attribute__((unused)) -char *getenv(const char *name) +char *_getenv(const char *name, char **environ) { - extern char **environ; int idx, i; if (environ) { @@ -83,6 +84,13 @@ char *getenv(const char *name) return NULL; } +static inline __attribute__((unused,always_inline)) +char *getenv(const char *name) +{ + extern char **environ; + return _getenv(name, environ); +} + /* Converts the unsigned long integer to its hex representation into * buffer , which must be long enough to store the number and the * trailing zero (17 bytes for "ffffffffffffffff" or 9 for "ffffffff"). The From 5312aaa5d567f0dfc11681ad991a78e9da43fe7b Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Tue, 29 Mar 2022 17:17:29 +0700 Subject: [PATCH 133/155] tools/nolibc: x86-64: Update System V ABI document link The old link no longer works, update it. Acked-by: Willy Tarreau Signed-off-by: Ammar Faizi Signed-off-by: Paul E. McKenney --- tools/include/nolibc/arch-x86_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h index fe517c16cd4d..a7b70ea51b68 100644 --- a/tools/include/nolibc/arch-x86_64.h +++ b/tools/include/nolibc/arch-x86_64.h @@ -61,7 +61,7 @@ struct sys_stat_struct { * - see also x86-64 ABI section A.2 AMD64 Linux Kernel Conventions, A.2.1 * Calling Conventions. * - * Link x86-64 ABI: https://gitlab.com/x86-psABIs/x86-64-ABI/-/wikis/x86-64-psABI + * Link x86-64 ABI: https://gitlab.com/x86-psABIs/x86-64-ABI/-/wikis/home * */ From 37d62758e773939636b8fa64a1a39a8a0d3a9f8c Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Tue, 29 Mar 2022 17:17:30 +0700 Subject: [PATCH 134/155] tools/nolibc: Replace `asm` with `__asm__` Replace `asm` with `__asm__` to support compilation with -std flag. Using `asm` with -std flag makes GCC think `asm()` is a function call instead of an inline assembly. GCC doc says: For the C language, the `asm` keyword is a GNU extension. When writing C code that can be compiled with `-ansi` and the `-std` options that select C dialects without GNU extensions, use `__asm__` instead of `asm`. Link: https://gcc.gnu.org/onlinedocs/gcc/Basic-Asm.html Reported-by: Alviro Iskandar Setiawan Acked-by: Willy Tarreau Signed-off-by: Ammar Faizi Signed-off-by: Paul E. McKenney --- tools/include/nolibc/arch-aarch64.h | 74 ++++++++++++++--------------- tools/include/nolibc/arch-arm.h | 58 +++++++++++----------- tools/include/nolibc/arch-i386.h | 56 +++++++++++----------- tools/include/nolibc/arch-mips.h | 62 ++++++++++++------------ tools/include/nolibc/arch-riscv.h | 74 ++++++++++++++--------------- tools/include/nolibc/arch-x86_64.h | 72 ++++++++++++++-------------- 6 files changed, 198 insertions(+), 198 deletions(-) diff --git a/tools/include/nolibc/arch-aarch64.h b/tools/include/nolibc/arch-aarch64.h index 87d9e434820c..b68443f63980 100644 --- a/tools/include/nolibc/arch-aarch64.h +++ b/tools/include/nolibc/arch-aarch64.h @@ -64,10 +64,10 @@ struct sys_stat_struct { #define my_syscall0(num) \ ({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0"); \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0"); \ \ - asm volatile ( \ + __asm__ volatile ( \ "svc #0\n" \ : "=r"(_arg1) \ : "r"(_num) \ @@ -78,10 +78,10 @@ struct sys_stat_struct { #define my_syscall1(num, arg1) \ ({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ \ - asm volatile ( \ + __asm__ volatile ( \ "svc #0\n" \ : "=r"(_arg1) \ : "r"(_arg1), \ @@ -93,11 +93,11 @@ struct sys_stat_struct { #define my_syscall2(num, arg1, arg2) \ ({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ + register long _arg2 __asm__ ("x1") = (long)(arg2); \ \ - asm volatile ( \ + __asm__ volatile ( \ "svc #0\n" \ : "=r"(_arg1) \ : "r"(_arg1), "r"(_arg2), \ @@ -109,12 +109,12 @@ struct sys_stat_struct { #define my_syscall3(num, arg1, arg2, arg3) \ ({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ + register long _arg2 __asm__ ("x1") = (long)(arg2); \ + register long _arg3 __asm__ ("x2") = (long)(arg3); \ \ - asm volatile ( \ + __asm__ volatile ( \ "svc #0\n" \ : "=r"(_arg1) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ @@ -126,13 +126,13 @@ struct sys_stat_struct { #define my_syscall4(num, arg1, arg2, arg3, arg4) \ ({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ + register long _arg2 __asm__ ("x1") = (long)(arg2); \ + register long _arg3 __asm__ ("x2") = (long)(arg3); \ + register long _arg4 __asm__ ("x3") = (long)(arg4); \ \ - asm volatile ( \ + __asm__ volatile ( \ "svc #0\n" \ : "=r"(_arg1) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ @@ -144,14 +144,14 @@ struct sys_stat_struct { #define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ ({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - register long _arg5 asm("x4") = (long)(arg5); \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ + register long _arg2 __asm__ ("x1") = (long)(arg2); \ + register long _arg3 __asm__ ("x2") = (long)(arg3); \ + register long _arg4 __asm__ ("x3") = (long)(arg4); \ + register long _arg5 __asm__ ("x4") = (long)(arg5); \ \ - asm volatile ( \ + __asm__ volatile ( \ "svc #0\n" \ : "=r" (_arg1) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ @@ -163,15 +163,15 @@ struct sys_stat_struct { #define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ ({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - register long _arg5 asm("x4") = (long)(arg5); \ - register long _arg6 asm("x5") = (long)(arg6); \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ + register long _arg2 __asm__ ("x1") = (long)(arg2); \ + register long _arg3 __asm__ ("x2") = (long)(arg3); \ + register long _arg4 __asm__ ("x3") = (long)(arg4); \ + register long _arg5 __asm__ ("x4") = (long)(arg5); \ + register long _arg6 __asm__ ("x5") = (long)(arg6); \ \ - asm volatile ( \ + __asm__ volatile ( \ "svc #0\n" \ : "=r" (_arg1) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ @@ -182,7 +182,7 @@ struct sys_stat_struct { }) /* startup code */ -asm(".section .text\n" +__asm__ (".section .text\n" ".weak _start\n" ".global _start\n" "_start:\n" diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h index 001a3c8c9ad5..55fd9439b2e2 100644 --- a/tools/include/nolibc/arch-arm.h +++ b/tools/include/nolibc/arch-arm.h @@ -77,10 +77,10 @@ struct sys_stat_struct { #define my_syscall0(num) \ ({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0"); \ + register long _num __asm__ ("r7") = (num); \ + register long _arg1 __asm__ ("r0"); \ \ - asm volatile ( \ + __asm__ volatile ( \ "svc #0\n" \ : "=r"(_arg1) \ : "r"(_num) \ @@ -91,10 +91,10 @@ struct sys_stat_struct { #define my_syscall1(num, arg1) \ ({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ + register long _num __asm__ ("r7") = (num); \ + register long _arg1 __asm__ ("r0") = (long)(arg1); \ \ - asm volatile ( \ + __asm__ volatile ( \ "svc #0\n" \ : "=r"(_arg1) \ : "r"(_arg1), \ @@ -106,11 +106,11 @@ struct sys_stat_struct { #define my_syscall2(num, arg1, arg2) \ ({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ + register long _num __asm__ ("r7") = (num); \ + register long _arg1 __asm__ ("r0") = (long)(arg1); \ + register long _arg2 __asm__ ("r1") = (long)(arg2); \ \ - asm volatile ( \ + __asm__ volatile ( \ "svc #0\n" \ : "=r"(_arg1) \ : "r"(_arg1), "r"(_arg2), \ @@ -122,12 +122,12 @@ struct sys_stat_struct { #define my_syscall3(num, arg1, arg2, arg3) \ ({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ + register long _num __asm__ ("r7") = (num); \ + register long _arg1 __asm__ ("r0") = (long)(arg1); \ + register long _arg2 __asm__ ("r1") = (long)(arg2); \ + register long _arg3 __asm__ ("r2") = (long)(arg3); \ \ - asm volatile ( \ + __asm__ volatile ( \ "svc #0\n" \ : "=r"(_arg1) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ @@ -139,13 +139,13 @@ struct sys_stat_struct { #define my_syscall4(num, arg1, arg2, arg3, arg4) \ ({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - register long _arg4 asm("r3") = (long)(arg4); \ + register long _num __asm__ ("r7") = (num); \ + register long _arg1 __asm__ ("r0") = (long)(arg1); \ + register long _arg2 __asm__ ("r1") = (long)(arg2); \ + register long _arg3 __asm__ ("r2") = (long)(arg3); \ + register long _arg4 __asm__ ("r3") = (long)(arg4); \ \ - asm volatile ( \ + __asm__ volatile ( \ "svc #0\n" \ : "=r"(_arg1) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ @@ -157,14 +157,14 @@ struct sys_stat_struct { #define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ ({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - register long _arg4 asm("r3") = (long)(arg4); \ - register long _arg5 asm("r4") = (long)(arg5); \ + register long _num __asm__ ("r7") = (num); \ + register long _arg1 __asm__ ("r0") = (long)(arg1); \ + register long _arg2 __asm__ ("r1") = (long)(arg2); \ + register long _arg3 __asm__ ("r2") = (long)(arg3); \ + register long _arg4 __asm__ ("r3") = (long)(arg4); \ + register long _arg5 __asm__ ("r4") = (long)(arg5); \ \ - asm volatile ( \ + __asm__ volatile ( \ "svc #0\n" \ : "=r" (_arg1) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ @@ -175,7 +175,7 @@ struct sys_stat_struct { }) /* startup code */ -asm(".section .text\n" +__asm__ (".section .text\n" ".weak _start\n" ".global _start\n" "_start:\n" diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h index d7e4d53325a3..136d5739e456 100644 --- a/tools/include/nolibc/arch-i386.h +++ b/tools/include/nolibc/arch-i386.h @@ -66,9 +66,9 @@ struct sys_stat_struct { #define my_syscall0(num) \ ({ \ long _ret; \ - register long _num asm("eax") = (num); \ + register long _num __asm__ ("eax") = (num); \ \ - asm volatile ( \ + __asm__ volatile ( \ "int $0x80\n" \ : "=a" (_ret) \ : "0"(_num) \ @@ -80,10 +80,10 @@ struct sys_stat_struct { #define my_syscall1(num, arg1) \ ({ \ long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ + register long _num __asm__ ("eax") = (num); \ + register long _arg1 __asm__ ("ebx") = (long)(arg1); \ \ - asm volatile ( \ + __asm__ volatile ( \ "int $0x80\n" \ : "=a" (_ret) \ : "r"(_arg1), \ @@ -96,11 +96,11 @@ struct sys_stat_struct { #define my_syscall2(num, arg1, arg2) \ ({ \ long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ + register long _num __asm__ ("eax") = (num); \ + register long _arg1 __asm__ ("ebx") = (long)(arg1); \ + register long _arg2 __asm__ ("ecx") = (long)(arg2); \ \ - asm volatile ( \ + __asm__ volatile ( \ "int $0x80\n" \ : "=a" (_ret) \ : "r"(_arg1), "r"(_arg2), \ @@ -113,12 +113,12 @@ struct sys_stat_struct { #define my_syscall3(num, arg1, arg2, arg3) \ ({ \ long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ + register long _num __asm__ ("eax") = (num); \ + register long _arg1 __asm__ ("ebx") = (long)(arg1); \ + register long _arg2 __asm__ ("ecx") = (long)(arg2); \ + register long _arg3 __asm__ ("edx") = (long)(arg3); \ \ - asm volatile ( \ + __asm__ volatile ( \ "int $0x80\n" \ : "=a" (_ret) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ @@ -131,13 +131,13 @@ struct sys_stat_struct { #define my_syscall4(num, arg1, arg2, arg3, arg4) \ ({ \ long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - register long _arg4 asm("esi") = (long)(arg4); \ + register long _num __asm__ ("eax") = (num); \ + register long _arg1 __asm__ ("ebx") = (long)(arg1); \ + register long _arg2 __asm__ ("ecx") = (long)(arg2); \ + register long _arg3 __asm__ ("edx") = (long)(arg3); \ + register long _arg4 __asm__ ("esi") = (long)(arg4); \ \ - asm volatile ( \ + __asm__ volatile ( \ "int $0x80\n" \ : "=a" (_ret) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ @@ -150,14 +150,14 @@ struct sys_stat_struct { #define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ ({ \ long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - register long _arg4 asm("esi") = (long)(arg4); \ - register long _arg5 asm("edi") = (long)(arg5); \ + register long _num __asm__ ("eax") = (num); \ + register long _arg1 __asm__ ("ebx") = (long)(arg1); \ + register long _arg2 __asm__ ("ecx") = (long)(arg2); \ + register long _arg3 __asm__ ("edx") = (long)(arg3); \ + register long _arg4 __asm__ ("esi") = (long)(arg4); \ + register long _arg5 __asm__ ("edi") = (long)(arg5); \ \ - asm volatile ( \ + __asm__ volatile ( \ "int $0x80\n" \ : "=a" (_ret) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ @@ -174,7 +174,7 @@ struct sys_stat_struct { * 2) The deepest stack frame should be set to zero * */ -asm(".section .text\n" +__asm__ (".section .text\n" ".weak _start\n" ".global _start\n" "_start:\n" diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h index c9a6aac87c6d..cd587e539d7d 100644 --- a/tools/include/nolibc/arch-mips.h +++ b/tools/include/nolibc/arch-mips.h @@ -69,10 +69,10 @@ struct sys_stat_struct { #define my_syscall0(num) \ ({ \ - register long _num asm("v0") = (num); \ - register long _arg4 asm("a3"); \ + register long _num __asm__ ("v0") = (num); \ + register long _arg4 __asm__ ("a3"); \ \ - asm volatile ( \ + __asm__ volatile ( \ "addiu $sp, $sp, -32\n" \ "syscall\n" \ "addiu $sp, $sp, 32\n" \ @@ -86,11 +86,11 @@ struct sys_stat_struct { #define my_syscall1(num, arg1) \ ({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg4 asm("a3"); \ + register long _num __asm__ ("v0") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg4 __asm__ ("a3"); \ \ - asm volatile ( \ + __asm__ volatile ( \ "addiu $sp, $sp, -32\n" \ "syscall\n" \ "addiu $sp, $sp, 32\n" \ @@ -105,12 +105,12 @@ struct sys_stat_struct { #define my_syscall2(num, arg1, arg2) \ ({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg4 asm("a3"); \ + register long _num __asm__ ("v0") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg4 __asm__ ("a3"); \ \ - asm volatile ( \ + __asm__ volatile ( \ "addiu $sp, $sp, -32\n" \ "syscall\n" \ "addiu $sp, $sp, 32\n" \ @@ -125,13 +125,13 @@ struct sys_stat_struct { #define my_syscall3(num, arg1, arg2, arg3) \ ({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3"); \ + register long _num __asm__ ("v0") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3"); \ \ - asm volatile ( \ + __asm__ volatile ( \ "addiu $sp, $sp, -32\n" \ "syscall\n" \ "addiu $sp, $sp, 32\n" \ @@ -146,13 +146,13 @@ struct sys_stat_struct { #define my_syscall4(num, arg1, arg2, arg3, arg4) \ ({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ + register long _num __asm__ ("v0") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ \ - asm volatile ( \ + __asm__ volatile ( \ "addiu $sp, $sp, -32\n" \ "syscall\n" \ "addiu $sp, $sp, 32\n" \ @@ -167,14 +167,14 @@ struct sys_stat_struct { #define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ ({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ + register long _num __asm__ ("v0") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ register long _arg5 = (long)(arg5); \ \ - asm volatile ( \ + __asm__ volatile ( \ "addiu $sp, $sp, -32\n" \ "sw %7, 16($sp)\n" \ "syscall\n " \ @@ -189,7 +189,7 @@ struct sys_stat_struct { }) /* startup code, note that it's called __start on MIPS */ -asm(".section .text\n" +__asm__ (".section .text\n" ".weak __start\n" ".set nomips16\n" ".global __start\n" diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h index bc10b7b5706d..8ec4c05fa69b 100644 --- a/tools/include/nolibc/arch-riscv.h +++ b/tools/include/nolibc/arch-riscv.h @@ -66,10 +66,10 @@ struct sys_stat_struct { #define my_syscall0(num) \ ({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0"); \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0"); \ \ - asm volatile ( \ + __asm__ volatile ( \ "ecall\n\t" \ : "=r"(_arg1) \ : "r"(_num) \ @@ -80,10 +80,10 @@ struct sys_stat_struct { #define my_syscall1(num, arg1) \ ({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ \ - asm volatile ( \ + __asm__ volatile ( \ "ecall\n" \ : "+r"(_arg1) \ : "r"(_num) \ @@ -94,11 +94,11 @@ struct sys_stat_struct { #define my_syscall2(num, arg1, arg2) \ ({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ \ - asm volatile ( \ + __asm__ volatile ( \ "ecall\n" \ : "+r"(_arg1) \ : "r"(_arg2), \ @@ -110,12 +110,12 @@ struct sys_stat_struct { #define my_syscall3(num, arg1, arg2, arg3) \ ({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ \ - asm volatile ( \ + __asm__ volatile ( \ "ecall\n\t" \ : "+r"(_arg1) \ : "r"(_arg2), "r"(_arg3), \ @@ -127,13 +127,13 @@ struct sys_stat_struct { #define my_syscall4(num, arg1, arg2, arg3, arg4) \ ({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ \ - asm volatile ( \ + __asm__ volatile ( \ "ecall\n" \ : "+r"(_arg1) \ : "r"(_arg2), "r"(_arg3), "r"(_arg4), \ @@ -145,14 +145,14 @@ struct sys_stat_struct { #define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ ({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - register long _arg5 asm("a4") = (long)(arg5); \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + register long _arg5 __asm__ ("a4") = (long)(arg5); \ \ - asm volatile ( \ + __asm__ volatile ( \ "ecall\n" \ : "+r"(_arg1) \ : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ @@ -164,15 +164,15 @@ struct sys_stat_struct { #define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ ({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - register long _arg5 asm("a4") = (long)(arg5); \ - register long _arg6 asm("a5") = (long)(arg6); \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + register long _arg5 __asm__ ("a4") = (long)(arg5); \ + register long _arg6 __asm__ ("a5") = (long)(arg6); \ \ - asm volatile ( \ + __asm__ volatile ( \ "ecall\n" \ : "+r"(_arg1) \ : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), "r"(_arg6), \ @@ -183,7 +183,7 @@ struct sys_stat_struct { }) /* startup code */ -asm(".section .text\n" +__asm__ (".section .text\n" ".weak _start\n" ".global _start\n" "_start:\n" diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h index a7b70ea51b68..490530429ac9 100644 --- a/tools/include/nolibc/arch-x86_64.h +++ b/tools/include/nolibc/arch-x86_64.h @@ -68,9 +68,9 @@ struct sys_stat_struct { #define my_syscall0(num) \ ({ \ long _ret; \ - register long _num asm("rax") = (num); \ + register long _num __asm__ ("rax") = (num); \ \ - asm volatile ( \ + __asm__ volatile ( \ "syscall\n" \ : "=a"(_ret) \ : "0"(_num) \ @@ -82,10 +82,10 @@ struct sys_stat_struct { #define my_syscall1(num, arg1) \ ({ \ long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ + register long _num __asm__ ("rax") = (num); \ + register long _arg1 __asm__ ("rdi") = (long)(arg1); \ \ - asm volatile ( \ + __asm__ volatile ( \ "syscall\n" \ : "=a"(_ret) \ : "r"(_arg1), \ @@ -98,11 +98,11 @@ struct sys_stat_struct { #define my_syscall2(num, arg1, arg2) \ ({ \ long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ + register long _num __asm__ ("rax") = (num); \ + register long _arg1 __asm__ ("rdi") = (long)(arg1); \ + register long _arg2 __asm__ ("rsi") = (long)(arg2); \ \ - asm volatile ( \ + __asm__ volatile ( \ "syscall\n" \ : "=a"(_ret) \ : "r"(_arg1), "r"(_arg2), \ @@ -115,12 +115,12 @@ struct sys_stat_struct { #define my_syscall3(num, arg1, arg2, arg3) \ ({ \ long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ + register long _num __asm__ ("rax") = (num); \ + register long _arg1 __asm__ ("rdi") = (long)(arg1); \ + register long _arg2 __asm__ ("rsi") = (long)(arg2); \ + register long _arg3 __asm__ ("rdx") = (long)(arg3); \ \ - asm volatile ( \ + __asm__ volatile ( \ "syscall\n" \ : "=a"(_ret) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ @@ -133,13 +133,13 @@ struct sys_stat_struct { #define my_syscall4(num, arg1, arg2, arg3, arg4) \ ({ \ long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ + register long _num __asm__ ("rax") = (num); \ + register long _arg1 __asm__ ("rdi") = (long)(arg1); \ + register long _arg2 __asm__ ("rsi") = (long)(arg2); \ + register long _arg3 __asm__ ("rdx") = (long)(arg3); \ + register long _arg4 __asm__ ("r10") = (long)(arg4); \ \ - asm volatile ( \ + __asm__ volatile ( \ "syscall\n" \ : "=a"(_ret) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ @@ -152,14 +152,14 @@ struct sys_stat_struct { #define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ ({ \ long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - register long _arg5 asm("r8") = (long)(arg5); \ + register long _num __asm__ ("rax") = (num); \ + register long _arg1 __asm__ ("rdi") = (long)(arg1); \ + register long _arg2 __asm__ ("rsi") = (long)(arg2); \ + register long _arg3 __asm__ ("rdx") = (long)(arg3); \ + register long _arg4 __asm__ ("r10") = (long)(arg4); \ + register long _arg5 __asm__ ("r8") = (long)(arg5); \ \ - asm volatile ( \ + __asm__ volatile ( \ "syscall\n" \ : "=a"(_ret) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ @@ -172,15 +172,15 @@ struct sys_stat_struct { #define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ ({ \ long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - register long _arg5 asm("r8") = (long)(arg5); \ - register long _arg6 asm("r9") = (long)(arg6); \ + register long _num __asm__ ("rax") = (num); \ + register long _arg1 __asm__ ("rdi") = (long)(arg1); \ + register long _arg2 __asm__ ("rsi") = (long)(arg2); \ + register long _arg3 __asm__ ("rdx") = (long)(arg3); \ + register long _arg4 __asm__ ("r10") = (long)(arg4); \ + register long _arg5 __asm__ ("r8") = (long)(arg5); \ + register long _arg6 __asm__ ("r9") = (long)(arg6); \ \ - asm volatile ( \ + __asm__ volatile ( \ "syscall\n" \ : "=a"(_ret) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ @@ -197,7 +197,7 @@ struct sys_stat_struct { * 2) The deepest stack frame should be zero (the %rbp). * */ -asm(".section .text\n" +__asm__ (".section .text\n" ".weak _start\n" ".global _start\n" "_start:\n" From 1590c59836dace3a20945bad049fe8802c4e6f3f Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Tue, 29 Mar 2022 17:17:31 +0700 Subject: [PATCH 135/155] tools/nolibc: Remove .global _start from the entry point code Building with clang yields the following error: ``` :3:1: error: _start changed binding to STB_GLOBAL .global _start ^ 1 error generated. ``` Make sure only specify one between `.global _start` and `.weak _start`. Remove `.global _start`. Cc: llvm@lists.linux.dev Reviewed-by: Nick Desaulniers Acked-by: Willy Tarreau Signed-off-by: Ammar Faizi Signed-off-by: Paul E. McKenney --- tools/include/nolibc/arch-aarch64.h | 1 - tools/include/nolibc/arch-arm.h | 1 - tools/include/nolibc/arch-i386.h | 1 - tools/include/nolibc/arch-mips.h | 1 - tools/include/nolibc/arch-riscv.h | 1 - tools/include/nolibc/arch-x86_64.h | 1 - 6 files changed, 6 deletions(-) diff --git a/tools/include/nolibc/arch-aarch64.h b/tools/include/nolibc/arch-aarch64.h index b68443f63980..f68baf8f395f 100644 --- a/tools/include/nolibc/arch-aarch64.h +++ b/tools/include/nolibc/arch-aarch64.h @@ -184,7 +184,6 @@ struct sys_stat_struct { /* startup code */ __asm__ (".section .text\n" ".weak _start\n" - ".global _start\n" "_start:\n" "ldr x0, [sp]\n" // argc (x0) was in the stack "add x1, sp, 8\n" // argv (x1) = sp diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h index 55fd9439b2e2..f31be8e967d6 100644 --- a/tools/include/nolibc/arch-arm.h +++ b/tools/include/nolibc/arch-arm.h @@ -177,7 +177,6 @@ struct sys_stat_struct { /* startup code */ __asm__ (".section .text\n" ".weak _start\n" - ".global _start\n" "_start:\n" #if defined(__THUMBEB__) || defined(__THUMBEL__) /* We enter here in 32-bit mode but if some previous functions were in diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h index 136d5739e456..10aada40680d 100644 --- a/tools/include/nolibc/arch-i386.h +++ b/tools/include/nolibc/arch-i386.h @@ -176,7 +176,6 @@ struct sys_stat_struct { */ __asm__ (".section .text\n" ".weak _start\n" - ".global _start\n" "_start:\n" "pop %eax\n" // argc (first arg, %eax) "mov %esp, %ebx\n" // argv[] (second arg, %ebx) diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h index cd587e539d7d..5fc5b8029bff 100644 --- a/tools/include/nolibc/arch-mips.h +++ b/tools/include/nolibc/arch-mips.h @@ -192,7 +192,6 @@ struct sys_stat_struct { __asm__ (".section .text\n" ".weak __start\n" ".set nomips16\n" - ".global __start\n" ".set noreorder\n" ".option pic0\n" ".ent __start\n" diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h index 8ec4c05fa69b..95e2b7924925 100644 --- a/tools/include/nolibc/arch-riscv.h +++ b/tools/include/nolibc/arch-riscv.h @@ -185,7 +185,6 @@ struct sys_stat_struct { /* startup code */ __asm__ (".section .text\n" ".weak _start\n" - ".global _start\n" "_start:\n" ".option push\n" ".option norelax\n" diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h index 490530429ac9..0e1e9eb8545d 100644 --- a/tools/include/nolibc/arch-x86_64.h +++ b/tools/include/nolibc/arch-x86_64.h @@ -199,7 +199,6 @@ struct sys_stat_struct { */ __asm__ (".section .text\n" ".weak _start\n" - ".global _start\n" "_start:\n" "pop %rdi\n" // argc (first arg, %rdi) "mov %rsp, %rsi\n" // argv[] (second arg, %rsi) From f4738ff74c74241c2458269bb6afb64000ec1001 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Tue, 29 Mar 2022 17:17:32 +0700 Subject: [PATCH 136/155] tools/nolibc: i386: Implement syscall with 6 arguments On i386, the 6th argument of syscall goes in %ebp. However, both Clang and GCC cannot use %ebp in the clobber list and in the "r" constraint without using -fomit-frame-pointer. To make it always available for any kind of compilation, the below workaround is implemented. 1) Push the 6-th argument. 2) Push %ebp. 3) Load the 6-th argument from 4(%esp) to %ebp. 4) Do the syscall (int $0x80). 5) Pop %ebp (restore the old value of %ebp). 6) Add %esp by 4 (undo the stack pointer). Cc: x86@kernel.org Cc: llvm@lists.linux.dev Link: https://lore.kernel.org/lkml/2e335ac54db44f1d8496583d97f9dab0@AcuMS.aculab.com Suggested-by: David Laight Acked-by: Willy Tarreau Signed-off-by: Ammar Faizi Signed-off-by: Paul E. McKenney --- tools/include/nolibc/arch-i386.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h index 10aada40680d..d7e7212346e2 100644 --- a/tools/include/nolibc/arch-i386.h +++ b/tools/include/nolibc/arch-i386.h @@ -167,6 +167,29 @@ struct sys_stat_struct { _ret; \ }) +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + long _eax = (long)(num); \ + long _arg6 = (long)(arg6); /* Always in memory */ \ + __asm__ volatile ( \ + "pushl %[_arg6]\n\t" \ + "pushl %%ebp\n\t" \ + "movl 4(%%esp),%%ebp\n\t" \ + "int $0x80\n\t" \ + "popl %%ebp\n\t" \ + "addl $4,%%esp\n\t" \ + : "+a"(_eax) /* %eax */ \ + : "b"(arg1), /* %ebx */ \ + "c"(arg2), /* %ecx */ \ + "d"(arg3), /* %edx */ \ + "S"(arg4), /* %esi */ \ + "D"(arg5), /* %edi */ \ + [_arg6]"m"(_arg6) /* memory */ \ + : "memory", "cc" \ + ); \ + _eax; \ +}) + /* startup code */ /* * i386 System V ABI mandates: From 544fa1a2d3e61c954ab531f2c790bc79c1745606 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Tue, 29 Mar 2022 17:17:33 +0700 Subject: [PATCH 137/155] tools/nolibc/sys: Implement `mmap()` and `munmap()` Implement mmap() and munmap(). Currently, they are only available for architecures that have my_syscall6 macro. For architectures that don't have, this function will return -1 with errno set to ENOSYS (Function not implemented). This has been tested on x86 and i386. Notes for i386: 1) The common mmap() syscall implementation uses __NR_mmap2 instead of __NR_mmap. 2) The offset must be shifted-right by 12-bit. Acked-by: Willy Tarreau Signed-off-by: Ammar Faizi Signed-off-by: Paul E. McKenney --- tools/include/nolibc/sys.h | 62 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 4d4308d5d111..08491070387b 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -14,6 +14,7 @@ #include #include // for SIGCHLD #include +#include #include #include #include @@ -675,6 +676,67 @@ int mknod(const char *path, mode_t mode, dev_t dev) return ret; } +#ifndef MAP_SHARED +#define MAP_SHARED 0x01 /* Share changes */ +#define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ +#endif + +#ifndef MAP_FAILED +#define MAP_FAILED ((void *)-1) +#endif + +static __attribute__((unused)) +void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd, + off_t offset) +{ +#ifndef my_syscall6 + /* Function not implemented. */ + return -ENOSYS; +#else + + int n; + +#if defined(__i386__) + n = __NR_mmap2; + offset >>= 12; +#else + n = __NR_mmap; +#endif + + return (void *)my_syscall6(n, addr, length, prot, flags, fd, offset); +#endif +} + +static __attribute__((unused)) +void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + void *ret = sys_mmap(addr, length, prot, flags, fd, offset); + + if ((unsigned long)ret >= -4095UL) { + SET_ERRNO(-(long)ret); + ret = MAP_FAILED; + } + return ret; +} + +static __attribute__((unused)) +int sys_munmap(void *addr, size_t length) +{ + return my_syscall2(__NR_munmap, addr, length); +} + +static __attribute__((unused)) +int munmap(void *addr, size_t length) +{ + int ret = sys_munmap(addr, length); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} /* * int mount(const char *source, const char *target, From 5a18d07ce3006dbcb3c4cfc7bf1c094a5da19540 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Tue, 29 Mar 2022 17:17:34 +0700 Subject: [PATCH 138/155] tools/nolibc/types: Implement `offsetof()` and `container_of()` macro Implement `offsetof()` and `container_of()` macro. The first use case of these macros is for `malloc()`, `realloc()` and `free()`. Acked-by: Willy Tarreau Signed-off-by: Ammar Faizi Signed-off-by: Paul E. McKenney --- tools/include/nolibc/types.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index 357e60ad38a8..959997034e55 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -191,4 +191,15 @@ struct stat { #define major(dev) ((unsigned int)(((dev) >> 8) & 0xfff)) #define minor(dev) ((unsigned int)(((dev) & 0xff)) +#ifndef offsetof +#define offsetof(TYPE, FIELD) ((size_t) &((TYPE *)0)->FIELD) +#endif + +#ifndef container_of +#define container_of(PTR, TYPE, FIELD) ({ \ + __typeof__(((TYPE *)0)->FIELD) *__FIELD_PTR = (PTR); \ + (TYPE *)((char *) __FIELD_PTR - offsetof(TYPE, FIELD)); \ +}) +#endif + #endif /* _NOLIBC_TYPES_H */ From 0e0ff638400be8f497a35b51a4751fd823f6bd6a Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Tue, 29 Mar 2022 17:17:35 +0700 Subject: [PATCH 139/155] tools/nolibc/stdlib: Implement `malloc()`, `calloc()`, `realloc()` and `free()` Implement basic dynamic allocator functions. These functions are currently only available on architectures that have nolibc mmap() syscall implemented. These are not a super-fast memory allocator, but at least they can satisfy basic needs for having heap without libc. Cc: David Laight Acked-by: Willy Tarreau Signed-off-by: Ammar Faizi Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdlib.h | 81 +++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index 8a07e263f0d0..8fd32eaf8037 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -11,7 +11,12 @@ #include "arch.h" #include "types.h" #include "sys.h" +#include "string.h" +struct nolibc_heap { + size_t len; + char user_p[] __attribute__((__aligned__)); +}; /* Buffer used to store int-to-ASCII conversions. Will only be implemented if * any of the related functions is implemented. The area is large enough to @@ -60,6 +65,18 @@ int atoi(const char *s) return atol(s); } +static __attribute__((unused)) +void free(void *ptr) +{ + struct nolibc_heap *heap; + + if (!ptr) + return; + + heap = container_of(ptr, struct nolibc_heap, user_p); + munmap(heap, heap->len); +} + /* getenv() tries to find the environment variable named in the * environment array pointed to by global variable "environ" which must be * declared as a char **, and must be terminated by a NULL (it is recommended @@ -91,6 +108,70 @@ char *getenv(const char *name) return _getenv(name, environ); } +static __attribute__((unused)) +void *malloc(size_t len) +{ + struct nolibc_heap *heap; + + /* Always allocate memory with size multiple of 4096. */ + len = sizeof(*heap) + len; + len = (len + 4095UL) & -4096UL; + heap = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, + -1, 0); + if (__builtin_expect(heap == MAP_FAILED, 0)) + return NULL; + + heap->len = len; + return heap->user_p; +} + +static __attribute__((unused)) +void *calloc(size_t size, size_t nmemb) +{ + void *orig; + size_t res = 0; + + if (__builtin_expect(__builtin_mul_overflow(nmemb, size, &res), 0)) { + SET_ERRNO(ENOMEM); + return NULL; + } + + /* + * No need to zero the heap, the MAP_ANONYMOUS in malloc() + * already does it. + */ + return malloc(res); +} + +static __attribute__((unused)) +void *realloc(void *old_ptr, size_t new_size) +{ + struct nolibc_heap *heap; + size_t user_p_len; + void *ret; + + if (!old_ptr) + return malloc(new_size); + + heap = container_of(old_ptr, struct nolibc_heap, user_p); + user_p_len = heap->len - sizeof(*heap); + /* + * Don't realloc() if @user_p_len >= @new_size, this block of + * memory is still enough to handle the @new_size. Just return + * the same pointer. + */ + if (user_p_len >= new_size) + return old_ptr; + + ret = malloc(new_size); + if (__builtin_expect(!ret, 0)) + return NULL; + + memcpy(ret, heap->user_p, heap->len); + munmap(heap, heap->len); + return ret; +} + /* Converts the unsigned long integer to its hex representation into * buffer , which must be long enough to store the number and the * trailing zero (17 bytes for "ffffffffffffffff" or 9 for "ffffffff"). The From b26823c19a12d9a06207ad3051e3d1059a9e1005 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Tue, 29 Mar 2022 17:17:36 +0700 Subject: [PATCH 140/155] tools/nolibc/string: Implement `strnlen()` size_t strnlen(const char *str, size_t maxlen); The strnlen() function returns the number of bytes in the string pointed to by sstr, excluding the terminating null byte ('\0'), but at most maxlen. In doing this, strnlen() looks only at the first maxlen characters in the string pointed to by str and never beyond str[maxlen-1]. The first use case of this function is for determining the memory allocation size in the strndup() function. Link: https://lore.kernel.org/lkml/CAOG64qMpEMh+EkOfjNdAoueC+uQyT2Uv3689_sOr37-JxdJf4g@mail.gmail.com Suggested-by: Alviro Iskandar Setiawan Acked-by: Willy Tarreau Signed-off-by: Ammar Faizi Signed-off-by: Paul E. McKenney --- tools/include/nolibc/string.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index 75a453870498..f43d52a44d09 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -147,6 +147,15 @@ size_t nolibc_strlen(const char *str) #define strlen(str) nolibc_strlen((str)) #endif +static __attribute__((unused)) +size_t strnlen(const char *str, size_t maxlen) +{ + size_t len; + + for (len = 0; (len < maxlen) && str[len]; len++); + return len; +} + static __attribute__((unused)) size_t strlcat(char *dst, const char *src, size_t size) { From 11dbdaeff41d9ec9376476889651fac4838bff99 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Tue, 29 Mar 2022 17:17:37 +0700 Subject: [PATCH 141/155] tools/nolibc/string: Implement `strdup()` and `strndup()` These functions are currently only available on architectures that have my_syscall6() macro implemented. Since these functions use malloc(), malloc() uses mmap(), mmap() depends on my_syscall6() macro. On architectures that don't support my_syscall6(), these function will always return NULL with errno set to ENOSYS. Acked-by: Willy Tarreau Signed-off-by: Ammar Faizi Signed-off-by: Paul E. McKenney --- tools/include/nolibc/string.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index f43d52a44d09..bef35bee9c44 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -9,6 +9,8 @@ #include "std.h" +static void *malloc(size_t len); + /* * As much as possible, please keep functions alphabetically sorted. */ @@ -156,6 +158,36 @@ size_t strnlen(const char *str, size_t maxlen) return len; } +static __attribute__((unused)) +char *strdup(const char *str) +{ + size_t len; + char *ret; + + len = strlen(str); + ret = malloc(len + 1); + if (__builtin_expect(ret != NULL, 1)) + memcpy(ret, str, len + 1); + + return ret; +} + +static __attribute__((unused)) +char *strndup(const char *str, size_t maxlen) +{ + size_t len; + char *ret; + + len = strnlen(str, maxlen); + ret = malloc(len + 1); + if (__builtin_expect(ret != NULL, 1)) { + memcpy(ret, str, len); + ret[len] = '\0'; + } + + return ret; +} + static __attribute__((unused)) size_t strlcat(char *dst, const char *src, size_t size) { From 3ba75c1316390b2bc39c19cb8f0f85922ab3f9ed Mon Sep 17 00:00:00 2001 From: Baskov Evgeniy Date: Thu, 3 Mar 2022 17:21:19 +0300 Subject: [PATCH 142/155] efi: libstub: declare DXE services table UEFI DXE services are not yet used in kernel code but are required to manipulate page table memory protection flags. Add required declarations to use DXE services functions. Signed-off-by: Baskov Evgeniy Link: https://lore.kernel.org/r/20220303142120.1975-2-baskov@ispras.ru [ardb: ignore absent DXE table but warn if the signature check fails] Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 5 ++ drivers/firmware/efi/libstub/efistub.h | 74 +++++++++++++++++++++++++ drivers/firmware/efi/libstub/x86-stub.c | 9 ++- include/linux/efi.h | 2 + 4 files changed, 89 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 98938a68251c..bed74a0f2932 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -357,6 +357,11 @@ static inline u32 efi64_convert_status(efi_status_t status) runtime), \ func, __VA_ARGS__)) +#define efi_dxe_call(func, ...) \ + (efi_is_native() \ + ? efi_dxe_table->func(__VA_ARGS__) \ + : __efi64_thunk_map(efi_dxe_table, func, __VA_ARGS__)) + #else /* CONFIG_EFI_MIXED */ static inline bool efi_is_64bit(void) diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index edb77b0621ea..2dc24776899a 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -36,6 +36,9 @@ extern bool efi_novamap; extern const efi_system_table_t *efi_system_table; +typedef union efi_dxe_services_table efi_dxe_services_table_t; +extern const efi_dxe_services_table_t *efi_dxe_table; + efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg); @@ -44,6 +47,7 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, #define efi_is_native() (true) #define efi_bs_call(func, ...) efi_system_table->boottime->func(__VA_ARGS__) #define efi_rt_call(func, ...) efi_system_table->runtime->func(__VA_ARGS__) +#define efi_dxe_call(func, ...) efi_dxe_table->func(__VA_ARGS__) #define efi_table_attr(inst, attr) (inst->attr) #define efi_call_proto(inst, func, ...) inst->func(inst, ##__VA_ARGS__) @@ -329,6 +333,76 @@ union efi_boot_services { } mixed_mode; }; +typedef enum { + EfiGcdMemoryTypeNonExistent, + EfiGcdMemoryTypeReserved, + EfiGcdMemoryTypeSystemMemory, + EfiGcdMemoryTypeMemoryMappedIo, + EfiGcdMemoryTypePersistent, + EfiGcdMemoryTypeMoreReliable, + EfiGcdMemoryTypeMaximum +} efi_gcd_memory_type_t; + +typedef struct { + efi_physical_addr_t base_address; + u64 length; + u64 capabilities; + u64 attributes; + efi_gcd_memory_type_t gcd_memory_type; + void *image_handle; + void *device_handle; +} efi_gcd_memory_space_desc_t; + +/* + * EFI DXE Services table + */ +union efi_dxe_services_table { + struct { + efi_table_hdr_t hdr; + void *add_memory_space; + void *allocate_memory_space; + void *free_memory_space; + void *remove_memory_space; + efi_status_t (__efiapi *get_memory_space_descriptor)(efi_physical_addr_t, + efi_gcd_memory_space_desc_t *); + efi_status_t (__efiapi *set_memory_space_attributes)(efi_physical_addr_t, + u64, u64); + void *get_memory_space_map; + void *add_io_space; + void *allocate_io_space; + void *free_io_space; + void *remove_io_space; + void *get_io_space_descriptor; + void *get_io_space_map; + void *dispatch; + void *schedule; + void *trust; + void *process_firmware_volume; + void *set_memory_space_capabilities; + }; + struct { + efi_table_hdr_t hdr; + u32 add_memory_space; + u32 allocate_memory_space; + u32 free_memory_space; + u32 remove_memory_space; + u32 get_memory_space_descriptor; + u32 set_memory_space_attributes; + u32 get_memory_space_map; + u32 add_io_space; + u32 allocate_io_space; + u32 free_io_space; + u32 remove_io_space; + u32 get_io_space_descriptor; + u32 get_io_space_map; + u32 dispatch; + u32 schedule; + u32 trust; + u32 process_firmware_volume; + u32 set_memory_space_capabilities; + } mixed_mode; +}; + typedef union efi_uga_draw_protocol efi_uga_draw_protocol_t; union efi_uga_draw_protocol { diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 01ddd4502e28..cda60aba2f12 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -22,6 +22,7 @@ #define MAXMEM_X86_64_4LEVEL (1ull << 46) const efi_system_table_t *efi_system_table; +const efi_dxe_services_table_t *efi_dxe_table; extern u32 image_offset; static efi_loaded_image_t *image = NULL; @@ -677,11 +678,17 @@ unsigned long efi_main(efi_handle_t handle, efi_status_t status; efi_system_table = sys_table_arg; - /* Check if we were booted by the EFI firmware */ if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) efi_exit(handle, EFI_INVALID_PARAMETER); + efi_dxe_table = get_efi_config_table(EFI_DXE_SERVICES_TABLE_GUID); + if (efi_dxe_table && + efi_dxe_table->hdr.signature != EFI_DXE_SERVICES_TABLE_SIGNATURE) { + efi_warn("Ignoring DXE services table: invalid signature\n"); + efi_dxe_table = NULL; + } + /* * If the kernel isn't already loaded at a suitable address, * relocate it. diff --git a/include/linux/efi.h b/include/linux/efi.h index fd266198f9d8..b1f7c6a3e5bf 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -385,6 +385,7 @@ void efi_native_runtime_setup(void); #define EFI_LOAD_FILE_PROTOCOL_GUID EFI_GUID(0x56ec3091, 0x954c, 0x11d2, 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b) #define EFI_LOAD_FILE2_PROTOCOL_GUID EFI_GUID(0x4006c0c1, 0xfcb3, 0x403e, 0x99, 0x6d, 0x4a, 0x6c, 0x87, 0x24, 0xe0, 0x6d) #define EFI_RT_PROPERTIES_TABLE_GUID EFI_GUID(0xeb66918a, 0x7eef, 0x402a, 0x84, 0x2e, 0x93, 0x1d, 0x21, 0xc3, 0x8a, 0xe9) +#define EFI_DXE_SERVICES_TABLE_GUID EFI_GUID(0x05ad34ba, 0x6f02, 0x4214, 0x95, 0x2e, 0x4d, 0xa0, 0x39, 0x8e, 0x2b, 0xb9) #define EFI_IMAGE_SECURITY_DATABASE_GUID EFI_GUID(0xd719b2cb, 0x3d3a, 0x4596, 0xa3, 0xbc, 0xda, 0xd0, 0x0e, 0x67, 0x65, 0x6f) #define EFI_SHIM_LOCK_GUID EFI_GUID(0x605dab50, 0xe046, 0x4300, 0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23) @@ -438,6 +439,7 @@ typedef struct { } efi_config_table_type_t; #define EFI_SYSTEM_TABLE_SIGNATURE ((u64)0x5453595320494249ULL) +#define EFI_DXE_SERVICES_TABLE_SIGNATURE ((u64)0x565245535f455844ULL) #define EFI_2_30_SYSTEM_TABLE_REVISION ((2 << 16) | (30)) #define EFI_2_20_SYSTEM_TABLE_REVISION ((2 << 16) | (20)) From 82e0d6d76a2a74bd6a31141d555d53b4cc22c2a3 Mon Sep 17 00:00:00 2001 From: Baskov Evgeniy Date: Thu, 3 Mar 2022 17:21:20 +0300 Subject: [PATCH 143/155] efi: libstub: ensure allocated memory to be executable There are UEFI versions that restrict execution of memory regions, preventing the kernel from booting. Parts that needs to be executable are: * Area used for trampoline placement. * All memory regions that the kernel may be relocated before and during extraction. Use DXE services to ensure aforementioned address ranges to be executable. Only modify attributes that does not have appropriate attributes. Signed-off-by: Baskov Evgeniy Link: https://lore.kernel.org/r/20220303142120.1975-3-baskov@ispras.ru Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/Kconfig | 12 +++ drivers/firmware/efi/libstub/x86-stub.c | 110 +++++++++++++++++++++++- 2 files changed, 118 insertions(+), 4 deletions(-) diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index f8ddd2259ba0..4720ba98cec3 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -91,6 +91,18 @@ config EFI_SOFT_RESERVE If unsure, say Y. +config EFI_DXE_MEM_ATTRIBUTES + bool "Adjust memory attributes in EFISTUB" + depends on EFI && EFI_STUB && X86 + default y + help + UEFI specification does not guarantee all memory to be + accessible for both write and execute as the kernel expects + it to be. + Use DXE services to check and alter memory protection + attributes during boot via EFISTUB to ensure that memory + ranges used by the kernel are writable and executable. + config EFI_PARAMS_FROM_FDT bool help diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index cda60aba2f12..b14e88ccefca 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -212,9 +212,110 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) } } +static void +adjust_memory_range_protection(unsigned long start, unsigned long size) +{ + efi_status_t status; + efi_gcd_memory_space_desc_t desc; + unsigned long end, next; + unsigned long rounded_start, rounded_end; + unsigned long unprotect_start, unprotect_size; + int has_system_memory = 0; + + if (efi_dxe_table == NULL) + return; + + rounded_start = rounddown(start, EFI_PAGE_SIZE); + rounded_end = roundup(start + size, EFI_PAGE_SIZE); + + /* + * Don't modify memory region attributes, they are + * already suitable, to lower the possibility to + * encounter firmware bugs. + */ + + for (end = start + size; start < end; start = next) { + + status = efi_dxe_call(get_memory_space_descriptor, start, &desc); + + if (status != EFI_SUCCESS) + return; + + next = desc.base_address + desc.length; + + /* + * Only system memory is suitable for trampoline/kernel image placement, + * so only this type of memory needs its attributes to be modified. + */ + + if (desc.gcd_memory_type != EfiGcdMemoryTypeSystemMemory || + (desc.attributes & (EFI_MEMORY_RO | EFI_MEMORY_XP)) == 0) + continue; + + unprotect_start = max(rounded_start, (unsigned long)desc.base_address); + unprotect_size = min(rounded_end, next) - unprotect_start; + + status = efi_dxe_call(set_memory_space_attributes, + unprotect_start, unprotect_size, + EFI_MEMORY_WB); + + if (status != EFI_SUCCESS) { + efi_warn("Unable to unprotect memory range [%08lx,%08lx]: %d\n", + unprotect_start, + unprotect_start + unprotect_size, + (int)status); + } + } +} + +/* + * Trampoline takes 2 pages and can be loaded in first megabyte of memory + * with its end placed between 128k and 640k where BIOS might start. + * (see arch/x86/boot/compressed/pgtable_64.c) + * + * We cannot find exact trampoline placement since memory map + * can be modified by UEFI, and it can alter the computed address. + */ + +#define TRAMPOLINE_PLACEMENT_BASE ((128 - 8)*1024) +#define TRAMPOLINE_PLACEMENT_SIZE (640*1024 - (128 - 8)*1024) + +void startup_32(struct boot_params *boot_params); + +static void +setup_memory_protection(unsigned long image_base, unsigned long image_size) +{ + /* + * Allow execution of possible trampoline used + * for switching between 4- and 5-level page tables + * and relocated kernel image. + */ + + adjust_memory_range_protection(TRAMPOLINE_PLACEMENT_BASE, + TRAMPOLINE_PLACEMENT_SIZE); + +#ifdef CONFIG_64BIT + if (image_base != (unsigned long)startup_32) + adjust_memory_range_protection(image_base, image_size); +#else + /* + * Clear protection flags on a whole range of possible + * addresses used for KASLR. We don't need to do that + * on x86_64, since KASLR/extraction is performed after + * dedicated identity page tables are built and we only + * need to remove possible protection on relocated image + * itself disregarding further relocations. + */ + adjust_memory_range_protection(LOAD_PHYSICAL_ADDR, + KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR); +#endif +} + static const efi_char16_t apple[] = L"Apple"; -static void setup_quirks(struct boot_params *boot_params) +static void setup_quirks(struct boot_params *boot_params, + unsigned long image_base, + unsigned long image_size) { efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) efi_table_attr(efi_system_table, fw_vendor); @@ -223,6 +324,9 @@ static void setup_quirks(struct boot_params *boot_params) if (IS_ENABLED(CONFIG_APPLE_PROPERTIES)) retrieve_apple_device_properties(boot_params); } + + if (IS_ENABLED(CONFIG_EFI_DXE_MEM_ATTRIBUTES)) + setup_memory_protection(image_base, image_size); } /* @@ -342,8 +446,6 @@ static void __noreturn efi_exit(efi_handle_t handle, efi_status_t status) asm("hlt"); } -void startup_32(struct boot_params *boot_params); - void __noreturn efi_stub_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg, struct boot_params *boot_params); @@ -798,7 +900,7 @@ unsigned long efi_main(efi_handle_t handle, setup_efi_pci(boot_params); - setup_quirks(boot_params); + setup_quirks(boot_params, bzimage_addr, buffer_end - buffer_start); status = exit_boot(boot_params, handle); if (status != EFI_SUCCESS) { From 24b72bb12e84c75e297a5a81f24b921d7a011575 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Tue, 29 Mar 2022 14:47:43 -0400 Subject: [PATCH 144/155] efi: x86: Set the NX-compatibility flag in the PE header Following Baskov Evgeniy's "Handle UEFI NX-restricted page tables" patches, it's safe to set this compatibility flag to let loaders know they don't need to make special accommodations for kernel to load if pre-boot NX is enabled. Signed-off-by: Peter Jones Link: https://lore.kernel.org/all/20220329184743.798513-1-pjones@redhat.com/ Signed-off-by: Ard Biesheuvel --- arch/x86/boot/header.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 6dbd7e9f74c9..0352e4589efa 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -163,7 +163,11 @@ extra_header_fields: .long 0x200 # SizeOfHeaders .long 0 # CheckSum .word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application) +#ifdef CONFIG_DXE_MEM_ATTRIBUTES + .word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics +#else .word 0 # DllCharacteristics +#endif #ifdef CONFIG_X86_32 .long 0 # SizeOfStackReserve .long 0 # SizeOfStackCommit From 416a9f84a77cf826fed1bf9a1908b2d066c17430 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 19 Mar 2022 18:35:53 +0100 Subject: [PATCH 145/155] efi: libstub: pass image handle to handle_kernel_image() In a future patch, arm64's implementation of handle_kernel_image() will omit randomizing the placement of the kernel if the load address was chosen randomly by the loader. In order to do this, it needs to locate a protocol on the image handle, so pass it to handle_kernel_image(). Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/arm32-stub.c | 3 ++- drivers/firmware/efi/libstub/arm64-stub.c | 3 ++- drivers/firmware/efi/libstub/efi-stub.c | 2 +- drivers/firmware/efi/libstub/efistub.h | 3 ++- drivers/firmware/efi/libstub/riscv-stub.c | 3 ++- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c index 4b5b2403b3a0..0131e3aaa605 100644 --- a/drivers/firmware/efi/libstub/arm32-stub.c +++ b/drivers/firmware/efi/libstub/arm32-stub.c @@ -117,7 +117,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long *image_size, unsigned long *reserve_addr, unsigned long *reserve_size, - efi_loaded_image_t *image) + efi_loaded_image_t *image, + efi_handle_t image_handle) { const int slack = TEXT_OFFSET - 5 * PAGE_SIZE; int alloc_size = MAX_UNCOMP_KERNEL_SIZE + EFI_PHYS_ALIGN; diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 9cc556013d08..00c91a3807ea 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -83,7 +83,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long *image_size, unsigned long *reserve_addr, unsigned long *reserve_size, - efi_loaded_image_t *image) + efi_loaded_image_t *image, + efi_handle_t image_handle) { efi_status_t status; unsigned long kernel_size, kernel_memsize = 0; diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index da93864d7abc..f515394cce6e 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -198,7 +198,7 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, status = handle_kernel_image(&image_addr, &image_size, &reserve_addr, &reserve_size, - image); + image, handle); if (status != EFI_SUCCESS) { efi_err("Failed to relocate kernel\n"); goto fail_free_screeninfo; diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 2dc24776899a..a0477afaa55f 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -939,7 +939,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long *image_size, unsigned long *reserve_addr, unsigned long *reserve_size, - efi_loaded_image_t *image); + efi_loaded_image_t *image, + efi_handle_t image_handle); asmlinkage void __noreturn efi_enter_kernel(unsigned long entrypoint, unsigned long fdt_addr, diff --git a/drivers/firmware/efi/libstub/riscv-stub.c b/drivers/firmware/efi/libstub/riscv-stub.c index 9c460843442f..eec043873354 100644 --- a/drivers/firmware/efi/libstub/riscv-stub.c +++ b/drivers/firmware/efi/libstub/riscv-stub.c @@ -80,7 +80,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long *image_size, unsigned long *reserve_addr, unsigned long *reserve_size, - efi_loaded_image_t *image) + efi_loaded_image_t *image, + efi_handle_t image_handle) { unsigned long kernel_size = 0; unsigned long preferred_addr; From 07768c55f9c2ad64ccae3ed82447a87d8af8a687 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 19 Mar 2022 19:00:20 +0100 Subject: [PATCH 146/155] efi/arm64: libstub: run image in place if randomized by the loader If the loader has already placed the EFI kernel image randomly in physical memory, and indicates having done so by installing the 'fixed placement' protocol onto the image handle, don't bother randomizing the placement again in the EFI stub. Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/arm64-stub.c | 12 +++++++++--- include/linux/efi.h | 11 +++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 00c91a3807ea..577173ee1f83 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -101,7 +101,15 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, u64 min_kimg_align = efi_nokaslr ? MIN_KIMG_ALIGN : EFI_KIMG_ALIGN; if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { - if (!efi_nokaslr) { + efi_guid_t li_fixed_proto = LINUX_EFI_LOADED_IMAGE_FIXED_GUID; + void *p; + + if (efi_nokaslr) { + efi_info("KASLR disabled on kernel command line\n"); + } else if (efi_bs_call(handle_protocol, image_handle, + &li_fixed_proto, &p) == EFI_SUCCESS) { + efi_info("Image placement fixed by loader\n"); + } else { status = efi_get_random_bytes(sizeof(phys_seed), (u8 *)&phys_seed); if (status == EFI_NOT_FOUND) { @@ -112,8 +120,6 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, status); efi_nokaslr = true; } - } else { - efi_info("KASLR disabled on kernel command line\n"); } } diff --git a/include/linux/efi.h b/include/linux/efi.h index b1f7c6a3e5bf..580ce607d6f5 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -410,6 +410,17 @@ void efi_native_runtime_setup(void); #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89) #define LINUX_EFI_COCO_SECRET_AREA_GUID EFI_GUID(0xadf956ad, 0xe98c, 0x484c, 0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47) +/* + * This GUID may be installed onto the kernel image's handle as a NULL protocol + * to signal to the stub that the placement of the image should be respected, + * and moving the image in physical memory is undesirable. To ensure + * compatibility with 64k pages kernels with virtually mapped stacks, and to + * avoid defeating physical randomization, this protocol should only be + * installed if the image was placed at a randomized 128k aligned address in + * memory. + */ +#define LINUX_EFI_LOADED_IMAGE_FIXED_GUID EFI_GUID(0xf5a37b6d, 0x3344, 0x42a5, 0xb6, 0xbb, 0x97, 0x86, 0x48, 0xc1, 0x89, 0x0a) + /* OEM GUIDs */ #define DELLEMC_EFI_RCI2_TABLE_GUID EFI_GUID(0x2d9f28a2, 0xa886, 0x456a, 0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55) #define AMD_SEV_MEM_ENCRYPT_GUID EFI_GUID(0x0cf29b71, 0x9e51, 0x433a, 0xa3, 0xb7, 0x81, 0xf3, 0xab, 0x16, 0xb8, 0x75) From 5b759db44195bb779828a188bad6b745c18dcd55 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Mon, 2 May 2022 21:05:09 +0900 Subject: [PATCH 147/155] tools/memory-model/README: Update klitmus7 compat table EXPORT_SYMBOL of do_exec() was removed in v5.17. Unfortunately, kernel modules from klitmus7 7.56 have do_exec() at the end of each kthread. herdtools7 7.56.1 has addressed the issue. Update the compatibility table accordingly. Signed-off-by: Akira Yokosawa Cc: Luc Maranget Cc: Jade Alglave Cc: stable@vger.kernel.org # v5.17+ Signed-off-by: Paul E. McKenney --- tools/memory-model/README | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/memory-model/README b/tools/memory-model/README index 9edd402704c4..dab38904206a 100644 --- a/tools/memory-model/README +++ b/tools/memory-model/README @@ -54,7 +54,8 @@ klitmus7 Compatibility Table -- 4.14 7.48 -- 4.15 -- 4.19 7.49 -- 4.20 -- 5.5 7.54 -- - 5.6 -- 7.56 -- + 5.6 -- 5.16 7.56 -- + 5.17 -- 7.56.1 -- ============ ========== From a57ffb3c6b67e59e8632f731414b792eacc6cca0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 31 Jan 2022 11:21:30 -0800 Subject: [PATCH 148/155] srcu: Automatically determine size-transition strategy at boot This commit adds a srcutree.convert_to_big option of zero that causes SRCU to decide at boot whether to wait for contention (small systems) or immediately expand to large (large systems). A new srcutree.big_cpu_lim (defaulting to 128) defines how many CPUs constitute a large system. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 10 ++++++++ kernel/rcu/srcutree.c | 23 ++++++++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 177e688768c0..0a094bb2d722 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5608,6 +5608,15 @@ off: Disable mitigation and remove performance impact to RDRAND and RDSEED + srcutree.big_cpu_lim [KNL] + Specifies the number of CPUs constituting a + large system, such that srcu_struct structures + should immediately allocate an srcu_node array. + This kernel-boot parameter defaults to 128, + but takes effect only when the low-order four + bits of srcutree.convert_to_big is equal to 3 + (decide at boot). + srcutree.convert_to_big [KNL] Specifies under what conditions an SRCU tree srcu_struct structure will be converted to big @@ -5616,6 +5625,7 @@ 0: Never. 1: At init_srcu_struct() time. 2: When rcutorture decides to. + 3: Decide at boot time (default). 0x1X: Above plus if high contention. Either way, the srcu_node tree will be sized based diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0bc6a0a3edee..b9dec26245e0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -41,10 +41,10 @@ module_param(counter_wrap_check, ulong, 0444); /* * Control conversion to SRCU_SIZE_BIG: - * 0: Don't convert at all (default). + * 0: Don't convert at all. * 1: Convert at init_srcu_struct() time. * 2: Convert when rcutorture invokes srcu_torture_stats_print(). - * 3: Decide at boot time based on system shape. + * 3: Decide at boot time based on system shape (default). * 0x1x: Convert when excessive contention encountered. */ #define SRCU_SIZING_NONE 0 @@ -57,9 +57,13 @@ module_param(counter_wrap_check, ulong, 0444); #define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT)) #define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE)) #define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND) -static int convert_to_big = SRCU_SIZING_NONE; +static int convert_to_big = SRCU_SIZING_AUTO; module_param(convert_to_big, int, 0444); +/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */ +static int big_cpu_lim __read_mostly = 128; +module_param(big_cpu_lim, int, 0444); + /* Contention events per jiffy to initiate transition to big. */ static int small_contention_lim __read_mostly = 100; module_param(small_contention_lim, int, 0444); @@ -1619,6 +1623,17 @@ void __init srcu_init(void) { struct srcu_struct *ssp; + /* Decide on srcu_struct-size strategy. */ + if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { + if (nr_cpu_ids >= big_cpu_lim) { + convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention. + pr_info("%s: Setting srcu_struct sizes to big.\n", __func__); + } else { + convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND; + pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__); + } + } + /* * Once that is set, call_srcu() can follow the normal path and * queue delayed work. This must follow RCU workqueues creation @@ -1629,6 +1644,8 @@ void __init srcu_init(void) ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, work.work.entry); list_del_init(&ssp->work.work.entry); + if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL) + ssp->srcu_size_state = SRCU_SIZE_ALLOC; queue_work(rcu_gp_wq, &ssp->work.work); } } From c2445d38785086422e56dcbe049b73a53b2ba81f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 31 Jan 2022 13:27:15 -0800 Subject: [PATCH 149/155] srcu: Add contention check to call_srcu() srcu_data ->lock acquisition This commit increases the sensitivity of contention detection by adding checks to the acquisition of the srcu_data structure's lock on the call_srcu() code path. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 45 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b9dec26245e0..862008c147b0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -330,18 +330,13 @@ static void srcu_transition_to_big(struct srcu_struct *ssp) } /* - * Acquire the specified srcu_struct structure's ->lock, but check for - * excessive contention, which results in initiation of a transition - * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module - * parameter permits this. + * Check to see if the just-encountered contention event justifies + * a transition to SRCU_SIZE_BIG. */ -static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) { unsigned long j; - if (spin_trylock_irqsave_rcu_node(ssp, *flags)) - return; - spin_lock_irqsave_rcu_node(ssp, *flags); if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state) return; j = jiffies; @@ -354,6 +349,38 @@ static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned l __srcu_transition_to_big(ssp); } +/* + * Acquire the specified srcu_data structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) +{ + struct srcu_struct *ssp = sdp->ssp; + + if (spin_trylock_irqsave_rcu_node(sdp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_check_contention(ssp); + spin_unlock_irqrestore_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(sdp, *flags); +} + +/* + * Acquire the specified srcu_struct structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +{ + if (spin_trylock_irqsave_rcu_node(ssp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_check_contention(ssp); +} + /* * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be @@ -1010,7 +1037,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, sdp = per_cpu_ptr(ssp->sda, 0); else sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_rcu_node(sdp, flags); + spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, From 282d8998e9979c2186af7f7d22366f2fc3149838 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Mar 2022 15:45:33 -0800 Subject: [PATCH 150/155] srcu: Prevent expedited GPs and blocking readers from consuming CPU If an SRCU reader blocks while a synchronize_srcu_expedited() waits for that same reader, then that grace period will spawn an endless series of workqueue handlers, consuming a full CPU. This quickly gets pointless because consuming more CPU isn't going to make that reader get done faster, especially if it is blocked waiting for an external event. This commit therefore spawns at most one pair of back-to-back workqueue handlers per expedited grace period phase, instead inserting increasing delays as that grace period phase grows older, but capped at 10 jiffies. In any case, if there have been at least 100 back-to-back workqueue handlers within a single jiffy, regardless of grace period or grace-period phase, then a one-jiffy delay is inserted. [ paulmck: Apply feedback from kernel test robot. ] Cc: Neeraj Upadhyay Reported-by: Song Liu Tested-by: kernel test robot Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 4 ++++ kernel/rcu/srcutree.c | 44 ++++++++++++++++++++++++++++++++++------ 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 1b9ff4ed37e4..e3014319d1ad 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -71,9 +71,11 @@ struct srcu_struct { unsigned long srcu_gp_seq; /* Grace-period seq #. */ unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ + unsigned long srcu_gp_start; /* Last GP start timestamp (jiffies) */ unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ unsigned long srcu_size_jiffies; /* Current contention-measurement interval. */ unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ + unsigned long srcu_n_exp_nodelay; /* # expedited no-delays in current GP phase. */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ @@ -83,6 +85,8 @@ struct srcu_struct { atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ /* callback for the barrier */ /* operation. */ + unsigned long reschedule_jiffies; + unsigned long reschedule_count; struct delayed_work work; struct lockdep_map dep_map; }; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 862008c147b0..6dd44e759f12 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -511,7 +511,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp) return sum; } -#define SRCU_INTERVAL 1 +#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. +#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. +#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances. +#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances. /* * Return grace-period delay, zero if there are expedited grace @@ -519,9 +522,18 @@ static bool srcu_readers_active(struct srcu_struct *ssp) */ static unsigned long srcu_get_delay(struct srcu_struct *ssp) { + unsigned long jbase = SRCU_INTERVAL; + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) - return 0; - return SRCU_INTERVAL; + jbase = 0; + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) + jbase += jiffies - READ_ONCE(ssp->srcu_gp_start); + if (!jbase) { + WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE) + jbase = 1; + } + return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase; } /** @@ -623,6 +635,8 @@ static void srcu_gp_start(struct srcu_struct *ssp) (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&ssp->srcu_gp_seq)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ + WRITE_ONCE(ssp->srcu_gp_start, jiffies); + WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&ssp->srcu_gp_seq); state = rcu_seq_state(ssp->srcu_gp_seq); @@ -706,7 +720,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) spin_lock_irq_rcu_node(ssp); idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - cbdelay = srcu_get_delay(ssp); + cbdelay = !!srcu_get_delay(ssp); WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); @@ -893,7 +907,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) queue_delayed_work(rcu_gp_wq, &ssp->work, - srcu_get_delay(ssp)); + !!srcu_get_delay(ssp)); else if (list_empty(&ssp->work.work.entry)) list_add(&ssp->work.work.entry, &srcu_boot_list); } @@ -1448,6 +1462,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) srcu_flip(ssp); spin_lock_irq_rcu_node(ssp); rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); + ssp->srcu_n_exp_nodelay = 0; spin_unlock_irq_rcu_node(ssp); } @@ -1462,6 +1477,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) mutex_unlock(&ssp->srcu_gp_mutex); return; /* readers present, retry later. */ } + ssp->srcu_n_exp_nodelay = 0; srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } @@ -1552,12 +1568,28 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) */ static void process_srcu(struct work_struct *work) { + unsigned long curdelay; + unsigned long j; struct srcu_struct *ssp; ssp = container_of(work, struct srcu_struct, work.work); srcu_advance_state(ssp); - srcu_reschedule(ssp, srcu_get_delay(ssp)); + curdelay = srcu_get_delay(ssp); + if (curdelay) { + WRITE_ONCE(ssp->reschedule_count, 0); + } else { + j = jiffies; + if (READ_ONCE(ssp->reschedule_jiffies) == j) { + WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); + if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY) + curdelay = 1; + } else { + WRITE_ONCE(ssp->reschedule_count, 1); + WRITE_ONCE(ssp->reschedule_jiffies, j); + } + } + srcu_reschedule(ssp, curdelay); } void srcutorture_get_gp_data(enum rcutorture_type test_type, From 586e31d59c436cda65a2e8ac04ff954bed247023 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 15 Mar 2022 09:55:49 +0100 Subject: [PATCH 151/155] srcu: Drop needless initialization of sdp in srcu_gp_start() Commit 9c7ef4c30f12 ("srcu: Make Tree SRCU able to operate without snp_node array") initializes the local variable sdp differently depending on the srcu's state in srcu_gp_start(). Either way, this initialization overwrites the value used when sdp is defined. This commit therefore drops this pointless definition-time initialization. Although there is no functional change, compiler code generation may be affected. Signed-off-by: Lukas Bulwahn Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6dd44e759f12..50ba70f019de 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -620,7 +620,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); */ static void srcu_gp_start(struct srcu_struct *ssp) { - struct srcu_data *sdp = this_cpu_ptr(ssp->sda); + struct srcu_data *sdp; int state; if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) From a6cfe03c34bad8c7f51aba49a73403e348c51d1f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 3 May 2022 14:29:39 +0200 Subject: [PATCH 152/155] efi: stub: prefer mirrored memory for randomized allocations If the system exposes memory regions with the EFI_MORE_RELIABLE attribute, it is implied that it is intended to be used for allocations that are relatively important, such as the kernel's static image. Since efi_random_alloc() is mostly (only) used for allocating space for the kernel image, let's update it to take this into account, and disregard all memory without the EFI_MORE_RELIABLE attribute if there is sufficient memory available that does have this attribute. Note that this change only affects booting with randomization enabled. In other cases, the EFI stub runs the kernel image in place unless its placement is unsuitable for some reason (i.e., misaligned, or its BSS overlaps with another allocation), and it is left to the bootloader to ensure that the kernel was loaded into EFI_MORE_RELIABLE memory if this is desired. Signed-off-by: Ard Biesheuvel Reviewed-by: Kefeng Wang --- drivers/firmware/efi/libstub/randomalloc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index 724155b9e10d..715f37479154 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -56,6 +56,7 @@ efi_status_t efi_random_alloc(unsigned long size, unsigned long random_seed) { unsigned long map_size, desc_size, total_slots = 0, target_slot; + unsigned long total_mirrored_slots = 0; unsigned long buff_size; efi_status_t status; efi_memory_desc_t *memory_map; @@ -86,8 +87,14 @@ efi_status_t efi_random_alloc(unsigned long size, slots = get_entry_num_slots(md, size, ilog2(align)); MD_NUM_SLOTS(md) = slots; total_slots += slots; + if (md->attribute & EFI_MEMORY_MORE_RELIABLE) + total_mirrored_slots += slots; } + /* consider only mirrored slots for randomization if any exist */ + if (total_mirrored_slots > 0) + total_slots = total_mirrored_slots; + /* find a random number between 0 and total_slots */ target_slot = (total_slots * (u64)(random_seed & U32_MAX)) >> 32; @@ -107,6 +114,10 @@ efi_status_t efi_random_alloc(unsigned long size, efi_physical_addr_t target; unsigned long pages; + if (total_mirrored_slots > 0 && + !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) + continue; + if (target_slot >= MD_NUM_SLOTS(md)) { target_slot -= MD_NUM_SLOTS(md); continue; From 28b3ae426598e722cf5d5ab9cc7038791b955a56 Mon Sep 17 00:00:00 2001 From: Uladzislau Rezki Date: Wed, 16 Feb 2022 14:52:09 +0100 Subject: [PATCH 153/155] rcu: Introduce CONFIG_RCU_EXP_CPU_STALL_TIMEOUT Currently both expedited and regular grace period stall warnings use a single timeout value that with units of seconds. However, recent Android use cases problem require a sub-100-millisecond expedited RCU CPU stall warning. Given that expedited RCU grace periods normally complete in far less than a single millisecond, especially for small systems, this is not unreasonable. Therefore introduce the CONFIG_RCU_EXP_CPU_STALL_TIMEOUT kernel configuration that defaults to 20 msec on Android and remains the same as that of the non-expedited stall warnings otherwise. It also can be changed in run-time via: /sys/.../parameters/rcu_exp_cpu_stall_timeout. [ paulmck: Default of zero to use CONFIG_RCU_STALL_TIMEOUT. ] Signed-off-by: Uladzislau Rezki Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.rst | 20 +++++++++++++ .../admin-guide/kernel-parameters.txt | 12 ++++++++ kernel/rcu/Kconfig.debug | 14 ++++++++++ kernel/rcu/rcu.h | 2 ++ kernel/rcu/tree_exp.h | 4 +-- kernel/rcu/tree_stall.h | 28 +++++++++++++++++++ kernel/rcu/update.c | 2 ++ 7 files changed, 80 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst index 78404625bad2..794837eb519b 100644 --- a/Documentation/RCU/stallwarn.rst +++ b/Documentation/RCU/stallwarn.rst @@ -162,6 +162,26 @@ CONFIG_RCU_CPU_STALL_TIMEOUT Stall-warning messages may be enabled and disabled completely via /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress. +CONFIG_RCU_EXP_CPU_STALL_TIMEOUT +-------------------------------- + + Same as the CONFIG_RCU_CPU_STALL_TIMEOUT parameter but only for + the expedited grace period. This parameter defines the period + of time that RCU will wait from the beginning of an expedited + grace period until it issues an RCU CPU stall warning. This time + period is normally 20 milliseconds on Android devices. A zero + value causes the CONFIG_RCU_CPU_STALL_TIMEOUT value to be used, + after conversion to milliseconds. + + This configuration parameter may be changed at runtime via the + /sys/module/rcupdate/parameters/rcu_exp_cpu_stall_timeout, however + this parameter is checked only at the beginning of a cycle. If you + are in a current stall cycle, setting it to a new value will change + the timeout for the -next- stall. + + Stall-warning messages may be enabled and disabled completely via + /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress. + RCU_STALL_DELAY_DELTA --------------------- diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f1cc5e317ed..5e21a3fb57c4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4893,6 +4893,18 @@ rcupdate.rcu_cpu_stall_timeout= [KNL] Set timeout for RCU CPU stall warning messages. + The value is in seconds and the maximum allowed + value is 300 seconds. + + rcupdate.rcu_exp_cpu_stall_timeout= [KNL] + Set timeout for expedited RCU CPU stall warning + messages. The value is in milliseconds + and the maximum allowed value is 21000 + milliseconds. Please note that this value is + adjusted to an arch timer tick resolution. + Setting this to zero causes the value from + rcupdate.rcu_cpu_stall_timeout to be used (after + conversion from seconds to milliseconds). rcupdate.rcu_expedited= [KNL] Use expedited grace-period primitives, for diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4fd64999300f..0b397b5bf846 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -91,6 +91,20 @@ config RCU_CPU_STALL_TIMEOUT RCU grace period persists, additional CPU stall warnings are printed at more widely spaced intervals. +config RCU_EXP_CPU_STALL_TIMEOUT + int "Expedited RCU CPU stall timeout in milliseconds" + depends on RCU_STALL_COMMON + range 0 21000 + default 20 if ANDROID + default 0 if !ANDROID + help + If a given expedited RCU grace period extends more than the + specified number of milliseconds, a CPU stall warning is printed. + If the RCU grace period persists, additional CPU stall warnings + are printed at more widely spaced intervals. A value of zero + says to use the RCU_CPU_STALL_TIMEOUT value converted from + seconds to milliseconds. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..20f0300f6cb1 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -210,7 +210,9 @@ static inline bool rcu_stall_is_suppressed_at_boot(void) extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; +extern int rcu_exp_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); +int rcu_exp_jiffies_till_stall_check(void); static inline bool rcu_stall_is_suppressed(void) { diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 60197ea24ceb..b1f52b59fa4b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -496,7 +496,7 @@ static void synchronize_rcu_expedited_wait(void) struct rcu_node *rnp_root = rcu_get_root(); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); - jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_stall = rcu_exp_jiffies_till_stall_check(); jiffies_start = jiffies; if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) { if (synchronize_rcu_expedited_wait_once(1)) @@ -571,7 +571,7 @@ static void synchronize_rcu_expedited_wait(void) dump_cpu_task(cpu); } } - jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; } } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0c5d8516516a..009d3f9305cf 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -25,6 +25,34 @@ int sysctl_max_rcu_stall_to_panic __read_mostly; #define RCU_STALL_MIGHT_DIV 8 #define RCU_STALL_MIGHT_MIN (2 * HZ) +int rcu_exp_jiffies_till_stall_check(void) +{ + int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout); + int exp_stall_delay_delta = 0; + int till_stall_check; + + // Zero says to use rcu_cpu_stall_timeout, but in milliseconds. + if (!cpu_stall_timeout) + cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check()); + + // Limit check must be consistent with the Kconfig limits for + // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range. + // The minimum clamped value is "2UL", because at least one full + // tick has to be guaranteed. + till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ); + + if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout) + WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check)); + +#ifdef CONFIG_PROVE_RCU + /* Add extra ~25% out of till_stall_check. */ + exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1; +#endif + + return till_stall_check + exp_stall_delay_delta; +} +EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check); + /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 180ff9c41fa8..fc7fef575606 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -506,6 +506,8 @@ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); +int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT; +module_param(rcu_exp_cpu_stall_timeout, int, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ // Suppress boot-time RCU CPU stall warnings and rcutorture writer stall From 9621fbee44df940e2e1b94b0676460a538dffefa Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Fri, 8 Apr 2022 17:35:27 -0700 Subject: [PATCH 154/155] rcu: Move expedited grace period (GP) work to RT kthread_worker Enabling CONFIG_RCU_BOOST did not reduce RCU expedited grace-period latency because its workqueues run at SCHED_OTHER, and thus can be delayed by normal processes. This commit avoids these delays by moving the expedited GP work items to a real-time-priority kthread_worker. This option is controlled by CONFIG_RCU_EXP_KTHREAD and disabled by default on PREEMPT_RT=y kernels which disable expedited grace periods after boot by unconditionally setting rcupdate.rcu_normal_after_boot=1. The results were evaluated on arm64 Android devices (6GB ram) running 5.10 kernel, and capturing trace data in critical user-level code. The table below shows the resulting order-of-magnitude improvements in synchronize_rcu_expedited() latency: ------------------------------------------------------------------------ | | workqueues | kthread_worker | Diff | ------------------------------------------------------------------------ | Count | 725 | 688 | | ------------------------------------------------------------------------ | Min Duration (ns) | 326 | 447 | 37.12% | ------------------------------------------------------------------------ | Q1 (ns) | 39,428 | 38,971 | -1.16% | ------------------------------------------------------------------------ | Q2 - Median (ns) | 98,225 | 69,743 | -29.00% | ------------------------------------------------------------------------ | Q3 (ns) | 342,122 | 126,638 | -62.98% | ------------------------------------------------------------------------ | Max Duration (ns) | 372,766,967 | 2,329,671 | -99.38% | ------------------------------------------------------------------------ | Avg Duration (ns) | 2,746,353 | 151,242 | -94.49% | ------------------------------------------------------------------------ | Standard Deviation (ns) | 19,327,765 | 294,408 | | ------------------------------------------------------------------------ The below table show the range of maximums/minimums for synchronize_rcu_expedited() latency from all experiments: ------------------------------------------------------------------------ | | workqueues | kthread_worker | Diff | ------------------------------------------------------------------------ | Total No. of Experiments | 25 | 23 | | ------------------------------------------------------------------------ | Largest Maximum (ns) | 372,766,967 | 2,329,671 | -99.38% | ------------------------------------------------------------------------ | Smallest Maximum (ns) | 38,819 | 86,954 | 124.00% | ------------------------------------------------------------------------ | Range of Maximums (ns) | 372,728,148 | 2,242,717 | | ------------------------------------------------------------------------ | Largest Minimum (ns) | 88,623 | 27,588 | -68.87% | ------------------------------------------------------------------------ | Smallest Minimum (ns) | 326 | 447 | 37.12% | ------------------------------------------------------------------------ | Range of Minimums (ns) | 88,297 | 27,141 | | ------------------------------------------------------------------------ Cc: "Paul E. McKenney" Cc: Tejun Heo Reported-by: Tim Murray Reported-by: Wei Wang Tested-by: Kyle Lin Tested-by: Chunwei Lu Tested-by: Lulu Wang Signed-off-by: Kalesh Singh Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 14 ++++ kernel/rcu/rcu.h | 5 ++ kernel/rcu/tree.c | 51 ++++++++++++++- kernel/rcu/tree.h | 5 ++ kernel/rcu/tree_exp.h | 147 +++++++++++++++++++++++++++++++++--------- 5 files changed, 188 insertions(+), 34 deletions(-) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index bf8e341e75b4..fd64a75823cb 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -195,6 +195,20 @@ config RCU_BOOST_DELAY Accept the default if unsure. +config RCU_EXP_KTHREAD + bool "Perform RCU expedited work in a real-time kthread" + depends on RCU_BOOST && RCU_EXPERT + default !PREEMPT_RT && NR_CPUS <= 32 + help + Use this option to further reduce the latencies of expedited + grace periods at the expense of being more disruptive. + + This option is disabled by default on PREEMPT_RT=y kernels which + disable expedited grace periods after boot by unconditionally + setting rcupdate.rcu_normal_after_boot=1. + + Accept the default if unsure. + config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" depends on TREE_RCU diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 20f0300f6cb1..e27bf7d1e3a4 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -536,7 +536,12 @@ int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; +#ifdef CONFIG_RCU_EXP_KTHREAD +extern struct kthread_worker *rcu_exp_gp_kworker; +extern struct kthread_worker *rcu_exp_par_gp_kworker; +#else /* !CONFIG_RCU_EXP_KTHREAD */ extern struct workqueue_struct *rcu_par_gp_wq; +#endif /* CONFIG_RCU_EXP_KTHREAD */ #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..763e45fdf49b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4471,6 +4471,51 @@ static int rcu_pm_notify(struct notifier_block *self, return NOTIFY_OK; } +#ifdef CONFIG_RCU_EXP_KTHREAD +struct kthread_worker *rcu_exp_gp_kworker; +struct kthread_worker *rcu_exp_par_gp_kworker; + +static void __init rcu_start_exp_gp_kworkers(void) +{ + const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; + const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", gp_kworker_name); + return; + } + + rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { + pr_err("Failed to create %s!\n", par_gp_kworker_name); + kthread_destroy_worker(rcu_exp_gp_kworker); + return; + } + + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, + ¶m); +} + +static inline void rcu_alloc_par_gp_wq(void) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +struct workqueue_struct *rcu_par_gp_wq; + +static void __init rcu_start_exp_gp_kworkers(void) +{ +} + +static inline void rcu_alloc_par_gp_wq(void) +{ + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Spawn the kthreads that handle RCU's grace periods. */ @@ -4500,6 +4545,8 @@ static int __init rcu_spawn_gp_kthread(void) rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); rcu_spawn_core_kthreads(); + /* Create kthread worker for expedited GPs */ + rcu_start_exp_gp_kworkers(); return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -4745,7 +4792,6 @@ static void __init rcu_dump_rcu_node_tree(void) } struct workqueue_struct *rcu_gp_wq; -struct workqueue_struct *rcu_par_gp_wq; static void __init kfree_rcu_batch_init(void) { @@ -4811,8 +4857,7 @@ void __init rcu_init(void) /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); + rcu_alloc_par_gp_wq(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 926673ebe355..b577cdfdc851 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -10,6 +10,7 @@ */ #include +#include #include #include #include @@ -23,7 +24,11 @@ /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { unsigned long rew_s; +#ifdef CONFIG_RCU_EXP_KTHREAD + struct kthread_work rew_work; +#else struct work_struct rew_work; +#endif /* CONFIG_RCU_EXP_KTHREAD */ }; /* RCU's kthread states for tracing. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index b1f52b59fa4b..0f70f62039a9 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -334,15 +334,13 @@ fastpath: * Select the CPUs within the specified rcu_node that the upcoming * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) { int cpu; unsigned long flags; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; - struct rcu_exp_work *rewp = - container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -417,13 +415,119 @@ retry_ipi: rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); } +static void rcu_exp_sel_wait_wake(unsigned long s); + +#ifdef CONFIG_RCU_EXP_KTHREAD +static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_exp_par_gp_kworker); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* + * Use rcu_exp_par_gp_kworker, because flushing a work item from + * another work item on the same kthread worker can result in + * deadlock. + */ + kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + kthread_flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + kthread_init_work(&rew->rew_work, wait_rcu_exp_gp); + kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_par_gp_wq); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); + + INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* If all offline, queue the work on an unbound CPU. */ + if (unlikely(cpu > rnp->grphi - rnp->grplo)) + cpu = WORK_CPU_UNBOUND; + else + cpu += rnp->grplo; + queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct work_struct *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp); + queue_work(rcu_gp_wq, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ + destroy_work_on_stack(&rew->rew_work); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Select the nodes that the upcoming expedited grace period needs * to wait for. */ static void sync_rcu_exp_select_cpus(void) { - int cpu; struct rcu_node *rnp; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset")); @@ -435,28 +539,21 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!READ_ONCE(rcu_par_gp_wq) || + if (!rcu_gp_par_worker_started() || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { - /* No workqueues yet or last leaf, do direct call. */ + /* No worker started yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; } - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); - /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi - rnp->grplo)) - cpu = WORK_CPU_UNBOUND; - else - cpu += rnp->grplo; - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); + sync_rcu_exp_select_cpus_queue_work(rnp); rnp->exp_need_flush = true; } - /* Wait for workqueue jobs (if any) to complete. */ + /* Wait for jobs (if any) to complete. */ rcu_for_each_leaf_node(rnp) if (rnp->exp_need_flush) - flush_work(&rnp->rew.rew_work); + sync_rcu_exp_select_cpus_flush_work(rnp); } /* @@ -622,17 +719,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s) rcu_exp_wait_wake(s); } -/* - * Work-queue handler to drive an expedited grace period forward. - */ -static void wait_rcu_exp_gp(struct work_struct *wp) -{ - struct rcu_exp_work *rewp; - - rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_s); -} - #ifdef CONFIG_PREEMPT_RCU /* @@ -848,20 +934,19 @@ void synchronize_rcu_expedited(void) } else { /* Marshall arguments & schedule the expedited grace period. */ rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew.rew_work); + synchronize_rcu_expedited_queue_work(&rew); } /* Wait for expedited grace period to complete. */ rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); - smp_mb(); /* Workqueue actions happen before return. */ + smp_mb(); /* Work actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); if (likely(!boottime)) - destroy_work_on_stack(&rew.rew_work); + synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); From 3f68e69520d3d52d66a6ad872a75b7d8f2ea7665 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Thu, 19 May 2022 10:45:12 +0530 Subject: [PATCH 155/155] riscv/efi_stub: Add support for RISCV_EFI_BOOT_PROTOCOL Add support for getting the boot hart ID from the Linux EFI stub using RISCV_EFI_BOOT_PROTOCOL. This method is preferred over the existing DT based approach since it works irrespective of DT or ACPI. The specification of the protocol is hosted at: https://github.com/riscv-non-isa/riscv-uefi Signed-off-by: Sunil V L Acked-by: Palmer Dabbelt Reviewed-by: Heinrich Schuchardt Link: https://lore.kernel.org/r/20220519051512.136724-2-sunilvl@ventanamicro.com [ardb: minor tweaks for coding style and whitespace] Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/efistub.h | 7 ++++++ drivers/firmware/efi/libstub/riscv-stub.c | 29 ++++++++++++++++++----- include/linux/efi.h | 2 ++ 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index a0477afaa55f..b0ae0a454404 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -794,6 +794,13 @@ union efi_tcg2_protocol { } mixed_mode; }; +struct riscv_efi_boot_protocol { + u64 revision; + + efi_status_t (__efiapi *get_boot_hartid)(struct riscv_efi_boot_protocol *, + unsigned long *boot_hartid); +}; + typedef union efi_load_file_protocol efi_load_file_protocol_t; typedef union efi_load_file_protocol efi_load_file2_protocol_t; diff --git a/drivers/firmware/efi/libstub/riscv-stub.c b/drivers/firmware/efi/libstub/riscv-stub.c index eec043873354..9e85e58d1f27 100644 --- a/drivers/firmware/efi/libstub/riscv-stub.c +++ b/drivers/firmware/efi/libstub/riscv-stub.c @@ -21,9 +21,9 @@ #define MIN_KIMG_ALIGN SZ_4M #endif -typedef void __noreturn (*jump_kernel_func)(unsigned int, unsigned long); +typedef void __noreturn (*jump_kernel_func)(unsigned long, unsigned long); -static u32 hartid; +static unsigned long hartid; static int get_boot_hartid_from_fdt(void) { @@ -47,14 +47,31 @@ static int get_boot_hartid_from_fdt(void) return 0; } +static efi_status_t get_boot_hartid_from_efi(void) +{ + efi_guid_t boot_protocol_guid = RISCV_EFI_BOOT_PROTOCOL_GUID; + struct riscv_efi_boot_protocol *boot_protocol; + efi_status_t status; + + status = efi_bs_call(locate_protocol, &boot_protocol_guid, NULL, + (void **)&boot_protocol); + if (status != EFI_SUCCESS) + return status; + return efi_call_proto(boot_protocol, get_boot_hartid, &hartid); +} + efi_status_t check_platform_features(void) { + efi_status_t status; int ret; - ret = get_boot_hartid_from_fdt(); - if (ret) { - efi_err("/chosen/boot-hartid missing or invalid!\n"); - return EFI_UNSUPPORTED; + status = get_boot_hartid_from_efi(); + if (status != EFI_SUCCESS) { + ret = get_boot_hartid_from_fdt(); + if (ret) { + efi_err("Failed to get boot hartid!\n"); + return EFI_UNSUPPORTED; + } } return EFI_SUCCESS; } diff --git a/include/linux/efi.h b/include/linux/efi.h index 580ce607d6f5..0412304ce34e 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -410,6 +410,8 @@ void efi_native_runtime_setup(void); #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89) #define LINUX_EFI_COCO_SECRET_AREA_GUID EFI_GUID(0xadf956ad, 0xe98c, 0x484c, 0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47) +#define RISCV_EFI_BOOT_PROTOCOL_GUID EFI_GUID(0xccd15fec, 0x6f73, 0x4eec, 0x83, 0x95, 0x3e, 0x69, 0xe4, 0xb9, 0x40, 0xbf) + /* * This GUID may be installed onto the kernel image's handle as a NULL protocol * to signal to the stub that the placement of the image should be respected,