From 5e124f74325d8181b3275a704e87e44246c484f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:33 -0400 Subject: [PATCH 01/11] blk-iocost: use local[64]_t for percpu stat blk-iocost has been reading percpu stat counters from remote cpus which on some archs can lead to torn reads in really rare occassions. Use local[64]_t for those counters. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index d37b55db2409..e2266e7692b4 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -179,6 +179,8 @@ #include #include #include +#include +#include #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk-wbt.h" @@ -373,8 +375,8 @@ struct ioc_params { }; struct ioc_missed { - u32 nr_met; - u32 nr_missed; + local_t nr_met; + local_t nr_missed; u32 last_met; u32 last_missed; }; @@ -382,7 +384,7 @@ struct ioc_missed { struct ioc_pcpu_stat { struct ioc_missed missed[2]; - u64 rq_wait_ns; + local64_t rq_wait_ns; u64 last_rq_wait_ns; }; @@ -1278,8 +1280,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p u64 this_rq_wait_ns; for (rw = READ; rw <= WRITE; rw++) { - u32 this_met = READ_ONCE(stat->missed[rw].nr_met); - u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed); + u32 this_met = local_read(&stat->missed[rw].nr_met); + u32 this_missed = local_read(&stat->missed[rw].nr_missed); nr_met[rw] += this_met - stat->missed[rw].last_met; nr_missed[rw] += this_missed - stat->missed[rw].last_missed; @@ -1287,7 +1289,7 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p stat->missed[rw].last_missed = this_missed; } - this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns); + this_rq_wait_ns = local64_read(&stat->rq_wait_ns); rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; stat->last_rq_wait_ns = this_rq_wait_ns; } @@ -1908,6 +1910,7 @@ static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) { struct ioc *ioc = rqos_to_ioc(rqos); + struct ioc_pcpu_stat *ccs; u64 on_q_ns, rq_wait_ns, size_nsec; int pidx, rw; @@ -1931,13 +1934,17 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); + ccs = get_cpu_ptr(ioc->pcpu_stat); + if (on_q_ns <= size_nsec || on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) - this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met); + local_inc(&ccs->missed[rw].nr_met); else - this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed); + local_inc(&ccs->missed[rw].nr_missed); - this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns); + local64_add(rq_wait_ns, &ccs->rq_wait_ns); + + put_cpu_ptr(ccs); } static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos) @@ -1977,7 +1984,7 @@ static int blk_iocost_init(struct request_queue *q) { struct ioc *ioc; struct rq_qos *rqos; - int ret; + int i, cpu, ret; ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); if (!ioc) @@ -1989,6 +1996,16 @@ static int blk_iocost_init(struct request_queue *q) return -ENOMEM; } + for_each_possible_cpu(cpu) { + struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); + + for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { + local_set(&ccs->missed[i].nr_met, 0); + local_set(&ccs->missed[i].nr_missed, 0); + } + local64_set(&ccs->rq_wait_ns, 0); + } + rqos = &ioc->rqos; rqos->id = RQ_QOS_COST; rqos->ops = &ioc_rqos_ops; From 00410f1b09fe7c9a12bde07f0bb4b978a3367f3a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:34 -0400 Subject: [PATCH 02/11] blk-iocost: rename propagate_active_weights() to propagate_weights() It already propagates two weights - active and inuse - and there will be another soon. Let's drop the confusing misnomers. Rename [__]propagate_active_weights() to [__]propagate_weights() and commit_active_weights() to commit_weights(). This is pure rename. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index e2266e7692b4..78e6919153d8 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -479,7 +479,7 @@ struct ioc_gq { atomic64_t active_period; struct list_head active_list; - /* see __propagate_active_weight() and current_hweight() for details */ + /* see __propagate_weights() and current_hweight() for details */ u64 child_active_sum; u64 child_inuse_sum; int hweight_gen; @@ -890,7 +890,7 @@ static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) * Update @iocg's `active` and `inuse` to @active and @inuse, update level * weight sums and propagate upwards accordingly. */ -static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) +static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse) { struct ioc *ioc = iocg->ioc; int lvl; @@ -935,7 +935,7 @@ static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse ioc->weights_updated = true; } -static void commit_active_weights(struct ioc *ioc) +static void commit_weights(struct ioc *ioc) { lockdep_assert_held(&ioc->lock); @@ -947,10 +947,10 @@ static void commit_active_weights(struct ioc *ioc) } } -static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) +static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse) { - __propagate_active_weight(iocg, active, inuse); - commit_active_weights(iocg->ioc); + __propagate_weights(iocg, active, inuse); + commit_weights(iocg->ioc); } static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep) @@ -966,9 +966,9 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep goto out; /* - * Paired with wmb in commit_active_weights(). If we saw the - * updated hweight_gen, all the weight updates from - * __propagate_active_weight() are visible too. + * Paired with wmb in commit_weights(). If we saw the updated + * hweight_gen, all the weight updates from __propagate_weights() are + * visible too. * * We can race with weight updates during calculation and get it * wrong. However, hweight_gen would have changed and a future @@ -1018,7 +1018,7 @@ static void weight_updated(struct ioc_gq *iocg) weight = iocg->cfg_weight ?: iocc->dfl_weight; if (weight != iocg->weight && iocg->active) - propagate_active_weight(iocg, weight, + propagate_weights(iocg, weight, DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight)); iocg->weight = weight; } @@ -1090,8 +1090,8 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) */ iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; list_add(&iocg->active_list, &ioc->active_iocgs); - propagate_active_weight(iocg, iocg->weight, - iocg->last_inuse ?: iocg->weight); + propagate_weights(iocg, iocg->weight, + iocg->last_inuse ?: iocg->weight); TRACE_IOCG_PATH(iocg_activate, iocg, now, last_period, cur_period, vtime); @@ -1384,13 +1384,13 @@ static void ioc_timer_fn(struct timer_list *timer) } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ iocg->last_inuse = iocg->inuse; - __propagate_active_weight(iocg, 0, 0); + __propagate_weights(iocg, 0, 0); list_del_init(&iocg->active_list); } spin_unlock(&iocg->waitq.lock); } - commit_active_weights(ioc); + commit_weights(ioc); /* calc usages and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { @@ -1483,8 +1483,8 @@ static void ioc_timer_fn(struct timer_list *timer) TRACE_IOCG_PATH(inuse_takeback, iocg, &now, iocg->inuse, new_inuse, hw_inuse, new_hwi); - __propagate_active_weight(iocg, iocg->weight, - new_inuse); + __propagate_weights(iocg, iocg->weight, + new_inuse); } } else { /* genuninely out of vtime */ @@ -1524,11 +1524,11 @@ static void ioc_timer_fn(struct timer_list *timer) TRACE_IOCG_PATH(inuse_giveaway, iocg, &now, iocg->inuse, new_inuse, hw_inuse, new_hwi); - __propagate_active_weight(iocg, iocg->weight, new_inuse); + __propagate_weights(iocg, iocg->weight, new_inuse); } } skip_surplus_transfers: - commit_active_weights(ioc); + commit_weights(ioc); /* * If q is getting clogged or we're missing too much, we're issuing @@ -1753,7 +1753,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) TRACE_IOCG_PATH(inuse_reset, iocg, &now, iocg->inuse, iocg->weight, hw_inuse, hw_active); spin_lock_irq(&ioc->lock); - propagate_active_weight(iocg, iocg->weight, iocg->weight); + propagate_weights(iocg, iocg->weight, iocg->weight); spin_unlock_irq(&ioc->lock); current_hweight(iocg, &hw_active, &hw_inuse); } @@ -2114,7 +2114,7 @@ static void ioc_pd_free(struct blkg_policy_data *pd) if (ioc) { spin_lock_irqsave(&ioc->lock, flags); if (!list_empty(&iocg->active_list)) { - propagate_active_weight(iocg, 0, 0); + propagate_weights(iocg, 0, 0); list_del_init(&iocg->active_list); } spin_unlock_irqrestore(&ioc->lock, flags); From db84a72af6be422abf2089a5896293590dda5066 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:35 -0400 Subject: [PATCH 03/11] blk-iocost: clamp inuse and skip noops in __propagate_weights() __propagate_weights() currently expects the callers to clamp inuse within [1, active], which is needlessly fragile. The inuse adjustment logic is going to be revamped, in preparation, let's make __propagate_weights() clamp inuse on entry. Also, make it avoid weight updates altogether if neither active or inuse is changed. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 78e6919153d8..8dfe73dde2a8 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -897,7 +897,10 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse) lockdep_assert_held(&ioc->lock); - inuse = min(active, inuse); + inuse = clamp_t(u32, inuse, 1, active); + + if (active == iocg->active && inuse == iocg->inuse) + return; for (lvl = iocg->level - 1; lvl >= 0; lvl--) { struct ioc_gq *parent = iocg->ancestors[lvl]; From 6ef20f787b0aa337dc901d42efa44854e09c87e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:36 -0400 Subject: [PATCH 04/11] blk-iocost: move iocg_kick_delay() above iocg_kick_waitq() We'll make iocg_kick_waitq() call iocg_kick_delay(). Reorder them in preparation. This is pure code reorganization. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 120 ++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 8dfe73dde2a8..ac22d761a350 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1115,6 +1115,66 @@ fail_unlock: return false; } +static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct blkcg_gq *blkg = iocg_to_blkg(iocg); + u64 vtime = atomic64_read(&iocg->vtime); + u64 vmargin = ioc->margin_us * now->vrate; + u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; + u64 delta_ns, expires, oexpires; + u32 hw_inuse; + + lockdep_assert_held(&iocg->waitq.lock); + + /* debt-adjust vtime */ + current_hweight(iocg, NULL, &hw_inuse); + vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); + + /* + * Clear or maintain depending on the overage. Non-zero vdebt is what + * guarantees that @iocg is online and future iocg_kick_delay() will + * clear use_delay. Don't leave it on when there's no vdebt. + */ + if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) { + blkcg_clear_delay(blkg); + return false; + } + if (!atomic_read(&blkg->use_delay) && + time_before_eq64(vtime, now->vnow + vmargin)) + return false; + + /* use delay */ + delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow, + now->vrate) * NSEC_PER_USEC; + blkcg_set_delay(blkg, delta_ns); + expires = now->now_ns + delta_ns; + + /* if already active and close enough, don't bother */ + oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); + if (hrtimer_is_queued(&iocg->delay_timer) && + abs(oexpires - expires) <= margin_ns / 4) + return true; + + hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires), + margin_ns / 4, HRTIMER_MODE_ABS); + return true; +} + +static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) +{ + struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); + struct ioc_now now; + unsigned long flags; + + spin_lock_irqsave(&iocg->waitq.lock, flags); + ioc_now(iocg->ioc, &now); + iocg_kick_delay(iocg, &now); + spin_unlock_irqrestore(&iocg->waitq.lock, flags); + + return HRTIMER_NORESTART; +} + static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { @@ -1211,66 +1271,6 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) -{ - struct ioc *ioc = iocg->ioc; - struct blkcg_gq *blkg = iocg_to_blkg(iocg); - u64 vtime = atomic64_read(&iocg->vtime); - u64 vmargin = ioc->margin_us * now->vrate; - u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; - u64 delta_ns, expires, oexpires; - u32 hw_inuse; - - lockdep_assert_held(&iocg->waitq.lock); - - /* debt-adjust vtime */ - current_hweight(iocg, NULL, &hw_inuse); - vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); - - /* - * Clear or maintain depending on the overage. Non-zero vdebt is what - * guarantees that @iocg is online and future iocg_kick_delay() will - * clear use_delay. Don't leave it on when there's no vdebt. - */ - if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) { - blkcg_clear_delay(blkg); - return false; - } - if (!atomic_read(&blkg->use_delay) && - time_before_eq64(vtime, now->vnow + vmargin)) - return false; - - /* use delay */ - delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow, - now->vrate) * NSEC_PER_USEC; - blkcg_set_delay(blkg, delta_ns); - expires = now->now_ns + delta_ns; - - /* if already active and close enough, don't bother */ - oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); - if (hrtimer_is_queued(&iocg->delay_timer) && - abs(oexpires - expires) <= margin_ns / 4) - return true; - - hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires), - margin_ns / 4, HRTIMER_MODE_ABS); - return true; -} - -static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) -{ - struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); - struct ioc_now now; - unsigned long flags; - - spin_lock_irqsave(&iocg->waitq.lock, flags); - ioc_now(iocg->ioc, &now); - iocg_kick_delay(iocg, &now); - spin_unlock_irqrestore(&iocg->waitq.lock, flags); - - return HRTIMER_NORESTART; -} - static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p) { u32 nr_met[2] = { }; From 7b84b49e381a8538626f200785dd918a402c62d7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:37 -0400 Subject: [PATCH 05/11] blk-iocost: make iocg_kick_waitq() call iocg_kick_delay() after paying debt iocg_kick_waitq() is the function which pays debt and iocg_kick_delay() updates the actual delay status accordingly. If iocg_kick_delay() is not called after iocg_kick_delay() updated debt, unnecessarily large delays can be applied temporarily. Let's make sure such conditions don't occur by making iocg_kick_waitq() always call iocg_kick_delay() after paying debt. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index ac22d761a350..b2b8dfbeee5a 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1226,6 +1226,8 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) atomic64_add(delta, &iocg->vtime); atomic64_add(delta, &iocg->done_vtime); iocg->abs_vdebt -= abs_delta; + + iocg_kick_delay(iocg, now); } /* @@ -1383,7 +1385,6 @@ static void ioc_timer_fn(struct timer_list *timer) if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, &now); - iocg_kick_delay(iocg, &now); } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ iocg->last_inuse = iocg->inuse; From fe20cdb516377e6ac6300254d5ca0dd7d8621ea8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:38 -0400 Subject: [PATCH 06/11] blk-iocost: s/HWEIGHT_WHOLE/WEIGHT_ONE/g We're gonna use HWEIGHT_WHOLE for regular weights too. Let's rename it to WEIGHT_ONE. Pure rename. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index b2b8dfbeee5a..5e6d56eec1c9 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -68,7 +68,7 @@ * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest, * 12.5% each. The distribution mechanism only cares about these flattened * shares. They're called hweights (hierarchical weights) and always add - * upto 1 (HWEIGHT_WHOLE). + * upto 1 (WEIGHT_ONE). * * A given cgroup's vtime runs slower in inverse proportion to its hweight. * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5) @@ -246,7 +246,7 @@ enum { MIN_VALID_USAGES = 2, /* 1/64k is granular enough and can easily be handled w/ u32 */ - HWEIGHT_WHOLE = 1 << 16, + WEIGHT_ONE = 1 << 16, /* * As vtime is used to calculate the cost of each IO, it needs to @@ -285,8 +285,8 @@ enum { * donate the surplus. */ SURPLUS_SCALE_PCT = 125, /* * 125% */ - SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */ - SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */ + SURPLUS_SCALE_ABS = WEIGHT_ONE / 50, /* + 2% */ + SURPLUS_MIN_ADJ_DELTA = WEIGHT_ONE / 33, /* 3% */ /* switch iff the conditions are met for longer than this */ AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, @@ -491,7 +491,7 @@ struct ioc_gq { struct hrtimer waitq_timer; struct hrtimer delay_timer; - /* usage is recorded as fractions of HWEIGHT_WHOLE */ + /* usage is recorded as fractions of WEIGHT_ONE */ int usage_idx; u32 usages[NR_USAGE_SLOTS]; @@ -658,7 +658,7 @@ static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg) */ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) { - return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse); + return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse); } /* @@ -666,7 +666,7 @@ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) */ static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) { - return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE); + return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); } static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) @@ -980,7 +980,7 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep */ smp_rmb(); - hwa = hwi = HWEIGHT_WHOLE; + hwa = hwi = WEIGHT_ONE; for (lvl = 0; lvl <= iocg->level - 1; lvl++) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; @@ -2088,8 +2088,8 @@ static void ioc_pd_init(struct blkg_policy_data *pd) atomic64_set(&iocg->done_vtime, now.vnow); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); - iocg->hweight_active = HWEIGHT_WHOLE; - iocg->hweight_inuse = HWEIGHT_WHOLE; + iocg->hweight_active = WEIGHT_ONE; + iocg->hweight_inuse = WEIGHT_ONE; init_waitqueue_head(&iocg->waitq); hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); From bd0adb91a68b2fbf384ace5287bb46f135e8d889 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:39 -0400 Subject: [PATCH 07/11] blk-iocost: use WEIGHT_ONE based fixed point number for weights To improve weight donations, we want to able to scale inuse with a greater accuracy and down below 1. Let's make non-hierarchical weights to use WEIGHT_ONE based fixed point numbers too like hierarchical ones. This doesn't cause any behavior changes yet. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 5e6d56eec1c9..00c5a3ad2b5b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -984,8 +984,8 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep for (lvl = 0; lvl <= iocg->level - 1; lvl++) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; - u32 active_sum = READ_ONCE(parent->child_active_sum); - u32 inuse_sum = READ_ONCE(parent->child_inuse_sum); + u64 active_sum = READ_ONCE(parent->child_active_sum); + u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); u32 active = READ_ONCE(child->active); u32 inuse = READ_ONCE(child->inuse); @@ -993,11 +993,11 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep if (!active_sum || !inuse_sum) continue; - active_sum = max(active, active_sum); - hwa = hwa * active / active_sum; /* max 16bits * 10000 */ + active_sum = max_t(u64, active, active_sum); + hwa = div64_u64((u64)hwa * active, active_sum); - inuse_sum = max(inuse, inuse_sum); - hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */ + inuse_sum = max_t(u64, inuse, inuse_sum); + hwi = div64_u64((u64)hwi * inuse, inuse_sum); } iocg->hweight_active = max_t(u32, hwa, 1); @@ -1022,7 +1022,8 @@ static void weight_updated(struct ioc_gq *iocg) weight = iocg->cfg_weight ?: iocc->dfl_weight; if (weight != iocg->weight && iocg->active) propagate_weights(iocg, weight, - DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight)); + DIV64_U64_ROUND_UP((u64)iocg->inuse * weight, + iocg->weight)); iocg->weight = weight; } @@ -2050,7 +2051,7 @@ static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) if (!iocc) return NULL; - iocc->dfl_weight = CGROUP_WEIGHT_DFL; + iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; return &iocc->cpd; } @@ -2136,7 +2137,7 @@ static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, struct ioc_gq *iocg = pd_to_iocg(pd); if (dname && iocg->cfg_weight) - seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight); + seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); return 0; } @@ -2146,7 +2147,7 @@ static int ioc_weight_show(struct seq_file *sf, void *v) struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); - seq_printf(sf, "default %u\n", iocc->dfl_weight); + seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; @@ -2172,7 +2173,7 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, return -EINVAL; spin_lock(&blkcg->lock); - iocc->dfl_weight = v; + iocc->dfl_weight = v * WEIGHT_ONE; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct ioc_gq *iocg = blkg_to_iocg(blkg); @@ -2203,7 +2204,7 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, } spin_lock(&iocg->ioc->lock); - iocg->cfg_weight = v; + iocg->cfg_weight = v * WEIGHT_ONE; weight_updated(iocg); spin_unlock(&iocg->ioc->lock); From ce95570acf741ea306baddcb43aba0b59b920a21 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:40 -0400 Subject: [PATCH 08/11] blk-iocost: make ioc_now->now and ioc->period_at 64bit They are in microseconds and wrap in around 1.2 hours with u32. While unlikely, confusions from wraparounds are still possible. We aren't saving anything meaningful by keeping these u32. Let's make them u64. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 00c5a3ad2b5b..dc72cd965837 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -409,7 +409,7 @@ struct ioc { atomic64_t vtime_rate; seqcount_spinlock_t period_seqcount; - u32 period_at; /* wallclock starttime */ + u64 period_at; /* wallclock starttime */ u64 period_at_vtime; /* vtime starttime */ atomic64_t cur_period; /* inc'd each period */ @@ -508,7 +508,7 @@ struct ioc_cgrp { struct ioc_now { u64 now_ns; - u32 now; + u64 now; u64 vnow; u64 vrate; }; From 7ca5b2e60bfa4aa5b8d52e9c3e2a757c581bec1d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:41 -0400 Subject: [PATCH 09/11] blk-iocost: streamline vtime margin and timer slack handling The margin handling was pretty inconsistent. * ioc->margin_us and ioc->inuse_margin_vtime were used as vtime margin thresholds. However, the two are in different units with the former requiring conversion to vtime on use. * iocg_kick_waitq() was using a quarter of WAITQ_TIMER_MARGIN_PCT of period_us as the timer slack - ~1.2%. While iocg_kick_delay() was using a quarter of ioc->margin_us - ~12.5%. There aren't strong reasons to use different values for the two. This patch cleans up margin and timer slack handling: * vtime margins are now recorded in ioc->margins.{min, max} on period duration changes and used consistently. * Timer slack is now 1% of period_us and recorded in ioc->timer_slack_ns and used consistently for iocg_kick_waitq() and iocg_kick_delay(). The only functional change is shortening of timer slack. No meaningful visible change is expected. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 67 ++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index dc72cd965837..f36988657594 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -221,11 +221,11 @@ enum { * serves as its IO credit buffer. Surplus weight adjustment is * immediately canceled if the vtime margin runs below 10%. */ - MARGIN_PCT = 50, - INUSE_MARGIN_PCT = 10, + MARGIN_MIN_PCT = 10, + MARGIN_MAX_PCT = 50, - /* Have some play in waitq timer operations */ - WAITQ_TIMER_MARGIN_PCT = 5, + /* Have some play in timer operations */ + TIMER_SLACK_PCT = 1, /* * vtime can wrap well within a reasonable uptime when vrate is @@ -374,6 +374,11 @@ struct ioc_params { u32 too_slow_vrate_pct; }; +struct ioc_margins { + s64 min; + s64 max; +}; + struct ioc_missed { local_t nr_met; local_t nr_missed; @@ -395,8 +400,9 @@ struct ioc { bool enabled; struct ioc_params params; + struct ioc_margins margins; u32 period_us; - u32 margin_us; + u32 timer_slack_ns; u64 vrate_min; u64 vrate_max; @@ -415,7 +421,6 @@ struct ioc { atomic64_t cur_period; /* inc'd each period */ int busy_level; /* saturation history */ - u64 inuse_margin_vtime; bool weights_updated; atomic_t hweight_gen; /* for lazy hweights */ @@ -678,6 +683,16 @@ static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) #define CREATE_TRACE_POINTS #include +static void ioc_refresh_margins(struct ioc *ioc) +{ + struct ioc_margins *margins = &ioc->margins; + u32 period_us = ioc->period_us; + u64 vrate = atomic64_read(&ioc->vtime_rate); + + margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; + margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate; +} + /* latency Qos params changed, update period_us and all the dependent params */ static void ioc_refresh_period_us(struct ioc *ioc) { @@ -711,9 +726,10 @@ static void ioc_refresh_period_us(struct ioc *ioc) /* calculate dependent params */ ioc->period_us = period_us; - ioc->margin_us = period_us * MARGIN_PCT / 100; - ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( - period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100); + ioc->timer_slack_ns = div64_u64( + (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT, + 100); + ioc_refresh_margins(ioc); } static int ioc_autop_idx(struct ioc *ioc) @@ -1031,7 +1047,7 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 last_period, cur_period, max_period_delta; - u64 vtime, vmargin, vmin; + u64 vtime, vmin; int i; /* @@ -1077,8 +1093,7 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) */ max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us); vtime = atomic64_read(&iocg->vtime); - vmargin = ioc->margin_us * now->vrate; - vmin = now->vnow - vmargin; + vmin = now->vnow - ioc->margins.max; if (last_period + max_period_delta < cur_period || time_before64(vtime, vmin)) { @@ -1121,8 +1136,6 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); u64 vtime = atomic64_read(&iocg->vtime); - u64 vmargin = ioc->margin_us * now->vrate; - u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; u64 delta_ns, expires, oexpires; u32 hw_inuse; @@ -1142,7 +1155,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) return false; } if (!atomic_read(&blkg->use_delay) && - time_before_eq64(vtime, now->vnow + vmargin)) + time_before_eq64(vtime, now->vnow + ioc->margins.max)) return false; /* use delay */ @@ -1154,11 +1167,11 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); if (hrtimer_is_queued(&iocg->delay_timer) && - abs(oexpires - expires) <= margin_ns / 4) + abs(oexpires - expires) <= ioc->timer_slack_ns) return true; hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires), - margin_ns / 4, HRTIMER_MODE_ABS); + ioc->timer_slack_ns, HRTIMER_MODE_ABS); return true; } @@ -1206,8 +1219,6 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct iocg_wake_ctx ctx = { .iocg = iocg }; - u64 margin_ns = (u64)(ioc->period_us * - WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; u64 vdebt, vshortage, expires, oexpires; s64 vbudget; u32 hw_inuse; @@ -1243,20 +1254,20 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) if (WARN_ON_ONCE(ctx.vbudget >= 0)) return; - /* determine next wakeup, add a quarter margin to guarantee chunking */ + /* determine next wakeup, add a timer margin to guarantee chunking */ vshortage = -ctx.vbudget; expires = now->now_ns + DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC; - expires += margin_ns / 4; + expires += ioc->timer_slack_ns; /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); if (hrtimer_is_queued(&iocg->waitq_timer) && - abs(oexpires - expires) <= margin_ns / 4) + abs(oexpires - expires) <= ioc->timer_slack_ns) return; hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), - margin_ns / 4, HRTIMER_MODE_ABS); + ioc->timer_slack_ns, HRTIMER_MODE_ABS); } static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) @@ -1399,7 +1410,7 @@ static void ioc_timer_fn(struct timer_list *timer) /* calc usages and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, vusage, vmargin, vmin; + u64 vdone, vtime, vusage, vmin; u32 hw_active, hw_inuse, usage; /* @@ -1450,8 +1461,7 @@ static void ioc_timer_fn(struct timer_list *timer) } /* see whether there's surplus vtime */ - vmargin = ioc->margin_us * now.vrate; - vmin = now.vnow - vmargin; + vmin = now.vnow - ioc->margins.max; iocg->has_surplus = false; @@ -1623,8 +1633,7 @@ skip_surplus_transfers: nr_surpluses); atomic64_set(&ioc->vtime_rate, vrate); - ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( - ioc->period_us * vrate * INUSE_MARGIN_PCT, 100); + ioc_refresh_margins(ioc); } else if (ioc->busy_level != prev_busy_level || nr_lagging) { trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), missed_ppm, rq_wait_pct, nr_lagging, @@ -1754,7 +1763,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) current_hweight(iocg, &hw_active, &hw_inuse); if (hw_inuse < hw_active && - time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) { + time_after_eq64(vtime + ioc->margins.min, now.vnow)) { TRACE_IOCG_PATH(inuse_reset, iocg, &now, iocg->inuse, iocg->weight, hw_inuse, hw_active); spin_lock_irq(&ioc->lock); From da437b95db83106319d337a5859e6dfb464b085a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:42 -0400 Subject: [PATCH 10/11] blk-iocost: grab ioc->lock for debt handling Currently, debt handling requires only iocg->waitq.lock. In the future, we want to adjust and propagate inuse changes depending on debt status. Let's grab ioc->lock in debt handling paths in preparation. * Because ioc->lock nests outside iocg->waitq.lock, the decision to grab ioc->lock needs to be made before entering the critical sections. * Add and use iocg_[un]lock() which handles the conditional double locking. * Add @pay_debt to iocg_kick_waitq() so that debt payment happens only when the caller grabbed both locks. This patch is prepatory and the comments contain references to future changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 92 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 19 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index f36988657594..23b173e34591 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -680,6 +680,26 @@ static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) atomic64_add(cost, &iocg->vtime); } +static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags) +{ + if (lock_ioc) { + spin_lock_irqsave(&iocg->ioc->lock, *flags); + spin_lock(&iocg->waitq.lock); + } else { + spin_lock_irqsave(&iocg->waitq.lock, *flags); + } +} + +static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags) +{ + if (unlock_ioc) { + spin_unlock(&iocg->waitq.lock); + spin_unlock_irqrestore(&iocg->ioc->lock, *flags); + } else { + spin_unlock_irqrestore(&iocg->waitq.lock, *flags); + } +} + #define CREATE_TRACE_POINTS #include @@ -1215,11 +1235,17 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, return 0; } -static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) +/* + * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters + * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in + * addition to iocg->waitq.lock. + */ +static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, + struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct iocg_wake_ctx ctx = { .iocg = iocg }; - u64 vdebt, vshortage, expires, oexpires; + u64 vshortage, expires, oexpires; s64 vbudget; u32 hw_inuse; @@ -1229,25 +1255,39 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) vbudget = now->vnow - atomic64_read(&iocg->vtime); /* pay off debt */ - vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); - if (vdebt && vbudget > 0) { + if (pay_debt && iocg->abs_vdebt && vbudget > 0) { + u64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); u64 delta = min_t(u64, vbudget, vdebt); u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), iocg->abs_vdebt); + lockdep_assert_held(&ioc->lock); + atomic64_add(delta, &iocg->vtime); atomic64_add(delta, &iocg->done_vtime); iocg->abs_vdebt -= abs_delta; + vbudget -= vdebt; iocg_kick_delay(iocg, now); } + /* + * Debt can still be outstanding if we haven't paid all yet or the + * caller raced and called without @pay_debt. Shouldn't wake up waiters + * under debt. Make sure @vbudget reflects the outstanding amount and is + * not positive. + */ + if (iocg->abs_vdebt) { + s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); + vbudget = min_t(s64, 0, vbudget - vdebt); + } + /* * Wake up the ones which are due and see how much vtime we'll need * for the next one. */ ctx.hw_inuse = hw_inuse; - ctx.vbudget = vbudget - vdebt; + ctx.vbudget = vbudget; __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); if (!waitqueue_active(&iocg->waitq)) return; @@ -1273,14 +1313,15 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) { struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer); + bool pay_debt = READ_ONCE(iocg->abs_vdebt); struct ioc_now now; unsigned long flags; ioc_now(iocg->ioc, &now); - spin_lock_irqsave(&iocg->waitq.lock, flags); - iocg_kick_waitq(iocg, &now); - spin_unlock_irqrestore(&iocg->waitq.lock, flags); + iocg_lock(iocg, pay_debt, &flags); + iocg_kick_waitq(iocg, pay_debt, &now); + iocg_unlock(iocg, pay_debt, &flags); return HRTIMER_NORESTART; } @@ -1396,7 +1437,7 @@ static void ioc_timer_fn(struct timer_list *timer) if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { /* might be oversleeping vtime / hweight changes, kick */ - iocg_kick_waitq(iocg, &now); + iocg_kick_waitq(iocg, true, &now); } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ iocg->last_inuse = iocg->inuse; @@ -1743,6 +1784,8 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) struct iocg_wait wait; u32 hw_active, hw_inuse; u64 abs_cost, cost, vtime; + bool use_debt, ioc_locked; + unsigned long flags; /* bypass IOs if disabled or for root cgroup */ if (!ioc->enabled || !iocg->level) @@ -1786,15 +1829,26 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) } /* - * We activated above but w/o any synchronization. Deactivation is - * synchronized with waitq.lock and we won't get deactivated as long - * as we're waiting or has debt, so we're good if we're activated - * here. In the unlikely case that we aren't, just issue the IO. + * We're over budget. This can be handled in two ways. IOs which may + * cause priority inversions are punted to @ioc->aux_iocg and charged as + * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling + * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine + * whether debt handling is needed and acquire locks accordingly. */ - spin_lock_irq(&iocg->waitq.lock); + use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current); + ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); + iocg_lock(iocg, ioc_locked, &flags); + + /* + * @iocg must stay activated for debt and waitq handling. Deactivation + * is synchronized against both ioc->lock and waitq.lock and we won't + * get deactivated as long as we're waiting or has debt, so we're good + * if we're activated here. In the unlikely cases that we aren't, just + * issue the IO. + */ if (unlikely(list_empty(&iocg->active_list))) { - spin_unlock_irq(&iocg->waitq.lock); + iocg_unlock(iocg, ioc_locked, &flags); iocg_commit_bio(iocg, bio, cost); return; } @@ -1816,12 +1870,12 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) * clear them and leave @iocg inactive w/ dangling use_delay heavily * penalizing the cgroup and its descendants. */ - if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { + if (use_debt) { iocg->abs_vdebt += abs_cost; if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->q, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); - spin_unlock_irq(&iocg->waitq.lock); + iocg_unlock(iocg, ioc_locked, &flags); return; } @@ -1845,9 +1899,9 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) wait.committed = false; /* will be set true by waker */ __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); - iocg_kick_waitq(iocg, &now); + iocg_kick_waitq(iocg, ioc_locked, &now); - spin_unlock_irq(&iocg->waitq.lock); + iocg_unlock(iocg, ioc_locked, &flags); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); From 97eb19751f155903489f8d42ea658ace2c59faf7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:43 -0400 Subject: [PATCH 11/11] blk-iocost: add absolute usage stat Currently, iocost doesn't collect or expose any statistics punting off all monitoring duties to drgn based iocost_monitor.py. While it works for some scenarios, there are some usability and data availability challenges. For example, accurate per-cgroup usage information can't be tracked by vtime progression at all and the number available in iocg->usages[] are really short-term snapshots used for control heuristics with possibly significant errors. This patch implements per-cgroup absolute usage stat counter and exposes it through io.stat along with the current vrate. Usage stat collection and flushing employ the same method as cgroup rstat on the active iocg's and the only hot path overhead is preemption toggling and adding to a percpu counter. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 155 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 149 insertions(+), 6 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 23b173e34591..f30f9b37fcf0 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -431,6 +431,14 @@ struct ioc { bool user_cost_model:1; }; +struct iocg_pcpu_stat { + local64_t abs_vusage; +}; + +struct iocg_stat { + u64 usage_us; +}; + /* per device-cgroup pair */ struct ioc_gq { struct blkg_policy_data pd; @@ -492,10 +500,19 @@ struct ioc_gq { u32 hweight_inuse; bool has_surplus; + struct list_head walk_list; + struct wait_queue_head waitq; struct hrtimer waitq_timer; struct hrtimer delay_timer; + /* statistics */ + struct iocg_pcpu_stat __percpu *pcpu_stat; + struct iocg_stat local_stat; + struct iocg_stat desc_stat; + struct iocg_stat last_stat; + u64 last_stat_abs_vusage; + /* usage is recorded as fractions of WEIGHT_ONE */ int usage_idx; u32 usages[NR_USAGE_SLOTS]; @@ -674,10 +691,17 @@ static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); } -static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) +static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, + u64 abs_cost, u64 cost) { + struct iocg_pcpu_stat *gcs; + bio->bi_iocost_cost = cost; atomic64_add(cost, &iocg->vtime); + + gcs = get_cpu_ptr(iocg->pcpu_stat); + local64_add(abs_cost, &gcs->abs_vusage); + put_cpu_ptr(gcs); } static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags) @@ -1221,7 +1245,7 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, if (ctx->vbudget < 0) return -1; - iocg_commit_bio(ctx->iocg, wait->bio, cost); + iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); /* * autoremove_wake_function() removes the wait entry only when it @@ -1382,6 +1406,87 @@ static bool iocg_is_idle(struct ioc_gq *iocg) return true; } +/* + * Call this function on the target leaf @iocg's to build pre-order traversal + * list of all the ancestors in @inner_walk. The inner nodes are linked through + * ->walk_list and the caller is responsible for dissolving the list after use. + */ +static void iocg_build_inner_walk(struct ioc_gq *iocg, + struct list_head *inner_walk) +{ + int lvl; + + WARN_ON_ONCE(!list_empty(&iocg->walk_list)); + + /* find the first ancestor which hasn't been visited yet */ + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { + if (!list_empty(&iocg->ancestors[lvl]->walk_list)) + break; + } + + /* walk down and visit the inner nodes to get pre-order traversal */ + while (++lvl <= iocg->level - 1) { + struct ioc_gq *inner = iocg->ancestors[lvl]; + + /* record traversal order */ + list_add_tail(&inner->walk_list, inner_walk); + } +} + +/* collect per-cpu counters and propagate the deltas to the parent */ +static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct iocg_stat new_stat; + u64 abs_vusage = 0; + u64 vusage_delta; + int cpu; + + lockdep_assert_held(&iocg->ioc->lock); + + /* collect per-cpu counters */ + for_each_possible_cpu(cpu) { + abs_vusage += local64_read( + per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); + } + vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; + iocg->last_stat_abs_vusage = abs_vusage; + + iocg->local_stat.usage_us += div64_u64(vusage_delta, now->vrate); + + new_stat.usage_us = + iocg->local_stat.usage_us + iocg->desc_stat.usage_us; + + /* propagate the deltas to the parent */ + if (iocg->level > 0) { + struct iocg_stat *parent_stat = + &iocg->ancestors[iocg->level - 1]->desc_stat; + + parent_stat->usage_us += + new_stat.usage_us - iocg->last_stat.usage_us; + } + + iocg->last_stat = new_stat; +} + +/* get stat counters ready for reading on all active iocgs */ +static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) +{ + LIST_HEAD(inner_walk); + struct ioc_gq *iocg, *tiocg; + + /* flush leaves and build inner node walk list */ + list_for_each_entry(iocg, target_iocgs, active_list) { + iocg_flush_stat_one(iocg, now); + iocg_build_inner_walk(iocg, &inner_walk); + } + + /* keep flushing upwards by walking the inner list backwards */ + list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { + iocg_flush_stat_one(iocg, now); + list_del_init(&iocg->walk_list); + } +} + /* returns usage with margin added if surplus is large enough */ static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse) { @@ -1422,6 +1527,8 @@ static void ioc_timer_fn(struct timer_list *timer) return; } + iocg_flush_stat(&ioc->active_iocgs, &now); + /* * Waiters determine the sleep durations based on the vrate they * saw at the time of sleep. If vrate has increased, some waiters @@ -1824,7 +1931,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) */ if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && time_before_eq64(vtime + cost, now.vnow)) { - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); return; } @@ -1849,7 +1956,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) */ if (unlikely(list_empty(&iocg->active_list))) { iocg_unlock(iocg, ioc_locked, &flags); - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); return; } @@ -1948,7 +2055,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, */ if (rq->bio && rq->bio->bi_iocost_cost && time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); return; } @@ -1962,7 +2069,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, iocg->abs_vdebt += abs_cost; iocg_kick_delay(iocg, &now); } else { - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); } spin_unlock_irqrestore(&iocg->waitq.lock, flags); } @@ -2133,6 +2240,12 @@ static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, if (!iocg) return NULL; + iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); + if (!iocg->pcpu_stat) { + kfree(iocg); + return NULL; + } + return &iocg->pd; } @@ -2152,6 +2265,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) atomic64_set(&iocg->done_vtime, now.vnow); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); + INIT_LIST_HEAD(&iocg->walk_list); iocg->hweight_active = WEIGHT_ONE; iocg->hweight_inuse = WEIGHT_ONE; @@ -2181,18 +2295,46 @@ static void ioc_pd_free(struct blkg_policy_data *pd) if (ioc) { spin_lock_irqsave(&ioc->lock, flags); + if (!list_empty(&iocg->active_list)) { propagate_weights(iocg, 0, 0); list_del_init(&iocg->active_list); } + + WARN_ON_ONCE(!list_empty(&iocg->walk_list)); + spin_unlock_irqrestore(&ioc->lock, flags); hrtimer_cancel(&iocg->waitq_timer); hrtimer_cancel(&iocg->delay_timer); } + free_percpu(iocg->pcpu_stat); kfree(iocg); } +static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +{ + struct ioc_gq *iocg = pd_to_iocg(pd); + struct ioc *ioc = iocg->ioc; + size_t pos = 0; + + if (!ioc->enabled) + return 0; + + if (iocg->level == 0) { + unsigned vp10k = DIV64_U64_ROUND_CLOSEST( + atomic64_read(&ioc->vtime_rate) * 10000, + VTIME_PER_USEC); + pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", + vp10k / 100, vp10k % 100); + } + + pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu", + iocg->last_stat.usage_us); + + return pos; +} + static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -2606,6 +2748,7 @@ static struct blkcg_policy blkcg_policy_iocost = { .pd_alloc_fn = ioc_pd_alloc, .pd_init_fn = ioc_pd_init, .pd_free_fn = ioc_pd_free, + .pd_stat_fn = ioc_pd_stat, }; static int __init ioc_init(void)