From 1aa50d020c7148f5f0bde15ca80fe6f91a8c5a4e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:44 -0400 Subject: [PATCH 1/8] blk-iocost: calculate iocg->usages[] from iocg->local_stat.usage_us Currently, iocg->usages[] which are used to guide inuse adjustments are calculated from vtime deltas. This, however, assumes that the hierarchical inuse weight at the time of calculation held for the entire period, which often isn't true and can lead to significant errors. Now that we have absolute usage information collected, we can derive iocg->usages[] from iocg->local_stat.usage_us so that inuse adjustment decisions are made based on actual absolute usage. The calculated usage is clamped between 1 and WEIGHT_ONE and WEIGHT_ONE is also used to signal saturation regardless of the current hierarchical inuse weight. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 70 ++++++++++++++++++++++------------- include/trace/events/iocost.h | 7 +--- 2 files changed, 46 insertions(+), 31 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index f30f9b37fcf0..2496674bbbf4 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -476,14 +476,10 @@ struct ioc_gq { * `vtime_done` is the same but progressed on completion rather * than issue. The delta behind `vtime` represents the cost of * currently in-flight IOs. - * - * `last_vtime` is used to remember `vtime` at the end of the last - * period to calculate utilization. */ atomic64_t vtime; atomic64_t done_vtime; u64 abs_vdebt; - u64 last_vtime; /* * The period this iocg was last active in. Used for deactivation @@ -506,6 +502,9 @@ struct ioc_gq { struct hrtimer waitq_timer; struct hrtimer delay_timer; + /* timestamp at the latest activation */ + u64 activated_at; + /* statistics */ struct iocg_pcpu_stat __percpu *pcpu_stat; struct iocg_stat local_stat; @@ -514,6 +513,7 @@ struct ioc_gq { u64 last_stat_abs_vusage; /* usage is recorded as fractions of WEIGHT_ONE */ + u32 usage_delta_us; int usage_idx; u32 usages[NR_USAGE_SLOTS]; @@ -1159,7 +1159,7 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) TRACE_IOCG_PATH(iocg_activate, iocg, now, last_period, cur_period, vtime); - iocg->last_vtime = vtime; + iocg->activated_at = now->now; if (ioc->running == IOC_IDLE) { ioc->running = IOC_RUNNING; @@ -1451,7 +1451,8 @@ static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; iocg->last_stat_abs_vusage = abs_vusage; - iocg->local_stat.usage_us += div64_u64(vusage_delta, now->vrate); + iocg->usage_delta_us = div64_u64(vusage_delta, now->vrate); + iocg->local_stat.usage_us += iocg->usage_delta_us; new_stat.usage_us = iocg->local_stat.usage_us + iocg->desc_stat.usage_us; @@ -1558,8 +1559,9 @@ static void ioc_timer_fn(struct timer_list *timer) /* calc usages and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, vusage, vmin; + u64 vdone, vtime, usage_us, vmin; u32 hw_active, hw_inuse, usage; + int uidx; /* * Collect unused and wind vtime closer to vnow to prevent @@ -1583,27 +1585,44 @@ static void ioc_timer_fn(struct timer_list *timer) time_before64(vdone, now.vnow - period_vtime)) nr_lagging++; - if (waitqueue_active(&iocg->waitq)) - vusage = now.vnow - iocg->last_vtime; - else if (time_before64(iocg->last_vtime, vtime)) - vusage = vtime - iocg->last_vtime; - else - vusage = 0; - - iocg->last_vtime += vusage; /* - * Factor in in-flight vtime into vusage to avoid - * high-latency completions appearing as idle. This should - * be done after the above ->last_time adjustment. + * Determine absolute usage factoring in pending and in-flight + * IOs to avoid stalls and high-latency completions appearing as + * idle. */ - vusage = max(vusage, vtime - vdone); + usage_us = iocg->usage_delta_us; + if (waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow)) + usage_us += DIV64_U64_ROUND_UP( + cost_to_abs_cost(now.vnow - vtime, hw_inuse), + now.vrate); + if (vdone != vtime) { + u64 inflight_us = DIV64_U64_ROUND_UP( + cost_to_abs_cost(vtime - vdone, hw_inuse), + now.vrate); + usage_us = max(usage_us, inflight_us); + } - /* calculate hweight based usage ratio and record */ - if (vusage) { - usage = DIV64_U64_ROUND_UP(vusage * hw_inuse, - period_vtime); - iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS; - iocg->usages[iocg->usage_idx] = usage; + /* convert to hweight based usage ratio and record */ + uidx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS; + + if (time_after64(vtime, now.vnow - ioc->margins.min)) { + iocg->usage_idx = uidx; + iocg->usages[uidx] = WEIGHT_ONE; + } else if (usage_us) { + u64 started_at, dur; + + if (time_after64(iocg->activated_at, ioc->period_at)) + started_at = iocg->activated_at; + else + started_at = ioc->period_at; + + dur = max_t(u64, now.now - started_at, 1); + usage = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur), + 1, WEIGHT_ONE); + + iocg->usage_idx = uidx; + iocg->usages[uidx] = usage; } else { usage = 0; } @@ -1620,7 +1639,6 @@ static void ioc_timer_fn(struct timer_list *timer) /* throw away surplus vtime */ atomic64_add(delta, &iocg->vtime); atomic64_add(delta, &iocg->done_vtime); - iocg->last_vtime += delta; /* if usage is sufficiently low, maybe it can donate */ if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) { iocg->has_surplus = true; diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index c2f580fd371b..a905ecc0342f 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -26,7 +26,6 @@ TRACE_EVENT(iocost_iocg_activate, __field(u64, vrate) __field(u64, last_period) __field(u64, cur_period) - __field(u64, last_vtime) __field(u64, vtime) __field(u32, weight) __field(u32, inuse) @@ -42,7 +41,6 @@ TRACE_EVENT(iocost_iocg_activate, __entry->vrate = now->vrate; __entry->last_period = last_period; __entry->cur_period = cur_period; - __entry->last_vtime = iocg->last_vtime; __entry->vtime = vtime; __entry->weight = iocg->weight; __entry->inuse = iocg->inuse; @@ -51,13 +49,12 @@ TRACE_EVENT(iocost_iocg_activate, ), TP_printk("[%s:%s] now=%llu:%llu vrate=%llu " - "period=%llu->%llu vtime=%llu->%llu " + "period=%llu->%llu vtime=%llu " "weight=%u/%u hweight=%llu/%llu", __get_str(devname), __get_str(cgroup), __entry->now, __entry->vnow, __entry->vrate, __entry->last_period, __entry->cur_period, - __entry->last_vtime, __entry->vtime, - __entry->inuse, __entry->weight, + __entry->vtime, __entry->inuse, __entry->weight, __entry->hweight_inuse, __entry->hweight_active ) ); From 8692d2db8e01f1a5718f9b67c219b823a64b2088 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:45 -0400 Subject: [PATCH 2/8] blk-iocost: replace iocg->has_surplus with ->surplus_list Instead of marking iocgs with surplus with a flag and filtering for them while walking all active iocgs, build a surpluses list. This doesn't make much difference now but will help implementing improved donation logic which will iterate iocgs with surplus multiple times. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 2496674bbbf4..c1cd66cfa2a8 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -494,9 +494,9 @@ struct ioc_gq { int hweight_gen; u32 hweight_active; u32 hweight_inuse; - bool has_surplus; struct list_head walk_list; + struct list_head surplus_list; struct wait_queue_head waitq; struct hrtimer waitq_timer; @@ -1507,6 +1507,7 @@ static void ioc_timer_fn(struct timer_list *timer) struct ioc *ioc = container_of(timer, struct ioc, timer); struct ioc_gq *iocg, *tiocg; struct ioc_now now; + LIST_HEAD(surpluses); int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0; u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; @@ -1630,8 +1631,7 @@ static void ioc_timer_fn(struct timer_list *timer) /* see whether there's surplus vtime */ vmin = now.vnow - ioc->margins.max; - iocg->has_surplus = false; - + WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (!waitqueue_active(&iocg->waitq) && time_before64(vtime, vmin)) { u64 delta = vmin - vtime; @@ -1641,7 +1641,7 @@ static void ioc_timer_fn(struct timer_list *timer) atomic64_add(delta, &iocg->done_vtime); /* if usage is sufficiently low, maybe it can donate */ if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) { - iocg->has_surplus = true; + list_add(&iocg->surplus_list, &surpluses); nr_surpluses++; } } else if (hw_inuse < hw_active) { @@ -1677,13 +1677,10 @@ static void ioc_timer_fn(struct timer_list *timer) goto skip_surplus_transfers; /* there are both shortages and surpluses, transfer surpluses */ - list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + list_for_each_entry(iocg, &surpluses, surplus_list) { u32 usage, hw_active, hw_inuse, new_hwi, new_inuse; int nr_valid = 0; - if (!iocg->has_surplus) - continue; - /* base the decision on max historical usage */ for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) { if (iocg->usages[i]) { @@ -1711,6 +1708,10 @@ static void ioc_timer_fn(struct timer_list *timer) skip_surplus_transfers: commit_weights(ioc); + /* surplus list should be dissolved after use */ + list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) + list_del_init(&iocg->surplus_list); + /* * If q is getting clogged or we're missing too much, we're issuing * too much IO and should lower vtime rate. If we're not missing @@ -2284,6 +2285,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); INIT_LIST_HEAD(&iocg->walk_list); + INIT_LIST_HEAD(&iocg->surplus_list); iocg->hweight_active = WEIGHT_ONE; iocg->hweight_inuse = WEIGHT_ONE; @@ -2320,6 +2322,7 @@ static void ioc_pd_free(struct blkg_policy_data *pd) } WARN_ON_ONCE(!list_empty(&iocg->walk_list)); + WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); spin_unlock_irqrestore(&ioc->lock, flags); From 065655c862fedf4b04e1b28b83ca6f338d81cf0b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:46 -0400 Subject: [PATCH 3/8] blk-iocost: decouple vrate adjustment from surplus transfers Budget donations are inaccurate and could take multiple periods to converge. To prevent triggering vrate adjustments while surplus transfers were catching up, vrate adjustment was suppressed if donations were increasing, which was indicated by non-zero nr_surpluses. This entangling won't be necessary with the scheduled rewrite of donation mechanism which will make it precise and immediate. Let's decouple the two in preparation. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 19 +++++++------------ include/trace/events/iocost.h | 13 ++++--------- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index c1cd66cfa2a8..a3889a8b0a33 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1508,7 +1508,7 @@ static void ioc_timer_fn(struct timer_list *timer) struct ioc_gq *iocg, *tiocg; struct ioc_now now; LIST_HEAD(surpluses); - int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0; + int nr_shortages = 0, nr_lagging = 0; u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; u32 missed_ppm[2], rq_wait_pct; @@ -1640,10 +1640,8 @@ static void ioc_timer_fn(struct timer_list *timer) atomic64_add(delta, &iocg->vtime); atomic64_add(delta, &iocg->done_vtime); /* if usage is sufficiently low, maybe it can donate */ - if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) { + if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) list_add(&iocg->surplus_list, &surpluses); - nr_surpluses++; - } } else if (hw_inuse < hw_active) { u32 new_hwi, new_inuse; @@ -1673,7 +1671,7 @@ static void ioc_timer_fn(struct timer_list *timer) } } - if (!nr_shortages || !nr_surpluses) + if (!nr_shortages || list_empty(&surpluses)) goto skip_surplus_transfers; /* there are both shortages and surpluses, transfer surpluses */ @@ -1738,11 +1736,9 @@ skip_surplus_transfers: /* * If there are IOs spanning multiple periods, wait - * them out before pushing the device harder. If - * there are surpluses, let redistribution work it - * out first. + * them out before pushing the device harder. */ - if (!nr_lagging && !nr_surpluses) + if (!nr_lagging) ioc->busy_level--; } else { /* @@ -1796,15 +1792,14 @@ skip_surplus_transfers: } trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, - nr_lagging, nr_shortages, - nr_surpluses); + nr_lagging, nr_shortages); atomic64_set(&ioc->vtime_rate, vrate); ioc_refresh_margins(ioc); } else if (ioc->busy_level != prev_busy_level || nr_lagging) { trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), missed_ppm, rq_wait_pct, nr_lagging, - nr_shortages, nr_surpluses); + nr_shortages); } ioc_refresh_params(ioc, false); diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index a905ecc0342f..ee024fe8fef6 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -128,11 +128,9 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset, TRACE_EVENT(iocost_ioc_vrate_adj, TP_PROTO(struct ioc *ioc, u64 new_vrate, u32 *missed_ppm, - u32 rq_wait_pct, int nr_lagging, int nr_shortages, - int nr_surpluses), + u32 rq_wait_pct, int nr_lagging, int nr_shortages), - TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages, - nr_surpluses), + TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages), TP_STRUCT__entry ( __string(devname, ioc_name(ioc)) @@ -144,7 +142,6 @@ TRACE_EVENT(iocost_ioc_vrate_adj, __field(u32, rq_wait_pct) __field(int, nr_lagging) __field(int, nr_shortages) - __field(int, nr_surpluses) ), TP_fast_assign( @@ -157,15 +154,13 @@ TRACE_EVENT(iocost_ioc_vrate_adj, __entry->rq_wait_pct = rq_wait_pct; __entry->nr_lagging = nr_lagging; __entry->nr_shortages = nr_shortages; - __entry->nr_surpluses = nr_surpluses; ), - TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d surpluses=%d", + TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d", __get_str(devname), __entry->old_vrate, __entry->new_vrate, __entry->busy_level, __entry->read_missed_ppm, __entry->write_missed_ppm, - __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages, - __entry->nr_surpluses + __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages ) ); From 93f7d2db80e4aea2731619d7b907a029e0d14259 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:47 -0400 Subject: [PATCH 4/8] blk-iocost: restructure surplus donation logic The way the surplus donation logic is structured isn't great. There are two separate paths for starting/increasing donations and decreasing them making the logic harder to follow and is prone to unnecessary behavior differences. In preparation for improved donation handling, this patch restructures the code so that * All donors - new, increasing and decreasing - are funneled through the same code path. * The target donation calculation is factored into hweight_after_donation() which is called once from the same spot for all possible donors. * Actual inuse adjustment is factored into trasnfer_surpluses(). This change introduces a few behavior differences - e.g. donation amount reduction now uses the max usage of the recent three periods just like new and increasing donations, and inuse now gets adjusted upwards the same way it gets downwards. These differences are unlikely to have severely negative implications and the whole logic will be revamped soon. This patch also removes two tracepoints. The existing TPs don't quite fit the new implementation. A later patch will update and reinstate them. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 181 ++++++++++++++++++++++++++------------------- 1 file changed, 104 insertions(+), 77 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index a3889a8b0a33..61b008d0801f 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -494,6 +494,7 @@ struct ioc_gq { int hweight_gen; u32 hweight_active; u32 hweight_inuse; + u32 hweight_after_donation; struct list_head walk_list; struct list_head surplus_list; @@ -1070,6 +1071,32 @@ out: *hw_inusep = iocg->hweight_inuse; } +/* + * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the + * other weights stay unchanged. + */ +static u32 current_hweight_max(struct ioc_gq *iocg) +{ + u32 hwm = WEIGHT_ONE; + u32 inuse = iocg->active; + u64 child_inuse_sum; + int lvl; + + lockdep_assert_held(&iocg->ioc->lock); + + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { + struct ioc_gq *parent = iocg->ancestors[lvl]; + struct ioc_gq *child = iocg->ancestors[lvl + 1]; + + child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; + hwm = div64_u64((u64)hwm * inuse, child_inuse_sum); + inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, + parent->child_active_sum); + } + + return max_t(u32, hwm, 1); +} + static void weight_updated(struct ioc_gq *iocg) { struct ioc *ioc = iocg->ioc; @@ -1488,20 +1515,58 @@ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) } } -/* returns usage with margin added if surplus is large enough */ -static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse) +/* + * Determine what @iocg's hweight_inuse should be after donating unused + * capacity. @hwm is the upper bound and used to signal no donation. This + * function also throws away @iocg's excess budget. + */ +static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, + struct ioc_now *now) { + struct ioc *ioc = iocg->ioc; + u64 vtime = atomic64_read(&iocg->vtime); + s64 excess; + + /* see whether minimum margin requirement is met */ + if (waitqueue_active(&iocg->waitq) || + time_after64(vtime, now->vnow - ioc->margins.min)) + return hwm; + + /* throw away excess above max */ + excess = now->vnow - vtime - ioc->margins.max; + if (excess > 0) { + atomic64_add(excess, &iocg->vtime); + atomic64_add(excess, &iocg->done_vtime); + vtime += excess; + } + /* add margin */ usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100); usage += SURPLUS_SCALE_ABS; /* don't bother if the surplus is too small */ - if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse) - return 0; + if (usage + SURPLUS_MIN_ADJ_DELTA > hwm) + return hwm; return usage; } +static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) +{ + struct ioc_gq *iocg; + + list_for_each_entry(iocg, surpluses, surplus_list) { + u32 old_hwi, new_hwi, new_inuse; + + current_hweight(iocg, NULL, &old_hwi); + new_hwi = iocg->hweight_after_donation; + + new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi, + old_hwi); + __propagate_weights(iocg, iocg->weight, new_inuse); + } +} + static void ioc_timer_fn(struct timer_list *timer) { struct ioc *ioc = container_of(timer, struct ioc, timer); @@ -1560,9 +1625,9 @@ static void ioc_timer_fn(struct timer_list *timer) /* calc usages and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, usage_us, vmin; + u64 vdone, vtime, usage_us; u32 hw_active, hw_inuse, usage; - int uidx; + int uidx, nr_valid; /* * Collect unused and wind vtime closer to vnow to prevent @@ -1618,92 +1683,54 @@ static void ioc_timer_fn(struct timer_list *timer) started_at = ioc->period_at; dur = max_t(u64, now.now - started_at, 1); - usage = clamp_t(u32, - DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur), - 1, WEIGHT_ONE); iocg->usage_idx = uidx; - iocg->usages[uidx] = usage; - } else { - usage = 0; + iocg->usages[uidx] = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur), + 1, WEIGHT_ONE); } - /* see whether there's surplus vtime */ - vmin = now.vnow - ioc->margins.max; - - WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); - if (!waitqueue_active(&iocg->waitq) && - time_before64(vtime, vmin)) { - u64 delta = vmin - vtime; - - /* throw away surplus vtime */ - atomic64_add(delta, &iocg->vtime); - atomic64_add(delta, &iocg->done_vtime); - /* if usage is sufficiently low, maybe it can donate */ - if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) - list_add(&iocg->surplus_list, &surpluses); - } else if (hw_inuse < hw_active) { - u32 new_hwi, new_inuse; - - /* was donating but might need to take back some */ - if (waitqueue_active(&iocg->waitq)) { - new_hwi = hw_active; - } else { - new_hwi = max(hw_inuse, - usage * SURPLUS_SCALE_PCT / 100 + - SURPLUS_SCALE_ABS); - } - - new_inuse = div64_u64((u64)iocg->inuse * new_hwi, - hw_inuse); - new_inuse = clamp_t(u32, new_inuse, 1, iocg->active); - - if (new_inuse > iocg->inuse) { - TRACE_IOCG_PATH(inuse_takeback, iocg, &now, - iocg->inuse, new_inuse, - hw_inuse, new_hwi); - __propagate_weights(iocg, iocg->weight, - new_inuse); - } - } else { - /* genuninely out of vtime */ - nr_shortages++; - } - } - - if (!nr_shortages || list_empty(&surpluses)) - goto skip_surplus_transfers; - - /* there are both shortages and surpluses, transfer surpluses */ - list_for_each_entry(iocg, &surpluses, surplus_list) { - u32 usage, hw_active, hw_inuse, new_hwi, new_inuse; - int nr_valid = 0; - /* base the decision on max historical usage */ - for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) { + for (i = 0, usage = 0, nr_valid = 0; i < NR_USAGE_SLOTS; i++) { if (iocg->usages[i]) { usage = max(usage, iocg->usages[i]); nr_valid++; } } if (nr_valid < MIN_VALID_USAGES) - continue; + usage = WEIGHT_ONE; - current_hweight(iocg, &hw_active, &hw_inuse); - new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse); - if (!new_hwi) - continue; + /* see whether there's surplus vtime */ + WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); + if (hw_inuse < hw_active || + (!waitqueue_active(&iocg->waitq) && + time_before64(vtime, now.vnow - ioc->margins.max))) { + u32 hwm, new_hwi; - new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi, - hw_inuse); - if (new_inuse < iocg->inuse) { - TRACE_IOCG_PATH(inuse_giveaway, iocg, &now, - iocg->inuse, new_inuse, - hw_inuse, new_hwi); - __propagate_weights(iocg, iocg->weight, new_inuse); + /* + * Already donating or accumulated enough to start. + * Determine the donation amount. + */ + hwm = current_hweight_max(iocg); + new_hwi = hweight_after_donation(iocg, hwm, usage, + &now); + if (new_hwi < hwm) { + iocg->hweight_after_donation = new_hwi; + list_add(&iocg->surplus_list, &surpluses); + } else { + __propagate_weights(iocg, iocg->active, + iocg->active); + nr_shortages++; + } + } else { + /* genuinely short on vtime */ + nr_shortages++; } } -skip_surplus_transfers: + + if (!list_empty(&surpluses) && nr_shortages) + transfer_surpluses(&surpluses, &now); + commit_weights(ioc); /* surplus list should be dissolved after use */ From e08d02aa5fc9978ed42240bd80d3fb5dd28ecac0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:48 -0400 Subject: [PATCH 5/8] blk-iocost: implement Andy's method for donation weight updates iocost implements work conservation by reducing iocg->inuse and propagating the adjustment upwards proportionally. However, while I knew the target absolute hierarchical proportion - adjusted hweight_inuse, I couldn't figure out how to determine the iocg->inuse adjustment to achieve that and approximated the adjustment by scaling iocg->inuse using the proportion of the needed hweight_inuse changes. When nested, these scalings aren't accurate even when adjusting a single node as the donating node also receives the benefit of the donated portion. When multiple nodes are donating as they often do, they can be wildly wrong. iocost employed various safety nets to combat the inaccuracies. There are ample buffers in determining how much to donate, the adjustments are conservative and gradual. While it can achieve a reasonable level of work conservation in simple scenarios, the inaccuracies can easily add up leading to significant loss of total work. This in turn makes it difficult to closely cap vrate as vrate adjustment is needed to compensate for the loss of work. The combination of inaccurate donation calculations and vrate adjustments can lead to wide fluctuations and clunky overall behaviors. Andy Newell devised a method to calculate the needed ->inuse updates to achieve the target hweight_inuse's. The method is compatible with the proportional inuse adjustment propagation which allows all hot path operations to be local to each iocg. To roughly summarize, Andy's method divides the tree into donating and non-donating parts, calculates global donation rate which is used to determine the target hweight_inuse for each node, and then derives per-level proportions. There's non-trivial amount of math involved. Please refer to the following pdfs for detailed descriptions. https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN This patch implements Andy's method in transfer_surpluses(). This makes the donation calculations accurate per cycle and enables further improvements in other parts of the donation logic. Signed-off-by: Tejun Heo Cc: Andy Newell Signed-off-by: Jens Axboe --- block/blk-iocost.c | 252 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 244 insertions(+), 8 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 61b008d0801f..ecc23b827e5d 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -491,9 +491,11 @@ struct ioc_gq { /* see __propagate_weights() and current_hweight() for details */ u64 child_active_sum; u64 child_inuse_sum; + u64 child_adjusted_sum; int hweight_gen; u32 hweight_active; u32 hweight_inuse; + u32 hweight_donating; u32 hweight_after_donation; struct list_head walk_list; @@ -1551,20 +1553,252 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, return usage; } +/* + * For work-conservation, an iocg which isn't using all of its share should + * donate the leftover to other iocgs. There are two ways to achieve this - 1. + * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight. + * + * #1 is mathematically simpler but has the drawback of requiring synchronous + * global hweight_inuse updates when idle iocg's get activated or inuse weights + * change due to donation snapbacks as it has the possibility of grossly + * overshooting what's allowed by the model and vrate. + * + * #2 is inherently safe with local operations. The donating iocg can easily + * snap back to higher weights when needed without worrying about impacts on + * other nodes as the impacts will be inherently correct. This also makes idle + * iocg activations safe. The only effect activations have is decreasing + * hweight_inuse of others, the right solution to which is for those iocgs to + * snap back to higher weights. + * + * So, we go with #2. The challenge is calculating how each donating iocg's + * inuse should be adjusted to achieve the target donation amounts. This is done + * using Andy's method described in the following pdf. + * + * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo + * + * Given the weights and target after-donation hweight_inuse values, Andy's + * method determines how the proportional distribution should look like at each + * sibling level to maintain the relative relationship between all non-donating + * pairs. To roughly summarize, it divides the tree into donating and + * non-donating parts, calculates global donation rate which is used to + * determine the target hweight_inuse for each node, and then derives per-level + * proportions. + * + * The following pdf shows that global distribution calculated this way can be + * achieved by scaling inuse weights of donating leaves and propagating the + * adjustments upwards proportionally. + * + * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE + * + * Combining the above two, we can determine how each leaf iocg's inuse should + * be adjusted to achieve the target donation. + * + * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN + * + * The inline comments use symbols from the last pdf. + * + * b is the sum of the absolute budgets in the subtree. 1 for the root node. + * f is the sum of the absolute budgets of non-donating nodes in the subtree. + * t is the sum of the absolute budgets of donating nodes in the subtree. + * w is the weight of the node. w = w_f + w_t + * w_f is the non-donating portion of w. w_f = w * f / b + * w_b is the donating portion of w. w_t = w * t / b + * s is the sum of all sibling weights. s = Sum(w) for siblings + * s_f and s_t are the non-donating and donating portions of s. + * + * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g. + * w_pt is the donating portion of the parent's weight and w'_pt the same value + * after adjustments. Subscript r denotes the root node's values. + */ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) { - struct ioc_gq *iocg; + LIST_HEAD(over_hwa); + LIST_HEAD(inner_walk); + struct ioc_gq *iocg, *tiocg, *root_iocg; + u32 after_sum, over_sum, over_target, gamma; + /* + * It's pretty unlikely but possible for the total sum of + * hweight_after_donation's to be higher than WEIGHT_ONE, which will + * confuse the following calculations. If such condition is detected, + * scale down everyone over its full share equally to keep the sum below + * WEIGHT_ONE. + */ + after_sum = 0; + over_sum = 0; list_for_each_entry(iocg, surpluses, surplus_list) { - u32 old_hwi, new_hwi, new_inuse; + u32 hwa; - current_hweight(iocg, NULL, &old_hwi); - new_hwi = iocg->hweight_after_donation; + current_hweight(iocg, &hwa, NULL); + after_sum += iocg->hweight_after_donation; - new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi, - old_hwi); - __propagate_weights(iocg, iocg->weight, new_inuse); + if (iocg->hweight_after_donation > hwa) { + over_sum += iocg->hweight_after_donation; + list_add(&iocg->walk_list, &over_hwa); + } } + + if (after_sum >= WEIGHT_ONE) { + /* + * The delta should be deducted from the over_sum, calculate + * target over_sum value. + */ + u32 over_delta = after_sum - (WEIGHT_ONE - 1); + WARN_ON_ONCE(over_sum <= over_delta); + over_target = over_sum - over_delta; + } else { + over_target = 0; + } + + list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) { + if (over_target) + iocg->hweight_after_donation = + div_u64((u64)iocg->hweight_after_donation * + over_target, over_sum); + list_del_init(&iocg->walk_list); + } + + /* + * Build pre-order inner node walk list and prepare for donation + * adjustment calculations. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + iocg_build_inner_walk(iocg, &inner_walk); + } + + root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list); + WARN_ON_ONCE(root_iocg->level > 0); + + list_for_each_entry(iocg, &inner_walk, walk_list) { + iocg->child_adjusted_sum = 0; + iocg->hweight_donating = 0; + iocg->hweight_after_donation = 0; + } + + /* + * Propagate the donating budget (b_t) and after donation budget (b'_t) + * up the hierarchy. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + parent->hweight_donating += iocg->hweight_donating; + parent->hweight_after_donation += iocg->hweight_after_donation; + } + + list_for_each_entry_reverse(iocg, &inner_walk, walk_list) { + if (iocg->level > 0) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + parent->hweight_donating += iocg->hweight_donating; + parent->hweight_after_donation += iocg->hweight_after_donation; + } + } + + /* + * Calculate inner hwa's (b) and make sure the donation values are + * within the accepted ranges as we're doing low res calculations with + * roundups. + */ + list_for_each_entry(iocg, &inner_walk, walk_list) { + if (iocg->level) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + iocg->hweight_active = DIV64_U64_ROUND_UP( + (u64)parent->hweight_active * iocg->active, + parent->child_active_sum); + + } + + iocg->hweight_donating = min(iocg->hweight_donating, + iocg->hweight_active); + iocg->hweight_after_donation = min(iocg->hweight_after_donation, + iocg->hweight_donating - 1); + if (WARN_ON_ONCE(iocg->hweight_active <= 1 || + iocg->hweight_donating <= 1 || + iocg->hweight_after_donation == 0)) { + pr_warn("iocg: invalid donation weights in "); + pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); + pr_cont(": active=%u donating=%u after=%u\n", + iocg->hweight_active, iocg->hweight_donating, + iocg->hweight_after_donation); + } + } + + /* + * Calculate the global donation rate (gamma) - the rate to adjust + * non-donating budgets by. No need to use 64bit multiplication here as + * the first operand is guaranteed to be smaller than WEIGHT_ONE + * (1<<16). + * + * gamma = (1 - t_r') / (1 - t_r) + */ + gamma = DIV_ROUND_UP( + (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, + WEIGHT_ONE - root_iocg->hweight_donating); + + /* + * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner + * nodes. + */ + list_for_each_entry(iocg, &inner_walk, walk_list) { + struct ioc_gq *parent; + u32 inuse, wpt, wptp; + u64 st, sf; + + if (iocg->level == 0) { + /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */ + iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( + iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), + WEIGHT_ONE - iocg->hweight_after_donation); + continue; + } + + parent = iocg->ancestors[iocg->level - 1]; + + /* b' = gamma * b_f + b_t' */ + iocg->hweight_inuse = DIV64_U64_ROUND_UP( + (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), + WEIGHT_ONE) + iocg->hweight_after_donation; + + /* w' = s' * b' / b'_p */ + inuse = DIV64_U64_ROUND_UP( + (u64)parent->child_adjusted_sum * iocg->hweight_inuse, + parent->hweight_inuse); + + /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */ + st = DIV64_U64_ROUND_UP( + iocg->child_active_sum * iocg->hweight_donating, + iocg->hweight_active); + sf = iocg->child_active_sum - st; + wpt = DIV64_U64_ROUND_UP( + (u64)iocg->active * iocg->hweight_donating, + iocg->hweight_active); + wptp = DIV64_U64_ROUND_UP( + (u64)inuse * iocg->hweight_after_donation, + iocg->hweight_inuse); + + iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); + } + + /* + * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and + * we can finally determine leaf adjustments. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + u32 inuse; + + /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */ + inuse = DIV64_U64_ROUND_UP( + parent->child_adjusted_sum * iocg->hweight_after_donation, + parent->hweight_inuse); + __propagate_weights(iocg, iocg->active, inuse); + } + + /* walk list should be dissolved after use */ + list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list) + list_del_init(&iocg->walk_list); } static void ioc_timer_fn(struct timer_list *timer) @@ -1705,16 +1939,18 @@ static void ioc_timer_fn(struct timer_list *timer) if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.max))) { - u32 hwm, new_hwi; + u32 hwa, hwm, new_hwi; /* * Already donating or accumulated enough to start. * Determine the donation amount. */ + current_hweight(iocg, &hwa, NULL); hwm = current_hweight_max(iocg); new_hwi = hweight_after_donation(iocg, hwm, usage, &now); if (new_hwi < hwm) { + iocg->hweight_donating = hwa; iocg->hweight_after_donation = new_hwi; list_add(&iocg->surplus_list, &surpluses); } else { From f1de2439ec43b74764f2a26e3a310b24407e3bde Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:49 -0400 Subject: [PATCH 6/8] blk-iocost: revamp donation amount determination iocost has various safety nets to combat inuse adjustment calculation inaccuracies. With Andy's method implemented in transfer_surpluses(), inuse adjustment calculations are now accurate and we can make donation amount determinations accurate too. * Stop keeping track of past usage history and using the maximum. Act on the immediate usage information. * Remove donation constraints defined by SURPLUS_* constants. Donate whatever isn't used. * Determine the donation amount so that the iocg will end up with MARGIN_TARGET_PCT budget at the end of the coming period assuming the same usage as the previous period. TARGET is set at 50% of period, which is the previous maximum. This provides smooth convergence for most repetitive IO patterns. * Apply donation logic early at 20% budget. There's no risk in doing so as the calculation is based on the delta between the current budget and the target budget at the end of the coming period. * Remove preemptive iocg activation for zero cost IOs. As donation can reach near zero now, the mere activation doesn't provide any protection anymore. In the unlikely case that this becomes a problem, the right solution is assigning appropriate costs for such IOs. This significantly improves the donation determination logic while also simplifying it. Now all donations are immediate, exact and smooth. Signed-off-by: Tejun Heo Cc: Andy Newell Signed-off-by: Jens Axboe --- block/blk-iocost.c | 133 +++++++++++++++++---------------------------- 1 file changed, 51 insertions(+), 82 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index ecc23b827e5d..694f1487208a 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -217,12 +217,14 @@ enum { MAX_PERIOD = USEC_PER_SEC, /* - * A cgroup's vtime can run 50% behind the device vtime, which + * iocg->vtime is targeted at 50% behind the device vtime, which * serves as its IO credit buffer. Surplus weight adjustment is * immediately canceled if the vtime margin runs below 10%. */ MARGIN_MIN_PCT = 10, - MARGIN_MAX_PCT = 50, + MARGIN_LOW_PCT = 20, + MARGIN_TARGET_PCT = 50, + MARGIN_MAX_PCT = 100, /* Have some play in timer operations */ TIMER_SLACK_PCT = 1, @@ -234,17 +236,6 @@ enum { */ VTIME_VALID_DUR = 300 * USEC_PER_SEC, - /* - * Remember the past three non-zero usages and use the max for - * surplus calculation. Three slots guarantee that we remember one - * full period usage from the last active stretch even after - * partial deactivation and re-activation periods. Don't start - * giving away weight before collecting two data points to prevent - * hweight adjustments based on one partial activation period. - */ - NR_USAGE_SLOTS = 3, - MIN_VALID_USAGES = 2, - /* 1/64k is granular enough and can easily be handled w/ u32 */ WEIGHT_ONE = 1 << 16, @@ -280,14 +271,6 @@ enum { /* don't let cmds which take a very long time pin lagging for too long */ MAX_LAGGING_PERIODS = 10, - /* - * If usage% * 1.25 + 2% is lower than hweight% by more than 3%, - * donate the surplus. - */ - SURPLUS_SCALE_PCT = 125, /* * 125% */ - SURPLUS_SCALE_ABS = WEIGHT_ONE / 50, /* + 2% */ - SURPLUS_MIN_ADJ_DELTA = WEIGHT_ONE / 33, /* 3% */ - /* switch iff the conditions are met for longer than this */ AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, @@ -376,6 +359,8 @@ struct ioc_params { struct ioc_margins { s64 min; + s64 low; + s64 target; s64 max; }; @@ -514,11 +499,7 @@ struct ioc_gq { struct iocg_stat desc_stat; struct iocg_stat last_stat; u64 last_stat_abs_vusage; - - /* usage is recorded as fractions of WEIGHT_ONE */ - u32 usage_delta_us; - int usage_idx; - u32 usages[NR_USAGE_SLOTS]; + u64 usage_delta_us; /* this iocg's depth in the hierarchy and ancestors including self */ int level; @@ -737,6 +718,8 @@ static void ioc_refresh_margins(struct ioc *ioc) u64 vrate = atomic64_read(&ioc->vtime_rate); margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; + margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; + margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate; } @@ -1228,7 +1211,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) return false; } if (!atomic_read(&blkg->use_delay) && - time_before_eq64(vtime, now->vnow + ioc->margins.max)) + time_before_eq64(vtime, now->vnow + ioc->margins.target)) return false; /* use delay */ @@ -1527,7 +1510,7 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, { struct ioc *ioc = iocg->ioc; u64 vtime = atomic64_read(&iocg->vtime); - s64 excess; + s64 excess, delta, target, new_hwi; /* see whether minimum margin requirement is met */ if (waitqueue_active(&iocg->waitq) || @@ -1542,15 +1525,28 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, vtime += excess; } - /* add margin */ - usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100); - usage += SURPLUS_SCALE_ABS; + /* + * Let's say the distance between iocg's and device's vtimes as a + * fraction of period duration is delta. Assuming that the iocg will + * consume the usage determined above, we want to determine new_hwi so + * that delta equals MARGIN_TARGET at the end of the next period. + * + * We need to execute usage worth of IOs while spending the sum of the + * new budget (1 - MARGIN_TARGET) and the leftover from the last period + * (delta): + * + * usage = (1 - MARGIN_TARGET + delta) * new_hwi + * + * Therefore, the new_hwi is: + * + * new_hwi = usage / (1 - MARGIN_TARGET + delta) + */ + delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), + now->vnow - ioc->period_at_vtime); + target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100; + new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); - /* don't bother if the surplus is too small */ - if (usage + SURPLUS_MIN_ADJ_DELTA > hwm) - return hwm; - - return usage; + return clamp_t(s64, new_hwi, 1, hwm); } /* @@ -1812,7 +1808,7 @@ static void ioc_timer_fn(struct timer_list *timer) u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; u32 missed_ppm[2], rq_wait_pct; u64 period_vtime; - int prev_busy_level, i; + int prev_busy_level; /* how were the latencies during the period? */ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); @@ -1857,11 +1853,10 @@ static void ioc_timer_fn(struct timer_list *timer) } commit_weights(ioc); - /* calc usages and see whether some weights need to be moved around */ + /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, usage_us; - u32 hw_active, hw_inuse, usage; - int uidx, nr_valid; + u64 vdone, vtime, usage_us, usage_dur; + u32 usage, hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent @@ -1886,15 +1881,11 @@ static void ioc_timer_fn(struct timer_list *timer) nr_lagging++; /* - * Determine absolute usage factoring in pending and in-flight - * IOs to avoid stalls and high-latency completions appearing as - * idle. + * Determine absolute usage factoring in in-flight IOs to avoid + * high-latency completions appearing as idle. */ usage_us = iocg->usage_delta_us; - if (waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow)) - usage_us += DIV64_U64_ROUND_UP( - cost_to_abs_cost(now.vnow - vtime, hw_inuse), - now.vrate); + if (vdone != vtime) { u64 inflight_us = DIV64_U64_ROUND_UP( cost_to_abs_cost(vtime - vdone, hw_inuse), @@ -1902,43 +1893,22 @@ static void ioc_timer_fn(struct timer_list *timer) usage_us = max(usage_us, inflight_us); } - /* convert to hweight based usage ratio and record */ - uidx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS; + /* convert to hweight based usage ratio */ + if (time_after64(iocg->activated_at, ioc->period_at)) + usage_dur = max_t(u64, now.now - iocg->activated_at, 1); + else + usage_dur = max_t(u64, now.now - ioc->period_at, 1); - if (time_after64(vtime, now.vnow - ioc->margins.min)) { - iocg->usage_idx = uidx; - iocg->usages[uidx] = WEIGHT_ONE; - } else if (usage_us) { - u64 started_at, dur; - - if (time_after64(iocg->activated_at, ioc->period_at)) - started_at = iocg->activated_at; - else - started_at = ioc->period_at; - - dur = max_t(u64, now.now - started_at, 1); - - iocg->usage_idx = uidx; - iocg->usages[uidx] = clamp_t(u32, - DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur), + usage = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, + usage_dur), 1, WEIGHT_ONE); - } - - /* base the decision on max historical usage */ - for (i = 0, usage = 0, nr_valid = 0; i < NR_USAGE_SLOTS; i++) { - if (iocg->usages[i]) { - usage = max(usage, iocg->usages[i]); - nr_valid++; - } - } - if (nr_valid < MIN_VALID_USAGES) - usage = WEIGHT_ONE; /* see whether there's surplus vtime */ WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && - time_before64(vtime, now.vnow - ioc->margins.max))) { + time_before64(vtime, now.vnow - ioc->margins.low))) { u32 hwa, hwm, new_hwi; /* @@ -2175,15 +2145,14 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) if (!ioc->enabled || !iocg->level) return; - /* always activate so that even 0 cost IOs get protected to some level */ - if (!iocg_activate(iocg, &now)) - return; - /* calculate the absolute vtime cost */ abs_cost = calc_vtime_cost(bio, iocg, false); if (!abs_cost) return; + if (!iocg_activate(iocg, &now)) + return; + iocg->cursor = bio_end_sector(bio); vtime = atomic64_read(&iocg->vtime); From b0853ab4a238c54b8f97ca7dde1ae156e2bbd5e4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:50 -0400 Subject: [PATCH 7/8] blk-iocost: revamp in-period donation snapbacks When the margin drops below the minimum on a donating iocg, donation is immediately canceled in full. There are a couple shortcomings with the current behavior. * It's abrupt. A small temporary budget deficit can lead to a wide swing in weight allocation and a large surplus. * It's open coded in the issue path but not implemented for the merge path. A series of merges at a low inuse can make the iocg incur debts and stall incorrectly. This patch reimplements in-period donation snapbacks so that * inuse adjustment and cost calculations are factored into adjust_inuse_and_calc_cost() which is called from both the issue and merge paths. * Snapbacks are more gradual. It occurs in quarter steps. * A snapback triggers if the margin goes below the low threshold and is lower than the budget at the time of the last adjustment. * For the above, __propagate_weights() stores the margin in iocg->saved_margin. Move iocg->last_inuse storing together into __propagate_weights() for consistency. * Full snapback is guaranteed when there are waiters. * With precise donation and gradual snapbacks, inuse adjustments are now a lot more effective and the value of scaling inuse on weight changes isn't clear. Removed inuse scaling from weight_update(). Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 133 ++++++++++++++++++++++++++++++++------------- 1 file changed, 96 insertions(+), 37 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 694f1487208a..d09b4011449c 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -226,6 +226,8 @@ enum { MARGIN_TARGET_PCT = 50, MARGIN_MAX_PCT = 100, + INUSE_ADJ_STEP_PCT = 25, + /* Have some play in timer operations */ TIMER_SLACK_PCT = 1, @@ -443,12 +445,17 @@ struct ioc_gq { * * `last_inuse` remembers `inuse` while an iocg is idle to persist * surplus adjustments. + * + * `inuse` may be adjusted dynamically during period. `saved_*` are used + * to determine and track adjustments. */ u32 cfg_weight; u32 weight; u32 active; u32 inuse; + u32 last_inuse; + s64 saved_margin; sector_t cursor; /* to detect randio */ @@ -934,9 +941,11 @@ static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) /* * Update @iocg's `active` and `inuse` to @active and @inuse, update level - * weight sums and propagate upwards accordingly. + * weight sums and propagate upwards accordingly. If @save, the current margin + * is saved to be used as reference for later inuse in-period adjustments. */ -static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse) +static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, + bool save, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; int lvl; @@ -945,6 +954,10 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse) inuse = clamp_t(u32, inuse, 1, active); + iocg->last_inuse = iocg->inuse; + if (save) + iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); + if (active == iocg->active && inuse == iocg->inuse) return; @@ -996,9 +1009,10 @@ static void commit_weights(struct ioc *ioc) } } -static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse) +static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, + bool save, struct ioc_now *now) { - __propagate_weights(iocg, active, inuse); + __propagate_weights(iocg, active, inuse, save, now); commit_weights(iocg->ioc); } @@ -1082,7 +1096,7 @@ static u32 current_hweight_max(struct ioc_gq *iocg) return max_t(u32, hwm, 1); } -static void weight_updated(struct ioc_gq *iocg) +static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); @@ -1093,9 +1107,7 @@ static void weight_updated(struct ioc_gq *iocg) weight = iocg->cfg_weight ?: iocc->dfl_weight; if (weight != iocg->weight && iocg->active) - propagate_weights(iocg, weight, - DIV64_U64_ROUND_UP((u64)iocg->inuse * weight, - iocg->weight)); + propagate_weights(iocg, weight, iocg->inuse, true, now); iocg->weight = weight; } @@ -1165,8 +1177,9 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) */ iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; list_add(&iocg->active_list, &ioc->active_iocgs); + propagate_weights(iocg, iocg->weight, - iocg->last_inuse ?: iocg->weight); + iocg->last_inuse ?: iocg->weight, true, now); TRACE_IOCG_PATH(iocg_activate, iocg, now, last_period, cur_period, vtime); @@ -1789,7 +1802,7 @@ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) inuse = DIV64_U64_ROUND_UP( parent->child_adjusted_sum * iocg->hweight_after_donation, parent->hweight_inuse); - __propagate_weights(iocg, iocg->active, inuse); + __propagate_weights(iocg, iocg->active, inuse, true, now); } /* walk list should be dissolved after use */ @@ -1844,8 +1857,7 @@ static void ioc_timer_fn(struct timer_list *timer) iocg_kick_waitq(iocg, true, &now); } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ - iocg->last_inuse = iocg->inuse; - __propagate_weights(iocg, 0, 0); + __propagate_weights(iocg, 0, 0, false, &now); list_del_init(&iocg->active_list); } @@ -1925,7 +1937,7 @@ static void ioc_timer_fn(struct timer_list *timer) list_add(&iocg->surplus_list, &surpluses); } else { __propagate_weights(iocg, iocg->active, - iocg->active); + iocg->active, true, &now); nr_shortages++; } } else { @@ -2055,6 +2067,50 @@ static void ioc_timer_fn(struct timer_list *timer) spin_unlock_irq(&ioc->lock); } +static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, + u64 abs_cost, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct ioc_margins *margins = &ioc->margins; + u32 adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); + u32 hwi; + s64 margin; + u64 cost, new_inuse; + + current_hweight(iocg, NULL, &hwi); + cost = abs_cost_to_cost(abs_cost, hwi); + margin = now->vnow - vtime - cost; + + /* + * We only increase inuse during period and do so iff the margin has + * deteriorated since the previous adjustment. + */ + if (margin >= iocg->saved_margin || margin >= margins->low || + iocg->inuse == iocg->active) + return cost; + + spin_lock_irq(&ioc->lock); + + /* we own inuse only when @iocg is in the normal active state */ + if (list_empty(&iocg->active_list)) { + spin_unlock_irq(&ioc->lock); + return cost; + } + + /* bump up inuse till @abs_cost fits in the existing budget */ + new_inuse = iocg->inuse; + do { + new_inuse = new_inuse + adj_step; + propagate_weights(iocg, iocg->active, new_inuse, true, now); + current_hweight(iocg, NULL, &hwi); + cost = abs_cost_to_cost(abs_cost, hwi); + } while (time_after64(vtime + cost, now->vnow) && + iocg->inuse != iocg->active); + + spin_unlock_irq(&ioc->lock); + return cost; +} + static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, bool is_merge, u64 *costp) { @@ -2136,7 +2192,6 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) struct ioc_gq *iocg = blkg_to_iocg(blkg); struct ioc_now now; struct iocg_wait wait; - u32 hw_active, hw_inuse; u64 abs_cost, cost, vtime; bool use_debt, ioc_locked; unsigned long flags; @@ -2154,21 +2209,8 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) return; iocg->cursor = bio_end_sector(bio); - vtime = atomic64_read(&iocg->vtime); - current_hweight(iocg, &hw_active, &hw_inuse); - - if (hw_inuse < hw_active && - time_after_eq64(vtime + ioc->margins.min, now.vnow)) { - TRACE_IOCG_PATH(inuse_reset, iocg, &now, - iocg->inuse, iocg->weight, hw_inuse, hw_active); - spin_lock_irq(&ioc->lock); - propagate_weights(iocg, iocg->weight, iocg->weight); - spin_unlock_irq(&ioc->lock); - current_hweight(iocg, &hw_active, &hw_inuse); - } - - cost = abs_cost_to_cost(abs_cost, hw_inuse); + cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* * If no one's waiting and within budget, issue right away. The @@ -2190,7 +2232,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) */ use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current); ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); - +retry_lock: iocg_lock(iocg, ioc_locked, &flags); /* @@ -2232,6 +2274,17 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) return; } + /* guarantee that iocgs w/ waiters have maximum inuse */ + if (iocg->inuse != iocg->active) { + if (!ioc_locked) { + iocg_unlock(iocg, false, &flags); + ioc_locked = true; + goto retry_lock; + } + propagate_weights(iocg, iocg->active, iocg->active, true, + &now); + } + /* * Append self to the waitq and schedule the wakeup timer if we're * the first waiter. The timer duration is calculated based on the @@ -2274,8 +2327,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, struct ioc *ioc = iocg->ioc; sector_t bio_end = bio_end_sector(bio); struct ioc_now now; - u32 hw_inuse; - u64 abs_cost, cost; + u64 vtime, abs_cost, cost; unsigned long flags; /* bypass if disabled or for root cgroup */ @@ -2287,8 +2339,9 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, return; ioc_now(ioc, &now); - current_hweight(iocg, NULL, &hw_inuse); - cost = abs_cost_to_cost(abs_cost, hw_inuse); + + vtime = atomic64_read(&iocg->vtime); + cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* update cursor if backmerging into the request at the cursor */ if (blk_rq_pos(rq) < bio_end && @@ -2530,7 +2583,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) } spin_lock_irqsave(&ioc->lock, flags); - weight_updated(iocg); + weight_updated(iocg, &now); spin_unlock_irqrestore(&ioc->lock, flags); } @@ -2544,7 +2597,10 @@ static void ioc_pd_free(struct blkg_policy_data *pd) spin_lock_irqsave(&ioc->lock, flags); if (!list_empty(&iocg->active_list)) { - propagate_weights(iocg, 0, 0); + struct ioc_now now; + + ioc_now(ioc, &now); + propagate_weights(iocg, 0, 0, false, &now); list_del_init(&iocg->active_list); } @@ -2612,6 +2668,7 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); struct blkg_conf_ctx ctx; + struct ioc_now now; struct ioc_gq *iocg; u32 v; int ret; @@ -2632,7 +2689,8 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, if (iocg) { spin_lock_irq(&iocg->ioc->lock); - weight_updated(iocg); + ioc_now(iocg->ioc, &now); + weight_updated(iocg, &now); spin_unlock_irq(&iocg->ioc->lock); } } @@ -2658,7 +2716,8 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, spin_lock(&iocg->ioc->lock); iocg->cfg_weight = v * WEIGHT_ONE; - weight_updated(iocg); + ioc_now(iocg->ioc, &now); + weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); blkg_conf_finish(&ctx); From c421a3eb2e27402c14131a14390551ae6cdb15fa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:51 -0400 Subject: [PATCH 8/8] blk-iocost: revamp debt handling Debt handling had several issues. * How much inuse a debtor carries wasn't clearly defined. inuse would be driven down over time from not issuing IOs but it'd be better to clamp it to minimum immediately once in debt. * How much can be paid off was determined by hweight_inuse. As inuse was driven down, the payment amount would fall together regardless of the debtor's active weight. This means that the debtors were punished harshly. * ioc_rqos_merge() wasn't calling blkcg_schedule_throttle() after iocg_kick_delay(). This patch revamps debt handling so that * Debt handling owns inuse for iocgs in debt and keeps them at zero. * Payment amount is determined by hweight_active. This is more deterministic and safer than hweight_inuse but still far from ideal in that it doesn't factor in possible donations from other iocgs for debt payments. This likely needs further improvements in the future. * iocg_rqos_merge() now calls blkcg_schedule_throttle() as necessary. Signed-off-by: Tejun Heo Cc: Andy Newell Signed-off-by: Jens Axboe --- block/blk-iocost.c | 117 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 93 insertions(+), 24 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index d09b4011449c..d2b69d87f3e7 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1206,13 +1206,13 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) struct blkcg_gq *blkg = iocg_to_blkg(iocg); u64 vtime = atomic64_read(&iocg->vtime); u64 delta_ns, expires, oexpires; - u32 hw_inuse; + u32 hwa; lockdep_assert_held(&iocg->waitq.lock); /* debt-adjust vtime */ - current_hweight(iocg, NULL, &hw_inuse); - vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); + current_hweight(iocg, &hwa, NULL); + vtime += abs_cost_to_cost(iocg->abs_vdebt, hwa); /* * Clear or maintain depending on the overage. Non-zero vdebt is what @@ -1258,6 +1258,47 @@ static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost, + struct ioc_now *now) +{ + struct iocg_pcpu_stat *gcs; + + lockdep_assert_held(&iocg->ioc->lock); + lockdep_assert_held(&iocg->waitq.lock); + WARN_ON_ONCE(list_empty(&iocg->active_list)); + + /* + * Once in debt, debt handling owns inuse. @iocg stays at the minimum + * inuse donating all of it share to others until its debt is paid off. + */ + if (!iocg->abs_vdebt && abs_cost) + propagate_weights(iocg, iocg->active, 0, false, now); + + iocg->abs_vdebt += abs_cost; + + gcs = get_cpu_ptr(iocg->pcpu_stat); + local64_add(abs_cost, &gcs->abs_vusage); + put_cpu_ptr(gcs); +} + +static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, + struct ioc_now *now) +{ + lockdep_assert_held(&iocg->ioc->lock); + lockdep_assert_held(&iocg->waitq.lock); + + /* make sure that nobody messed with @iocg */ + WARN_ON_ONCE(list_empty(&iocg->active_list)); + WARN_ON_ONCE(iocg->inuse > 1); + + iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); + + /* if debt is paid in full, restore inuse */ + if (!iocg->abs_vdebt) + propagate_weights(iocg, iocg->active, iocg->last_inuse, + false, now); +} + static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { @@ -1296,26 +1337,25 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, struct iocg_wake_ctx ctx = { .iocg = iocg }; u64 vshortage, expires, oexpires; s64 vbudget; - u32 hw_inuse; + u32 hwa; lockdep_assert_held(&iocg->waitq.lock); - current_hweight(iocg, NULL, &hw_inuse); + current_hweight(iocg, &hwa, NULL); vbudget = now->vnow - atomic64_read(&iocg->vtime); /* pay off debt */ if (pay_debt && iocg->abs_vdebt && vbudget > 0) { - u64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); - u64 delta = min_t(u64, vbudget, vdebt); - u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), - iocg->abs_vdebt); + u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa); + u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); + u64 vpay = abs_cost_to_cost(abs_vpay, hwa); lockdep_assert_held(&ioc->lock); - atomic64_add(delta, &iocg->vtime); - atomic64_add(delta, &iocg->done_vtime); - iocg->abs_vdebt -= abs_delta; - vbudget -= vdebt; + atomic64_add(vpay, &iocg->vtime); + atomic64_add(vpay, &iocg->done_vtime); + iocg_pay_debt(iocg, abs_vpay, now); + vbudget -= vpay; iocg_kick_delay(iocg, now); } @@ -1327,17 +1367,20 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, * not positive. */ if (iocg->abs_vdebt) { - s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); + s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); vbudget = min_t(s64, 0, vbudget - vdebt); } /* - * Wake up the ones which are due and see how much vtime we'll need - * for the next one. + * Wake up the ones which are due and see how much vtime we'll need for + * the next one. As paying off debt restores hw_inuse, it must be read + * after the above debt payment. */ - ctx.hw_inuse = hw_inuse; ctx.vbudget = vbudget; + current_hweight(iocg, NULL, &ctx.hw_inuse); + __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); + if (!waitqueue_active(&iocg->waitq)) return; if (WARN_ON_ONCE(ctx.vbudget >= 0)) @@ -1525,6 +1568,10 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, u64 vtime = atomic64_read(&iocg->vtime); s64 excess, delta, target, new_hwi; + /* debt handling owns inuse for debtors */ + if (iocg->abs_vdebt) + return 1; + /* see whether minimum margin requirement is met */ if (waitqueue_active(&iocg->waitq) || time_after64(vtime, now->vnow - ioc->margins.min)) @@ -1798,6 +1845,18 @@ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; u32 inuse; + /* + * In-debt iocgs participated in the donation calculation with + * the minimum target hweight_inuse. Configuring inuse + * accordingly would work fine but debt handling expects + * @iocg->inuse stay at the minimum and we don't wanna + * interfere. + */ + if (iocg->abs_vdebt) { + WARN_ON_ONCE(iocg->inuse > 1); + continue; + } + /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */ inuse = DIV64_U64_ROUND_UP( parent->child_adjusted_sum * iocg->hweight_after_donation, @@ -2081,6 +2140,10 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, cost = abs_cost_to_cost(abs_cost, hwi); margin = now->vnow - vtime - cost; + /* debt handling owns inuse for debtors */ + if (iocg->abs_vdebt) + return cost; + /* * We only increase inuse during period and do so iff the margin has * deteriorated since the previous adjustment. @@ -2092,7 +2155,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, spin_lock_irq(&ioc->lock); /* we own inuse only when @iocg is in the normal active state */ - if (list_empty(&iocg->active_list)) { + if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { spin_unlock_irq(&ioc->lock); return cost; } @@ -2266,7 +2329,7 @@ retry_lock: * penalizing the cgroup and its descendants. */ if (use_debt) { - iocg->abs_vdebt += abs_cost; + iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->q, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); @@ -2275,7 +2338,7 @@ retry_lock: } /* guarantee that iocgs w/ waiters have maximum inuse */ - if (iocg->inuse != iocg->active) { + if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { if (!ioc_locked) { iocg_unlock(iocg, false, &flags); ioc_locked = true; @@ -2363,14 +2426,20 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, * be for the vast majority of cases. See debt handling in * ioc_rqos_throttle() for details. */ - spin_lock_irqsave(&iocg->waitq.lock, flags); + spin_lock_irqsave(&ioc->lock, flags); + spin_lock(&iocg->waitq.lock); + if (likely(!list_empty(&iocg->active_list))) { - iocg->abs_vdebt += abs_cost; - iocg_kick_delay(iocg, &now); + iocg_incur_debt(iocg, abs_cost, &now); + if (iocg_kick_delay(iocg, &now)) + blkcg_schedule_throttle(rqos->q, + (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { iocg_commit_bio(iocg, bio, abs_cost, cost); } - spin_unlock_irqrestore(&iocg->waitq.lock, flags); + + spin_unlock(&iocg->waitq.lock); + spin_unlock_irqrestore(&ioc->lock, flags); } static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)