ACK: [SRU][Xenial][PATCH] sched/fair: Fix O(nr_cgroups) in load balance path
Stefan Bader
stefan.bader at canonical.com
Wed Feb 28 10:16:38 UTC 2018
On 07.02.2018 13:06, Gavin Guo wrote:
> From: Tejun Heo <tj at kernel.org>
>
> BugLink: https://bugs.launchpad.net/bugs/1747896
>
> Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all
> live cfs_rqs which have ever been active on the CPU; unfortunately,
> this makes update_blocked_averages() O(# total cgroups) which isn't
> scalable at all.
>
> This shows up as a small CPU consumption and scheduling latency
> increase in the load balancing path in systems with CPU controller
> enabled across most cgroups. In an edge case where temporary cgroups
> were leaking, this caused the kernel to consume good several tens of
> percents of CPU cycles running update_blocked_averages(), each run
> taking multiple millisecs.
>
> This patch fixes the issue by taking empty and fully decayed cfs_rqs
> off the rq->leaf_cfs_rq_list.
>
> Signed-off-by: Tejun Heo <tj at kernel.org>
> [ Added cfs_rq_is_decayed() ]
> Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org>
> Acked-by: Vincent Guittot <vincent.guittot at linaro.org>
> Cc: Chris Mason <clm at fb.com>
> Cc: Linus Torvalds <torvalds at linux-foundation.org>
> Cc: Mike Galbraith <efault at gmx.de>
> Cc: Paul Turner <pjt at google.com>
> Cc: Peter Zijlstra <peterz at infradead.org>
> Cc: Thomas Gleixner <tglx at linutronix.de>
> Link: http://lkml.kernel.org/r/20170426004350.GB3222@wtj.duckdns.org
> Signed-off-by: Ingo Molnar <mingo at kernel.org>
>
> (backported from commit a9e7f6544b9cebdae54d29f87a7ba2a83c0471b5)
> Signed-off-by: Gavin Guo <gavin.guo at canonical.com>
Acked-by: Stefan Bader <stefan.bader at canonical.com>
> ---
Positive testing.
> kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++++++------------
> 1 file changed, 39 insertions(+), 12 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index c1e78e9c8f05..0c711cb4fbe8 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -314,8 +314,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
> }
>
> /* Iterate thr' all leaf cfs_rq's on a runqueue */
> -#define for_each_leaf_cfs_rq(rq, cfs_rq) \
> - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
> +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
> + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
> + leaf_cfs_rq_list)
>
> /* Do the two (enqueued) entities belong to the same group ? */
> static inline struct cfs_rq *
> @@ -408,8 +409,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
> {
> }
>
> -#define for_each_leaf_cfs_rq(rq, cfs_rq) \
> - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
> +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
> + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
>
> static inline struct sched_entity *parent_entity(struct sched_entity *se)
> {
> @@ -4065,9 +4066,9 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
>
> static void __maybe_unused update_runtime_enabled(struct rq *rq)
> {
> - struct cfs_rq *cfs_rq;
> + struct cfs_rq *cfs_rq, *pos;
>
> - for_each_leaf_cfs_rq(rq, cfs_rq) {
> + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
> struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
>
> raw_spin_lock(&cfs_b->lock);
> @@ -4078,9 +4079,9 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq)
>
> static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
> {
> - struct cfs_rq *cfs_rq;
> + struct cfs_rq *cfs_rq, *pos;
>
> - for_each_leaf_cfs_rq(rq, cfs_rq) {
> + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
> if (!cfs_rq->runtime_enabled)
> continue;
>
> @@ -5966,10 +5967,28 @@ static void attach_tasks(struct lb_env *env)
> }
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> +
> +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
> +{
> + if (cfs_rq->load.weight)
> + return false;
> +
> + if (cfs_rq->avg.load_sum)
> + return false;
> +
> + if (cfs_rq->avg.util_sum)
> + return false;
> +
> + if (cfs_rq->runnable_load_sum)
> + return false;
> +
> + return true;
> +}
> +
> static void update_blocked_averages(int cpu)
> {
> struct rq *rq = cpu_rq(cpu);
> - struct cfs_rq *cfs_rq;
> + struct cfs_rq *cfs_rq, *pos;
> unsigned long flags;
>
> raw_spin_lock_irqsave(&rq->lock, flags);
> @@ -5979,13 +5998,21 @@ static void update_blocked_averages(int cpu)
> * Iterates the task_group tree in a bottom up fashion, see
> * list_add_leaf_cfs_rq() for details.
> */
> - for_each_leaf_cfs_rq(rq, cfs_rq) {
> + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
> +
> /* throttled entities do not contribute to load */
> if (throttled_hierarchy(cfs_rq))
> continue;
>
> if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
> update_tg_load_avg(cfs_rq, 0);
> +
> + /*
> + * There can be a lot of idle CPU cgroups. Don't let fully
> + * decayed cfs_rqs linger on the list.
> + */
> + if (cfs_rq_is_decayed(cfs_rq))
> + list_del_leaf_cfs_rq(cfs_rq);
> }
> raw_spin_unlock_irqrestore(&rq->lock, flags);
> }
> @@ -8386,10 +8413,10 @@ const struct sched_class fair_sched_class = {
> #ifdef CONFIG_SCHED_DEBUG
> void print_cfs_stats(struct seq_file *m, int cpu)
> {
> - struct cfs_rq *cfs_rq;
> + struct cfs_rq *cfs_rq, *pos;
>
> rcu_read_lock();
> - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
> + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
> print_cfs_rq(m, cpu, cfs_rq);
> rcu_read_unlock();
> }
>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 819 bytes
Desc: OpenPGP digital signature
URL: <https://lists.ubuntu.com/archives/kernel-team/attachments/20180228/a77718b9/attachment.sig>
More information about the kernel-team
mailing list