ACK: [SRU][Xenial][PATCH] sched/fair: Fix O(nr_cgroups) in load balance path

Wed Feb 28 10:16:38 UTC 2018

On 07.02.2018 13:06, Gavin Guo wrote:
> From: Tejun Heo <tj at kernel.org>
> 
> BugLink: https://bugs.launchpad.net/bugs/1747896
> 
> Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all
> live cfs_rqs which have ever been active on the CPU; unfortunately,
> this makes update_blocked_averages() O(# total cgroups) which isn't
> scalable at all.
> 
> This shows up as a small CPU consumption and scheduling latency
> increase in the load balancing path in systems with CPU controller
> enabled across most cgroups.  In an edge case where temporary cgroups
> were leaking, this caused the kernel to consume good several tens of
> percents of CPU cycles running update_blocked_averages(), each run
> taking multiple millisecs.
> 
> This patch fixes the issue by taking empty and fully decayed cfs_rqs
> off the rq->leaf_cfs_rq_list.
> 
> Signed-off-by: Tejun Heo <tj at kernel.org>
> [ Added cfs_rq_is_decayed() ]
> Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org>
> Acked-by: Vincent Guittot <vincent.guittot at linaro.org>
> Cc: Chris Mason <clm at fb.com>
> Cc: Linus Torvalds <torvalds at linux-foundation.org>
> Cc: Mike Galbraith <efault at gmx.de>
> Cc: Paul Turner <pjt at google.com>
> Cc: Peter Zijlstra <peterz at infradead.org>
> Cc: Thomas Gleixner <tglx at linutronix.de>
> Link: http://lkml.kernel.org/r/20170426004350.GB3222@wtj.duckdns.org
> Signed-off-by: Ingo Molnar <mingo at kernel.org>
> 
> (backported from commit a9e7f6544b9cebdae54d29f87a7ba2a83c0471b5)
> Signed-off-by: Gavin Guo <gavin.guo at canonical.com>
Acked-by: Stefan Bader <stefan.bader at canonical.com>

> ---

Positive testing.

>  kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 39 insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index c1e78e9c8f05..0c711cb4fbe8 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -314,8 +314,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
>  }
>  
>  /* Iterate thr' all leaf cfs_rq's on a runqueue */
> -#define for_each_leaf_cfs_rq(rq, cfs_rq) \
> -	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
> +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
> +	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
> +				 leaf_cfs_rq_list)
>  
>  /* Do the two (enqueued) entities belong to the same group ? */
>  static inline struct cfs_rq *
> @@ -408,8 +409,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
>  {
>  }
>  
> -#define for_each_leaf_cfs_rq(rq, cfs_rq) \
> -		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
> +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
> +		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
>  
>  static inline struct sched_entity *parent_entity(struct sched_entity *se)
>  {
> @@ -4065,9 +4066,9 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
>  
>  static void __maybe_unused update_runtime_enabled(struct rq *rq)
>  {
> -	struct cfs_rq *cfs_rq;
> +	struct cfs_rq *cfs_rq, *pos;
>  
> -	for_each_leaf_cfs_rq(rq, cfs_rq) {
> +	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
>  		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
>  
>  		raw_spin_lock(&cfs_b->lock);
> @@ -4078,9 +4079,9 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq)
>  
>  static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
>  {
> -	struct cfs_rq *cfs_rq;
> +	struct cfs_rq *cfs_rq, *pos;
>  
> -	for_each_leaf_cfs_rq(rq, cfs_rq) {
> +	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
>  		if (!cfs_rq->runtime_enabled)
>  			continue;
>  
> @@ -5966,10 +5967,28 @@ static void attach_tasks(struct lb_env *env)
>  }
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> +
> +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
> +{
> +	if (cfs_rq->load.weight)
> +		return false;
> +
> +	if (cfs_rq->avg.load_sum)
> +		return false;
> +
> +	if (cfs_rq->avg.util_sum)
> +		return false;
> +
> +	if (cfs_rq->runnable_load_sum)
> +		return false;
> +
> +	return true;
> +}
> +
>  static void update_blocked_averages(int cpu)
>  {
>  	struct rq *rq = cpu_rq(cpu);
> -	struct cfs_rq *cfs_rq;
> +	struct cfs_rq *cfs_rq, *pos;
>  	unsigned long flags;
>  
>  	raw_spin_lock_irqsave(&rq->lock, flags);
> @@ -5979,13 +5998,21 @@ static void update_blocked_averages(int cpu)
>  	 * Iterates the task_group tree in a bottom up fashion, see
>  	 * list_add_leaf_cfs_rq() for details.
>  	 */
> -	for_each_leaf_cfs_rq(rq, cfs_rq) {
> +	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
> +
>  		/* throttled entities do not contribute to load */
>  		if (throttled_hierarchy(cfs_rq))
>  			continue;
>  
>  		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
>  			update_tg_load_avg(cfs_rq, 0);
> +
> +		/*
> +		 * There can be a lot of idle CPU cgroups.  Don't let fully
> +		 * decayed cfs_rqs linger on the list.
> +		 */
> +		if (cfs_rq_is_decayed(cfs_rq))
> +			list_del_leaf_cfs_rq(cfs_rq);
>  	}
>  	raw_spin_unlock_irqrestore(&rq->lock, flags);
>  }
> @@ -8386,10 +8413,10 @@ const struct sched_class fair_sched_class = {
>  #ifdef CONFIG_SCHED_DEBUG
>  void print_cfs_stats(struct seq_file *m, int cpu)
>  {
> -	struct cfs_rq *cfs_rq;
> +	struct cfs_rq *cfs_rq, *pos;
>  
>  	rcu_read_lock();
> -	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
> +	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
>  		print_cfs_rq(m, cpu, cfs_rq);
>  	rcu_read_unlock();
>  }
> 

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 819 bytes
Desc: OpenPGP digital signature
URL: <https://lists.ubuntu.com/archives/kernel-team/attachments/20180228/a77718b9/attachment.sig>