Merge tag 'v3.14' into p/abusse/merge_upgrade
[projects/modsched/linux.git] / kernel / sched / cfs / fair.c
index 7c70201..9b4c4f3 100644 (file)
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
        update_sysctl();
 }
 
-#if BITS_PER_LONG == 32
-# define WMULT_CONST   (~0UL)
-#else
-# define WMULT_CONST   (1UL << 32)
-#endif
-
+#define WMULT_CONST    (~0U)
 #define WMULT_SHIFT    32
 
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+static void __update_inv_weight(struct load_weight *lw)
+{
+       unsigned long w;
+
+       if (likely(lw->inv_weight))
+               return;
+
+       w = scale_load_down(lw->weight);
+
+       if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+               lw->inv_weight = 1;
+       else if (unlikely(!w))
+               lw->inv_weight = WMULT_CONST;
+       else
+               lw->inv_weight = WMULT_CONST / w;
+}
 
 /*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ *   OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
  */
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-               struct load_weight *lw)
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 {
-       u64 tmp;
-
-       /*
-        * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-        * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-        * 2^SCHED_LOAD_RESOLUTION.
-        */
-       if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-               tmp = (u64)delta_exec * scale_load_down(weight);
-       else
-               tmp = (u64)delta_exec;
+       u64 fact = scale_load_down(weight);
+       int shift = WMULT_SHIFT;
 
-       if (!lw->inv_weight) {
-               unsigned long w = scale_load_down(lw->weight);
+       __update_inv_weight(lw);
 
-               if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
-                       lw->inv_weight = 1;
-               else if (unlikely(!w))
-                       lw->inv_weight = WMULT_CONST;
-               else
-                       lw->inv_weight = WMULT_CONST / w;
+       if (unlikely(fact >> 32)) {
+               while (fact >> 32) {
+                       fact >>= 1;
+                       shift--;
+               }
        }
 
-       /*
-        * Check whether we'd overflow the 64-bit multiplication:
-        */
-       if (unlikely(tmp > WMULT_CONST))
-               tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
-                       WMULT_SHIFT/2);
-       else
-               tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+       /* hint to use a 32x32->64 mul */
+       fact = (u64)(u32)fact * lw->inv_weight;
+
+       while (fact >> 32) {
+               fact >>= 1;
+               shift--;
+       }
 
-       return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+       return mul_u64_u32_shr(delta_exec, fact, shift);
 }
 
 
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 /*
  * delta /= w
  */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 {
        if (unlikely(se->load.weight != NICE_0_LOAD))
-               delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+               delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 
        return delta;
 }
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
                        update_load_add(&lw, se->load.weight);
                        load = &lw;
                }
-               slice = calc_delta_mine(slice, se->load.weight, load);
+               slice = __calc_delta(slice, se->load.weight, load);
        }
        return slice;
 }
@@ -681,6 +682,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #ifdef CONFIG_SMP
+static unsigned long task_h_load(struct task_struct *p);
+
 static inline void __update_task_entity_contrib(struct sched_entity *se);
 
 /* Give new task start runnable values to heavy its load in infant time */
@@ -701,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
 #endif
 
 /*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
+ * Update the current task's runtime statistics.
  */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-             unsigned long delta_exec)
-{
-       unsigned long delta_exec_weighted;
-
-       schedstat_set(curr->statistics.exec_max,
-                     max((u64)delta_exec, curr->statistics.exec_max));
-
-       curr->sum_exec_runtime += delta_exec;
-       schedstat_add(cfs_rq, exec_clock, delta_exec);
-       delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-
-       curr->vruntime += delta_exec_weighted;
-       update_min_vruntime(cfs_rq);
-}
-
 static void update_curr(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq->curr;
        u64 now = rq_clock_task(rq_of(cfs_rq));
-       unsigned long delta_exec;
+       u64 delta_exec;
 
        if (unlikely(!curr))
                return;
 
-       /*
-        * Get the amount of time the current task was running
-        * since the last time we changed load (this cannot
-        * overflow on 32 bits):
-        */
-       delta_exec = (unsigned long)(now - curr->exec_start);
-       if (!delta_exec)
+       delta_exec = now - curr->exec_start;
+       if (unlikely((s64)delta_exec <= 0))
                return;
 
-       __update_curr(cfs_rq, curr, delta_exec);
        curr->exec_start = now;
 
+       schedstat_set(curr->statistics.exec_max,
+                     max(delta_exec, curr->statistics.exec_max));
+
+       curr->sum_exec_runtime += delta_exec;
+       schedstat_add(cfs_rq, exec_clock, delta_exec);
+
+       curr->vruntime += calc_delta_fair(delta_exec, curr);
+       update_min_vruntime(cfs_rq);
+
        if (entity_is_task(curr)) {
                struct task_struct *curtask = task_of(curr);
 
@@ -818,11 +806,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 #ifdef CONFIG_NUMA_BALANCING
 /*
- * numa task sample period in ms
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
+ * calculated based on the tasks virtual memory size and
+ * numa_balancing_scan_size.
  */
-unsigned int sysctl_numa_balancing_scan_period_min = 100;
-unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
-unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
 
 /* Portion of address space to scan in MB */
 unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,41 +819,830 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 
-static void task_numa_placement(struct task_struct *p)
+/*
+ * After skipping a page migration on a shared page, skip N more numa page
+ * migrations unconditionally. This reduces the number of NUMA migrations
+ * in shared memory workloads, and has the effect of pulling tasks towards
+ * where their memory lives, over pulling the memory towards the task.
+ */
+unsigned int sysctl_numa_balancing_migrate_deferred = 16;
+
+static unsigned int task_nr_scan_windows(struct task_struct *p)
 {
-       int seq;
+       unsigned long rss = 0;
+       unsigned long nr_scan_pages;
 
-       if (!p->mm)     /* for example, ksmd faulting in a user's mm */
+       /*
+        * Calculations based on RSS as non-present and empty pages are skipped
+        * by the PTE scanner and NUMA hinting faults should be trapped based
+        * on resident pages
+        */
+       nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+       rss = get_mm_rss(p->mm);
+       if (!rss)
+               rss = nr_scan_pages;
+
+       rss = round_up(rss, nr_scan_pages);
+       return rss / nr_scan_pages;
+}
+
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+#define MAX_SCAN_WINDOW 2560
+
+static unsigned int task_scan_min(struct task_struct *p)
+{
+       unsigned int scan, floor;
+       unsigned int windows = 1;
+
+       if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+               windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+       floor = 1000 / windows;
+
+       scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
+       return max_t(unsigned int, floor, scan);
+}
+
+static unsigned int task_scan_max(struct task_struct *p)
+{
+       unsigned int smin = task_scan_min(p);
+       unsigned int smax;
+
+       /* Watch for min being lower than max due to floor calculations */
+       smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+       return max(smin, smax);
+}
+
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+       rq->nr_numa_running += (p->numa_preferred_nid != -1);
+       rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+       rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+       rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+
+struct numa_group {
+       atomic_t refcount;
+
+       spinlock_t lock; /* nr_tasks, tasks */
+       int nr_tasks;
+       pid_t gid;
+       struct list_head task_list;
+
+       struct rcu_head rcu;
+       unsigned long total_faults;
+       unsigned long faults[0];
+};
+
+pid_t task_numa_group_id(struct task_struct *p)
+{
+       return p->numa_group ? p->numa_group->gid : 0;
+}
+
+static inline int task_faults_idx(int nid, int priv)
+{
+       return 2 * nid + priv;
+}
+
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+       if (!p->numa_faults)
+               return 0;
+
+       return p->numa_faults[task_faults_idx(nid, 0)] +
+               p->numa_faults[task_faults_idx(nid, 1)];
+}
+
+static inline unsigned long group_faults(struct task_struct *p, int nid)
+{
+       if (!p->numa_group)
+               return 0;
+
+       return p->numa_group->faults[task_faults_idx(nid, 0)] +
+               p->numa_group->faults[task_faults_idx(nid, 1)];
+}
+
+/*
+ * These return the fraction of accesses done by a particular task, or
+ * task group, on a particular numa node.  The group weight is given a
+ * larger multiplier, in order to group tasks together that are almost
+ * evenly spread out between numa nodes.
+ */
+static inline unsigned long task_weight(struct task_struct *p, int nid)
+{
+       unsigned long total_faults;
+
+       if (!p->numa_faults)
+               return 0;
+
+       total_faults = p->total_numa_faults;
+
+       if (!total_faults)
+               return 0;
+
+       return 1000 * task_faults(p, nid) / total_faults;
+}
+
+static inline unsigned long group_weight(struct task_struct *p, int nid)
+{
+       if (!p->numa_group || !p->numa_group->total_faults)
+               return 0;
+
+       return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+}
+
+static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long power_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+
+/* Cached statistics for all CPUs within a node */
+struct numa_stats {
+       unsigned long nr_running;
+       unsigned long load;
+
+       /* Total compute capacity of CPUs on a node */
+       unsigned long power;
+
+       /* Approximate capacity in terms of runnable tasks on a node */
+       unsigned long capacity;
+       int has_capacity;
+};
+
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+       int cpu, cpus = 0;
+
+       memset(ns, 0, sizeof(*ns));
+       for_each_cpu(cpu, cpumask_of_node(nid)) {
+               struct rq *rq = cpu_rq(cpu);
+
+               ns->nr_running += rq->nr_running;
+               ns->load += weighted_cpuload(cpu);
+               ns->power += power_of(cpu);
+
+               cpus++;
+       }
+
+       /*
+        * If we raced with hotplug and there are no CPUs left in our mask
+        * the @ns structure is NULL'ed and task_numa_compare() will
+        * not find this node attractive.
+        *
+        * We'll either bail at !has_capacity, or we'll detect a huge imbalance
+        * and bail there.
+        */
+       if (!cpus)
                return;
+
+       ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
+       ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
+       ns->has_capacity = (ns->nr_running < ns->capacity);
+}
+
+struct task_numa_env {
+       struct task_struct *p;
+
+       int src_cpu, src_nid;
+       int dst_cpu, dst_nid;
+
+       struct numa_stats src_stats, dst_stats;
+
+       int imbalance_pct;
+
+       struct task_struct *best_task;
+       long best_imp;
+       int best_cpu;
+};
+
+static void task_numa_assign(struct task_numa_env *env,
+                            struct task_struct *p, long imp)
+{
+       if (env->best_task)
+               put_task_struct(env->best_task);
+       if (p)
+               get_task_struct(p);
+
+       env->best_task = p;
+       env->best_imp = imp;
+       env->best_cpu = env->dst_cpu;
+}
+
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env,
+                             long taskimp, long groupimp)
+{
+       struct rq *src_rq = cpu_rq(env->src_cpu);
+       struct rq *dst_rq = cpu_rq(env->dst_cpu);
+       struct task_struct *cur;
+       long dst_load, src_load;
+       long load;
+       long imp = (groupimp > 0) ? groupimp : taskimp;
+
+       rcu_read_lock();
+       cur = ACCESS_ONCE(dst_rq->curr);
+       if (cur->pid == 0) /* idle */
+               cur = NULL;
+
+       /*
+        * "imp" is the fault differential for the source task between the
+        * source and destination node. Calculate the total differential for
+        * the source task and potential destination task. The more negative
+        * the value is, the more rmeote accesses that would be expected to
+        * be incurred if the tasks were swapped.
+        */
+       if (cur) {
+               /* Skip this swap candidate if cannot move to the source cpu */
+               if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+                       goto unlock;
+
+               /*
+                * If dst and source tasks are in the same NUMA group, or not
+                * in any group then look only at task weights.
+                */
+               if (cur->numa_group == env->p->numa_group) {
+                       imp = taskimp + task_weight(cur, env->src_nid) -
+                             task_weight(cur, env->dst_nid);
+                       /*
+                        * Add some hysteresis to prevent swapping the
+                        * tasks within a group over tiny differences.
+                        */
+                       if (cur->numa_group)
+                               imp -= imp/16;
+               } else {
+                       /*
+                        * Compare the group weights. If a task is all by
+                        * itself (not part of a group), use the task weight
+                        * instead.
+                        */
+                       if (env->p->numa_group)
+                               imp = groupimp;
+                       else
+                               imp = taskimp;
+
+                       if (cur->numa_group)
+                               imp += group_weight(cur, env->src_nid) -
+                                      group_weight(cur, env->dst_nid);
+                       else
+                               imp += task_weight(cur, env->src_nid) -
+                                      task_weight(cur, env->dst_nid);
+               }
+       }
+
+       if (imp < env->best_imp)
+               goto unlock;
+
+       if (!cur) {
+               /* Is there capacity at our destination? */
+               if (env->src_stats.has_capacity &&
+                   !env->dst_stats.has_capacity)
+                       goto unlock;
+
+               goto balance;
+       }
+
+       /* Balance doesn't matter much if we're running a task per cpu */
+       if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+               goto assign;
+
+       /*
+        * In the overloaded case, try and keep the load balanced.
+        */
+balance:
+       dst_load = env->dst_stats.load;
+       src_load = env->src_stats.load;
+
+       /* XXX missing power terms */
+       load = task_h_load(env->p);
+       dst_load += load;
+       src_load -= load;
+
+       if (cur) {
+               load = task_h_load(cur);
+               dst_load -= load;
+               src_load += load;
+       }
+
+       /* make src_load the smaller */
+       if (dst_load < src_load)
+               swap(dst_load, src_load);
+
+       if (src_load * env->imbalance_pct < dst_load * 100)
+               goto unlock;
+
+assign:
+       task_numa_assign(env, cur, imp);
+unlock:
+       rcu_read_unlock();
+}
+
+static void task_numa_find_cpu(struct task_numa_env *env,
+                               long taskimp, long groupimp)
+{
+       int cpu;
+
+       for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+               /* Skip this CPU if the source task cannot migrate */
+               if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+                       continue;
+
+               env->dst_cpu = cpu;
+               task_numa_compare(env, taskimp, groupimp);
+       }
+}
+
+static int task_numa_migrate(struct task_struct *p)
+{
+       struct task_numa_env env = {
+               .p = p,
+
+               .src_cpu = task_cpu(p),
+               .src_nid = task_node(p),
+
+               .imbalance_pct = 112,
+
+               .best_task = NULL,
+               .best_imp = 0,
+               .best_cpu = -1
+       };
+       struct sched_domain *sd;
+       unsigned long taskweight, groupweight;
+       int nid, ret;
+       long taskimp, groupimp;
+
+       /*
+        * Pick the lowest SD_NUMA domain, as that would have the smallest
+        * imbalance and would be the first to start moving tasks about.
+        *
+        * And we want to avoid any moving of tasks about, as that would create
+        * random movement of tasks -- counter the numa conditions we're trying
+        * to satisfy here.
+        */
+       rcu_read_lock();
+       sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+       if (sd)
+               env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+       rcu_read_unlock();
+
+       /*
+        * Cpusets can break the scheduler domain tree into smaller
+        * balance domains, some of which do not cross NUMA boundaries.
+        * Tasks that are "trapped" in such domains cannot be migrated
+        * elsewhere, so there is no point in (re)trying.
+        */
+       if (unlikely(!sd)) {
+               p->numa_preferred_nid = task_node(p);
+               return -EINVAL;
+       }
+
+       taskweight = task_weight(p, env.src_nid);
+       groupweight = group_weight(p, env.src_nid);
+       update_numa_stats(&env.src_stats, env.src_nid);
+       env.dst_nid = p->numa_preferred_nid;
+       taskimp = task_weight(p, env.dst_nid) - taskweight;
+       groupimp = group_weight(p, env.dst_nid) - groupweight;
+       update_numa_stats(&env.dst_stats, env.dst_nid);
+
+       /* If the preferred nid has capacity, try to use it. */
+       if (env.dst_stats.has_capacity)
+               task_numa_find_cpu(&env, taskimp, groupimp);
+
+       /* No space available on the preferred nid. Look elsewhere. */
+       if (env.best_cpu == -1) {
+               for_each_online_node(nid) {
+                       if (nid == env.src_nid || nid == p->numa_preferred_nid)
+                               continue;
+
+                       /* Only consider nodes where both task and groups benefit */
+                       taskimp = task_weight(p, nid) - taskweight;
+                       groupimp = group_weight(p, nid) - groupweight;
+                       if (taskimp < 0 && groupimp < 0)
+                               continue;
+
+                       env.dst_nid = nid;
+                       update_numa_stats(&env.dst_stats, env.dst_nid);
+                       task_numa_find_cpu(&env, taskimp, groupimp);
+               }
+       }
+
+       /* No better CPU than the current one was found. */
+       if (env.best_cpu == -1)
+               return -EAGAIN;
+
+       sched_setnuma(p, env.dst_nid);
+
+       /*
+        * Reset the scan period if the task is being rescheduled on an
+        * alternative node to recheck if the tasks is now properly placed.
+        */
+       p->numa_scan_period = task_scan_min(p);
+
+       if (env.best_task == NULL) {
+               ret = migrate_task_to(p, env.best_cpu);
+               if (ret != 0)
+                       trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
+               return ret;
+       }
+
+       ret = migrate_swap(p, env.best_task);
+       if (ret != 0)
+               trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
+       put_task_struct(env.best_task);
+       return ret;
+}
+
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
+{
+       /* This task has no NUMA fault statistics yet */
+       if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+               return;
+
+       /* Periodically retry migrating the task to the preferred node */
+       p->numa_migrate_retry = jiffies + HZ;
+
+       /* Success if task is already running on preferred CPU */
+       if (task_node(p) == p->numa_preferred_nid)
+               return;
+
+       /* Otherwise, try migrate to a CPU on the preferred node */
+       task_numa_migrate(p);
+}
+
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+                       unsigned long shared, unsigned long private)
+{
+       unsigned int period_slot;
+       int ratio;
+       int diff;
+
+       unsigned long remote = p->numa_faults_locality[0];
+       unsigned long local = p->numa_faults_locality[1];
+
+       /*
+        * If there were no record hinting faults then either the task is
+        * completely idle or all activity is areas that are not of interest
+        * to automatic numa balancing. Scan slower
+        */
+       if (local + shared == 0) {
+               p->numa_scan_period = min(p->numa_scan_period_max,
+                       p->numa_scan_period << 1);
+
+               p->mm->numa_next_scan = jiffies +
+                       msecs_to_jiffies(p->numa_scan_period);
+
+               return;
+       }
+
+       /*
+        * Prepare to scale scan period relative to the current period.
+        *       == NUMA_PERIOD_THRESHOLD scan period stays the same
+        *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+        *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+        */
+       period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+       ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+       if (ratio >= NUMA_PERIOD_THRESHOLD) {
+               int slot = ratio - NUMA_PERIOD_THRESHOLD;
+               if (!slot)
+                       slot = 1;
+               diff = slot * period_slot;
+       } else {
+               diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+
+               /*
+                * Scale scan rate increases based on sharing. There is an
+                * inverse relationship between the degree of sharing and
+                * the adjustment made to the scanning period. Broadly
+                * speaking the intent is that there is little point
+                * scanning faster if shared accesses dominate as it may
+                * simply bounce migrations uselessly
+                */
+               ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+               diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+       }
+
+       p->numa_scan_period = clamp(p->numa_scan_period + diff,
+                       task_scan_min(p), task_scan_max(p));
+       memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+
+static void task_numa_placement(struct task_struct *p)
+{
+       int seq, nid, max_nid = -1, max_group_nid = -1;
+       unsigned long max_faults = 0, max_group_faults = 0;
+       unsigned long fault_types[2] = { 0, 0 };
+       spinlock_t *group_lock = NULL;
+
        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
        if (p->numa_scan_seq == seq)
                return;
        p->numa_scan_seq = seq;
+       p->numa_scan_period_max = task_scan_max(p);
+
+       /* If the task is part of a group prevent parallel updates to group stats */
+       if (p->numa_group) {
+               group_lock = &p->numa_group->lock;
+               spin_lock(group_lock);
+       }
+
+       /* Find the node with the highest number of faults */
+       for_each_online_node(nid) {
+               unsigned long faults = 0, group_faults = 0;
+               int priv, i;
+
+               for (priv = 0; priv < 2; priv++) {
+                       long diff;
+
+                       i = task_faults_idx(nid, priv);
+                       diff = -p->numa_faults[i];
+
+                       /* Decay existing window, copy faults since last scan */
+                       p->numa_faults[i] >>= 1;
+                       p->numa_faults[i] += p->numa_faults_buffer[i];
+                       fault_types[priv] += p->numa_faults_buffer[i];
+                       p->numa_faults_buffer[i] = 0;
+
+                       faults += p->numa_faults[i];
+                       diff += p->numa_faults[i];
+                       p->total_numa_faults += diff;
+                       if (p->numa_group) {
+                               /* safe because we can only change our own group */
+                               p->numa_group->faults[i] += diff;
+                               p->numa_group->total_faults += diff;
+                               group_faults += p->numa_group->faults[i];
+                       }
+               }
+
+               if (faults > max_faults) {
+                       max_faults = faults;
+                       max_nid = nid;
+               }
+
+               if (group_faults > max_group_faults) {
+                       max_group_faults = group_faults;
+                       max_group_nid = nid;
+               }
+       }
+
+       update_task_scan_period(p, fault_types[0], fault_types[1]);
+
+       if (p->numa_group) {
+               /*
+                * If the preferred task and group nids are different,
+                * iterate over the nodes again to find the best place.
+                */
+               if (max_nid != max_group_nid) {
+                       unsigned long weight, max_weight = 0;
+
+                       for_each_online_node(nid) {
+                               weight = task_weight(p, nid) + group_weight(p, nid);
+                               if (weight > max_weight) {
+                                       max_weight = weight;
+                                       max_nid = nid;
+                               }
+                       }
+               }
+
+               spin_unlock(group_lock);
+       }
+
+       /* Preferred node as the node with the most faults */
+       if (max_faults && max_nid != p->numa_preferred_nid) {
+               /* Update the preferred nid and migrate task if possible */
+               sched_setnuma(p, max_nid);
+               numa_migrate_preferred(p);
+       }
+}
+
+static inline int get_numa_group(struct numa_group *grp)
+{
+       return atomic_inc_not_zero(&grp->refcount);
+}
+
+static inline void put_numa_group(struct numa_group *grp)
+{
+       if (atomic_dec_and_test(&grp->refcount))
+               kfree_rcu(grp, rcu);
+}
+
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+                       int *priv)
+{
+       struct numa_group *grp, *my_grp;
+       struct task_struct *tsk;
+       bool join = false;
+       int cpu = cpupid_to_cpu(cpupid);
+       int i;
+
+       if (unlikely(!p->numa_group)) {
+               unsigned int size = sizeof(struct numa_group) +
+                                   2*nr_node_ids*sizeof(unsigned long);
+
+               grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+               if (!grp)
+                       return;
+
+               atomic_set(&grp->refcount, 1);
+               spin_lock_init(&grp->lock);
+               INIT_LIST_HEAD(&grp->task_list);
+               grp->gid = p->pid;
+
+               for (i = 0; i < 2*nr_node_ids; i++)
+                       grp->faults[i] = p->numa_faults[i];
+
+               grp->total_faults = p->total_numa_faults;
+
+               list_add(&p->numa_entry, &grp->task_list);
+               grp->nr_tasks++;
+               rcu_assign_pointer(p->numa_group, grp);
+       }
+
+       rcu_read_lock();
+       tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+
+       if (!cpupid_match_pid(tsk, cpupid))
+               goto no_join;
+
+       grp = rcu_dereference(tsk->numa_group);
+       if (!grp)
+               goto no_join;
+
+       my_grp = p->numa_group;
+       if (grp == my_grp)
+               goto no_join;
+
+       /*
+        * Only join the other group if its bigger; if we're the bigger group,
+        * the other task will join us.
+        */
+       if (my_grp->nr_tasks > grp->nr_tasks)
+               goto no_join;
+
+       /*
+        * Tie-break on the grp address.
+        */
+       if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
+               goto no_join;
+
+       /* Always join threads in the same process. */
+       if (tsk->mm == current->mm)
+               join = true;
+
+       /* Simple filter to avoid false positives due to PID collisions */
+       if (flags & TNF_SHARED)
+               join = true;
+
+       /* Update priv based on whether false sharing was detected */
+       *priv = !join;
+
+       if (join && !get_numa_group(grp))
+               goto no_join;
+
+       rcu_read_unlock();
+
+       if (!join)
+               return;
+
+       double_lock(&my_grp->lock, &grp->lock);
+
+       for (i = 0; i < 2*nr_node_ids; i++) {
+               my_grp->faults[i] -= p->numa_faults[i];
+               grp->faults[i] += p->numa_faults[i];
+       }
+       my_grp->total_faults -= p->total_numa_faults;
+       grp->total_faults += p->total_numa_faults;
+
+       list_move(&p->numa_entry, &grp->task_list);
+       my_grp->nr_tasks--;
+       grp->nr_tasks++;
+
+       spin_unlock(&my_grp->lock);
+       spin_unlock(&grp->lock);
+
+       rcu_assign_pointer(p->numa_group, grp);
+
+       put_numa_group(my_grp);
+       return;
+
+no_join:
+       rcu_read_unlock();
+       return;
+}
+
+void task_numa_free(struct task_struct *p)
+{
+       struct numa_group *grp = p->numa_group;
+       int i;
+       void *numa_faults = p->numa_faults;
+
+       if (grp) {
+               spin_lock(&grp->lock);
+               for (i = 0; i < 2*nr_node_ids; i++)
+                       grp->faults[i] -= p->numa_faults[i];
+               grp->total_faults -= p->total_numa_faults;
+
+               list_del(&p->numa_entry);
+               grp->nr_tasks--;
+               spin_unlock(&grp->lock);
+               rcu_assign_pointer(p->numa_group, NULL);
+               put_numa_group(grp);
+       }
 
-       /* FIXME: Scheduling placement policy hints go here */
+       p->numa_faults = NULL;
+       p->numa_faults_buffer = NULL;
+       kfree(numa_faults);
 }
 
 /*
  * Got a PROT_NONE fault for a page on @node.
  */
-void task_numa_fault(int node, int pages, bool migrated)
+void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 {
        struct task_struct *p = current;
+       bool migrated = flags & TNF_MIGRATED;
+       int priv;
 
        if (!numabalancing_enabled)
                return;
 
-       /* FIXME: Allocate task-specific structure for placement policy here */
+       /* for example, ksmd faulting in a user's mm */
+       if (!p->mm)
+               return;
+
+       /* Do not worry about placement if exiting */
+       if (p->state == TASK_DEAD)
+               return;
+
+       /* Allocate buffer to track faults on a per-node basis */
+       if (unlikely(!p->numa_faults)) {
+               int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+
+               /* numa_faults and numa_faults_buffer share the allocation */
+               p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
+               if (!p->numa_faults)
+                       return;
+
+               BUG_ON(p->numa_faults_buffer);
+               p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+               p->total_numa_faults = 0;
+               memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+       }
 
        /*
-        * If pages are properly placed (did not migrate) then scan slower.
-        * This is reset periodically in case of phase changes
+        * First accesses are treated as private, otherwise consider accesses
+        * to be private if the accessing pid has not changed
         */
-        if (!migrated)
-               p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
-                       p->numa_scan_period + jiffies_to_msecs(10));
+       if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
+               priv = 1;
+       } else {
+               priv = cpupid_match_pid(p, last_cpupid);
+               if (!priv && !(flags & TNF_NO_GROUP))
+                       task_numa_group(p, last_cpupid, flags, &priv);
+       }
 
        task_numa_placement(p);
+
+       /*
+        * Retry task to preferred node migration periodically, in case it
+        * case it previously failed, or the scheduler moved us.
+        */
+       if (time_after(jiffies, p->numa_migrate_retry))
+               numa_migrate_preferred(p);
+
+       if (migrated)
+               p->numa_pages_migrated += pages;
+
+       p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+       p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
 }
 
 static void reset_ptenuma_scan(struct task_struct *p)
@@ -884,6 +1662,7 @@ void task_numa_work(struct callback_head *work)
        struct mm_struct *mm = p->mm;
        struct vm_area_struct *vma;
        unsigned long start, end;
+       unsigned long nr_pte_updates = 0;
        long pages;
 
        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -900,35 +1679,9 @@ void task_numa_work(struct callback_head *work)
        if (p->flags & PF_EXITING)
                return;
 
-       /*
-        * We do not care about task placement until a task runs on a node
-        * other than the first one used by the address space. This is
-        * largely because migrations are driven by what CPU the task
-        * is running on. If it's never scheduled on another node, it'll
-        * not migrate so why bother trapping the fault.
-        */
-       if (mm->first_nid == NUMA_PTE_SCAN_INIT)
-               mm->first_nid = numa_node_id();
-       if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
-               /* Are we running on a new node yet? */
-               if (numa_node_id() == mm->first_nid &&
-                   !sched_feat_numa(NUMA_FORCE))
-                       return;
-
-               mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
-       }
-
-       /*
-        * Reset the scan period if enough time has gone by. Objective is that
-        * scanning will be reduced if pages are properly placed. As tasks
-        * can enter different phases this needs to be re-examined. Lacking
-        * proper tracking of reference behaviour, this blunt hammer is used.
-        */
-       migrate = mm->numa_next_reset;
-       if (time_after(now, migrate)) {
-               p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-               next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
-               xchg(&mm->numa_next_reset, next_scan);
+       if (!mm->numa_next_scan) {
+               mm->numa_next_scan = now +
+                       msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
        }
 
        /*
@@ -938,20 +1691,20 @@ void task_numa_work(struct callback_head *work)
        if (time_before(now, migrate))
                return;
 
-       if (p->numa_scan_period == 0)
-               p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+       if (p->numa_scan_period == 0) {
+               p->numa_scan_period_max = task_scan_max(p);
+               p->numa_scan_period = task_scan_min(p);
+       }
 
        next_scan = now + msecs_to_jiffies(p->numa_scan_period);
        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
                return;
 
        /*
-        * Do not set pte_numa if the current running node is rate-limited.
-        * This loses statistics on the fault but if we are unwilling to
-        * migrate to this node, it is less likely we can do useful work
+        * Delay this task enough that another task of this mm will likely win
+        * the next time around.
         */
-       if (migrate_ratelimited(numa_node_id()))
-               return;
+       p->node_stamp += 2 * TICK_NSEC;
 
        start = mm->numa_scan_offset;
        pages = sysctl_numa_balancing_scan_size;
@@ -967,31 +1720,54 @@ void task_numa_work(struct callback_head *work)
                vma = mm->mmap;
        }
        for (; vma; vma = vma->vm_next) {
-               if (!vma_migratable(vma))
+               if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
+                       continue;
+
+               /*
+                * Shared library pages mapped by multiple processes are not
+                * migrated as it is expected they are cache replicated. Avoid
+                * hinting faults in read-only file-backed mappings or the vdso
+                * as migrating the pages will be of marginal benefit.
+                */
+               if (!vma->vm_mm ||
+                   (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
                        continue;
 
-               /* Skip small VMAs. They are not likely to be of relevance */
-               if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+               /*
+                * Skip inaccessible VMAs to avoid any confusion between
+                * PROT_NONE and NUMA hinting ptes
+                */
+               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
                        continue;
 
                do {
                        start = max(start, vma->vm_start);
                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
                        end = min(end, vma->vm_end);
-                       pages -= change_prot_numa(vma, start, end);
+                       nr_pte_updates += change_prot_numa(vma, start, end);
+
+                       /*
+                        * Scan sysctl_numa_balancing_scan_size but ensure that
+                        * at least one PTE is updated so that unused virtual
+                        * address space is quickly skipped.
+                        */
+                       if (nr_pte_updates)
+                               pages -= (end - start) >> PAGE_SHIFT;
 
                        start = end;
                        if (pages <= 0)
                                goto out;
+
+                       cond_resched();
                } while (end != vma->vm_end);
        }
 
 out:
        /*
-        * It is possible to reach the end of the VMA list but the last few VMAs are
-        * not guaranteed to the vma_migratable. If they are not, we would find the
-        * !migratable VMA on the next scan but not reset the scanner to the start
-        * so check it now.
+        * It is possible to reach the end of the VMA list but the last few
+        * VMAs are not guaranteed to the vma_migratable. If they are not, we
+        * would find the !migratable VMA on the next scan but not reset the
+        * scanner to the start so check it now.
         */
        if (vma)
                mm->numa_scan_offset = start;
@@ -1025,8 +1801,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 
        if (now - curr->node_stamp > period) {
                if (!curr->node_stamp)
-                       curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-               curr->node_stamp = now;
+                       curr->numa_scan_period = task_scan_min(curr);
+               curr->node_stamp += period;
 
                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1038,6 +1814,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
 }
+
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 static void
@@ -1047,8 +1831,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        if (!parent_entity(se))
                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
-       if (entity_is_task(se))
-               list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+       if (entity_is_task(se)) {
+               struct rq *rq = rq_of(cfs_rq);
+
+               account_numa_enqueue(rq, task_of(se));
+               list_add(&se->group_node, &rq->cfs_tasks);
+       }
 #endif
        cfs_rq->nr_running++;
 }
@@ -1059,8 +1847,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        update_load_sub(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-       if (entity_is_task(se))
+       if (entity_is_task(se)) {
+               account_numa_dequeue(rq_of(cfs_rq), task_of(se));
                list_del_init(&se->group_node);
+       }
        cfs_rq->nr_running--;
 }
 
@@ -1378,7 +2168,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
        long contrib;
 
        /* The fraction of a cpu used by this cfs_rq */
-       contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+       contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
                          sa->runnable_avg_period + 1);
        contrib -= cfs_rq->tg_runnable_contrib;
 
@@ -1572,13 +2362,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
                }
                wakeup = 0;
        } else {
-               /*
-                * Task re-woke on same cpu (or else migrate_task_rq_fair()
-                * would have made count negative); we must be careful to avoid
-                * double-accounting blocked time after synchronizing decays.
-                */
-               se->avg.last_runnable_update += __synchronize_entity_decay(se)
-                                                       << 20;
+               __synchronize_entity_decay(se);
        }
 
        /* migrated tasks did not contribute to our blocked load */
@@ -2070,13 +2854,14 @@ static inline bool cfs_bandwidth_used(void)
        return static_key_false(&__cfs_bandwidth_used);
 }
 
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
+void cfs_bandwidth_usage_inc(void)
+{
+       static_key_slow_inc(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_dec(void)
 {
-       /* only need to count groups transitioning between enabled/!enabled */
-       if (enabled && !was_enabled)
-               static_key_slow_inc(&__cfs_bandwidth_used);
-       else if (!enabled && was_enabled)
-               static_key_slow_dec(&__cfs_bandwidth_used);
+       static_key_slow_dec(&__cfs_bandwidth_used);
 }
 #else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)
@@ -2084,7 +2869,8 @@ static bool cfs_bandwidth_used(void)
        return true;
 }
 
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
 #endif /* HAVE_JUMP_LABEL */
 
 /*
@@ -2213,8 +2999,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        }
 }
 
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-                                    unsigned long delta_exec)
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
        /* dock delta_exec before expiring quota (as it could span periods) */
        cfs_rq->runtime_remaining -= delta_exec;
@@ -2232,7 +3017,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 }
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
                return;
@@ -2335,6 +3120,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        cfs_rq->throttled_clock = rq_clock(rq);
        raw_spin_lock(&cfs_b->lock);
        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       if (!cfs_b->timer_active)
+               __start_cfs_bandwidth(cfs_b);
        raw_spin_unlock(&cfs_b->lock);
 }
 
@@ -2448,6 +3235,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
        if (idle)
                goto out_unlock;
 
+       /*
+        * if we have relooped after returning idle once, we need to update our
+        * status as actually running, so that other cpus doing
+        * __start_cfs_bandwidth will stop trying to cancel us.
+        */
+       cfs_b->timer_active = 1;
+
        __refill_cfs_bandwidth_runtime(cfs_b);
 
        if (!throttled) {
@@ -2508,7 +3302,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
 /* how long we wait to gather additional slack before distributing */
 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
 
-/* are we near the end of the current quota period? */
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
 {
        struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2584,10 +3384,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        u64 expires;
 
        /* confirm we're still not at a refresh boundary */
-       if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+       raw_spin_lock(&cfs_b->lock);
+       if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+               raw_spin_unlock(&cfs_b->lock);
                return;
+       }
 
-       raw_spin_lock(&cfs_b->lock);
        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
                runtime = cfs_b->runtime;
                cfs_b->runtime = 0;
@@ -2708,11 +3510,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
         * (timer_active==0 becomes visible before the hrtimer call-back
         * terminates).  In either case we ensure that it's re-programmed
         */
-       while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+       while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
+              hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
+               /* bounce the lock to allow do_sched_cfs_period_timer to run */
                raw_spin_unlock(&cfs_b->lock);
-               /* ensure cfs_b->lock is available while we wait */
-               hrtimer_cancel(&cfs_b->period_timer);
-
+               cpu_relax();
                raw_spin_lock(&cfs_b->lock);
                /* if someone else restarted the timer then we're done */
                if (cfs_b->timer_active)
@@ -2755,8 +3557,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
        return rq_clock_task(rq_of(cfs_rq));
 }
 
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-                                    unsigned long delta_exec) {}
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -3166,8 +3967,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 }
 #else
 
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
-               unsigned long wl, unsigned long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        return wl;
 }
@@ -3292,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                 int this_cpu, int load_idx)
+                 int this_cpu, int sd_flag)
 {
        struct sched_group *idlest = NULL, *group = sd->groups;
        unsigned long min_load = ULONG_MAX, this_load = 0;
+       int load_idx = sd->forkexec_idx;
        int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
+       if (sd_flag & SD_BALANCE_WAKE)
+               load_idx = sd->wake_idx;
+
        do {
                unsigned long load, avg_load;
                int local_group;
@@ -3420,11 +4224,10 @@ done:
  * preempt must be disabled.
  */
 static int
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
 {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
-       int prev_cpu = task_cpu(p);
        int new_cpu = cpu;
        int want_affine = 0;
        int sync = wake_flags & WF_SYNC;
@@ -3466,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
        }
 
        while (sd) {
-               int load_idx = sd->forkexec_idx;
                struct sched_group *group;
                int weight;
 
@@ -3475,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
                        continue;
                }
 
-               if (sd_flag & SD_BALANCE_WAKE)
-                       load_idx = sd->wake_idx;
-
-               group = find_idlest_group(sd, p, cpu, load_idx);
+               group = find_idlest_group(sd, p, cpu, sd_flag);
                if (!group) {
                        sd = sd->child;
                        continue;
@@ -3904,9 +4703,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
+enum fbq_type { regular, remote, all };
+
 #define LBF_ALL_PINNED 0x01
 #define LBF_NEED_BREAK 0x02
-#define LBF_SOME_PINNED 0x04
+#define LBF_DST_PINNED  0x04
+#define LBF_SOME_PINNED        0x08
 
 struct lb_env {
        struct sched_domain     *sd;
@@ -3929,6 +4731,8 @@ struct lb_env {
        unsigned int            loop;
        unsigned int            loop_break;
        unsigned int            loop_max;
+
+       enum fbq_type           fbq_type;
 };
 
 /*
@@ -3975,6 +4779,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        return delta < (s64)sysctl_sched_migration_cost;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+       int src_nid, dst_nid;
+
+       if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+           !(env->sd->flags & SD_NUMA)) {
+               return false;
+       }
+
+       src_nid = cpu_to_node(env->src_cpu);
+       dst_nid = cpu_to_node(env->dst_cpu);
+
+       if (src_nid == dst_nid)
+               return false;
+
+       /* Always encourage migration to the preferred node. */
+       if (dst_nid == p->numa_preferred_nid)
+               return true;
+
+       /* If both task and group weight improve, this move is a winner. */
+       if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
+           group_weight(p, dst_nid) > group_weight(p, src_nid))
+               return true;
+
+       return false;
+}
+
+
+static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+{
+       int src_nid, dst_nid;
+
+       if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
+               return false;
+
+       if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+               return false;
+
+       src_nid = cpu_to_node(env->src_cpu);
+       dst_nid = cpu_to_node(env->dst_cpu);
+
+       if (src_nid == dst_nid)
+               return false;
+
+       /* Migrating away from the preferred node is always bad. */
+       if (src_nid == p->numa_preferred_nid)
+               return true;
+
+       /* If either task or group weight get worse, don't do it. */
+       if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
+           group_weight(p, dst_nid) < group_weight(p, src_nid))
+               return true;
+
+       return false;
+}
+
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+                                            struct lb_env *env)
+{
+       return false;
+}
+
+static inline bool migrate_degrades_locality(struct task_struct *p,
+                                            struct lb_env *env)
+{
+       return false;
+}
+#endif
+
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -3997,6 +4873,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
 
+               env->flags |= LBF_SOME_PINNED;
+
                /*
                 * Remember if this task can be migrated to any other cpu in
                 * our sched_group. We may want to revisit it if we couldn't
@@ -4005,13 +4883,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                 * Also avoid computing new_dst_cpu if we have already computed
                 * one in current iteration.
                 */
-               if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+               if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
                        return 0;
 
                /* Prevent to re-select dst_cpu via env's cpus */
                for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
                        if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
-                               env->flags |= LBF_SOME_PINNED;
+                               env->flags |= LBF_DST_PINNED;
                                env->new_dst_cpu = cpu;
                                break;
                        }
@@ -4030,11 +4908,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
        /*
         * Aggressive migration if:
-        * 1) task is cache cold, or
-        * 2) too many balance attempts have failed.
+        * 1) destination numa is preferred
+        * 2) task is cache cold, or
+        * 3) too many balance attempts have failed.
         */
-
        tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+       if (!tsk_cache_hot)
+               tsk_cache_hot = migrate_degrades_locality(p, env);
+
+       if (migrate_improves_locality(p, env)) {
+#ifdef CONFIG_SCHEDSTATS
+               if (tsk_cache_hot) {
+                       schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+                       schedstat_inc(p, se.statistics.nr_forced_migrations);
+               }
+#endif
+               return 1;
+       }
+
        if (!tsk_cache_hot ||
                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 
@@ -4077,8 +4968,6 @@ static int move_one_task(struct lb_env *env)
        return 0;
 }
 
-static unsigned long task_h_load(struct task_struct *p);
-
 static const unsigned int sched_nr_migrate_break = 32;
 
 /*
@@ -4291,6 +5180,10 @@ struct sg_lb_stats {
        unsigned int group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
        int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA_BALANCING
+       unsigned int nr_numa_running;
+       unsigned int nr_preferred_running;
+#endif
 };
 
 /*
@@ -4330,7 +5223,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 /**
  * get_sd_load_idx - Obtain the load index for a given sched domain.
  * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ * @idle: The idle status of the CPU for whose sd load_idx is obtained.
  *
  * Return: The load index.
  */
@@ -4447,7 +5340,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
-       unsigned long power;
+       unsigned long power, power_orig;
        unsigned long interval;
 
        interval = msecs_to_jiffies(sd->balance_interval);
@@ -4459,7 +5352,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
                return;
        }
 
-       power = 0;
+       power_orig = power = 0;
 
        if (child->flags & SD_OVERLAP) {
                /*
@@ -4467,8 +5360,33 @@ void update_group_power(struct sched_domain *sd, int cpu)
                 * span the current group.
                 */
 
-               for_each_cpu(cpu, sched_group_cpus(sdg))
-                       power += power_of(cpu);
+               for_each_cpu(cpu, sched_group_cpus(sdg)) {
+                       struct sched_group_power *sgp;
+                       struct rq *rq = cpu_rq(cpu);
+
+                       /*
+                        * build_sched_domains() -> init_sched_groups_power()
+                        * gets here before we've attached the domains to the
+                        * runqueues.
+                        *
+                        * Use power_of(), which is set irrespective of domains
+                        * in update_cpu_power().
+                        *
+                        * This avoids power/power_orig from being 0 and
+                        * causing divide-by-zero issues on boot.
+                        *
+                        * Runtime updates will correct power_orig.
+                        */
+                       if (unlikely(!rq->sd)) {
+                               power_orig += power_of(cpu);
+                               power += power_of(cpu);
+                               continue;
+                       }
+
+                       sgp = rq->sd->groups->sgp;
+                       power_orig += sgp->power_orig;
+                       power += sgp->power;
+               }
        } else  {
                /*
                 * !SD_OVERLAP domains can assume that child groups
@@ -4477,12 +5395,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
 
                group = child->groups;
                do {
+                       power_orig += group->sgp->power_orig;
                        power += group->sgp->power;
                        group = group->next;
                } while (group != child->groups);
        }
 
-       sdg->sgp->power_orig = sdg->sgp->power = power;
+       sdg->sgp->power_orig = power_orig;
+       sdg->sgp->power = power;
 }
 
 /*
@@ -4526,13 +5446,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
  * cpu 3 and leave one of the cpus in the second group unused.
  *
  * The current solution to this issue is detecting the skew in the first group
- * by noticing it has a cpu that is overloaded while the remaining cpus are
- * idle -- or rather, there's a distinct imbalance in the cpus; see
- * sg_imbalanced().
+ * by noticing the lower domain failed to reach balance and had difficulty
+ * moving tasks due to affinity constraints.
  *
  * When this is so detected; this group becomes a candidate for busiest; see
- * update_sd_pick_busiest(). And calculcate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditional to allow it
+ * update_sd_pick_busiest(). And calculate_imbalance() and
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
  * to create an effective group imbalance.
  *
  * This is a somewhat tricky proposition since the next run might not find the
@@ -4540,49 +5459,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
  * subtle and fragile situation.
  */
 
-struct sg_imb_stats {
-       unsigned long max_nr_running, min_nr_running;
-       unsigned long max_cpu_load, min_cpu_load;
-};
-
-static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
+static inline int sg_imbalanced(struct sched_group *group)
 {
-       sgi->max_cpu_load = sgi->max_nr_running = 0UL;
-       sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
+       return group->sgp->imbalance;
 }
 
-static inline void
-update_sg_imb_stats(struct sg_imb_stats *sgi,
-                   unsigned long load, unsigned long nr_running)
+/*
+ * Compute the group capacity.
+ *
+ * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
+ * first dividing out the smt factor and computing the actual number of cores
+ * and limit power unit capacity with that.
+ */
+static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
 {
-       if (load > sgi->max_cpu_load)
-               sgi->max_cpu_load = load;
-       if (sgi->min_cpu_load > load)
-               sgi->min_cpu_load = load;
+       unsigned int capacity, smt, cpus;
+       unsigned int power, power_orig;
 
-       if (nr_running > sgi->max_nr_running)
-               sgi->max_nr_running = nr_running;
-       if (sgi->min_nr_running > nr_running)
-               sgi->min_nr_running = nr_running;
-}
+       power = group->sgp->power;
+       power_orig = group->sgp->power_orig;
+       cpus = group->group_weight;
 
-static inline int
-sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
-{
-       /*
-        * Consider the group unbalanced when the imbalance is larger
-        * than the average weight of a task.
-        *
-        * APZ: with cgroup the avg task weight can vary wildly and
-        *      might not be a suitable number - should we keep a
-        *      normalized nr_running number somewhere that negates
-        *      the hierarchy?
-        */
-       if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
-           (sgi->max_nr_running - sgi->min_nr_running) > 1)
-               return 1;
+       /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
+       smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
+       capacity = cpus / smt; /* cores */
 
-       return 0;
+       capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
+       if (!capacity)
+               capacity = fix_small_capacity(env->sd, group);
+
+       return capacity;
 }
 
 /**
@@ -4597,37 +5503,31 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        struct sched_group *group, int load_idx,
                        int local_group, struct sg_lb_stats *sgs)
 {
-       struct sg_imb_stats sgi;
-       unsigned long nr_running;
        unsigned long load;
        int i;
 
-       init_sg_imb_stats(&sgi);
+       memset(sgs, 0, sizeof(*sgs));
 
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
 
-               nr_running = rq->nr_running;
-
                /* Bias balancing toward cpus of our domain */
-               if (local_group) {
+               if (local_group)
                        load = target_load(i, load_idx);
-               } else {
+               else
                        load = source_load(i, load_idx);
-                       update_sg_imb_stats(&sgi, load, nr_running);
-               }
 
                sgs->group_load += load;
-               sgs->sum_nr_running += nr_running;
+               sgs->sum_nr_running += rq->nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+               sgs->nr_numa_running += rq->nr_numa_running;
+               sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
                sgs->sum_weighted_load += weighted_cpuload(i);
                if (idle_cpu(i))
                        sgs->idle_cpus++;
        }
 
-       if (local_group && (env->idle != CPU_NEWLY_IDLE ||
-                       time_after_eq(jiffies, group->sgp->next_update)))
-               update_group_power(env->sd, env->dst_cpu);
-
        /* Adjust by relative CPU power of the group */
        sgs->group_power = group->sgp->power;
        sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
@@ -4635,16 +5535,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        if (sgs->sum_nr_running)
                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
 
-       sgs->group_imb = sg_imbalanced(sgs, &sgi);
-
-       sgs->group_capacity =
-               DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
-
-       if (!sgs->group_capacity)
-               sgs->group_capacity = fix_small_capacity(env->sd, group);
-
        sgs->group_weight = group->group_weight;
 
+       sgs->group_imb = sg_imbalanced(group);
+       sgs->group_capacity = sg_capacity(env, group);
+
        if (sgs->group_capacity > sgs->sum_nr_running)
                sgs->group_has_capacity = 1;
 }
@@ -4693,14 +5588,42 @@ static bool update_sd_pick_busiest(struct lb_env *env,
        return false;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+       if (sgs->sum_nr_running > sgs->nr_numa_running)
+               return regular;
+       if (sgs->sum_nr_running > sgs->nr_preferred_running)
+               return remote;
+       return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+       if (rq->nr_running > rq->nr_numa_running)
+               return regular;
+       if (rq->nr_running > rq->nr_preferred_running)
+               return remote;
+       return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+       return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+       return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 /**
  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
  * @env: The load balancing environment.
- * @balance: Should we balance.
  * @sds: variable to hold the statistics for this sched_domain.
  */
-static inline void update_sd_lb_stats(struct lb_env *env,
-                                       struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
@@ -4720,11 +5643,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                if (local_group) {
                        sds->local = sg;
                        sgs = &sds->local_stat;
+
+                       if (env->idle != CPU_NEWLY_IDLE ||
+                           time_after_eq(jiffies, sg->sgp->next_update))
+                               update_group_power(env->sd, env->dst_cpu);
                }
 
-               memset(sgs, 0, sizeof(*sgs));
                update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
 
+               if (local_group)
+                       goto next_group;
+
                /*
                 * In case the child domain prefers tasks go to siblings
                 * first, lower the sg capacity to one so that we'll try
@@ -4735,21 +5664,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                 * heaviest group when it is already under-utilized (possible
                 * with a large weight task outweighs the tasks on the system).
                 */
-               if (prefer_sibling && !local_group &&
-                               sds->local && sds->local_stat.group_has_capacity)
+               if (prefer_sibling && sds->local &&
+                   sds->local_stat.group_has_capacity)
                        sgs->group_capacity = min(sgs->group_capacity, 1U);
 
-               /* Now, start updating sd_lb_stats */
-               sds->total_load += sgs->group_load;
-               sds->total_pwr += sgs->group_power;
-
-               if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
+               if (update_sd_pick_busiest(env, sds, sg, sgs)) {
                        sds->busiest = sg;
                        sds->busiest_stat = *sgs;
                }
 
+next_group:
+               /* Now, start updating sd_lb_stats */
+               sds->total_load += sgs->group_load;
+               sds->total_pwr += sgs->group_power;
+
                sg = sg->next;
        } while (sg != env->sd->groups);
+
+       if (env->sd->flags & SD_NUMA)
+               env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 }
 
 /**
@@ -5053,15 +5986,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
        int i;
 
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-               unsigned long power = power_of(i);
-               unsigned long capacity = DIV_ROUND_CLOSEST(power,
-                                                          SCHED_POWER_SCALE);
-               unsigned long wl;
+               unsigned long power, capacity, wl;
+               enum fbq_type rt;
+
+               rq = cpu_rq(i);
+               rt = fbq_classify_rq(rq);
+
+               /*
+                * We classify groups/runqueues into three groups:
+                *  - regular: there are !numa tasks
+                *  - remote:  there are numa tasks that run on the 'wrong' node
+                *  - all:     there is no distinction
+                *
+                * In order to avoid migrating ideally placed numa tasks,
+                * ignore those when there's better options.
+                *
+                * If we ignore the actual busiest queue to migrate another
+                * task, the next balance pass can still reduce the busiest
+                * queue by moving tasks around inside the node.
+                *
+                * If we cannot move enough load due to this classification
+                * the next pass will adjust the group classification and
+                * allow migration of more tasks.
+                *
+                * Both cases only affect the total convergence complexity.
+                */
+               if (rt > env->fbq_type)
+                       continue;
 
+               power = power_of(i);
+               capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
                if (!capacity)
                        capacity = fix_small_capacity(env->sd, group);
 
-               rq = cpu_rq(i);
                wl = weighted_cpuload(i);
 
                /*
@@ -5164,6 +6121,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        int *continue_balancing)
 {
        int ld_moved, cur_ld_moved, active_balance = 0;
+       struct sched_domain *sd_parent = sd->parent;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
@@ -5177,6 +6135,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
                .cpus           = cpus,
+               .fbq_type       = all,
        };
 
        /*
@@ -5268,17 +6227,17 @@ more_balance:
                 * moreover subsequent load balance cycles should correct the
                 * excess load moved.
                 */
-               if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+               if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
+
+                       /* Prevent to re-select dst_cpu via env's cpus */
+                       cpumask_clear_cpu(env.dst_cpu, env.cpus);
 
                        env.dst_rq       = cpu_rq(env.new_dst_cpu);
                        env.dst_cpu      = env.new_dst_cpu;
-                       env.flags       &= ~LBF_SOME_PINNED;
+                       env.flags       &= ~LBF_DST_PINNED;
                        env.loop         = 0;
                        env.loop_break   = sched_nr_migrate_break;
 
-                       /* Prevent to re-select dst_cpu via env's cpus */
-                       cpumask_clear_cpu(env.dst_cpu, env.cpus);
-
                        /*
                         * Go back to "more_balance" rather than "redo" since we
                         * need to continue with same src_cpu.
@@ -5286,6 +6245,18 @@ more_balance:
                        goto more_balance;
                }
 
+               /*
+                * We failed to reach balance because of affinity.
+                */
+               if (sd_parent) {
+                       int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+
+                       if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                               *group_imbalance = 1;
+                       } else if (*group_imbalance)
+                               *group_imbalance = 0;
+               }
+
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(env.flags & LBF_ALL_PINNED)) {
                        cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5393,6 +6364,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        struct sched_domain *sd;
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
+       u64 curr_cost = 0;
 
        this_rq->idle_stamp = rq_clock(this_rq);
 
@@ -5409,15 +6381,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                int continue_balancing = 1;
+               u64 t0, domain_cost;
 
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
 
+               if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+                       break;
+
                if (sd->flags & SD_BALANCE_NEWIDLE) {
+                       t0 = sched_clock_cpu(this_cpu);
+
                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance(this_cpu, this_rq,
                                                   sd, CPU_NEWLY_IDLE,
                                                   &continue_balancing);
+
+                       domain_cost = sched_clock_cpu(this_cpu) - t0;
+                       if (domain_cost > sd->max_newidle_lb_cost)
+                               sd->max_newidle_lb_cost = domain_cost;
+
+                       curr_cost += domain_cost;
                }
 
                interval = msecs_to_jiffies(sd->balance_interval);
@@ -5439,6 +6423,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
                 */
                this_rq->next_balance = next_balance;
        }
+
+       if (curr_cost > this_rq->max_idle_balance_cost)
+               this_rq->max_idle_balance_cost = curr_cost;
 }
 
 /*
@@ -5522,7 +6509,7 @@ static struct {
        unsigned long next_balance;     /* in jiffy units */
 } nohz ____cacheline_aligned;
 
-static inline int find_new_ilb(int call_cpu)
+static inline int find_new_ilb(void)
 {
        int ilb = cpumask_first(nohz.idle_cpus_mask);
 
@@ -5537,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu)
  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
  * CPU (if there is one).
  */
-static void nohz_balancer_kick(int cpu)
+static void nohz_balancer_kick(void)
 {
        int ilb_cpu;
 
        nohz.next_balance++;
 
-       ilb_cpu = find_new_ilb(cpu);
+       ilb_cpu = find_new_ilb();
 
        if (ilb_cpu >= nr_cpu_ids)
                return;
@@ -5572,16 +6559,16 @@ static inline void nohz_balance_exit_idle(int cpu)
 static inline void set_cpu_sd_state_busy(void)
 {
        struct sched_domain *sd;
+       int cpu = smp_processor_id();
 
        rcu_read_lock();
-       sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+       sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
        if (!sd || !sd->nohz_idle)
                goto unlock;
        sd->nohz_idle = 0;
 
-       for (; sd; sd = sd->parent)
-               atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+       atomic_inc(&sd->groups->sgp->nr_busy_cpus);
 unlock:
        rcu_read_unlock();
 }
@@ -5589,16 +6576,16 @@ unlock:
 void set_cpu_sd_state_idle(void)
 {
        struct sched_domain *sd;
+       int cpu = smp_processor_id();
 
        rcu_read_lock();
-       sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+       sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
        if (!sd || sd->nohz_idle)
                goto unlock;
        sd->nohz_idle = 1;
 
-       for (; sd; sd = sd->parent)
-               atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+       atomic_dec(&sd->groups->sgp->nr_busy_cpus);
 unlock:
        rcu_read_unlock();
 }
@@ -5653,24 +6640,48 @@ void update_max_interval(void)
  *
  * Balancing parameters are set up in init_sched_domains.
  */
-static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 {
        int continue_balancing = 1;
-       struct rq *rq = cpu_rq(cpu);
+       int cpu = rq->cpu;
        unsigned long interval;
        struct sched_domain *sd;
        /* Earliest time when we have to do rebalance again */
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
-       int need_serialize;
+       int need_serialize, need_decay = 0;
+       u64 max_cost = 0;
 
        update_blocked_averages(cpu);
 
        rcu_read_lock();
        for_each_domain(cpu, sd) {
+               /*
+                * Decay the newidle max times here because this is a regular
+                * visit to all the domains. Decay ~1% per second.
+                */
+               if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+                       sd->max_newidle_lb_cost =
+                               (sd->max_newidle_lb_cost * 253) / 256;
+                       sd->next_decay_max_lb_cost = jiffies + HZ;
+                       need_decay = 1;
+               }
+               max_cost += sd->max_newidle_lb_cost;
+
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
 
+               /*
+                * Stop the load balance at this level. There is another
+                * CPU in our sched group which is doing load balancing more
+                * actively.
+                */
+               if (!continue_balancing) {
+                       if (need_decay)
+                               continue;
+                       break;
+               }
+
                interval = sd->balance_interval;
                if (idle != CPU_IDLE)
                        interval *= sd->busy_factor;
@@ -5689,7 +6700,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
                        if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
                                /*
-                                * The LBF_SOME_PINNED logic could have changed
+                                * The LBF_DST_PINNED logic could have changed
                                 * env->dst_cpu, so we can't know our idle
                                 * state even if we migrated tasks. Update it.
                                 */
@@ -5704,14 +6715,14 @@ out:
                        next_balance = sd->last_balance + interval;
                        update_next_balance = 1;
                }
-
+       }
+       if (need_decay) {
                /*
-                * Stop the load balance at this level. There is another
-                * CPU in our sched group which is doing load balancing more
-                * actively.
+                * Ensure the rq-wide value also decays but keep it at a
+                * reasonable floor to avoid funnies with rq->avg_idle.
                 */
-               if (!continue_balancing)
-                       break;
+               rq->max_idle_balance_cost =
+                       max((u64)sysctl_sched_migration_cost, max_cost);
        }
        rcu_read_unlock();
 
@@ -5729,9 +6740,9 @@ out:
  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
  */
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
-       struct rq *this_rq = cpu_rq(this_cpu);
+       int this_cpu = this_rq->cpu;
        struct rq *rq;
        int balance_cpu;
 
@@ -5758,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
                update_idle_cpu_load(rq);
                raw_spin_unlock_irq(&rq->lock);
 
-               rebalance_domains(balance_cpu, CPU_IDLE);
+               rebalance_domains(rq, CPU_IDLE);
 
                if (time_after(this_rq->next_balance, rq->next_balance))
                        this_rq->next_balance = rq->next_balance;
@@ -5777,12 +6788,14 @@ end:
  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
  *     domain span are idle.
  */
-static inline int nohz_kick_needed(struct rq *rq, int cpu)
+static inline int nohz_kick_needed(struct rq *rq)
 {
        unsigned long now = jiffies;
        struct sched_domain *sd;
+       struct sched_group_power *sgp;
+       int nr_busy, cpu = rq->cpu;
 
-       if (unlikely(idle_cpu(cpu)))
+       if (unlikely(rq->idle_balance))
                return 0;
 
        /*
@@ -5806,22 +6819,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
                goto need_kick;
 
        rcu_read_lock();
-       for_each_domain(cpu, sd) {
-               struct sched_group *sg = sd->groups;
-               struct sched_group_power *sgp = sg->sgp;
-               int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+       sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
-               if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
-                       goto need_kick_unlock;
+       if (sd) {
+               sgp = sd->groups->sgp;
+               nr_busy = atomic_read(&sgp->nr_busy_cpus);
 
-               if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
-                   && (cpumask_first_and(nohz.idle_cpus_mask,
-                                         sched_domain_span(sd)) < cpu))
+               if (nr_busy > 1)
                        goto need_kick_unlock;
-
-               if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
-                       break;
        }
+
+       sd = rcu_dereference(per_cpu(sd_asym, cpu));
+
+       if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
+                                 sched_domain_span(sd)) < cpu))
+               goto need_kick_unlock;
+
        rcu_read_unlock();
        return 0;
 
@@ -5831,7 +6844,7 @@ need_kick:
        return 1;
 }
 #else
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
 #endif
 
 /*
@@ -5840,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
  */
 static void run_rebalance_domains(struct softirq_action *h)
 {
-       int this_cpu = smp_processor_id();
-       struct rq *this_rq = cpu_rq(this_cpu);
+       struct rq *this_rq = this_rq();
        enum cpu_idle_type idle = this_rq->idle_balance ?
                                                CPU_IDLE : CPU_NOT_IDLE;
 
-       rebalance_domains(this_cpu, idle);
+       rebalance_domains(this_rq, idle);
 
        /*
         * If this cpu has a pending nohz_balance_kick, then do the
         * balancing on behalf of the other idle cpus whose ticks are
         * stopped.
         */
-       nohz_idle_balance(this_cpu, idle);
+       nohz_idle_balance(this_rq, idle);
 }
 
-static inline int on_null_domain(int cpu)
+static inline int on_null_domain(struct rq *rq)
 {
-       return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+       return !rcu_dereference_sched(rq->sd);
 }
 
 /*
  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
  */
-void trigger_load_balance(struct rq *rq, int cpu)
+void trigger_load_balance(struct rq *rq)
 {
        /* Don't need to rebalance while attached to NULL domain */
-       if (time_after_eq(jiffies, rq->next_balance) &&
-           likely(!on_null_domain(cpu)))
+       if (unlikely(on_null_domain(rq)))
+               return;
+
+       if (time_after_eq(jiffies, rq->next_balance))
                raise_softirq(SCHED_SOFTIRQ);
 #ifdef CONFIG_NO_HZ_COMMON
-       if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
-               nohz_balancer_kick(cpu);
+       if (nohz_kick_needed(rq))
+               nohz_balancer_kick();
 #endif
 }
 
@@ -5987,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
        /*
-        * Ensure the task's vruntime is normalized, so that when its
+        * Ensure the task's vruntime is normalized, so that when it's
         * switched back to the fair class the enqueue_entity(.flags=0) will
         * do the right thing.
         *
-        * If it was on_rq, then the dequeue_entity(.flags=0) will already
-        * have normalized the vruntime, if it was !on_rq, then only when
+        * If it's on_rq, then the dequeue_entity(.flags=0) will already
+        * have normalized the vruntime, if it's !on_rq, then only when
         * the task is sleeping will it still have non-normalized vruntime.
         */
-       if (!se->on_rq && p->state != TASK_RUNNING) {
+       if (!p->on_rq && p->state != TASK_RUNNING) {
                /*
                 * Fix up our vruntime so that the current sleep doesn't
                 * cause 'unlimited' sleep bonus.
@@ -6214,7 +7228,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                se->cfs_rq = parent->my_q;
 
        se->my_q = cfs_rq;
-       update_load_set(&se->load, 0);
+       /* guarantee group entities always have weight */
+       update_load_set(&se->load, NICE_0_LOAD);
        se->parent = parent;
 }