Merge tag 'v3.18' into p/abusse/merge_upgrade
[projects/modsched/linux.git] / kernel / sched / cfs / core.c
index 0899b6b..c4b2636 100644 (file)
@@ -73,6 +73,7 @@
 #include <linux/init_task.h>
 #include <linux/binfmts.h>
 #include <linux/context_tracking.h>
+#include <linux/compiler.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -122,6 +123,8 @@ void update_rq_clock(struct rq *rq)
                return;
 
        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+       if (delta < 0)
+               return;
        rq->clock += delta;
        update_rq_clock_task(rq, delta);
 }
@@ -226,6 +229,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        char buf[64];
        char *cmp;
        int i;
+       struct inode *inode;
 
        if (cnt > 63)
                cnt = 63;
@@ -236,7 +240,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        buf[cnt] = 0;
        cmp = strstrip(buf);
 
+       /* Ensure the static_key remains in a consistent state */
+       inode = file_inode(filp);
+       mutex_lock(&inode->i_mutex);
        i = sched_feat_set(cmp);
+       mutex_unlock(&inode->i_mutex);
        if (i == __SCHED_FEAT_NR)
                return -EINVAL;
 
@@ -296,8 +304,6 @@ __read_mostly int scheduler_running;
  */
 int sysctl_sched_rt_runtime = 950000;
 
-
-
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
@@ -311,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
        for (;;) {
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p)))
+               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
                        return rq;
                raw_spin_unlock(&rq->lock);
+
+               while (unlikely(task_on_rq_migrating(p)))
+                       cpu_relax();
        }
 }
 
@@ -330,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
                raw_spin_lock_irqsave(&p->pi_lock, *flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p)))
+               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
                        return rq;
                raw_spin_unlock(&rq->lock);
                raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+               while (unlikely(task_on_rq_migrating(p)))
+                       cpu_relax();
        }
 }
 
@@ -370,13 +382,6 @@ static struct rq *this_rq_lock(void)
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
- *
- * Its all a bit involved since we cannot program an hrt while holding the
- * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
- * reschedule event.
- *
- * When we get rescheduled we reprogram the hrtick_timer outside of the
- * rq->lock.
  */
 
 static void hrtick_clear(struct rq *rq)
@@ -404,6 +409,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 }
 
 #ifdef CONFIG_SMP
+
+static int __hrtick_restart(struct rq *rq)
+{
+       struct hrtimer *timer = &rq->hrtick_timer;
+       ktime_t time = hrtimer_get_softexpires(timer);
+
+       return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+}
+
 /*
  * called from hardirq (IPI) context
  */
@@ -412,7 +426,7 @@ static void __hrtick_start(void *arg)
        struct rq *rq = arg;
 
        raw_spin_lock(&rq->lock);
-       hrtimer_restart(&rq->hrtick_timer);
+       __hrtick_restart(rq);
        rq->hrtick_csd_pending = 0;
        raw_spin_unlock(&rq->lock);
 }
@@ -425,14 +439,22 @@ static void __hrtick_start(void *arg)
 void hrtick_start(struct rq *rq, u64 delay)
 {
        struct hrtimer *timer = &rq->hrtick_timer;
-       ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
+       ktime_t time;
+       s64 delta;
+
+       /*
+        * Don't schedule slices shorter than 10000ns, that just
+        * doesn't make sense and can cause timer DoS.
+        */
+       delta = max_t(s64, delay, 10000LL);
+       time = ktime_add_ns(timer->base->get_time(), delta);
 
        hrtimer_set_expires(timer, time);
 
        if (rq == this_rq()) {
-               hrtimer_restart(timer);
+               __hrtick_restart(rq);
        } else if (!rq->hrtick_csd_pending) {
-               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
+               smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
                rq->hrtick_csd_pending = 1;
        }
 }
@@ -505,37 +527,99 @@ static inline void init_hrtick(void)
 #endif /* CONFIG_SCHED_HRTICK */
 
 /*
- * resched_task - mark a task 'to be rescheduled now'.
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, val)                                             \
+({     typeof(*(ptr)) __old, __val = *(ptr);                           \
+       for (;;) {                                                      \
+               __old = cmpxchg((ptr), __val, __val | (val));           \
+               if (__old == __val)                                     \
+                       break;                                          \
+               __val = __old;                                          \
+       }                                                               \
+       __old;                                                          \
+})
+
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+       struct thread_info *ti = task_thread_info(p);
+       return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
  *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
  */
-#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+       struct thread_info *ti = task_thread_info(p);
+       typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+
+       for (;;) {
+               if (!(val & _TIF_POLLING_NRFLAG))
+                       return false;
+               if (val & _TIF_NEED_RESCHED)
+                       return true;
+               old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+               if (old == val)
+                       break;
+               val = old;
+       }
+       return true;
+}
+
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+       set_tsk_need_resched(p);
+       return true;
+}
 
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) 0
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+       return false;
+}
+#endif
 #endif
 
-void resched_task(struct task_struct *p)
+/*
+ * resched_curr - mark rq's current task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_curr(struct rq *rq)
 {
+       struct task_struct *curr = rq->curr;
        int cpu;
 
-       assert_raw_spin_locked(&task_rq(p)->lock);
+       lockdep_assert_held(&rq->lock);
 
-       if (test_tsk_need_resched(p))
+       if (test_tsk_need_resched(curr))
                return;
 
-       set_tsk_need_resched(p);
+       cpu = cpu_of(rq);
 
-       cpu = task_cpu(p);
-       if (cpu == smp_processor_id())
+       if (cpu == smp_processor_id()) {
+               set_tsk_need_resched(curr);
+               set_preempt_need_resched();
                return;
+       }
 
-       /* NEED_RESCHED must be visible before we test polling */
-       smp_mb();
-       if (!tsk_is_polling(p))
+       if (set_nr_and_not_polling(curr))
                smp_send_reschedule(cpu);
+       else
+               trace_sched_wake_idle_without_ipi(cpu);
 }
 
 void resched_cpu(int cpu)
@@ -545,11 +629,12 @@ void resched_cpu(int cpu)
 
        if (!raw_spin_trylock_irqsave(&rq->lock, flags))
                return;
-       resched_task(cpu_curr(cpu));
+       resched_curr(rq);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * In the semi idle case, use the nearest busy cpu for migrating timers
  * from an idle cpu.  This is good for power-savings.
@@ -558,12 +643,15 @@ void resched_cpu(int cpu)
  * selecting an idle cpu will add more delays to the timers than intended
  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
  */
-int get_nohz_timer_target(void)
+int get_nohz_timer_target(int pinned)
 {
        int cpu = smp_processor_id();
        int i;
        struct sched_domain *sd;
 
+       if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+               return cpu;
+
        rcu_read_lock();
        for_each_domain(cpu, sd) {
                for_each_cpu(i, sched_domain_span(sd)) {
@@ -587,56 +675,90 @@ unlock:
  * account when the CPU goes back to idle and evaluates the timer
  * wheel for the next timer event.
  */
-void wake_up_idle_cpu(int cpu)
+static void wake_up_idle_cpu(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
 
        if (cpu == smp_processor_id())
                return;
 
-       /*
-        * This is safe, as this function is called with the timer
-        * wheel base lock of (cpu) held. When the CPU is on the way
-        * to idle and has not yet set rq->curr to idle then it will
-        * be serialized on the timer wheel base lock and take the new
-        * timer into account automatically.
-        */
-       if (rq->curr != rq->idle)
-               return;
+       if (set_nr_and_not_polling(rq->idle))
+               smp_send_reschedule(cpu);
+       else
+               trace_sched_wake_idle_without_ipi(cpu);
+}
 
+static bool wake_up_full_nohz_cpu(int cpu)
+{
        /*
-        * We can set TIF_RESCHED on the idle task of the other CPU
-        * lockless. The worst case is that the other CPU runs the
-        * idle task through an additional NOOP schedule()
+        * We just need the target to call irq_exit() and re-evaluate
+        * the next tick. The nohz full kick at least implies that.
+        * If needed we can still optimize that later with an
+        * empty IRQ.
         */
-       set_tsk_need_resched(rq->idle);
+       if (tick_nohz_full_cpu(cpu)) {
+               if (cpu != smp_processor_id() ||
+                   tick_nohz_tick_stopped())
+                       tick_nohz_full_kick_cpu(cpu);
+               return true;
+       }
 
-       /* NEED_RESCHED must be visible before we test polling */
-       smp_mb();
-       if (!tsk_is_polling(rq->idle))
-               smp_send_reschedule(cpu);
+       return false;
+}
+
+void wake_up_nohz_cpu(int cpu)
+{
+       if (!wake_up_full_nohz_cpu(cpu))
+               wake_up_idle_cpu(cpu);
 }
 
 static inline bool got_nohz_idle_kick(void)
 {
        int cpu = smp_processor_id();
-       return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+
+       if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+               return false;
+
+       if (idle_cpu(cpu) && !need_resched())
+               return true;
+
+       /*
+        * We can't run Idle Load Balance on this CPU for this time so we
+        * cancel it and clear NOHZ_BALANCE_KICK
+        */
+       clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+       return false;
 }
 
-#else /* CONFIG_NO_HZ */
+#else /* CONFIG_NO_HZ_COMMON */
 
 static inline bool got_nohz_idle_kick(void)
 {
        return false;
 }
 
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
+
+#ifdef CONFIG_NO_HZ_FULL
+bool sched_can_stop_tick(void)
+{
+       /*
+        * More than one running task need preemption.
+        * nr_running update is assumed to be visible
+        * after IPI is sent from wakers.
+        */
+       if (this_rq()->nr_running > 1)
+               return false;
+
+       return true;
+}
+#endif /* CONFIG_NO_HZ_FULL */
 
 void sched_avg_update(struct rq *rq)
 {
        s64 period = sched_avg_period();
 
-       while ((s64)(rq->clock - rq->age_stamp) > period) {
+       while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
                /*
                 * Inline assembly required to prevent the compiler
                 * optimising this loop into a divmod call.
@@ -648,12 +770,6 @@ void sched_avg_update(struct rq *rq)
        }
 }
 
-#else /* !CONFIG_SMP */
-void resched_task(struct task_struct *p)
-{
-       assert_raw_spin_locked(&task_rq(p)->lock);
-       set_tsk_need_resched(p);
-}
 #endif /* CONFIG_SMP */
 
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -722,14 +838,14 @@ static void set_load_weight(struct task_struct *p)
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-       sched_info_queued(p);
+       sched_info_queued(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
 }
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-       sched_info_dequeued(p);
+       sched_info_dequeued(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -784,19 +900,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
        if (static_key_false((&paravirt_steal_rq_enabled))) {
-               u64 st;
-
                steal = paravirt_steal_clock(cpu_of(rq));
                steal -= rq->prev_steal_time_rq;
 
                if (unlikely(steal > delta))
                        steal = delta;
 
-               st = steal_ticks(steal);
-               steal = st * TICK_NSEC;
-
                rq->prev_steal_time_rq += steal;
-
                delta -= steal;
        }
 #endif
@@ -804,7 +914,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
        rq->clock_task += delta;
 
 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-       if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+       if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
                sched_rt_avg_update(rq, irq_delta + steal);
 #endif
 }
@@ -858,7 +968,9 @@ static inline int normal_prio(struct task_struct *p)
 {
        int prio;
 
-       if (task_has_rt_policy(p))
+       if (task_has_dl_policy(p))
+               prio = MAX_DL_PRIO-1;
+       else if (task_has_rt_policy(p))
                prio = MAX_RT_PRIO-1 - p->rt_priority;
        else
                prio = __normal_prio(p);
@@ -888,6 +1000,8 @@ static int effective_prio(struct task_struct *p)
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
  */
 inline int task_curr(const struct task_struct *p)
 {
@@ -902,7 +1016,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                if (prev_class->switched_from)
                        prev_class->switched_from(rq, p);
                p->sched_class->switched_to(rq, p);
-       } else if (oldprio != p->prio)
+       } else if (oldprio != p->prio || dl_task(p))
                p->sched_class->prio_changed(rq, p, oldprio);
 }
 
@@ -917,7 +1031,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                        if (class == rq->curr->sched_class)
                                break;
                        if (class == p->sched_class) {
-                               resched_task(rq->curr);
+                               resched_curr(rq);
                                break;
                        }
                }
@@ -927,17 +1041,10 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * A queue event has occurred, and we're going to schedule.  In
         * this case, we can save a useless back to back clock update.
         */
-       if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
+       if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
                rq->skip_clock_update = 1;
 }
 
-static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
-
-void register_task_migration_notifier(struct notifier_block *n)
-{
-       atomic_notifier_chain_register(&task_migration_notifier, n);
-}
-
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -947,7 +1054,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         * ttwu() will sort out the placement.
         */
        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-                       !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+                       !(task_preempt_count(p) & PREEMPT_ACTIVE));
 
 #ifdef CONFIG_LOCKDEP
        /*
@@ -968,21 +1075,115 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        trace_sched_migrate_task(p, new_cpu);
 
        if (task_cpu(p) != new_cpu) {
-               struct task_migration_notifier tmn;
-
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+       }
+
+       __set_task_cpu(p, new_cpu);
+}
 
-               tmn.task = p;
-               tmn.from_cpu = task_cpu(p);
-               tmn.to_cpu = new_cpu;
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+       if (task_on_rq_queued(p)) {
+               struct rq *src_rq, *dst_rq;
+
+               src_rq = task_rq(p);
+               dst_rq = cpu_rq(cpu);
 
-               atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
+               deactivate_task(src_rq, p, 0);
+               set_task_cpu(p, cpu);
+               activate_task(dst_rq, p, 0);
+               check_preempt_curr(dst_rq, p, 0);
+       } else {
+               /*
+                * Task isn't running anymore; make it appear like we migrated
+                * it before it went to sleep. This means on wakeup we make the
+                * previous cpu our targer instead of where it really is.
+                */
+               p->wake_cpu = cpu;
        }
+}
 
-       __set_task_cpu(p, new_cpu);
+struct migration_swap_arg {
+       struct task_struct *src_task, *dst_task;
+       int src_cpu, dst_cpu;
+};
+
+static int migrate_swap_stop(void *data)
+{
+       struct migration_swap_arg *arg = data;
+       struct rq *src_rq, *dst_rq;
+       int ret = -EAGAIN;
+
+       src_rq = cpu_rq(arg->src_cpu);
+       dst_rq = cpu_rq(arg->dst_cpu);
+
+       double_raw_lock(&arg->src_task->pi_lock,
+                       &arg->dst_task->pi_lock);
+       double_rq_lock(src_rq, dst_rq);
+       if (task_cpu(arg->dst_task) != arg->dst_cpu)
+               goto unlock;
+
+       if (task_cpu(arg->src_task) != arg->src_cpu)
+               goto unlock;
+
+       if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+               goto unlock;
+
+       if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+               goto unlock;
+
+       __migrate_swap_task(arg->src_task, arg->dst_cpu);
+       __migrate_swap_task(arg->dst_task, arg->src_cpu);
+
+       ret = 0;
+
+unlock:
+       double_rq_unlock(src_rq, dst_rq);
+       raw_spin_unlock(&arg->dst_task->pi_lock);
+       raw_spin_unlock(&arg->src_task->pi_lock);
+
+       return ret;
+}
+
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p)
+{
+       struct migration_swap_arg arg;
+       int ret = -EINVAL;
+
+       arg = (struct migration_swap_arg){
+               .src_task = cur,
+               .src_cpu = task_cpu(cur),
+               .dst_task = p,
+               .dst_cpu = task_cpu(p),
+       };
+
+       if (arg.src_cpu == arg.dst_cpu)
+               goto out;
+
+       /*
+        * These three tests are all lockless; this is OK since all of them
+        * will be re-checked with proper locks held further down the line.
+        */
+       if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+               goto out;
+
+       if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+               goto out;
+
+       if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+               goto out;
+
+       trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
+       ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+
+out:
+       return ret;
 }
 
 struct migration_arg {
@@ -1011,7 +1212,7 @@ static int migration_cpu_stop(void *data);
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
        unsigned long flags;
-       int running, on_rq;
+       int running, queued;
        unsigned long ncsw;
        struct rq *rq;
 
@@ -1049,7 +1250,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                rq = task_rq_lock(p, &flags);
                trace_sched_wait_task(p);
                running = task_running(rq, p);
-               on_rq = p->on_rq;
+               queued = task_on_rq_queued(p);
                ncsw = 0;
                if (!match_state || p->state == match_state)
                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1081,7 +1282,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * running right now), it's preempted, and we should
                 * yield - it could be a while.
                 */
-               if (unlikely(on_rq)) {
+               if (unlikely(queued)) {
                        ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
 
                        set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1192,7 +1393,7 @@ out:
                 * leave kernel.
                 */
                if (p->mm && printk_ratelimit()) {
-                       printk_sched("process %d (%s) no longer affine to cpu%d\n",
+                       printk_deferred("process %d (%s) no longer affine to cpu%d\n",
                                        task_pid_nr(p), p->comm, cpu);
                }
        }
@@ -1204,9 +1405,9 @@ out:
  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
  */
 static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
-       int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+       cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -1275,7 +1476,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
        activate_task(rq, p, en_flags);
-       p->on_rq = 1;
+       p->on_rq = TASK_ON_RQ_QUEUED;
 
        /* if a worker is waking up, notify workqueue */
        if (p->flags & PF_WQ_WORKER)
@@ -1288,8 +1489,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 static void
 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
-       trace_sched_wakeup(p, true);
        check_preempt_curr(rq, p, wake_flags);
+       trace_sched_wakeup(p, true);
 
        p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -1297,13 +1498,14 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
                p->sched_class->task_woken(rq, p);
 
        if (rq->idle_stamp) {
-               u64 delta = rq->clock - rq->idle_stamp;
-               u64 max = 2*sysctl_sched_migration_cost;
+               u64 delta = rq_clock(rq) - rq->idle_stamp;
+               u64 max = 2*rq->max_idle_balance_cost;
+
+               update_avg(&rq->avg_idle, delta);
 
-               if (delta > max)
+               if (rq->avg_idle > max)
                        rq->avg_idle = max;
-               else
-                       update_avg(&rq->avg_idle, delta);
+
                rq->idle_stamp = 0;
        }
 #endif
@@ -1333,7 +1535,9 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
        int ret = 0;
 
        rq = __task_rq_lock(p);
-       if (p->on_rq) {
+       if (task_on_rq_queued(p)) {
+               /* check_preempt_curr() may use rq clock */
+               update_rq_clock(rq);
                ttwu_do_wakeup(rq, p, wake_flags);
                ret = 1;
        }
@@ -1343,13 +1547,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 }
 
 #ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
 {
        struct rq *rq = this_rq();
        struct llist_node *llist = llist_del_all(&rq->wake_list);
        struct task_struct *p;
+       unsigned long flags;
 
-       raw_spin_lock(&rq->lock);
+       if (!llist)
+               return;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
 
        while (llist) {
                p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1357,11 +1565,18 @@ static void sched_ttwu_pending(void)
                ttwu_do_activate(rq, p, 0);
        }
 
-       raw_spin_unlock(&rq->lock);
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
 void scheduler_ipi(void)
 {
+       /*
+        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+        * TIF_NEED_RESCHED remotely (for the first time) will also send
+        * this IPI.
+        */
+       preempt_fold_need_resched();
+
        if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
                return;
 
@@ -1384,7 +1599,7 @@ void scheduler_ipi(void)
        /*
         * Check if someone kicked us for doing the nohz idle load balance.
         */
-       if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+       if (unlikely(got_nohz_idle_kick())) {
                this_rq()->idle_balance = 1;
                raise_softirq_irqoff(SCHED_SOFTIRQ);
        }
@@ -1393,8 +1608,33 @@ void scheduler_ipi(void)
 
 static void ttwu_queue_remote(struct task_struct *p, int cpu)
 {
-       if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
-               smp_send_reschedule(cpu);
+       struct rq *rq = cpu_rq(cpu);
+
+       if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+               if (!set_nr_if_polling(rq->idle))
+                       smp_send_reschedule(cpu);
+               else
+                       trace_sched_wake_idle_without_ipi(cpu);
+       }
+}
+
+void wake_up_if_idle(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       if (!is_idle_task(rq->curr))
+               return;
+
+       if (set_nr_if_polling(rq->idle)) {
+               trace_sched_wake_idle_without_ipi(cpu);
+       } else {
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               if (is_idle_task(rq->curr))
+                       smp_send_reschedule(cpu);
+               /* Else cpu is not in idle, do nothing here */
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
+       }
 }
 
 bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1432,7 +1672,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
  * the simpler "current->state = TASK_RUNNING" to mark yourself
  * runnable without the overhead of this.
  *
- * Returns %true if @p was woken up, %false if it was already running
+ * Return: %true if @p was woken up, %false if it was already running.
  * or @state didn't match @p's state.
  */
 static int
@@ -1441,7 +1681,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        unsigned long flags;
        int cpu, success = 0;
 
-       smp_wmb();
+       /*
+        * If we are going to wake up a thread waiting for CONDITION we
+        * need to ensure that CONDITION=1 done by the caller can not be
+        * reordered with p->state check below. This pairs with mb() in
+        * set_current_state() the waiting thread does.
+        */
+       smp_mb__before_spinlock();
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        if (!(p->state & state))
                goto out;
@@ -1470,7 +1716,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        if (p->sched_class->task_waking)
                p->sched_class->task_waking(p);
 
-       cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+       cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
        if (task_cpu(p) != cpu) {
                wake_flags |= WF_MIGRATED;
                set_task_cpu(p, cpu);
@@ -1517,7 +1763,7 @@ static void try_to_wake_up_local(struct task_struct *p)
        if (!(p->state & TASK_NORMAL))
                goto out;
 
-       if (!p->on_rq)
+       if (!task_on_rq_queued(p))
                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
        ttwu_do_wakeup(rq, p, 0);
@@ -1531,8 +1777,9 @@ out:
  * @p: The process to be woken up.
  *
  * Attempt to wake up the nominated process and move it to the set of runnable
- * processes.  Returns 1 if the process was woken up, 0 if it was already
- * running.
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
@@ -1551,13 +1798,27 @@ int wake_up_state(struct task_struct *p, unsigned int state)
 }
 EXPORT_SYMBOL(wake_up_process);
 
+/*
+ * This function clears the sched_dl_entity static params.
+ */
+void __dl_clear_params(struct task_struct *p)
+{
+       struct sched_dl_entity *dl_se = &p->dl;
+
+       dl_se->dl_runtime = 0;
+       dl_se->dl_deadline = 0;
+       dl_se->dl_period = 0;
+       dl_se->flags = 0;
+       dl_se->dl_bw = 0;
+}
+
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  *
  * __sched_fork() is basic setup used by init_idle() too:
  */
-static void __sched_fork(struct task_struct *p)
+static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
        p->on_rq                        = 0;
 
@@ -1569,19 +1830,14 @@ static void __sched_fork(struct task_struct *p)
        p->se.vruntime                  = 0;
        INIT_LIST_HEAD(&p->se.group_node);
 
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
-       p->se.avg.runnable_avg_period = 0;
-       p->se.avg.runnable_avg_sum = 0;
-#endif
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
+       RB_CLEAR_NODE(&p->dl.rb_node);
+       hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       __dl_clear_params(p);
+
        INIT_LIST_HEAD(&p->rt.run_list);
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1590,16 +1846,26 @@ static void __sched_fork(struct task_struct *p)
 
 #ifdef CONFIG_NUMA_BALANCING
        if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
-               p->mm->numa_next_scan = jiffies;
-               p->mm->numa_next_reset = jiffies;
+               p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
                p->mm->numa_scan_seq = 0;
        }
 
+       if (clone_flags & CLONE_VM)
+               p->numa_preferred_nid = current->numa_preferred_nid;
+       else
+               p->numa_preferred_nid = -1;
+
        p->node_stamp = 0ULL;
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-       p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
        p->numa_work.next = &p->numa_work;
+       p->numa_faults_memory = NULL;
+       p->numa_faults_buffer_memory = NULL;
+       p->last_task_numa_placement = 0;
+       p->last_sum_exec_runtime = 0;
+
+       INIT_LIST_HEAD(&p->numa_entry);
+       p->numa_group = NULL;
 #endif /* CONFIG_NUMA_BALANCING */
 }
 
@@ -1620,17 +1886,39 @@ void set_numabalancing_state(bool enabled)
        numabalancing_enabled = enabled;
 }
 #endif /* CONFIG_SCHED_DEBUG */
-#endif /* CONFIG_NUMA_BALANCING */
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_numa_balancing(struct ctl_table *table, int write,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       struct ctl_table t;
+       int err;
+       int state = numabalancing_enabled;
+
+       if (write && !capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       t = *table;
+       t.data = &state;
+       err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+       if (err < 0)
+               return err;
+       if (write)
+               set_numabalancing_state(state);
+       return err;
+}
+#endif
+#endif
 
 /*
  * fork()/clone()-time setup:
  */
-void sched_fork(struct task_struct *p)
+int sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long flags;
        int cpu = get_cpu();
 
-       __sched_fork(p);
+       __sched_fork(clone_flags, p);
        /*
         * We mark the process as running here. This guarantees that
         * nobody will actually run it, and a signal or other external
@@ -1647,7 +1935,7 @@ void sched_fork(struct task_struct *p)
         * Revert to default priority/policy on fork if requested.
         */
        if (unlikely(p->sched_reset_on_fork)) {
-               if (task_has_rt_policy(p)) {
+               if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
                        p->policy = SCHED_NORMAL;
                        p->static_prio = NICE_TO_PRIO(0);
                        p->rt_priority = 0;
@@ -1664,8 +1952,14 @@ void sched_fork(struct task_struct *p)
                p->sched_reset_on_fork = 0;
        }
 
-       if (!rt_prio(p->prio))
+       if (dl_prio(p->prio)) {
+               put_cpu();
+               return -EAGAIN;
+       } else if (rt_prio(p->prio)) {
+               p->sched_class = &rt_sched_class;
+       } else {
                p->sched_class = &fair_sched_class;
+       }
 
        if (p->sched_class->task_fork)
                p->sched_class->task_fork(p);
@@ -1688,52 +1982,168 @@ void sched_fork(struct task_struct *p)
 #if defined(CONFIG_SMP)
        p->on_cpu = 0;
 #endif
-#ifdef CONFIG_PREEMPT_COUNT
-       /* Want to start with kernel preemption disabled. */
-       task_thread_info(p)->preempt_count = 1;
-#endif
+       init_task_preempt_count(p);
 #ifdef CONFIG_SMP
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
+       RB_CLEAR_NODE(&p->pushable_dl_tasks);
 #endif
 
        put_cpu();
+       return 0;
 }
 
-/*
- * wake_up_new_task - wake up a newly created task for the first time.
- *
- * This function will do some initial scheduler statistics housekeeping
- * that must be done for every newly created context, then puts the task
- * on the runqueue and wakes it.
- */
-void wake_up_new_task(struct task_struct *p)
+unsigned long to_ratio(u64 period, u64 runtime)
 {
-       unsigned long flags;
-       struct rq *rq;
+       if (runtime == RUNTIME_INF)
+               return 1ULL << 20;
 
-       raw_spin_lock_irqsave(&p->pi_lock, flags);
-#ifdef CONFIG_SMP
        /*
-        * Fork balancing, do it here and not earlier because:
-        *  - cpus_allowed can change in the fork path
-        *  - any previously selected cpu might disappear through hotplug
+        * Doing this here saves a lot of checks in all
+        * the calling paths, and returning zero seems
+        * safe for them anyway.
         */
-       set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
-#endif
+       if (period == 0)
+               return 0;
+
+       return div64_u64(runtime << 20, period);
+}
 
-       rq = __task_rq_lock(p);
-       activate_task(rq, p, 0);
-       p->on_rq = 1;
-       trace_sched_wakeup_new(p, true);
-       check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
-       if (p->sched_class->task_woken)
-               p->sched_class->task_woken(rq, p);
-#endif
-       task_rq_unlock(rq, p, &flags);
+inline struct dl_bw *dl_bw_of(int i)
+{
+       rcu_lockdep_assert(rcu_read_lock_sched_held(),
+                          "sched RCU must be held");
+       return &cpu_rq(i)->rd->dl_bw;
 }
 
-#ifdef CONFIG_PREEMPT_NOTIFIERS
+static inline int dl_bw_cpus(int i)
+{
+       struct root_domain *rd = cpu_rq(i)->rd;
+       int cpus = 0;
+
+       rcu_lockdep_assert(rcu_read_lock_sched_held(),
+                          "sched RCU must be held");
+       for_each_cpu_and(i, rd->span, cpu_active_mask)
+               cpus++;
+
+       return cpus;
+}
+#else
+inline struct dl_bw *dl_bw_of(int i)
+{
+       return &cpu_rq(i)->dl.dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+       return 1;
+}
+#endif
+
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+       dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+       dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+       return dl_b->bw != -1 &&
+              dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+static int dl_overflow(struct task_struct *p, int policy,
+                      const struct sched_attr *attr)
+{
+
+       struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+       u64 period = attr->sched_period ?: attr->sched_deadline;
+       u64 runtime = attr->sched_runtime;
+       u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+       int cpus, err = -1;
+
+       if (new_bw == p->dl.dl_bw)
+               return 0;
+
+       /*
+        * Either if a task, enters, leave, or stays -deadline but changes
+        * its parameters, we may need to update accordingly the total
+        * allocated bandwidth of the container.
+        */
+       raw_spin_lock(&dl_b->lock);
+       cpus = dl_bw_cpus(task_cpu(p));
+       if (dl_policy(policy) && !task_has_dl_policy(p) &&
+           !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+               __dl_add(dl_b, new_bw);
+               err = 0;
+       } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+                  !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+               __dl_clear(dl_b, p->dl.dl_bw);
+               __dl_add(dl_b, new_bw);
+               err = 0;
+       } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+               __dl_clear(dl_b, p->dl.dl_bw);
+               err = 0;
+       }
+       raw_spin_unlock(&dl_b->lock);
+
+       return err;
+}
+
+extern void init_dl_bw(struct dl_bw *dl_b);
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void wake_up_new_task(struct task_struct *p)
+{
+       unsigned long flags;
+       struct rq *rq;
+
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+#ifdef CONFIG_SMP
+       /*
+        * Fork balancing, do it here and not earlier because:
+        *  - cpus_allowed can change in the fork path
+        *  - any previously selected cpu might disappear through hotplug
+        */
+       set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+#endif
+
+       /* Initialize new task's runnable average */
+       init_task_runnable_average(p);
+       rq = __task_rq_lock(p);
+       activate_task(rq, p, 0);
+       p->on_rq = TASK_ON_RQ_QUEUED;
+       trace_sched_wakeup_new(p, true);
+       check_preempt_curr(rq, p, WF_FORK);
+#ifdef CONFIG_SMP
+       if (p->sched_class->task_woken)
+               p->sched_class->task_woken(rq, p);
+#endif
+       task_rq_unlock(rq, p, &flags);
+}
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
 
 /**
  * preempt_notifier_register - tell me when current is being preempted & rescheduled
@@ -1807,7 +2217,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
        trace_sched_switch(prev, next);
-       sched_info_switch(prev, next);
+       sched_info_switch(rq, prev, next);
        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
        prepare_lock_switch(rq, next);
@@ -1859,6 +2269,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        if (mm)
                mmdrop(mm);
        if (unlikely(prev_state == TASK_DEAD)) {
+               if (prev->sched_class->task_dead)
+                       prev->sched_class->task_dead(prev);
+
                /*
                 * Remove function-return probe instances associated with this
                 * task and put them back on the free list.
@@ -1866,17 +2279,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
                kprobe_flush_task(prev);
                put_task_struct(prev);
        }
+
+       tick_nohz_task_switch(current);
 }
 
 #ifdef CONFIG_SMP
 
-/* assumes rq->lock is held */
-static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
-{
-       if (prev->sched_class->pre_schedule)
-               prev->sched_class->pre_schedule(rq, prev);
-}
-
 /* rq->lock is NOT held, but preemption is disabled */
 static inline void post_schedule(struct rq *rq)
 {
@@ -1894,10 +2302,6 @@ static inline void post_schedule(struct rq *rq)
 
 #else
 
-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
-
 static inline void post_schedule(struct rq *rq)
 {
 }
@@ -1908,7 +2312,7 @@ static inline void post_schedule(struct rq *rq)
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
  */
-asmlinkage void schedule_tail(struct task_struct *prev)
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
        __releases(rq->lock)
 {
        struct rq *rq = this_rq();
@@ -1921,10 +2325,6 @@ asmlinkage void schedule_tail(struct task_struct *prev)
         */
        post_schedule(rq);
 
-#ifdef __ARCH_WANT_UNLOCKED_CTXSW
-       /* In this case, finish_task_switch does not reenable preemption */
-       preempt_enable();
-#endif
        if (current->set_child_tid)
                put_user(task_pid_vnr(current), current->set_child_tid);
 }
@@ -1967,9 +2367,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
         * of the scheduler it's an obvious special-case), so we
         * do an early lockdep release here:
         */
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-#endif
 
        context_tracking_task_switch(prev, next);
        /* Here we just switch the register state and the stack. */
@@ -2000,6 +2398,18 @@ unsigned long nr_running(void)
        return sum;
 }
 
+/*
+ * Check if only the current task is running on the cpu.
+ */
+bool single_task_running(void)
+{
+       if (cpu_rq(smp_processor_id())->nr_running == 1)
+               return true;
+       else
+               return false;
+}
+EXPORT_SYMBOL(single_task_running);
+
 unsigned long long nr_context_switches(void)
 {
        int i;
@@ -2027,762 +2437,235 @@ unsigned long nr_iowait_cpu(int cpu)
        return atomic_read(&this->nr_iowait);
 }
 
-unsigned long this_cpu_load(void)
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
 {
        struct rq *this = this_rq();
-       return this->cpu_load[0];
+       *nr_waiters = atomic_read(&this->nr_iowait);
+       *load = this->cpu_load[0];
 }
 
+#ifdef CONFIG_SMP
 
 /*
- * Global load-average calculations
- *
- * We take a distributed and async approach to calculating the global load-avg
- * in order to minimize overhead.
- *
- * The global load average is an exponentially decaying average of nr_running +
- * nr_uninterruptible.
- *
- * Once every LOAD_FREQ:
- *
- *   nr_active = 0;
- *   for_each_possible_cpu(cpu)
- *     nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
- *
- *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
- *
- * Due to a number of reasons the above turns in the mess below:
- *
- *  - for_each_possible_cpu() is prohibitively expensive on machines with
- *    serious number of cpus, therefore we need to take a distributed approach
- *    to calculating nr_active.
- *
- *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
- *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
- *
- *    So assuming nr_active := 0 when we start out -- true per definition, we
- *    can simply take per-cpu deltas and fold those into a global accumulate
- *    to obtain the same result. See calc_load_fold_active().
- *
- *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
- *    across the machine, we assume 10 ticks is sufficient time for every
- *    cpu to have completed this task.
- *
- *    This places an upper-bound on the IRQ-off latency of the machine. Then
- *    again, being late doesn't loose the delta, just wrecks the sample.
- *
- *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
- *    this would add another cross-cpu cacheline miss and atomic operation
- *    to the wakeup path. Instead we increment on whatever cpu the task ran
- *    when it went into uninterruptible state and decrement on whatever cpu
- *    did the wakeup. This means that only the sum of nr_uninterruptible over
- *    all cpus yields the correct result.
- *
- *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
  */
+void sched_exec(void)
+{
+       struct task_struct *p = current;
+       unsigned long flags;
+       int dest_cpu;
 
-/* Variables and functions for calc_load */
-static atomic_long_t calc_load_tasks;
-static unsigned long calc_load_update;
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun); /* should be removed */
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+       if (dest_cpu == smp_processor_id())
+               goto unlock;
 
-/**
- * get_avenrun - get the load average array
- * @loads:     pointer to dest load array
- * @offset:    offset to add
- * @shift:     shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-       loads[0] = (avenrun[0] + offset) << shift;
-       loads[1] = (avenrun[1] + offset) << shift;
-       loads[2] = (avenrun[2] + offset) << shift;
+       if (likely(cpu_active(dest_cpu))) {
+               struct migration_arg arg = { p, dest_cpu };
+
+               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+               stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
+               return;
+       }
+unlock:
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 
-static long calc_load_fold_active(struct rq *this_rq)
+#endif
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
+
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
 {
-       long nr_active, delta = 0;
+       unsigned long flags;
+       struct rq *rq;
+       u64 ns;
 
-       nr_active = this_rq->nr_running;
-       nr_active += (long) this_rq->nr_uninterruptible;
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+       /*
+        * 64-bit doesn't need locks to atomically read a 64bit value.
+        * So we have a optimization chance when the task's delta_exec is 0.
+        * Reading ->on_cpu is racy, but this is ok.
+        *
+        * If we race with it leaving cpu, we'll take a lock. So we're correct.
+        * If we race with it entering cpu, unaccounted time is 0. This is
+        * indistinguishable from the read occurring a few cycles earlier.
+        * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+        * been accounted, so we're correct here as well.
+        */
+       if (!p->on_cpu || !task_on_rq_queued(p))
+               return p->se.sum_exec_runtime;
+#endif
 
-       if (nr_active != this_rq->calc_load_active) {
-               delta = nr_active - this_rq->calc_load_active;
-               this_rq->calc_load_active = nr_active;
+       rq = task_rq_lock(p, &flags);
+       /*
+        * Must be ->curr _and_ ->on_rq.  If dequeued, we would
+        * project cycles that may never be accounted to this
+        * thread, breaking clock_gettime().
+        */
+       if (task_current(rq, p) && task_on_rq_queued(p)) {
+               update_rq_clock(rq);
+               p->sched_class->update_curr(rq);
        }
+       ns = p->se.sum_exec_runtime;
+       task_rq_unlock(rq, p, &flags);
 
-       return delta;
+       return ns;
 }
 
 /*
- * a1 = a0 * e + a * (1 - e)
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
  */
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
+void scheduler_tick(void)
 {
-       load *= exp;
-       load += active * (FIXED_1 - exp);
-       load += 1UL << (FSHIFT - 1);
-       return load >> FSHIFT;
+       int cpu = smp_processor_id();
+       struct rq *rq = cpu_rq(cpu);
+       struct task_struct *curr = rq->curr;
+
+       sched_clock_tick();
+
+       raw_spin_lock(&rq->lock);
+       update_rq_clock(rq);
+       curr->sched_class->task_tick(rq, curr, 0);
+       update_cpu_load_active(rq);
+       raw_spin_unlock(&rq->lock);
+
+       perf_event_task_tick();
+
+#ifdef CONFIG_SMP
+       rq->idle_balance = idle_cpu(cpu);
+       trigger_load_balance(rq);
+#endif
+       rq_last_tick_reset(rq);
 }
 
-#ifdef CONFIG_NO_HZ
-/*
- * Handle NO_HZ for the global load-average.
- *
- * Since the above described distributed algorithm to compute the global
- * load-average relies on per-cpu sampling from the tick, it is affected by
- * NO_HZ.
- *
- * The basic idea is to fold the nr_active delta into a global idle-delta upon
- * entering NO_HZ state such that we can include this as an 'extra' cpu delta
- * when we read the global state.
- *
- * Obviously reality has to ruin such a delightfully simple scheme:
- *
- *  - When we go NO_HZ idle during the window, we can negate our sample
- *    contribution, causing under-accounting.
- *
- *    We avoid this by keeping two idle-delta counters and flipping them
- *    when the window starts, thus separating old and new NO_HZ load.
- *
- *    The only trick is the slight shift in index flip for read vs write.
- *
- *        0s            5s            10s           15s
- *          +10           +10           +10           +10
- *        |-|-----------|-|-----------|-|-----------|-|
- *    r:0 0 1           1 0           0 1           1 0
- *    w:0 1 1           0 0           1 1           0 0
- *
- *    This ensures we'll fold the old idle contribution in this window while
- *    accumlating the new one.
+#ifdef CONFIG_NO_HZ_FULL
+/**
+ * scheduler_tick_max_deferment
  *
- *  - When we wake up from NO_HZ idle during the window, we push up our
- *    contribution, since we effectively move our sample point to a known
- *    busy state.
+ * Keep at least one tick per second when a single
+ * active task is running because the scheduler doesn't
+ * yet completely support full dynticks environment.
  *
- *    This is solved by pushing the window forward, and thus skipping the
- *    sample, for this cpu (effectively using the idle-delta for this cpu which
- *    was in effect at the time the window opened). This also solves the issue
- *    of having to deal with a cpu having been in NOHZ idle for multiple
- *    LOAD_FREQ intervals.
+ * This makes sure that uptime, CFS vruntime, load
+ * balancing, etc... continue to move forward, even
+ * with a very low granularity.
  *
- * When making the ILB scale, we should try to pull this in as well.
+ * Return: Maximum deferment in nanoseconds.
  */
-static atomic_long_t calc_load_idle[2];
-static int calc_load_idx;
-
-static inline int calc_load_write_idx(void)
+u64 scheduler_tick_max_deferment(void)
 {
-       int idx = calc_load_idx;
+       struct rq *rq = this_rq();
+       unsigned long next, now = ACCESS_ONCE(jiffies);
 
-       /*
-        * See calc_global_nohz(), if we observe the new index, we also
-        * need to observe the new update time.
-        */
-       smp_rmb();
+       next = rq->last_sched_tick + HZ;
 
-       /*
-        * If the folding window started, make sure we start writing in the
-        * next idle-delta.
-        */
-       if (!time_before(jiffies, calc_load_update))
-               idx++;
+       if (time_before_eq(next, now))
+               return 0;
 
-       return idx & 1;
+       return jiffies_to_nsecs(next - now);
 }
+#endif
 
-static inline int calc_load_read_idx(void)
+notrace unsigned long get_parent_ip(unsigned long addr)
 {
-       return calc_load_idx & 1;
+       if (in_lock_functions(addr)) {
+               addr = CALLER_ADDR2;
+               if (in_lock_functions(addr))
+                       addr = CALLER_ADDR3;
+       }
+       return addr;
 }
 
-void calc_load_enter_idle(void)
-{
-       struct rq *this_rq = this_rq();
-       long delta;
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+                               defined(CONFIG_PREEMPT_TRACER))
 
+void preempt_count_add(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+       /*
+        * Underflow?
+        */
+       if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+               return;
+#endif
+       __preempt_count_add(val);
+#ifdef CONFIG_DEBUG_PREEMPT
        /*
-        * We're going into NOHZ mode, if there's any pending delta, fold it
-        * into the pending idle delta.
+        * Spinlock count overflowing soon?
         */
-       delta = calc_load_fold_active(this_rq);
-       if (delta) {
-               int idx = calc_load_write_idx();
-               atomic_long_add(delta, &calc_load_idle[idx]);
+       DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+                               PREEMPT_MASK - 10);
+#endif
+       if (preempt_count() == val) {
+               unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+               current->preempt_disable_ip = ip;
+#endif
+               trace_preempt_off(CALLER_ADDR0, ip);
        }
 }
+EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
 
-void calc_load_exit_idle(void)
+void preempt_count_sub(int val)
 {
-       struct rq *this_rq = this_rq();
-
+#ifdef CONFIG_DEBUG_PREEMPT
        /*
-        * If we're still before the sample window, we're done.
+        * Underflow?
         */
-       if (time_before(jiffies, this_rq->calc_load_update))
+       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
                return;
-
        /*
-        * We woke inside or after the sample window, this means we're already
-        * accounted through the nohz accounting, so skip the entire deal and
-        * sync up for the next window.
+        * Is the spinlock portion underflowing?
         */
-       this_rq->calc_load_update = calc_load_update;
-       if (time_before(jiffies, this_rq->calc_load_update + 10))
-               this_rq->calc_load_update += LOAD_FREQ;
-}
-
-static long calc_load_fold_idle(void)
-{
-       int idx = calc_load_read_idx();
-       long delta = 0;
-
-       if (atomic_long_read(&calc_load_idle[idx]))
-               delta = atomic_long_xchg(&calc_load_idle[idx], 0);
+       if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+                       !(preempt_count() & PREEMPT_MASK)))
+               return;
+#endif
 
-       return delta;
+       if (preempt_count() == val)
+               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+       __preempt_count_sub(val);
 }
+EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
 
-/**
- * fixed_power_int - compute: x^n, in O(log n) time
- *
- * @x:         base of the power
- * @frac_bits: fractional bits of @x
- * @n:         power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- */
-static unsigned long
-fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
-{
-       unsigned long result = 1UL << frac_bits;
-
-       if (n) for (;;) {
-               if (n & 1) {
-                       result *= x;
-                       result += 1UL << (frac_bits - 1);
-                       result >>= frac_bits;
-               }
-               n >>= 1;
-               if (!n)
-                       break;
-               x *= x;
-               x += 1UL << (frac_bits - 1);
-               x >>= frac_bits;
-       }
-
-       return result;
-}
+#endif
 
 /*
- * a1 = a0 * e + a * (1 - e)
- *
- * a2 = a1 * e + a * (1 - e)
- *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
- *    = a0 * e^2 + a * (1 - e) * (1 + e)
- *
- * a3 = a2 * e + a * (1 - e)
- *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
- *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
- *
- *  ...
- *
- * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
- *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
- *    = a0 * e^n + a * (1 - e^n)
- *
- * [1] application of the geometric series:
- *
- *              n         1 - x^(n+1)
- *     S_n := \Sum x^i = -------------
- *             i=0          1 - x
+ * Print scheduling while atomic bug:
  */
-static unsigned long
-calc_load_n(unsigned long load, unsigned long exp,
-           unsigned long active, unsigned int n)
+static noinline void __schedule_bug(struct task_struct *prev)
 {
+       if (oops_in_progress)
+               return;
 
-       return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
-}
-
-/*
- * NO_HZ can leave us missing all per-cpu ticks calling
- * calc_load_account_active(), but since an idle CPU folds its delta into
- * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
- * in the pending idle delta if our idle period crossed a load cycle boundary.
- *
- * Once we've updated the global active value, we need to apply the exponential
- * weights adjusted to the number of cycles missed.
- */
-static void calc_global_nohz(void)
-{
-       long delta, active, n;
-
-       if (!time_before(jiffies, calc_load_update + 10)) {
-               /*
-                * Catch-up, fold however many we are behind still
-                */
-               delta = jiffies - calc_load_update - 10;
-               n = 1 + (delta / LOAD_FREQ);
-
-               active = atomic_long_read(&calc_load_tasks);
-               active = active > 0 ? active * FIXED_1 : 0;
-
-               avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-               avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-               avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-
-               calc_load_update += n * LOAD_FREQ;
-       }
-
-       /*
-        * Flip the idle index...
-        *
-        * Make sure we first write the new time then flip the index, so that
-        * calc_load_write_idx() will see the new time when it reads the new
-        * index, this avoids a double flip messing things up.
-        */
-       smp_wmb();
-       calc_load_idx++;
-}
-#else /* !CONFIG_NO_HZ */
-
-static inline long calc_load_fold_idle(void) { return 0; }
-static inline void calc_global_nohz(void) { }
-
-#endif /* CONFIG_NO_HZ */
-
-/*
- * calc_load - update the avenrun load estimates 10 ticks after the
- * CPUs have updated calc_load_tasks.
- */
-void calc_global_load(unsigned long ticks)
-{
-       long active, delta;
-
-       if (time_before(jiffies, calc_load_update + 10))
-               return;
-
-       /*
-        * Fold the 'old' idle-delta to include all NO_HZ cpus.
-        */
-       delta = calc_load_fold_idle();
-       if (delta)
-               atomic_long_add(delta, &calc_load_tasks);
-
-       active = atomic_long_read(&calc_load_tasks);
-       active = active > 0 ? active * FIXED_1 : 0;
-
-       avenrun[0] = calc_load(avenrun[0], EXP_1, active);
-       avenrun[1] = calc_load(avenrun[1], EXP_5, active);
-       avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-
-       calc_load_update += LOAD_FREQ;
-
-       /*
-        * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
-        */
-       calc_global_nohz();
-}
-
-/*
- * Called from update_cpu_load() to periodically update this CPU's
- * active count.
- */
-static void calc_load_account_active(struct rq *this_rq)
-{
-       long delta;
-
-       if (time_before(jiffies, this_rq->calc_load_update))
-               return;
-
-       delta  = calc_load_fold_active(this_rq);
-       if (delta)
-               atomic_long_add(delta, &calc_load_tasks);
-
-       this_rq->calc_load_update += LOAD_FREQ;
-}
-
-/*
- * End of global load-average stuff
- */
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT          7
-static const unsigned char
-               degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
-               degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-                                       {0, 0, 0, 0, 0, 0, 0, 0},
-                                       {64, 32, 8, 0, 0, 0, 0, 0},
-                                       {96, 72, 40, 12, 1, 0, 0},
-                                       {112, 98, 75, 43, 15, 1, 0},
-                                       {120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
-       int j = 0;
-
-       if (!missed_updates)
-               return load;
-
-       if (missed_updates >= degrade_zero_ticks[idx])
-               return 0;
-
-       if (idx == 1)
-               return load >> missed_updates;
-
-       while (missed_updates) {
-               if (missed_updates % 2)
-                       load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
-               missed_updates >>= 1;
-               j++;
-       }
-       return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
-                             unsigned long pending_updates)
-{
-       int i, scale;
-
-       this_rq->nr_load_updates++;
-
-       /* Update our load: */
-       this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
-       for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-               unsigned long old_load, new_load;
-
-               /* scale is effectively 1 << i now, and >> i divides by scale */
-
-               old_load = this_rq->cpu_load[i];
-               old_load = decay_load_missed(old_load, pending_updates - 1, i);
-               new_load = this_load;
-               /*
-                * Round up the averaging division if load is increasing. This
-                * prevents us from getting stuck on 9 if the load is 10, for
-                * example.
-                */
-               if (new_load > old_load)
-                       new_load += scale - 1;
-
-               this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
-       }
-
-       sched_avg_update(this_rq);
-}
-
-#ifdef CONFIG_NO_HZ
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
-       unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-       unsigned long load = this_rq->load.weight;
-       unsigned long pending_updates;
-
-       /*
-        * bail if there's load or we're actually up-to-date.
-        */
-       if (load || curr_jiffies == this_rq->last_load_update_tick)
-               return;
-
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       this_rq->last_load_update_tick = curr_jiffies;
-
-       __update_cpu_load(this_rq, load, pending_updates);
-}
-
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
-       struct rq *this_rq = this_rq();
-       unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-       unsigned long pending_updates;
-
-       if (curr_jiffies == this_rq->last_load_update_tick)
-               return;
-
-       raw_spin_lock(&this_rq->lock);
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       if (pending_updates) {
-               this_rq->last_load_update_tick = curr_jiffies;
-               /*
-                * We were idle, this means load 0, the current load might be
-                * !0 due to remote wakeups and the sort.
-                */
-               __update_cpu_load(this_rq, 0, pending_updates);
-       }
-       raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ */
-
-/*
- * Called from scheduler_tick()
- */
-static void update_cpu_load_active(struct rq *this_rq)
-{
-       /*
-        * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
-        */
-       this_rq->last_load_update_tick = jiffies;
-       __update_cpu_load(this_rq, this_rq->load.weight, 1);
-
-       calc_load_account_active(this_rq);
-}
-
-#ifdef CONFIG_SMP
-
-/*
- * sched_exec - execve() is a valuable balancing opportunity, because at
- * this point the task has the smallest effective memory and cache footprint.
- */
-void sched_exec(void)
-{
-       struct task_struct *p = current;
-       unsigned long flags;
-       int dest_cpu;
-
-       raw_spin_lock_irqsave(&p->pi_lock, flags);
-       dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
-       if (dest_cpu == smp_processor_id())
-               goto unlock;
-
-       if (likely(cpu_active(dest_cpu))) {
-               struct migration_arg arg = { p, dest_cpu };
-
-               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-               stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
-               return;
-       }
-unlock:
-       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-}
-
-#endif
-
-DEFINE_PER_CPU(struct kernel_stat, kstat);
-DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
-
-EXPORT_PER_CPU_SYMBOL(kstat);
-EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
-
-/*
- * Return any ns on the sched_clock that have not yet been accounted in
- * @p in case that task is currently running.
- *
- * Called with task_rq_lock() held on @rq.
- */
-static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
-{
-       u64 ns = 0;
-
-       if (task_current(rq, p)) {
-               update_rq_clock(rq);
-               ns = rq->clock_task - p->se.exec_start;
-               if ((s64)ns < 0)
-                       ns = 0;
-       }
-
-       return ns;
-}
-
-unsigned long long task_delta_exec(struct task_struct *p)
-{
-       unsigned long flags;
-       struct rq *rq;
-       u64 ns = 0;
-
-       rq = task_rq_lock(p, &flags);
-       ns = do_task_delta_exec(p, rq);
-       task_rq_unlock(rq, p, &flags);
-
-       return ns;
-}
-
-/*
- * Return accounted runtime for the task.
- * In case the task is currently running, return the runtime plus current's
- * pending runtime that have not been accounted yet.
- */
-unsigned long long task_sched_runtime(struct task_struct *p)
-{
-       unsigned long flags;
-       struct rq *rq;
-       u64 ns = 0;
-
-       rq = task_rq_lock(p, &flags);
-       ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
-       task_rq_unlock(rq, p, &flags);
-
-       return ns;
-}
-
-/*
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled.
- */
-void scheduler_tick(void)
-{
-       int cpu = smp_processor_id();
-       struct rq *rq = cpu_rq(cpu);
-       struct task_struct *curr = rq->curr;
-
-       sched_clock_tick();
-
-       raw_spin_lock(&rq->lock);
-       update_rq_clock(rq);
-       update_cpu_load_active(rq);
-       curr->sched_class->task_tick(rq, curr, 0);
-       raw_spin_unlock(&rq->lock);
-
-       perf_event_task_tick();
-
-#ifdef CONFIG_SMP
-       rq->idle_balance = idle_cpu(cpu);
-       trigger_load_balance(rq, cpu);
-#endif
-}
-
-notrace unsigned long get_parent_ip(unsigned long addr)
-{
-       if (in_lock_functions(addr)) {
-               addr = CALLER_ADDR2;
-               if (in_lock_functions(addr))
-                       addr = CALLER_ADDR3;
-       }
-       return addr;
-}
-
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-                               defined(CONFIG_PREEMPT_TRACER))
-
-void __kprobes add_preempt_count(int val)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
-       /*
-        * Underflow?
-        */
-       if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
-               return;
-#endif
-       preempt_count() += val;
-#ifdef CONFIG_DEBUG_PREEMPT
-       /*
-        * Spinlock count overflowing soon?
-        */
-       DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
-                               PREEMPT_MASK - 10);
-#endif
-       if (preempt_count() == val)
-               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-}
-EXPORT_SYMBOL(add_preempt_count);
-
-void __kprobes sub_preempt_count(int val)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
-       /*
-        * Underflow?
-        */
-       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
-               return;
-       /*
-        * Is the spinlock portion underflowing?
-        */
-       if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
-                       !(preempt_count() & PREEMPT_MASK)))
-               return;
-#endif
-
-       if (preempt_count() == val)
-               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-       preempt_count() -= val;
-}
-EXPORT_SYMBOL(sub_preempt_count);
-
-#endif
-
-/*
- * Print scheduling while atomic bug:
- */
-static noinline void __schedule_bug(struct task_struct *prev)
-{
-       if (oops_in_progress)
-               return;
-
-       printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
-               prev->comm, prev->pid, preempt_count());
+       printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+               prev->comm, prev->pid, preempt_count());
 
        debug_show_held_locks(prev);
        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+       if (in_atomic_preempt_off()) {
+               pr_err("Preemption disabled at:");
+               print_ip_sym(current->preempt_disable_ip);
+               pr_cont("\n");
+       }
+#endif
        dump_stack();
        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
@@ -2792,12 +2675,15 @@ static noinline void __schedule_bug(struct task_struct *prev)
  */
 static inline void schedule_debug(struct task_struct *prev)
 {
+#ifdef CONFIG_SCHED_STACK_END_CHECK
+       BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+#endif
        /*
         * Test if we are atomic. Since do_exit() needs to call into
-        * schedule() atomically, we ignore that path for now.
-        * Otherwise, whine if we are scheduling when we should not be.
+        * schedule() atomically, we ignore that path. Otherwise whine
+        * if we are scheduling when we should not.
         */
-       if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
+       if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
                __schedule_bug(prev);
        rcu_sleep_check();
 
@@ -2806,36 +2692,40 @@ static inline void schedule_debug(struct task_struct *prev)
        schedstat_inc(this_rq(), sched_count);
 }
 
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
-       if (prev->on_rq || rq->skip_clock_update < 0)
-               update_rq_clock(rq);
-       prev->sched_class->put_prev_task(rq, prev);
-}
-
 /*
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
-       const struct sched_class *class;
+       const struct sched_class *class = &fair_sched_class;
        struct task_struct *p;
 
        /*
         * Optimization: we know that if all tasks are in
         * the fair class we can call that function directly:
         */
-       if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
-               p = fair_sched_class.pick_next_task(rq);
-               if (likely(p))
-                       return p;
+       if (likely(prev->sched_class == class &&
+                  rq->nr_running == rq->cfs.h_nr_running)) {
+               p = fair_sched_class.pick_next_task(rq, prev);
+               if (unlikely(p == RETRY_TASK))
+                       goto again;
+
+               /* assumes fair_sched_class->next == idle_sched_class */
+               if (unlikely(!p))
+                       p = idle_sched_class.pick_next_task(rq, prev);
+
+               return p;
        }
 
+again:
        for_each_class(class) {
-               p = class->pick_next_task(rq);
-               if (p)
+               p = class->pick_next_task(rq, prev);
+               if (p) {
+                       if (unlikely(p == RETRY_TASK))
+                               goto again;
                        return p;
+               }
        }
 
        BUG(); /* the idle class will always have a runnable task */
@@ -2910,6 +2800,12 @@ need_resched:
        if (sched_feat(HRTICK))
                hrtick_clear(rq);
 
+       /*
+        * Make sure that signal_pending_state()->signal_pending() below
+        * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+        * done by the caller to avoid the race with signal_wake_up().
+        */
+       smp_mb__before_spinlock();
        raw_spin_lock_irq(&rq->lock);
 
        switch_count = &prev->nivcsw;
@@ -2936,14 +2832,12 @@ need_resched:
                switch_count = &prev->nvcsw;
        }
 
-       pre_schedule(rq, prev);
-
-       if (unlikely(!rq->nr_running))
-               idle_balance(cpu, rq);
+       if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
+               update_rq_clock(rq);
 
-       put_prev_task(rq, prev);
-       next = pick_next_task(rq);
+       next = pick_next_task(rq, prev);
        clear_tsk_need_resched(prev);
+       clear_preempt_need_resched();
        rq->skip_clock_update = 0;
 
        if (likely(prev != next)) {
@@ -2982,7 +2876,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
                blk_schedule_flush_plug(tsk);
 }
 
-asmlinkage void __sched schedule(void)
+asmlinkage __visible void __sched schedule(void)
 {
        struct task_struct *tsk = current;
 
@@ -2992,17 +2886,21 @@ asmlinkage void __sched schedule(void)
 EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_CONTEXT_TRACKING
-asmlinkage void __sched schedule_user(void)
+asmlinkage __visible void __sched schedule_user(void)
 {
        /*
         * If we come here after a random call to set_need_resched(),
         * or we have been woken up remotely but the IPI has not yet arrived,
         * we haven't yet exited the RCU idle mode. Do it here manually until
         * we find a better solution.
+        *
+        * NB: There are buggy callers of this function.  Ideally we
+        * should warn if prev_state != IN_USER, but that will trigger
+        * too frequently to make sense yet.
         */
-       user_exit();
+       enum ctx_state prev_state = exception_enter();
        schedule();
-       user_enter();
+       exception_exit(prev_state);
 }
 #endif
 
@@ -3018,72 +2916,25 @@ void __sched schedule_preempt_disabled(void)
        preempt_disable();
 }
 
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
-       if (lock->owner != owner)
-               return false;
-
-       /*
-        * Ensure we emit the owner->on_cpu, dereference _after_ checking
-        * lock->owner still matches owner, if that fails, owner might
-        * point to free()d memory, if it still matches, the rcu_read_lock()
-        * ensures the memory stays valid.
-        */
-       barrier();
-
-       return owner->on_cpu;
-}
-
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-{
-       if (!sched_feat(OWNER_SPIN))
-               return 0;
-
-       rcu_read_lock();
-       while (owner_running(lock, owner)) {
-               if (need_resched())
-                       break;
-
-               arch_mutex_cpu_relax();
-       }
-       rcu_read_unlock();
-
-       /*
-        * We break out the loop above on need_resched() and when the
-        * owner changed, which is a sign for heavy contention. Return
-        * success only when lock->owner is NULL.
-        */
-       return lock->owner == NULL;
-}
-#endif
-
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable. Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
-asmlinkage void __sched notrace preempt_schedule(void)
+asmlinkage __visible void __sched notrace preempt_schedule(void)
 {
-       struct thread_info *ti = current_thread_info();
-
        /*
         * If there is a non-zero preempt_count or interrupts are disabled,
         * we do not want to preempt the current task. Just return..
         */
-       if (likely(ti->preempt_count || irqs_disabled()))
+       if (likely(!preemptible()))
                return;
 
        do {
-               add_preempt_count_notrace(PREEMPT_ACTIVE);
+               __preempt_count_add(PREEMPT_ACTIVE);
                __schedule();
-               sub_preempt_count_notrace(PREEMPT_ACTIVE);
+               __preempt_count_sub(PREEMPT_ACTIVE);
 
                /*
                 * Check again in case we missed a preemption opportunity
@@ -3092,478 +2943,89 @@ asmlinkage void __sched notrace preempt_schedule(void)
                barrier();
        } while (need_resched());
 }
+NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
 
-/*
- * this is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
+#ifdef CONFIG_CONTEXT_TRACKING
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
  */
-asmlinkage void __sched preempt_schedule_irq(void)
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
 {
-       struct thread_info *ti = current_thread_info();
+       enum ctx_state prev_ctx;
 
-       /* Catch callers which need to be fixed */
-       BUG_ON(ti->preempt_count || !irqs_disabled());
+       if (likely(!preemptible()))
+               return;
 
-       user_exit();
        do {
-               add_preempt_count(PREEMPT_ACTIVE);
-               local_irq_enable();
-               __schedule();
-               local_irq_disable();
-               sub_preempt_count(PREEMPT_ACTIVE);
-
+               __preempt_count_add(PREEMPT_ACTIVE);
                /*
-                * Check again in case we missed a preemption opportunity
-                * between schedule and now.
+                * Needs preempt disabled in case user_exit() is traced
+                * and the tracer calls preempt_enable_notrace() causing
+                * an infinite recursion.
                 */
+               prev_ctx = exception_enter();
+               __schedule();
+               exception_exit(prev_ctx);
+
+               __preempt_count_sub(PREEMPT_ACTIVE);
                barrier();
        } while (need_resched());
 }
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_CONTEXT_TRACKING */
 
 #endif /* CONFIG_PREEMPT */
 
-int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
-                         void *key)
-{
-       return try_to_wake_up(curr->private, mode, wake_flags);
-}
-EXPORT_SYMBOL(default_wake_function);
-
-/*
- * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
- * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
- *
- * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
- * zero in this (rare) case, and we handle it by continuing to scan the queue.
- */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-                       int nr_exclusive, int wake_flags, void *key)
-{
-       wait_queue_t *curr, *next;
-
-       list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
-               unsigned flags = curr->flags;
-
-               if (curr->func(curr, mode, wake_flags, key) &&
-                               (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
-                       break;
-       }
-}
-
-/**
- * __wake_up - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: is directly passed to the wakeup function
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up(wait_queue_head_t *q, unsigned int mode,
-                       int nr_exclusive, void *key)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&q->lock, flags);
-       __wake_up_common(q, mode, nr_exclusive, 0, key);
-       spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(__wake_up);
-
-/*
- * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
- */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
-{
-       __wake_up_common(q, mode, nr, 0, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked);
-
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
-{
-       __wake_up_common(q, mode, 1, 0, key);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key);
-
-/**
- * __wake_up_sync_key - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: opaque value to be passed to wakeup targets
- *
- * The sync wakeup differs that the waker knows that it will schedule
- * away soon, so while the target thread will be woken up, it will not
- * be migrated to another CPU - ie. the two threads are 'synchronized'
- * with each other. This can prevent needless bouncing between CPUs.
- *
- * On UP it can prevent extra preemption.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
-                       int nr_exclusive, void *key)
-{
-       unsigned long flags;
-       int wake_flags = WF_SYNC;
-
-       if (unlikely(!q))
-               return;
-
-       if (unlikely(!nr_exclusive))
-               wake_flags = 0;
-
-       spin_lock_irqsave(&q->lock, flags);
-       __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
-       spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync_key);
-
 /*
- * __wake_up_sync - see __wake_up_sync_key()
- */
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
-{
-       __wake_up_sync_key(q, mode, nr_exclusive, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync);     /* For internal use only */
-
-/**
- * complete: - signals a single thread waiting on this completion
- * @x:  holds the state of this particular completion
- *
- * This will wake up a single thread waiting on this completion. Threads will be
- * awakened in the same order in which they were queued.
- *
- * See also complete_all(), wait_for_completion() and related routines.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete(struct completion *x)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&x->wait.lock, flags);
-       x->done++;
-       __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
-       spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete);
-
-/**
- * complete_all: - signals all threads waiting on this completion
- * @x:  holds the state of this particular completion
- *
- * This will wake up all threads waiting on this particular completion event.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete_all(struct completion *x)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&x->wait.lock, flags);
-       x->done += UINT_MAX/2;
-       __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
-       spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete_all);
-
-static inline long __sched
-do_wait_for_common(struct completion *x,
-                  long (*action)(long), long timeout, int state)
-{
-       if (!x->done) {
-               DECLARE_WAITQUEUE(wait, current);
-
-               __add_wait_queue_tail_exclusive(&x->wait, &wait);
-               do {
-                       if (signal_pending_state(state, current)) {
-                               timeout = -ERESTARTSYS;
-                               break;
-                       }
-                       __set_current_state(state);
-                       spin_unlock_irq(&x->wait.lock);
-                       timeout = action(timeout);
-                       spin_lock_irq(&x->wait.lock);
-               } while (!x->done && timeout);
-               __remove_wait_queue(&x->wait, &wait);
-               if (!x->done)
-                       return timeout;
-       }
-       x->done--;
-       return timeout ?: 1;
-}
-
-static inline long __sched
-__wait_for_common(struct completion *x,
-                 long (*action)(long), long timeout, int state)
-{
-       might_sleep();
-
-       spin_lock_irq(&x->wait.lock);
-       timeout = do_wait_for_common(x, action, timeout, state);
-       spin_unlock_irq(&x->wait.lock);
-       return timeout;
-}
-
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
-{
-       return __wait_for_common(x, schedule_timeout, timeout, state);
-}
-
-static long __sched
-wait_for_common_io(struct completion *x, long timeout, int state)
-{
-       return __wait_for_common(x, io_schedule_timeout, timeout, state);
-}
-
-/**
- * wait_for_completion: - waits for completion of a task
- * @x:  holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout.
- *
- * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
- * and interrupt capability. Also see complete().
- */
-void __sched wait_for_completion(struct completion *x)
-{
-       wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion);
-
-/**
- * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible.
- *
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_timeout(struct completion *x, unsigned long timeout)
-{
-       return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_timeout);
-
-/**
- * wait_for_completion_io: - waits for completion of a task
- * @x:  holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout. The caller is accounted as waiting
- * for IO.
- */
-void __sched wait_for_completion_io(struct completion *x)
-{
-       wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_io);
-
-/**
- * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible. The caller is accounted as waiting for IO.
- *
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
-{
-       return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_io_timeout);
-
-/**
- * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
- * @x:  holds the state of this particular completion
- *
- * This waits for completion of a specific task to be signaled. It is
- * interruptible.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_interruptible(struct completion *x)
-{
-       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
-       if (t == -ERESTARTSYS)
-               return t;
-       return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible);
-
-/**
- * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. It is interruptible. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_interruptible_timeout(struct completion *x,
-                                         unsigned long timeout)
-{
-       return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
-
-/**
- * wait_for_completion_killable: - waits for completion of a task (killable)
- * @x:  holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It can be
- * interrupted by a kill signal.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_killable(struct completion *x)
-{
-       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
-       if (t == -ERESTARTSYS)
-               return t;
-       return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_killable);
-
-/**
- * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be
- * signaled or for a specified timeout to expire. It can be
- * interrupted by a kill signal. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_killable_timeout(struct completion *x,
-                                    unsigned long timeout)
-{
-       return wait_for_common(x, timeout, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(wait_for_completion_killable_timeout);
-
-/**
- *     try_wait_for_completion - try to decrement a completion without blocking
- *     @x:     completion structure
- *
- *     Returns: 0 if a decrement cannot be done without blocking
- *              1 if a decrement succeeded.
- *
- *     If a completion is being used as a counting completion,
- *     attempt to decrement the counter without blocking. This
- *     enables us to avoid waiting if the resource the completion
- *     is protecting is not available.
- */
-bool try_wait_for_completion(struct completion *x)
-{
-       unsigned long flags;
-       int ret = 1;
-
-       spin_lock_irqsave(&x->wait.lock, flags);
-       if (!x->done)
-               ret = 0;
-       else
-               x->done--;
-       spin_unlock_irqrestore(&x->wait.lock, flags);
-       return ret;
-}
-EXPORT_SYMBOL(try_wait_for_completion);
-
-/**
- *     completion_done - Test to see if a completion has any waiters
- *     @x:     completion structure
- *
- *     Returns: 0 if there are waiters (wait_for_completion() in progress)
- *              1 if there are no waiters.
- *
+ * this is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
  */
-bool completion_done(struct completion *x)
-{
-       unsigned long flags;
-       int ret = 1;
-
-       spin_lock_irqsave(&x->wait.lock, flags);
-       if (!x->done)
-               ret = 0;
-       spin_unlock_irqrestore(&x->wait.lock, flags);
-       return ret;
-}
-EXPORT_SYMBOL(completion_done);
-
-static long __sched
-sleep_on_common(wait_queue_head_t *q, int state, long timeout)
+asmlinkage __visible void __sched preempt_schedule_irq(void)
 {
-       unsigned long flags;
-       wait_queue_t wait;
-
-       init_waitqueue_entry(&wait, current);
+       enum ctx_state prev_state;
 
-       __set_current_state(state);
-
-       spin_lock_irqsave(&q->lock, flags);
-       __add_wait_queue(q, &wait);
-       spin_unlock(&q->lock);
-       timeout = schedule_timeout(timeout);
-       spin_lock_irq(&q->lock);
-       __remove_wait_queue(q, &wait);
-       spin_unlock_irqrestore(&q->lock, flags);
+       /* Catch callers which need to be fixed */
+       BUG_ON(preempt_count() || !irqs_disabled());
 
-       return timeout;
-}
+       prev_state = exception_enter();
 
-void __sched interruptible_sleep_on(wait_queue_head_t *q)
-{
-       sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(interruptible_sleep_on);
+       do {
+               __preempt_count_add(PREEMPT_ACTIVE);
+               local_irq_enable();
+               __schedule();
+               local_irq_disable();
+               __preempt_count_sub(PREEMPT_ACTIVE);
 
-long __sched
-interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
-       return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(interruptible_sleep_on_timeout);
+               /*
+                * Check again in case we missed a preemption opportunity
+                * between schedule and now.
+                */
+               barrier();
+       } while (need_resched());
 
-void __sched sleep_on(wait_queue_head_t *q)
-{
-       sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+       exception_exit(prev_state);
 }
-EXPORT_SYMBOL(sleep_on);
 
-long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+                         void *key)
 {
-       return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
+       return try_to_wake_up(curr->private, mode, wake_flags);
 }
-EXPORT_SYMBOL(sleep_on_timeout);
+EXPORT_SYMBOL(default_wake_function);
 
 #ifdef CONFIG_RT_MUTEXES
 
@@ -3575,15 +3037,16 @@ EXPORT_SYMBOL(sleep_on_timeout);
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
  *
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-       int oldprio, on_rq, running;
+       int oldprio, queued, running, enqueue_flag = 0;
        struct rq *rq;
        const struct sched_class *prev_class;
 
-       BUG_ON(prio < 0 || prio > MAX_PRIO);
+       BUG_ON(prio > MAX_PRIO);
 
        rq = __task_rq_lock(p);
 
@@ -3608,37 +3071,64 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
-       on_rq = p->on_rq;
+       queued = task_on_rq_queued(p);
        running = task_current(rq, p);
-       if (on_rq)
+       if (queued)
                dequeue_task(rq, p, 0);
        if (running)
-               p->sched_class->put_prev_task(rq, p);
+               put_prev_task(rq, p);
 
-       if (rt_prio(prio))
+       /*
+        * Boosting condition are:
+        * 1. -rt task is running and holds mutex A
+        *      --> -dl task blocks on mutex A
+        *
+        * 2. -dl task is running and holds mutex A
+        *      --> -dl task blocks on mutex A and could preempt the
+        *          running task
+        */
+       if (dl_prio(prio)) {
+               struct task_struct *pi_task = rt_mutex_get_top_task(p);
+               if (!dl_prio(p->normal_prio) ||
+                   (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
+                       p->dl.dl_boosted = 1;
+                       p->dl.dl_throttled = 0;
+                       enqueue_flag = ENQUEUE_REPLENISH;
+               } else
+                       p->dl.dl_boosted = 0;
+               p->sched_class = &dl_sched_class;
+       } else if (rt_prio(prio)) {
+               if (dl_prio(oldprio))
+                       p->dl.dl_boosted = 0;
+               if (oldprio < prio)
+                       enqueue_flag = ENQUEUE_HEAD;
                p->sched_class = &rt_sched_class;
-       else
+       } else {
+               if (dl_prio(oldprio))
+                       p->dl.dl_boosted = 0;
                p->sched_class = &fair_sched_class;
+       }
 
        p->prio = prio;
 
        if (running)
                p->sched_class->set_curr_task(rq);
-       if (on_rq)
-               enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
+       if (queued)
+               enqueue_task(rq, p, enqueue_flag);
 
        check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
        __task_rq_unlock(rq);
 }
 #endif
+
 void set_user_nice(struct task_struct *p, long nice)
 {
-       int old_prio, delta, on_rq;
+       int old_prio, delta, queued;
        unsigned long flags;
        struct rq *rq;
 
-       if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+       if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
                return;
        /*
         * We have to be careful, if called from sys_setpriority(),
@@ -3649,14 +3139,14 @@ void set_user_nice(struct task_struct *p, long nice)
         * The RT priorities are set via sched_setscheduler(), but we still
         * allow the 'normal' nice value to be set - but as expected
         * it wont have any effect on scheduling until the task is
-        * SCHED_FIFO/SCHED_RR:
+        * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
         */
-       if (task_has_rt_policy(p)) {
+       if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
                p->static_prio = NICE_TO_PRIO(nice);
                goto out_unlock;
        }
-       on_rq = p->on_rq;
-       if (on_rq)
+       queued = task_on_rq_queued(p);
+       if (queued)
                dequeue_task(rq, p, 0);
 
        p->static_prio = NICE_TO_PRIO(nice);
@@ -3665,14 +3155,14 @@ void set_user_nice(struct task_struct *p, long nice)
        p->prio = effective_prio(p);
        delta = p->prio - old_prio;
 
-       if (on_rq) {
+       if (queued) {
                enqueue_task(rq, p, 0);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
                 */
                if (delta < 0 || (delta > 0 && task_running(rq, p)))
-                       resched_task(rq->curr);
+                       resched_curr(rq);
        }
 out_unlock:
        task_rq_unlock(rq, p, &flags);
@@ -3687,7 +3177,7 @@ EXPORT_SYMBOL(set_user_nice);
 int can_nice(const struct task_struct *p, const int nice)
 {
        /* convert nice value [19,-20] to rlimit style value [1,40] */
-       int nice_rlim = 20 - nice;
+       int nice_rlim = nice_to_rlimit(nice);
 
        return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                capable(CAP_SYS_NICE));
@@ -3711,17 +3201,10 @@ SYSCALL_DEFINE1(nice, int, increment)
         * We don't have to worry. Conceptually one call occurs first
         * and we have a single winner.
         */
-       if (increment < -40)
-               increment = -40;
-       if (increment > 40)
-               increment = 40;
-
-       nice = TASK_NICE(current) + increment;
-       if (nice < -20)
-               nice = -20;
-       if (nice > 19)
-               nice = 19;
+       increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+       nice = task_nice(current) + increment;
 
+       nice = clamp_val(nice, MIN_NICE, MAX_NICE);
        if (increment < 0 && !can_nice(current, nice))
                return -EPERM;
 
@@ -3739,7 +3222,7 @@ SYSCALL_DEFINE1(nice, int, increment)
  * task_prio - return the priority value of a given task.
  * @p: the task in question.
  *
- * This is the priority value as seen by users in /proc.
+ * Return: The priority value as seen by users in /proc.
  * RT tasks are offset by -200. Normal tasks are centered
  * around 0, value goes from -16 to +15.
  */
@@ -3748,19 +3231,11 @@ int task_prio(const struct task_struct *p)
        return p->prio - MAX_RT_PRIO;
 }
 
-/**
- * task_nice - return the nice value of a given task.
- * @p: the task in question.
- */
-int task_nice(const struct task_struct *p)
-{
-       return TASK_NICE(p);
-}
-EXPORT_SYMBOL(task_nice);
-
 /**
  * idle_cpu - is a given cpu idle currently?
  * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
  */
 int idle_cpu(int cpu)
 {
@@ -3783,6 +3258,8 @@ int idle_cpu(int cpu)
 /**
  * idle_task - return the idle task for a given cpu.
  * @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
  */
 struct task_struct *idle_task(int cpu)
 {
@@ -3792,26 +3269,140 @@ struct task_struct *idle_task(int cpu)
 /**
  * find_process_by_pid - find a process with a matching PID value.
  * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
  */
 static struct task_struct *find_process_by_pid(pid_t pid)
 {
        return pid ? find_task_by_vpid(pid) : current;
 }
 
-/* Actually do priority change: must hold rq lock. */
+/*
+ * This function initializes the sched_dl_entity of a newly becoming
+ * SCHED_DEADLINE task.
+ *
+ * Only the static values are considered here, the actual runtime and the
+ * absolute deadline will be properly calculated when the task is enqueued
+ * for the first time with its new policy.
+ */
 static void
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
+{
+       struct sched_dl_entity *dl_se = &p->dl;
+
+       init_dl_task_timer(dl_se);
+       dl_se->dl_runtime = attr->sched_runtime;
+       dl_se->dl_deadline = attr->sched_deadline;
+       dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
+       dl_se->flags = attr->sched_flags;
+       dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+       dl_se->dl_throttled = 0;
+       dl_se->dl_new = 1;
+       dl_se->dl_yielded = 0;
+}
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY        -1
+
+static void __setscheduler_params(struct task_struct *p,
+               const struct sched_attr *attr)
 {
+       int policy = attr->sched_policy;
+
+       if (policy == SETPARAM_POLICY)
+               policy = p->policy;
+
        p->policy = policy;
-       p->rt_priority = prio;
+
+       if (dl_policy(policy))
+               __setparam_dl(p, attr);
+       else if (fair_policy(policy))
+               p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+
+       /*
+        * __sched_setscheduler() ensures attr->sched_priority == 0 when
+        * !rt_policy. Always setting this ensures that things like
+        * getparam()/getattr() don't report silly values for !rt tasks.
+        */
+       p->rt_priority = attr->sched_priority;
        p->normal_prio = normal_prio(p);
-       /* we are holding p->pi_lock already */
-       p->prio = rt_mutex_getprio(p);
-       if (rt_prio(p->prio))
+       set_load_weight(p);
+}
+
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+                          const struct sched_attr *attr)
+{
+       __setscheduler_params(p, attr);
+
+       /*
+        * If we get here, there was no pi waiters boosting the
+        * task. It is safe to use the normal prio.
+        */
+       p->prio = normal_prio(p);
+
+       if (dl_prio(p->prio))
+               p->sched_class = &dl_sched_class;
+       else if (rt_prio(p->prio))
                p->sched_class = &rt_sched_class;
        else
                p->sched_class = &fair_sched_class;
-       set_load_weight(p);
+}
+
+static void
+__getparam_dl(struct task_struct *p, struct sched_attr *attr)
+{
+       struct sched_dl_entity *dl_se = &p->dl;
+
+       attr->sched_priority = p->rt_priority;
+       attr->sched_runtime = dl_se->dl_runtime;
+       attr->sched_deadline = dl_se->dl_deadline;
+       attr->sched_period = dl_se->dl_period;
+       attr->sched_flags = dl_se->flags;
+}
+
+/*
+ * This function validates the new parameters of a -deadline task.
+ * We ask for the deadline not being zero, and greater or equal
+ * than the runtime, as well as the period of being zero or
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
+ */
+static bool
+__checkparam_dl(const struct sched_attr *attr)
+{
+       /* deadline != 0 */
+       if (attr->sched_deadline == 0)
+               return false;
+
+       /*
+        * Since we truncate DL_SCALE bits, make sure we're at least
+        * that big.
+        */
+       if (attr->sched_runtime < (1ULL << DL_SCALE))
+               return false;
+
+       /*
+        * Since we use the MSB for wrap-around and sign issues, make
+        * sure it's not set (mind that period can be equal to zero).
+        */
+       if (attr->sched_deadline & (1ULL << 63) ||
+           attr->sched_period & (1ULL << 63))
+               return false;
+
+       /* runtime <= deadline <= period (if period != 0) */
+       if ((attr->sched_period != 0 &&
+            attr->sched_period < attr->sched_deadline) ||
+           attr->sched_deadline < attr->sched_runtime)
+               return false;
+
+       return true;
 }
 
 /*
@@ -3830,10 +3421,14 @@ static bool check_same_owner(struct task_struct *p)
        return match;
 }
 
-static int __sched_setscheduler(struct task_struct *p, int policy,
-                               const struct sched_param *param, bool user)
+static int __sched_setscheduler(struct task_struct *p,
+                               const struct sched_attr *attr,
+                               bool user)
 {
-       int retval, oldprio, oldpolicy = -1, on_rq, running;
+       int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
+                     MAX_RT_PRIO - 1 - attr->sched_priority;
+       int retval, oldprio, oldpolicy = -1, queued, running;
+       int policy = attr->sched_policy;
        unsigned long flags;
        const struct sched_class *prev_class;
        struct rq *rq;
@@ -3847,31 +3442,40 @@ recheck:
                reset_on_fork = p->sched_reset_on_fork;
                policy = oldpolicy = p->policy;
        } else {
-               reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
-               policy &= ~SCHED_RESET_ON_FORK;
+               reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
 
-               if (policy != SCHED_FIFO && policy != SCHED_RR &&
+               if (policy != SCHED_DEADLINE &&
+                               policy != SCHED_FIFO && policy != SCHED_RR &&
                                policy != SCHED_NORMAL && policy != SCHED_BATCH &&
                                policy != SCHED_IDLE)
                        return -EINVAL;
        }
 
+       if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+               return -EINVAL;
+
        /*
         * Valid priorities for SCHED_FIFO and SCHED_RR are
         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
         * SCHED_BATCH and SCHED_IDLE is 0.
         */
-       if (param->sched_priority < 0 ||
-           (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
-           (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+       if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
+           (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
                return -EINVAL;
-       if (rt_policy(policy) != (param->sched_priority != 0))
+       if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
+           (rt_policy(policy) != (attr->sched_priority != 0)))
                return -EINVAL;
 
        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
        if (user && !capable(CAP_SYS_NICE)) {
+               if (fair_policy(policy)) {
+                       if (attr->sched_nice < task_nice(p) &&
+                           !can_nice(p, attr->sched_nice))
+                               return -EPERM;
+               }
+
                if (rt_policy(policy)) {
                        unsigned long rlim_rtprio =
                                        task_rlimit(p, RLIMIT_RTPRIO);
@@ -3881,17 +3485,26 @@ recheck:
                                return -EPERM;
 
                        /* can't increase priority */
-                       if (param->sched_priority > p->rt_priority &&
-                           param->sched_priority > rlim_rtprio)
+                       if (attr->sched_priority > p->rt_priority &&
+                           attr->sched_priority > rlim_rtprio)
                                return -EPERM;
                }
 
+                /*
+                 * Can't set/change SCHED_DEADLINE policy at all for now
+                 * (safest behavior); in the future we would like to allow
+                 * unprivileged DL tasks to increase their relative deadline
+                 * or reduce their runtime (both ways reducing utilization)
+                 */
+               if (dl_policy(policy))
+                       return -EPERM;
+
                /*
                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
-                       if (!can_nice(p, TASK_NICE(p)))
+                       if (!can_nice(p, task_nice(p)))
                                return -EPERM;
                }
 
@@ -3928,16 +3541,25 @@ recheck:
        }
 
        /*
-        * If not changing anything there's no need to proceed further:
+        * If not changing anything there's no need to proceed further,
+        * but store a possible modification of reset_on_fork.
         */
-       if (unlikely(policy == p->policy && (!rt_policy(policy) ||
-                       param->sched_priority == p->rt_priority))) {
+       if (unlikely(policy == p->policy)) {
+               if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+                       goto change;
+               if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+                       goto change;
+               if (dl_policy(policy))
+                       goto change;
+
+               p->sched_reset_on_fork = reset_on_fork;
                task_rq_unlock(rq, p, &flags);
                return 0;
        }
+change:
 
-#ifdef CONFIG_RT_GROUP_SCHED
        if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
                /*
                 * Do not allow realtime tasks into groups that have no runtime
                 * assigned.
@@ -3948,8 +3570,24 @@ recheck:
                        task_rq_unlock(rq, p, &flags);
                        return -EPERM;
                }
-       }
 #endif
+#ifdef CONFIG_SMP
+               if (dl_bandwidth_enabled() && dl_policy(policy)) {
+                       cpumask_t *span = rq->rd->span;
+
+                       /*
+                        * Don't allow tasks with an affinity mask smaller than
+                        * the entire root_domain to become SCHED_DEADLINE. We
+                        * will also fail if there's no bandwidth available.
+                        */
+                       if (!cpumask_subset(span, &p->cpus_allowed) ||
+                           rq->rd->dl_bw.bw == 0) {
+                               task_rq_unlock(rq, p, &flags);
+                               return -EPERM;
+                       }
+               }
+#endif
+       }
 
        /* recheck policy now with rq lock held */
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3957,23 +3595,54 @@ recheck:
                task_rq_unlock(rq, p, &flags);
                goto recheck;
        }
-       on_rq = p->on_rq;
+
+       /*
+        * If setscheduling to SCHED_DEADLINE (or changing the parameters
+        * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+        * is available.
+        */
+       if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+               task_rq_unlock(rq, p, &flags);
+               return -EBUSY;
+       }
+
+       p->sched_reset_on_fork = reset_on_fork;
+       oldprio = p->prio;
+
+       /*
+        * Special case for priority boosted tasks.
+        *
+        * If the new priority is lower or equal (user space view)
+        * than the current (boosted) priority, we just store the new
+        * normal parameters and do not touch the scheduler class and
+        * the runqueue. This will be done when the task deboost
+        * itself.
+        */
+       if (rt_mutex_check_prio(p, newprio)) {
+               __setscheduler_params(p, attr);
+               task_rq_unlock(rq, p, &flags);
+               return 0;
+       }
+
+       queued = task_on_rq_queued(p);
        running = task_current(rq, p);
-       if (on_rq)
+       if (queued)
                dequeue_task(rq, p, 0);
        if (running)
-               p->sched_class->put_prev_task(rq, p);
+               put_prev_task(rq, p);
 
-       p->sched_reset_on_fork = reset_on_fork;
-
-       oldprio = p->prio;
        prev_class = p->sched_class;
-       __setscheduler(rq, p, policy, param->sched_priority);
+       __setscheduler(rq, p, attr);
 
        if (running)
                p->sched_class->set_curr_task(rq);
-       if (on_rq)
-               enqueue_task(rq, p, 0);
+       if (queued) {
+               /*
+                * We enqueue to tail when the priority of a task is
+                * increased (user space view).
+                */
+               enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+       }
 
        check_class_changed(rq, p, prev_class, oldprio);
        task_rq_unlock(rq, p, &flags);
@@ -3983,21 +3652,47 @@ recheck:
        return 0;
 }
 
+static int _sched_setscheduler(struct task_struct *p, int policy,
+                              const struct sched_param *param, bool check)
+{
+       struct sched_attr attr = {
+               .sched_policy   = policy,
+               .sched_priority = param->sched_priority,
+               .sched_nice     = PRIO_TO_NICE(p->static_prio),
+       };
+
+       /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
+       if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
+               attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+               policy &= ~SCHED_RESET_ON_FORK;
+               attr.sched_policy = policy;
+       }
+
+       return __sched_setscheduler(p, &attr, check);
+}
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
+ * Return: 0 on success. An error code otherwise.
+ *
  * NOTE that the task may be already dead.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
                       const struct sched_param *param)
 {
-       return __sched_setscheduler(p, policy, param, true);
+       return _sched_setscheduler(p, policy, param, true);
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+       return __sched_setscheduler(p, attr, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
+
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
  * @p: the task in question.
@@ -4008,11 +3703,13 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
  * current context has permission.  For example, this is needed in
  * stop_machine(): we create temporary high priority worker threads,
  * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
  */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
                               const struct sched_param *param)
 {
-       return __sched_setscheduler(p, policy, param, false);
+       return _sched_setscheduler(p, policy, param, false);
 }
 
 static int
@@ -4037,11 +3734,84 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
        return retval;
 }
 
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+                          struct sched_attr *attr)
+{
+       u32 size;
+       int ret;
+
+       if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+               return -EFAULT;
+
+       /*
+        * zero the full structure, so that a short copy will be nice.
+        */
+       memset(attr, 0, sizeof(*attr));
+
+       ret = get_user(size, &uattr->size);
+       if (ret)
+               return ret;
+
+       if (size > PAGE_SIZE)   /* silly large */
+               goto err_size;
+
+       if (!size)              /* abi compat */
+               size = SCHED_ATTR_SIZE_VER0;
+
+       if (size < SCHED_ATTR_SIZE_VER0)
+               goto err_size;
+
+       /*
+        * If we're handed a bigger struct than we know of,
+        * ensure all the unknown bits are 0 - i.e. new
+        * user-space does not rely on any kernel feature
+        * extensions we dont know about yet.
+        */
+       if (size > sizeof(*attr)) {
+               unsigned char __user *addr;
+               unsigned char __user *end;
+               unsigned char val;
+
+               addr = (void __user *)uattr + sizeof(*attr);
+               end  = (void __user *)uattr + size;
+
+               for (; addr < end; addr++) {
+                       ret = get_user(val, addr);
+                       if (ret)
+                               return ret;
+                       if (val)
+                               goto err_size;
+               }
+               size = sizeof(*attr);
+       }
+
+       ret = copy_from_user(attr, uattr, size);
+       if (ret)
+               return -EFAULT;
+
+       /*
+        * XXX: do we want to be lenient like existing syscalls; or do we want
+        * to be strict and return an error on out-of-bounds values?
+        */
+       attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
+
+       return 0;
+
+err_size:
+       put_user(sizeof(*attr), &uattr->size);
+       return -E2BIG;
+}
+
 /**
  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
  * @pid: the pid in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
  */
 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
                struct sched_param __user *, param)
@@ -4057,15 +3827,53 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
  * sys_sched_setparam - set/change the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
  */
 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 {
-       return do_sched_setscheduler(pid, -1, param);
+       return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+                              unsigned int, flags)
+{
+       struct sched_attr attr;
+       struct task_struct *p;
+       int retval;
+
+       if (!uattr || pid < 0 || flags)
+               return -EINVAL;
+
+       retval = sched_copy_attr(uattr, &attr);
+       if (retval)
+               return retval;
+
+       if ((int)attr.sched_policy < 0)
+               return -EINVAL;
+
+       rcu_read_lock();
+       retval = -ESRCH;
+       p = find_process_by_pid(pid);
+       if (p != NULL)
+               retval = sched_setattr(p, &attr);
+       rcu_read_unlock();
+
+       return retval;
 }
 
 /**
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
  */
 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 {
@@ -4092,14 +3900,99 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
  * sys_sched_getparam - get the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
  */
 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
-       struct sched_param lp;
+       struct sched_param lp = { .sched_priority = 0 };
+       struct task_struct *p;
+       int retval;
+
+       if (!param || pid < 0)
+               return -EINVAL;
+
+       rcu_read_lock();
+       p = find_process_by_pid(pid);
+       retval = -ESRCH;
+       if (!p)
+               goto out_unlock;
+
+       retval = security_task_getscheduler(p);
+       if (retval)
+               goto out_unlock;
+
+       if (task_has_rt_policy(p))
+               lp.sched_priority = p->rt_priority;
+       rcu_read_unlock();
+
+       /*
+        * This one might sleep, we cannot do it with a spinlock held ...
+        */
+       retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+       return retval;
+
+out_unlock:
+       rcu_read_unlock();
+       return retval;
+}
+
+static int sched_read_attr(struct sched_attr __user *uattr,
+                          struct sched_attr *attr,
+                          unsigned int usize)
+{
+       int ret;
+
+       if (!access_ok(VERIFY_WRITE, uattr, usize))
+               return -EFAULT;
+
+       /*
+        * If we're handed a smaller struct than we know of,
+        * ensure all the unknown bits are 0 - i.e. old
+        * user-space does not get uncomplete information.
+        */
+       if (usize < sizeof(*attr)) {
+               unsigned char *addr;
+               unsigned char *end;
+
+               addr = (void *)attr + usize;
+               end  = (void *)attr + sizeof(*attr);
+
+               for (; addr < end; addr++) {
+                       if (*addr)
+                               return -EFBIG;
+               }
+
+               attr->size = usize;
+       }
+
+       ret = copy_to_user(uattr, attr, attr->size);
+       if (ret)
+               return -EFAULT;
+
+       return 0;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+               unsigned int, size, unsigned int, flags)
+{
+       struct sched_attr attr = {
+               .size = sizeof(struct sched_attr),
+       };
        struct task_struct *p;
        int retval;
 
-       if (!param || pid < 0)
+       if (!uattr || pid < 0 || size > PAGE_SIZE ||
+           size < SCHED_ATTR_SIZE_VER0 || flags)
                return -EINVAL;
 
        rcu_read_lock();
@@ -4112,14 +4005,19 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
        if (retval)
                goto out_unlock;
 
-       lp.sched_priority = p->rt_priority;
-       rcu_read_unlock();
+       attr.sched_policy = p->policy;
+       if (p->sched_reset_on_fork)
+               attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+       if (task_has_dl_policy(p))
+               __getparam_dl(p, &attr);
+       else if (task_has_rt_policy(p))
+               attr.sched_priority = p->rt_priority;
+       else
+               attr.sched_nice = task_nice(p);
 
-       /*
-        * This one might sleep, we cannot do it with a spinlock held ...
-        */
-       retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+       rcu_read_unlock();
 
+       retval = sched_read_attr(uattr, &attr, size);
        return retval;
 
 out_unlock:
@@ -4133,13 +4031,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        struct task_struct *p;
        int retval;
 
-       get_online_cpus();
        rcu_read_lock();
 
        p = find_process_by_pid(pid);
        if (!p) {
                rcu_read_unlock();
-               put_online_cpus();
                return -ESRCH;
        }
 
@@ -4147,6 +4043,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        get_task_struct(p);
        rcu_read_unlock();
 
+       if (p->flags & PF_NO_SETAFFINITY) {
+               retval = -EINVAL;
+               goto out_put_task;
+       }
        if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
                retval = -ENOMEM;
                goto out_put_task;
@@ -4160,17 +4060,36 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                rcu_read_lock();
                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                        rcu_read_unlock();
-                       goto out_unlock;
+                       goto out_free_new_mask;
                }
                rcu_read_unlock();
        }
 
        retval = security_task_setscheduler(p);
        if (retval)
-               goto out_unlock;
+               goto out_free_new_mask;
+
 
        cpuset_cpus_allowed(p, cpus_allowed);
        cpumask_and(new_mask, in_mask, cpus_allowed);
+
+       /*
+        * Since bandwidth control happens on root_domain basis,
+        * if admission test is enabled, we only admit -deadline
+        * tasks allowed to run on all the CPUs in the task's
+        * root_domain.
+        */
+#ifdef CONFIG_SMP
+       if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+               rcu_read_lock();
+               if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
+                       retval = -EBUSY;
+                       rcu_read_unlock();
+                       goto out_free_new_mask;
+               }
+               rcu_read_unlock();
+       }
+#endif
 again:
        retval = set_cpus_allowed_ptr(p, new_mask);
 
@@ -4186,13 +4105,12 @@ again:
                        goto again;
                }
        }
-out_unlock:
+out_free_new_mask:
        free_cpumask_var(new_mask);
 out_free_cpus_allowed:
        free_cpumask_var(cpus_allowed);
 out_put_task:
        put_task_struct(p);
-       put_online_cpus();
        return retval;
 }
 
@@ -4212,6 +4130,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
  */
 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
                unsigned long __user *, user_mask_ptr)
@@ -4235,7 +4155,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        unsigned long flags;
        int retval;
 
-       get_online_cpus();
        rcu_read_lock();
 
        retval = -ESRCH;
@@ -4248,12 +4167,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
                goto out_unlock;
 
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-       cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+       cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 out_unlock:
        rcu_read_unlock();
-       put_online_cpus();
 
        return retval;
 }
@@ -4263,6 +4181,8 @@ out_unlock:
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
  */
 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
                unsigned long __user *, user_mask_ptr)
@@ -4297,6 +4217,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
  *
  * This function yields the current CPU to other tasks. If there are no
  * other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
  */
 SYSCALL_DEFINE0(sched_yield)
 {
@@ -4319,16 +4241,11 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
 
-static inline int should_resched(void)
-{
-       return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
-}
-
 static void __cond_resched(void)
 {
-       add_preempt_count(PREEMPT_ACTIVE);
+       __preempt_count_add(PREEMPT_ACTIVE);
        __schedule();
-       sub_preempt_count(PREEMPT_ACTIVE);
+       __preempt_count_sub(PREEMPT_ACTIVE);
 }
 
 int __sched _cond_resched(void)
@@ -4422,12 +4339,12 @@ EXPORT_SYMBOL(yield);
  * It's the caller's job to ensure that the target task struct
  * can't go away on us before we can do any checks.
  *
- * Returns:
+ * Return:
  *     true (>0) if we indeed boosted the target task.
  *     false (0) if we failed to boost the target.
  *     -ESRCH if there's no task to yield to.
  */
-bool __sched yield_to(struct task_struct *p, bool preempt)
+int __sched yield_to(struct task_struct *p, bool preempt)
 {
        struct task_struct *curr = current;
        struct rq *rq, *p_rq;
@@ -4449,7 +4366,7 @@ again:
        }
 
        double_rq_lock(rq, p_rq);
-       while (task_rq(p) != p_rq) {
+       if (task_rq(p) != p_rq) {
                double_rq_unlock(rq, p_rq);
                goto again;
        }
@@ -4471,7 +4388,7 @@ again:
                 * fairness.
                 */
                if (preempt && rq != p_rq)
-                       resched_task(p_rq->curr);
+                       resched_curr(p_rq);
        }
 
 out_unlock:
@@ -4525,8 +4442,9 @@ long __sched io_schedule_timeout(long timeout)
  * sys_sched_get_priority_max - return maximum RT priority.
  * @policy: scheduling class.
  *
- * this syscall returns the maximum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
  */
 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 {
@@ -4537,6 +4455,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
        case SCHED_RR:
                ret = MAX_USER_RT_PRIO-1;
                break;
+       case SCHED_DEADLINE:
        case SCHED_NORMAL:
        case SCHED_BATCH:
        case SCHED_IDLE:
@@ -4550,8 +4469,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
  * sys_sched_get_priority_min - return minimum RT priority.
  * @policy: scheduling class.
  *
- * this syscall returns the minimum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
  */
 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 {
@@ -4562,6 +4482,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
        case SCHED_RR:
                ret = 1;
                break;
+       case SCHED_DEADLINE:
        case SCHED_NORMAL:
        case SCHED_BATCH:
        case SCHED_IDLE:
@@ -4577,6 +4498,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
  *
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
  */
 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
                struct timespec __user *, interval)
@@ -4602,7 +4526,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
                goto out_unlock;
 
        rq = task_rq_lock(p, &flags);
-       time_slice = p->sched_class->get_rr_interval(rq, p);
+       time_slice = 0;
+       if (p->sched_class->get_rr_interval)
+               time_slice = p->sched_class->get_rr_interval(rq, p);
        task_rq_unlock(rq, p, &flags);
 
        rcu_read_unlock();
@@ -4647,6 +4573,7 @@ void sched_show_task(struct task_struct *p)
                task_pid_nr(p), ppid,
                (unsigned long)task_thread_info(p)->flags);
 
+       print_worker_info(KERN_INFO, p);
        show_stack(p, NULL);
 }
 
@@ -4662,7 +4589,7 @@ void show_state_filter(unsigned long state_filter)
                "  task                        PC stack   pid father\n");
 #endif
        rcu_read_lock();
-       do_each_thread(g, p) {
+       for_each_process_thread(g, p) {
                /*
                 * reset the NMI-timeout, listing all files on a slow
                 * console might take a lot of time:
@@ -4670,7 +4597,7 @@ void show_state_filter(unsigned long state_filter)
                touch_nmi_watchdog();
                if (!state_filter || (p->state & state_filter))
                        sched_show_task(p);
-       } while_each_thread(g, p);
+       }
 
        touch_all_softlockup_watchdogs();
 
@@ -4685,7 +4612,7 @@ void show_state_filter(unsigned long state_filter)
                debug_show_all_locks();
 }
 
-void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+void init_idle_bootup_task(struct task_struct *idle)
 {
        idle->sched_class = &idle_sched_class;
 }
@@ -4698,14 +4625,14 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle)
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
-void __cpuinit init_idle(struct task_struct *idle, int cpu)
+void init_idle(struct task_struct *idle, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
 
        raw_spin_lock_irqsave(&rq->lock, flags);
 
-       __sched_fork(idle);
+       __sched_fork(0, idle);
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
 
@@ -4725,26 +4652,54 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        rcu_read_unlock();
 
        rq->curr = rq->idle = idle;
+       idle->on_rq = TASK_ON_RQ_QUEUED;
 #if defined(CONFIG_SMP)
        idle->on_cpu = 1;
 #endif
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 
        /* Set the preempt count _outside_ the spinlocks! */
-       task_thread_info(idle)->preempt_count = 0;
+       init_idle_preempt_count(idle, cpu);
 
        /*
         * The idle tasks have their own, simple scheduling class:
         */
        idle->sched_class = &idle_sched_class;
        ftrace_graph_init_idle_task(idle, cpu);
-       vtime_init_idle(idle);
+       vtime_init_idle(idle, cpu);
 #if defined(CONFIG_SMP)
        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
 }
 
 #ifdef CONFIG_SMP
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+{
+       struct rq *rq = task_rq(p);
+
+       lockdep_assert_held(&rq->lock);
+
+       dequeue_task(rq, p, 0);
+       p->on_rq = TASK_ON_RQ_MIGRATING;
+       set_task_cpu(p, new_cpu);
+       raw_spin_unlock(&rq->lock);
+
+       rq = cpu_rq(new_cpu);
+
+       raw_spin_lock(&rq->lock);
+       BUG_ON(task_cpu(p) != new_cpu);
+       p->on_rq = TASK_ON_RQ_QUEUED;
+       enqueue_task(rq, p, 0);
+       check_preempt_curr(rq, p, 0);
+
+       return rq;
+}
+
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
        if (p->sched_class && p->sched_class->set_cpus_allowed)
@@ -4794,11 +4749,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
                goto out;
        }
 
-       if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
-               ret = -EINVAL;
-               goto out;
-       }
-
        do_set_cpus_allowed(p, new_mask);
 
        /* Can the task run on the task's current CPU? If so, we're done */
@@ -4806,14 +4756,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
                goto out;
 
        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-       if (p->on_rq) {
+       if (task_running(rq, p) || p->state == TASK_WAKING) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
                task_rq_unlock(rq, p, &flags);
                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                tlb_migrate_finish(p->mm);
                return 0;
-       }
+       } else if (task_on_rq_queued(p))
+               rq = move_queued_task(p, dest_cpu);
 out:
        task_rq_unlock(rq, p, &flags);
 
@@ -4834,20 +4785,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
  */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
-       struct rq *rq_dest, *rq_src;
+       struct rq *rq;
        int ret = 0;
 
        if (unlikely(!cpu_active(dest_cpu)))
                return ret;
 
-       rq_src = cpu_rq(src_cpu);
-       rq_dest = cpu_rq(dest_cpu);
+       rq = cpu_rq(src_cpu);
 
        raw_spin_lock(&p->pi_lock);
-       double_rq_lock(rq_src, rq_dest);
+       raw_spin_lock(&rq->lock);
        /* Already moved. */
        if (task_cpu(p) != src_cpu)
                goto done;
+
        /* Affinity changed (again). */
        if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                goto fail;
@@ -4856,20 +4807,64 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
         * If we're not on a rq, the next wake-up will ensure we're
         * placed properly.
         */
-       if (p->on_rq) {
-               dequeue_task(rq_src, p, 0);
-               set_task_cpu(p, dest_cpu);
-               enqueue_task(rq_dest, p, 0);
-               check_preempt_curr(rq_dest, p, 0);
-       }
+       if (task_on_rq_queued(p))
+               rq = move_queued_task(p, dest_cpu);
 done:
        ret = 1;
 fail:
-       double_rq_unlock(rq_src, rq_dest);
+       raw_spin_unlock(&rq->lock);
        raw_spin_unlock(&p->pi_lock);
        return ret;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+/* Migrate current task p to target_cpu */
+int migrate_task_to(struct task_struct *p, int target_cpu)
+{
+       struct migration_arg arg = { p, target_cpu };
+       int curr_cpu = task_cpu(p);
+
+       if (curr_cpu == target_cpu)
+               return 0;
+
+       if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+               return -EINVAL;
+
+       /* TODO: This is not properly updating schedstats */
+
+       trace_sched_move_numa(p, curr_cpu, target_cpu);
+       return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+}
+
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+       struct rq *rq;
+       unsigned long flags;
+       bool queued, running;
+
+       rq = task_rq_lock(p, &flags);
+       queued = task_on_rq_queued(p);
+       running = task_current(rq, p);
+
+       if (queued)
+               dequeue_task(rq, p, 0);
+       if (running)
+               put_prev_task(rq, p);
+
+       p->numa_preferred_nid = nid;
+
+       if (running)
+               p->sched_class->set_curr_task(rq);
+       if (queued)
+               enqueue_task(rq, p, 0);
+       task_rq_unlock(rq, p, &flags);
+}
+#endif
+
 /*
  * migration_cpu_stop - this will be executed by a highprio stopper thread
  * and performs thread migration by bumping thread off CPU then
@@ -4884,6 +4879,12 @@ static int migration_cpu_stop(void *data)
         * be on another cpu but it doesn't matter.
         */
        local_irq_disable();
+       /*
+        * We need to explicitly wake pending tasks before running
+        * __migrate_task() such that we will not miss enforcing cpus_allowed
+        * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+        */
+       sched_ttwu_pending();
        __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
        local_irq_enable();
        return 0;
@@ -4901,8 +4902,10 @@ void idle_task_exit(void)
 
        BUG_ON(cpu_online(smp_processor_id()));
 
-       if (mm != &init_mm)
+       if (mm != &init_mm) {
                switch_mm(mm, &init_mm, current);
+               finish_arch_post_lock_switch();
+       }
        mmdrop(mm);
 }
 
@@ -4920,6 +4923,22 @@ static void calc_load_migrate(struct rq *rq)
                atomic_long_add(delta, &calc_load_tasks);
 }
 
+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static const struct sched_class fake_sched_class = {
+       .put_prev_task = put_prev_task_fake,
+};
+
+static struct task_struct fake_task = {
+       /*
+        * Avoid pull_{rt,dl}_task()
+        */
+       .prio = MAX_PRIO + 1,
+       .sched_class = &fake_sched_class,
+};
+
 /*
  * Migrate all tasks from the rq, sleeping tasks will be migrated by
  * try_to_wake_up()->select_task_rq().
@@ -4945,6 +4964,13 @@ static void migrate_tasks(unsigned int dead_cpu)
         */
        rq->stop = NULL;
 
+       /*
+        * put_prev_task() and pick_next_task() sched
+        * class method both need to have an up-to-date
+        * value of rq->clock[_task]
+        */
+       update_rq_clock(rq);
+
        for ( ; ; ) {
                /*
                 * There's this thread running, bail when that's the only
@@ -4953,7 +4979,7 @@ static void migrate_tasks(unsigned int dead_cpu)
                if (rq->nr_running == 1)
                        break;
 
-               next = pick_next_task(rq);
+               next = pick_next_task(rq, &fake_task);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
 
@@ -5043,7 +5069,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-       struct ctl_table *table = sd_alloc_ctl_entry(13);
+       struct ctl_table *table = sd_alloc_ctl_entry(14);
 
        if (table == NULL)
                return NULL;
@@ -5071,14 +5097,17 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                sizeof(int), 0644, proc_dointvec_minmax, false);
        set_table_entry(&table[10], "flags", &sd->flags,
                sizeof(int), 0644, proc_dointvec_minmax, false);
-       set_table_entry(&table[11], "name", sd->name,
+       set_table_entry(&table[11], "max_newidle_lb_cost",
+               &sd->max_newidle_lb_cost,
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
+       set_table_entry(&table[12], "name", sd->name,
                CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-       /* &table[12] is terminator */
+       /* &table[13] is terminator */
 
        return table;
 }
 
-static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 {
        struct ctl_table *entry, *table;
        struct sched_domain *sd;
@@ -5180,7 +5209,7 @@ static void set_rq_offline(struct rq *rq)
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
-static int __cpuinit
+static int
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int cpu = (long)hcpu;
@@ -5234,16 +5263,25 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
  * happens before everything else.  This has to be lower priority than
  * the notifier in the perf_event subsystem, though.
  */
-static struct notifier_block __cpuinitdata migration_notifier = {
+static struct notifier_block migration_notifier = {
        .notifier_call = migration_call,
        .priority = CPU_PRI_MIGRATION,
 };
 
-static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+static void __cpuinit set_cpu_rq_start_time(void)
+{
+       int cpu = smp_processor_id();
+       struct rq *rq = cpu_rq(cpu);
+       rq->age_stamp = sched_clock_cpu(cpu);
+}
+
+static int sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_STARTING:
+               set_cpu_rq_start_time();
+               return NOTIFY_OK;
        case CPU_DOWN_FAILED:
                set_cpu_active((long)hcpu, true);
                return NOTIFY_OK;
@@ -5252,16 +5290,39 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
        }
 }
 
-static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+static int sched_cpu_inactive(struct notifier_block *nfb,
                                        unsigned long action, void *hcpu)
 {
+       unsigned long flags;
+       long cpu = (long)hcpu;
+       struct dl_bw *dl_b;
+
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
-               set_cpu_active((long)hcpu, false);
+               set_cpu_active(cpu, false);
+
+               /* explicitly allow suspend */
+               if (!(action & CPU_TASKS_FROZEN)) {
+                       bool overflow;
+                       int cpus;
+
+                       rcu_read_lock_sched();
+                       dl_b = dl_bw_of(cpu);
+
+                       raw_spin_lock_irqsave(&dl_b->lock, flags);
+                       cpus = dl_bw_cpus(cpu);
+                       overflow = __dl_overflow(dl_b, cpus, 0, 0);
+                       raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+                       rcu_read_unlock_sched();
+
+                       if (overflow)
+                               return notifier_from_errno(-EBUSY);
+               }
                return NOTIFY_OK;
-       default:
-               return NOTIFY_DONE;
        }
+
+       return NOTIFY_DONE;
 }
 
 static int __init migration_init(void)
@@ -5344,14 +5405,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                }
 
                /*
-                * Even though we initialize ->power to something semi-sane,
-                * we leave power_orig unset. This allows us to detect if
+                * Even though we initialize ->capacity to something semi-sane,
+                * we leave capacity_orig unset. This allows us to detect if
                 * domain iteration is still funny without causing /0 traps.
                 */
-               if (!group->sgp->power_orig) {
+               if (!group->sgc->capacity_orig) {
                        printk(KERN_CONT "\n");
-                       printk(KERN_ERR "ERROR: domain->cpu_power not "
-                                       "set\n");
+                       printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
                        break;
                }
 
@@ -5373,9 +5433,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 
                printk(KERN_CONT " %s", str);
-               if (group->sgp->power != SCHED_POWER_SCALE) {
-                       printk(KERN_CONT " (cpu_power = %d)",
-                               group->sgp->power);
+               if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+                       printk(KERN_CONT " (cpu_capacity = %d)",
+                               group->sgc->capacity);
                }
 
                group = group->next;
@@ -5433,8 +5493,9 @@ static int sd_degenerate(struct sched_domain *sd)
                         SD_BALANCE_NEWIDLE |
                         SD_BALANCE_FORK |
                         SD_BALANCE_EXEC |
-                        SD_SHARE_CPUPOWER |
-                        SD_SHARE_PKG_RESOURCES)) {
+                        SD_SHARE_CPUCAPACITY |
+                        SD_SHARE_PKG_RESOURCES |
+                        SD_SHARE_POWERDOMAIN)) {
                if (sd->groups != sd->groups->next)
                        return 0;
        }
@@ -5463,8 +5524,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_NEWIDLE |
                                SD_BALANCE_FORK |
                                SD_BALANCE_EXEC |
-                               SD_SHARE_CPUPOWER |
-                               SD_SHARE_PKG_RESOURCES);
+                               SD_SHARE_CPUCAPACITY |
+                               SD_SHARE_PKG_RESOURCES |
+                               SD_PREFER_SIBLING |
+                               SD_SHARE_POWERDOMAIN);
                if (nr_node_ids == 1)
                        pflags &= ~SD_SERIALIZE;
        }
@@ -5479,6 +5542,8 @@ static void free_rootdomain(struct rcu_head *rcu)
        struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
 
        cpupri_cleanup(&rd->cpupri);
+       cpudl_cleanup(&rd->cpudl);
+       free_cpumask_var(rd->dlo_mask);
        free_cpumask_var(rd->rto_mask);
        free_cpumask_var(rd->online);
        free_cpumask_var(rd->span);
@@ -5501,7 +5566,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                cpumask_clear_cpu(rq->cpu, old_rd->span);
 
                /*
-                * If we dont want to free the old_rt yet then
+                * If we dont want to free the old_rd yet then
                 * set old_rd to NULL to skip the freeing later
                 * in this function:
                 */
@@ -5530,8 +5595,14 @@ static int init_rootdomain(struct root_domain *rd)
                goto out;
        if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
                goto free_span;
-       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+       if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
                goto free_online;
+       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+               goto free_dlo_mask;
+
+       init_dl_bw(&rd->dl_bw);
+       if (cpudl_init(&rd->cpudl) != 0)
+               goto free_dlo_mask;
 
        if (cpupri_init(&rd->cpupri) != 0)
                goto free_rto_mask;
@@ -5539,6 +5610,8 @@ static int init_rootdomain(struct root_domain *rd)
 
 free_rto_mask:
        free_cpumask_var(rd->rto_mask);
+free_dlo_mask:
+       free_cpumask_var(rd->dlo_mask);
 free_online:
        free_cpumask_var(rd->online);
 free_span:
@@ -5576,7 +5649,7 @@ static struct root_domain *alloc_rootdomain(void)
        return rd;
 }
 
-static void free_sched_groups(struct sched_group *sg, int free_sgp)
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
 {
        struct sched_group *tmp, *first;
 
@@ -5587,8 +5660,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
        do {
                tmp = sg->next;
 
-               if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
-                       kfree(sg->sgp);
+               if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+                       kfree(sg->sgc);
 
                kfree(sg);
                sg = tmp;
@@ -5606,7 +5679,7 @@ static void free_sched_domain(struct rcu_head *rcu)
        if (sd->flags & SD_OVERLAP) {
                free_sched_groups(sd->groups, 1);
        } else if (atomic_dec_and_test(&sd->groups->ref)) {
-               kfree(sd->groups->sgp);
+               kfree(sd->groups->sgc);
                kfree(sd->groups);
        }
        kfree(sd);
@@ -5633,19 +5706,36 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
  * two cpus are in the same cache domain, see cpus_share_cache().
  */
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 
 static void update_top_cache_domain(int cpu)
 {
        struct sched_domain *sd;
+       struct sched_domain *busy_sd = NULL;
        int id = cpu;
+       int size = 1;
 
        sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
-       if (sd)
+       if (sd) {
                id = cpumask_first(sched_domain_span(sd));
+               size = cpumask_weight(sched_domain_span(sd));
+               busy_sd = sd->parent; /* sd_busy */
+       }
+       rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
 
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+       per_cpu(sd_llc_size, cpu) = size;
        per_cpu(sd_llc_id, cpu) = id;
+
+       sd = lowest_flag_domain(cpu, SD_NUMA);
+       rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+       sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+       rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
 }
 
 /*
@@ -5668,6 +5758,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
                        tmp->parent = parent->parent;
                        if (parent->parent)
                                parent->parent->child = tmp;
+                       /*
+                        * Transfer SD_PREFER_SIBLING down in case of a
+                        * degenerate parent; the spans match for this
+                        * so the property transfers.
+                        */
+                       if (parent->flags & SD_PREFER_SIBLING)
+                               tmp->flags |= SD_PREFER_SIBLING;
                        destroy_sched_domain(parent, cpu);
                } else
                        tmp = tmp->parent;
@@ -5704,17 +5801,6 @@ static int __init isolated_cpu_setup(char *str)
 
 __setup("isolcpus=", isolated_cpu_setup);
 
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
-       return cpumask_of_node(cpu_to_node(cpu));
-}
-
-struct sd_data {
-       struct sched_domain **__percpu sd;
-       struct sched_group **__percpu sg;
-       struct sched_group_power **__percpu sgp;
-};
-
 struct s_data {
        struct sched_domain ** __percpu sd;
        struct root_domain      *rd;
@@ -5727,21 +5813,6 @@ enum s_alloc {
        sa_none,
 };
 
-struct sched_domain_topology_level;
-
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-
-#define SDTL_OVERLAP   0x01
-
-struct sched_domain_topology_level {
-       sched_domain_init_f init;
-       sched_domain_mask_f mask;
-       int                 flags;
-       int                 numa_level;
-       struct sd_data      data;
-};
-
 /*
  * Build an iteration mask that can exclude certain CPUs from the upwards
  * domain traversal.
@@ -5787,7 +5858,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
        const struct cpumask *span = sched_domain_span(sd);
        struct cpumask *covered = sched_domains_tmpmask;
        struct sd_data *sdd = sd->private;
-       struct sched_domain *child;
+       struct sched_domain *sibling;
        int i;
 
        cpumask_clear(covered);
@@ -5798,10 +5869,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                if (cpumask_test_cpu(i, covered))
                        continue;
 
-               child = *per_cpu_ptr(sdd->sd, i);
+               sibling = *per_cpu_ptr(sdd->sd, i);
 
                /* See the comment near build_group_mask(). */
-               if (!cpumask_test_cpu(i, sched_domain_span(child)))
+               if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
                        continue;
 
                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
@@ -5811,24 +5882,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                        goto fail;
 
                sg_span = sched_group_cpus(sg);
-               if (child->child) {
-                       child = child->child;
-                       cpumask_copy(sg_span, sched_domain_span(child));
-               } else
+               if (sibling->child)
+                       cpumask_copy(sg_span, sched_domain_span(sibling->child));
+               else
                        cpumask_set_cpu(i, sg_span);
 
                cpumask_or(covered, covered, sg_span);
 
-               sg->sgp = *per_cpu_ptr(sdd->sgp, i);
-               if (atomic_inc_return(&sg->sgp->ref) == 1)
+               sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+               if (atomic_inc_return(&sg->sgc->ref) == 1)
                        build_group_mask(sd, sg);
 
                /*
-                * Initialize sgp->power such that even if we mess up the
+                * Initialize sgc->capacity such that even if we mess up the
                 * domains and no possible iteration will get us here, we won't
                 * die on a /0 trap.
                 */
-               sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+               sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+               sg->sgc->capacity_orig = sg->sgc->capacity;
 
                /*
                 * Make sure the first group of this domain contains the
@@ -5866,8 +5937,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 
        if (sg) {
                *sg = *per_cpu_ptr(sdd->sg, cpu);
-               (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
-               atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+               (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+               atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
        }
 
        return cpu;
@@ -5876,7 +5947,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 /*
  * build_sched_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
+ * and ->cpu_capacity to 0.
  *
  * Assumes the sched_domain tree is fully constructed
  */
@@ -5892,7 +5963,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
        get_group(cpu, sdd, &sd->groups);
        atomic_inc(&sd->groups->ref);
 
-       if (cpu != cpumask_first(sched_domain_span(sd)))
+       if (cpu != cpumask_first(span))
                return 0;
 
        lockdep_assert_held(&sched_domains_mutex);
@@ -5902,14 +5973,12 @@ build_sched_groups(struct sched_domain *sd, int cpu)
 
        for_each_cpu(i, span) {
                struct sched_group *sg;
-               int group = get_group(i, sdd, &sg);
-               int j;
+               int group, j;
 
                if (cpumask_test_cpu(i, covered))
                        continue;
 
-               cpumask_clear(sched_group_cpus(sg));
-               sg->sgp->power = 0;
+               group = get_group(i, sdd, &sg);
                cpumask_setall(sched_group_mask(sg));
 
                for_each_cpu(j, span) {
@@ -5932,20 +6001,20 @@ build_sched_groups(struct sched_domain *sd, int cpu)
 }
 
 /*
- * Initialize sched groups cpu_power.
+ * Initialize sched groups cpu_capacity.
  *
- * cpu_power indicates the capacity of sched group, which is used while
+ * cpu_capacity indicates the capacity of sched group, which is used while
  * distributing the load between different sched groups in a sched domain.
- * Typically cpu_power for all the groups in a sched domain will be same unless
- * there are asymmetries in the topology. If there are asymmetries, group
- * having more cpu_power will pickup more load compared to the group having
- * less cpu_power.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
  */
-static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 {
        struct sched_group *sg = sd->groups;
 
-       WARN_ON(!sd || !sg);
+       WARN_ON(!sg);
 
        do {
                sg->group_weight = cpumask_weight(sched_group_cpus(sg));
@@ -5955,13 +6024,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        if (cpu != group_balance_cpu(sg))
                return;
 
-       update_group_power(sd, cpu);
-       atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
-}
-
-int __weak arch_sd_sibling_asym_packing(void)
-{
-       return 0*SD_ASYM_PACKING;
+       update_group_capacity(sd, cpu);
+       atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
 }
 
 /*
@@ -5969,34 +6033,6 @@ int __weak arch_sd_sibling_asym_packing(void)
  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
  */
 
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type)                sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type)                do { } while (0)
-#endif
-
-#define SD_INIT_FUNC(type)                                             \
-static noinline struct sched_domain *                                  \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu)        \
-{                                                                      \
-       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
-       *sd = SD_##type##_INIT;                                         \
-       SD_INIT_NAME(sd, type);                                         \
-       sd->private = &tl->data;                                        \
-       return sd;                                                      \
-}
-
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
-
 static int default_relax_domain_level = -1;
 int sched_domain_level_max;
 
@@ -6080,97 +6116,158 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
        if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
                *per_cpu_ptr(sdd->sg, cpu) = NULL;
 
-       if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
-               *per_cpu_ptr(sdd->sgp, cpu) = NULL;
-}
-
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
-{
-       return topology_thread_cpumask(cpu);
+       if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+               *per_cpu_ptr(sdd->sgc, cpu) = NULL;
 }
-#endif
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-       { sd_init_SIBLING, cpu_smt_mask, },
-#endif
-#ifdef CONFIG_SCHED_MC
-       { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
-       { sd_init_BOOK, cpu_book_mask, },
-#endif
-       { sd_init_CPU, cpu_cpu_mask, },
-       { NULL, },
-};
-
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
 
 #ifdef CONFIG_NUMA
-
 static int sched_domains_numa_levels;
 static int *sched_domains_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
+#endif
 
-static inline int sd_local_flags(int level)
-{
-       if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
-               return 0;
-
-       return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
-}
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUCAPACITY      - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA                - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING        - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS              \
+       (SD_SHARE_CPUCAPACITY |         \
+        SD_SHARE_PKG_RESOURCES |       \
+        SD_NUMA |                      \
+        SD_ASYM_PACKING |              \
+        SD_SHARE_POWERDOMAIN)
 
 static struct sched_domain *
-sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl, int cpu)
 {
        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
-       int level = tl->numa_level;
-       int sd_weight = cpumask_weight(
-                       sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+       int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+       /*
+        * Ugly hack to pass state to sd_numa_mask()...
+        */
+       sched_domains_curr_level = tl->numa_level;
+#endif
+
+       sd_weight = cpumask_weight(tl->mask(cpu));
+
+       if (tl->sd_flags)
+               sd_flags = (*tl->sd_flags)();
+       if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+                       "wrong sd_flags in topology description\n"))
+               sd_flags &= ~TOPOLOGY_SD_FLAGS;
 
        *sd = (struct sched_domain){
                .min_interval           = sd_weight,
                .max_interval           = 2*sd_weight,
                .busy_factor            = 32,
                .imbalance_pct          = 125,
-               .cache_nice_tries       = 2,
-               .busy_idx               = 3,
-               .idle_idx               = 2,
+
+               .cache_nice_tries       = 0,
+               .busy_idx               = 0,
+               .idle_idx               = 0,
                .newidle_idx            = 0,
                .wake_idx               = 0,
                .forkexec_idx           = 0,
 
                .flags                  = 1*SD_LOAD_BALANCE
                                        | 1*SD_BALANCE_NEWIDLE
-                                       | 0*SD_BALANCE_EXEC
-                                       | 0*SD_BALANCE_FORK
+                                       | 1*SD_BALANCE_EXEC
+                                       | 1*SD_BALANCE_FORK
                                        | 0*SD_BALANCE_WAKE
-                                       | 0*SD_WAKE_AFFINE
-                                       | 0*SD_SHARE_CPUPOWER
+                                       | 1*SD_WAKE_AFFINE
+                                       | 0*SD_SHARE_CPUCAPACITY
                                        | 0*SD_SHARE_PKG_RESOURCES
-                                       | 1*SD_SERIALIZE
+                                       | 0*SD_SERIALIZE
                                        | 0*SD_PREFER_SIBLING
-                                       | sd_local_flags(level)
+                                       | 0*SD_NUMA
+                                       | sd_flags
                                        ,
+
                .last_balance           = jiffies,
                .balance_interval       = sd_weight,
+               .smt_gain               = 0,
+               .max_newidle_lb_cost    = 0,
+               .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+               .name                   = tl->name,
+#endif
        };
-       SD_INIT_NAME(sd, NUMA);
-       sd->private = &tl->data;
 
        /*
-        * Ugly hack to pass state to sd_numa_mask()...
+        * Convert topological properties into behaviour.
         */
-       sched_domains_curr_level = tl->numa_level;
+
+       if (sd->flags & SD_SHARE_CPUCAPACITY) {
+               sd->imbalance_pct = 110;
+               sd->smt_gain = 1178; /* ~15% */
+
+       } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+               sd->imbalance_pct = 117;
+               sd->cache_nice_tries = 1;
+               sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+       } else if (sd->flags & SD_NUMA) {
+               sd->cache_nice_tries = 2;
+               sd->busy_idx = 3;
+               sd->idle_idx = 2;
+
+               sd->flags |= SD_SERIALIZE;
+               if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+                       sd->flags &= ~(SD_BALANCE_EXEC |
+                                      SD_BALANCE_FORK |
+                                      SD_WAKE_AFFINE);
+               }
+
+#endif
+       } else {
+               sd->flags |= SD_PREFER_SIBLING;
+               sd->cache_nice_tries = 1;
+               sd->busy_idx = 2;
+               sd->idle_idx = 1;
+       }
+
+       sd->private = &tl->data;
 
        return sd;
 }
 
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+       { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+       { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+       { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+       { NULL, },
+};
+
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+#define for_each_sd_topology(tl)                       \
+       for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+       sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
 static const struct cpumask *sd_numa_mask(int cpu)
 {
        return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6265,11 +6362,15 @@ static void sched_init_numa(void)
                if (!sched_debug())
                        break;
        }
+
+       if (!level)
+               return;
+
        /*
         * 'level' contains the number of unique distances, excluding the
         * identity distance node_distance(i,i).
         *
-        * The sched_domains_nume_distance[] array includes the actual distance
+        * The sched_domains_numa_distance[] array includes the actual distance
         * numbers.
         */
 
@@ -6314,7 +6415,10 @@ static void sched_init_numa(void)
                }
        }
 
-       tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+       /* Compute default topology size */
+       for (i = 0; sched_domain_topology[i].mask; i++);
+
+       tl = kzalloc((i + level + 1) *
                        sizeof(struct sched_domain_topology_level), GFP_KERNEL);
        if (!tl)
                return;
@@ -6322,18 +6426,19 @@ static void sched_init_numa(void)
        /*
         * Copy the default topology bits..
         */
-       for (i = 0; default_topology[i].init; i++)
-               tl[i] = default_topology[i];
+       for (i = 0; sched_domain_topology[i].mask; i++)
+               tl[i] = sched_domain_topology[i];
 
        /*
         * .. and append 'j' levels of NUMA goodness.
         */
        for (j = 0; j < level; i++, j++) {
                tl[i] = (struct sched_domain_topology_level){
-                       .init = sd_numa_init,
                        .mask = sd_numa_mask,
+                       .sd_flags = cpu_numa_flags,
                        .flags = SDTL_OVERLAP,
                        .numa_level = j,
+                       SD_INIT_NAME(NUMA)
                };
        }
 
@@ -6407,7 +6512,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
        struct sched_domain_topology_level *tl;
        int j;
 
-       for (tl = sched_domain_topology; tl->init; tl++) {
+       for_each_sd_topology(tl) {
                struct sd_data *sdd = &tl->data;
 
                sdd->sd = alloc_percpu(struct sched_domain *);
@@ -6418,14 +6523,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                if (!sdd->sg)
                        return -ENOMEM;
 
-               sdd->sgp = alloc_percpu(struct sched_group_power *);
-               if (!sdd->sgp)
+               sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+               if (!sdd->sgc)
                        return -ENOMEM;
 
                for_each_cpu(j, cpu_map) {
                        struct sched_domain *sd;
                        struct sched_group *sg;
-                       struct sched_group_power *sgp;
+                       struct sched_group_capacity *sgc;
 
                        sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
@@ -6443,12 +6548,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
                        *per_cpu_ptr(sdd->sg, j) = sg;
 
-                       sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
+                       sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
-                       if (!sgp)
+                       if (!sgc)
                                return -ENOMEM;
 
-                       *per_cpu_ptr(sdd->sgp, j) = sgp;
+                       *per_cpu_ptr(sdd->sgc, j) = sgc;
                }
        }
 
@@ -6460,7 +6565,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
        struct sched_domain_topology_level *tl;
        int j;
 
-       for (tl = sched_domain_topology; tl->init; tl++) {
+       for_each_sd_topology(tl) {
                struct sd_data *sdd = &tl->data;
 
                for_each_cpu(j, cpu_map) {
@@ -6475,24 +6580,23 @@ static void __sdt_free(const struct cpumask *cpu_map)
 
                        if (sdd->sg)
                                kfree(*per_cpu_ptr(sdd->sg, j));
-                       if (sdd->sgp)
-                               kfree(*per_cpu_ptr(sdd->sgp, j));
+                       if (sdd->sgc)
+                               kfree(*per_cpu_ptr(sdd->sgc, j));
                }
                free_percpu(sdd->sd);
                sdd->sd = NULL;
                free_percpu(sdd->sg);
                sdd->sg = NULL;
-               free_percpu(sdd->sgp);
-               sdd->sgp = NULL;
+               free_percpu(sdd->sgc);
+               sdd->sgc = NULL;
        }
 }
 
 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
-               struct s_data *d, const struct cpumask *cpu_map,
-               struct sched_domain_attr *attr, struct sched_domain *child,
-               int cpu)
+               const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+               struct sched_domain *child, int cpu)
 {
-       struct sched_domain *sd = tl->init(tl, cpu);
+       struct sched_domain *sd = sd_init(tl, cpu);
        if (!sd)
                return child;
 
@@ -6501,8 +6605,22 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                sd->level = child->level + 1;
                sched_domain_level_max = max(sched_domain_level_max, sd->level);
                child->parent = sd;
+               sd->child = child;
+
+               if (!cpumask_subset(sched_domain_span(child),
+                                   sched_domain_span(sd))) {
+                       pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+                       pr_err("     the %s domain not a subset of the %s domain\n",
+                                       child->name, sd->name);
+#endif
+                       /* Fixup, ensure @sd has at least @child cpus. */
+                       cpumask_or(sched_domain_span(sd),
+                                  sched_domain_span(sd),
+                                  sched_domain_span(child));
+               }
+
        }
-       sd->child = child;
        set_domain_attribute(sd, attr);
 
        return sd;
@@ -6515,7 +6633,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 static int build_sched_domains(const struct cpumask *cpu_map,
                               struct sched_domain_attr *attr)
 {
-       enum s_alloc alloc_state = sa_none;
+       enum s_alloc alloc_state;
        struct sched_domain *sd;
        struct s_data d;
        int i, ret = -ENOMEM;
@@ -6529,18 +6647,15 @@ static int build_sched_domains(const struct cpumask *cpu_map,
                struct sched_domain_topology_level *tl;
 
                sd = NULL;
-               for (tl = sched_domain_topology; tl->init; tl++) {
-                       sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+               for_each_sd_topology(tl) {
+                       sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+                       if (tl == sched_domain_topology)
+                               *per_cpu_ptr(d.sd, i) = sd;
                        if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
                                sd->flags |= SD_OVERLAP;
                        if (cpumask_equal(cpu_map, sched_domain_span(sd)))
                                break;
                }
-
-               while (sd->child)
-                       sd = sd->child;
-
-               *per_cpu_ptr(d.sd, i) = sd;
        }
 
        /* Build the groups for the domains */
@@ -6557,14 +6672,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
                }
        }
 
-       /* Calculate CPU power for physical packages and nodes */
+       /* Calculate CPU capacity for physical packages and nodes */
        for (i = nr_cpumask_bits-1; i >= 0; i--) {
                if (!cpumask_test_cpu(i, cpu_map))
                        continue;
 
                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
                        claim_allocations(i, sd);
-                       init_sched_groups_power(i, sd);
+                       init_sched_groups_capacity(i, sd);
                }
        }
 
@@ -6599,7 +6714,7 @@ static cpumask_var_t fallback_doms;
  * cpu core maps. It is supposed to return 1 if the topology changed
  * or 0 if it stayed the same.
  */
-int __attribute__((weak)) arch_update_cpu_topology(void)
+int __weak arch_update_cpu_topology(void)
 {
        return 0;
 }
@@ -6735,8 +6850,9 @@ match1:
                ;
        }
 
+       n = ndoms_cur;
        if (doms_new == NULL) {
-               ndoms_cur = 0;
+               n = 0;
                doms_new = &fallback_doms;
                cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
                WARN_ON_ONCE(dattr_new);
@@ -6744,7 +6860,7 @@ match1:
 
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
-               for (j = 0; j < ndoms_cur && !new_topology; j++) {
+               for (j = 0; j < n && !new_topology; j++) {
                        if (cpumask_equal(doms_new[i], doms_cur[j])
                            && dattrs_equal(dattr_new, i, dattr_cur, j))
                                goto match2;
@@ -6839,22 +6955,22 @@ void __init sched_init_smp(void)
 
        sched_init_numa();
 
-       get_online_cpus();
+       /*
+        * There's no userspace yet to cause hotplug operations; hence all the
+        * cpu masks are stable and all blatant races in the below code cannot
+        * happen.
+        */
        mutex_lock(&sched_domains_mutex);
        init_sched_domains(cpu_active_mask);
        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        if (cpumask_empty(non_isolated_cpus))
                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
        mutex_unlock(&sched_domains_mutex);
-       put_online_cpus();
 
        hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
        hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
        hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
 
-       /* RT runtime code needs to handle some hotplug events */
-       hotcpu_notifier(update_runtime, 0);
-
        init_hrtick();
 
        /* Move init over to a non-isolated CPU */
@@ -6864,6 +6980,7 @@ void __init sched_init_smp(void)
        free_cpumask_var(non_isolated_cpus);
 
        init_sched_rt_class();
+       init_sched_dl_class();
 }
 #else
 void __init sched_init_smp(void)
@@ -6882,11 +6999,15 @@ int in_sched_functions(unsigned long addr)
 }
 
 #ifdef CONFIG_CGROUP_SCHED
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
 struct task_group root_task_group;
 LIST_HEAD(task_groups);
 #endif
 
-DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
 
 void __init sched_init(void)
 {
@@ -6923,19 +7044,21 @@ void __init sched_init(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
                for_each_possible_cpu(i) {
-                       per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+                       per_cpu(load_balance_mask, i) = (void *)ptr;
                        ptr += cpumask_size();
                }
 #endif /* CONFIG_CPUMASK_OFFSTACK */
        }
 
+       init_rt_bandwidth(&def_rt_bandwidth,
+                       global_rt_period(), global_rt_runtime());
+       init_dl_bandwidth(&def_dl_bandwidth,
+                       global_rt_period(), global_rt_runtime());
+
 #ifdef CONFIG_SMP
        init_defrootdomain();
 #endif
 
-       init_rt_bandwidth(&def_rt_bandwidth,
-                       global_rt_period(), global_rt_runtime());
-
 #ifdef CONFIG_RT_GROUP_SCHED
        init_rt_bandwidth(&root_task_group.rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
@@ -6949,12 +7072,6 @@ void __init sched_init(void)
 
 #endif /* CONFIG_CGROUP_SCHED */
 
-#ifdef CONFIG_CGROUP_CPUACCT
-       root_cpuacct.cpustat = &kernel_cpustat;
-       root_cpuacct.cpuusage = alloc_percpu(u64);
-       /* Too early, not expected to fail */
-       BUG_ON(!root_cpuacct.cpuusage);
-#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
 
@@ -6965,6 +7082,7 @@ void __init sched_init(void)
                rq->calc_load_update = jiffies + LOAD_FREQ;
                init_cfs_rq(&rq->cfs);
                init_rt_rq(&rq->rt, rq);
+               init_dl_rq(&rq->dl, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6993,7 +7111,6 @@ void __init sched_init(void)
 
                rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
-               INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
 
@@ -7005,7 +7122,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
-               rq->cpu_power = SCHED_POWER_SCALE;
+               rq->cpu_capacity = SCHED_CAPACITY_SCALE;
                rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
@@ -7014,13 +7131,17 @@ void __init sched_init(void)
                rq->online = 0;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
+               rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 
                INIT_LIST_HEAD(&rq->cfs_tasks);
 
                rq_attach_root(rq, &def_root_domain);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
                rq->nohz_flags = 0;
 #endif
+#ifdef CONFIG_NO_HZ_FULL
+               rq->last_sched_tick = 0;
+#endif
 #endif
                init_rq_hrtick(rq);
                atomic_set(&rq->nr_iowait, 0);
@@ -7032,10 +7153,6 @@ void __init sched_init(void)
        INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif
 
-#ifdef CONFIG_RT_MUTEXES
-       plist_head_init(&init_task.pi_waiters);
-#endif
-
        /*
         * The boot idle thread does lazy MMU switching as well:
         */
@@ -7063,6 +7180,7 @@ void __init sched_init(void)
        if (cpu_isolated_map == NULL)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
        idle_thread_set_boot_cpu();
+       set_cpu_rq_start_time();
 #endif
        init_sched_fair_class();
 
@@ -7082,7 +7200,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
        static unsigned long prev_jiffy;        /* ratelimiting */
 
        rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
-       if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+       if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+            !is_idle_task(current)) ||
            system_state != SYSTEM_RUNNING || oops_in_progress)
                return;
        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -7100,6 +7219,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
        debug_show_held_locks(current);
        if (irqs_disabled())
                print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+       if (!preempt_count_equals(preempt_offset)) {
+               pr_err("Preemption disabled at:");
+               print_ip_sym(current->preempt_disable_ip);
+               pr_cont("\n");
+       }
+#endif
        dump_stack();
 }
 EXPORT_SYMBOL(__might_sleep);
@@ -7109,16 +7235,19 @@ EXPORT_SYMBOL(__might_sleep);
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
        const struct sched_class *prev_class = p->sched_class;
+       struct sched_attr attr = {
+               .sched_policy = SCHED_NORMAL,
+       };
        int old_prio = p->prio;
-       int on_rq;
+       int queued;
 
-       on_rq = p->on_rq;
-       if (on_rq)
+       queued = task_on_rq_queued(p);
+       if (queued)
                dequeue_task(rq, p, 0);
-       __setscheduler(rq, p, SCHED_NORMAL, 0);
-       if (on_rq) {
+       __setscheduler(rq, p, &attr);
+       if (queued) {
                enqueue_task(rq, p, 0);
-               resched_task(rq->curr);
+               resched_curr(rq);
        }
 
        check_class_changed(rq, p, prev_class, old_prio);
@@ -7130,12 +7259,12 @@ void normalize_rt_tasks(void)
        unsigned long flags;
        struct rq *rq;
 
-       read_lock_irqsave(&tasklist_lock, flags);
-       do_each_thread(g, p) {
+       read_lock(&tasklist_lock);
+       for_each_process_thread(g, p) {
                /*
                 * Only normalize user tasks:
                 */
-               if (!p->mm)
+               if (p->flags & PF_KTHREAD)
                        continue;
 
                p->se.exec_start                = 0;
@@ -7145,26 +7274,21 @@ void normalize_rt_tasks(void)
                p->se.statistics.block_start    = 0;
 #endif
 
-               if (!rt_task(p)) {
+               if (!dl_task(p) && !rt_task(p)) {
                        /*
                         * Renice negative nice level userspace
                         * tasks back to 0:
                         */
-                       if (TASK_NICE(p) < 0 && p->mm)
+                       if (task_nice(p) < 0)
                                set_user_nice(p, 0);
                        continue;
                }
 
-               raw_spin_lock(&p->pi_lock);
-               rq = __task_rq_lock(p);
-
+               rq = task_rq_lock(p, &flags);
                normalize_task(rq, p);
-
-               __task_rq_unlock(rq);
-               raw_spin_unlock(&p->pi_lock);
-       } while_each_thread(g, p);
-
-       read_unlock_irqrestore(&tasklist_lock, flags);
+               task_rq_unlock(rq, p, &flags);
+       }
+       read_unlock(&tasklist_lock);
 }
 
 #endif /* CONFIG_MAGIC_SYSRQ */
@@ -7185,6 +7309,8 @@ void normalize_rt_tasks(void)
  * @cpu: the processor in question.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
  */
 struct task_struct *curr_task(int cpu)
 {
@@ -7302,52 +7428,46 @@ void sched_offline_group(struct task_group *tg)
 void sched_move_task(struct task_struct *tsk)
 {
        struct task_group *tg;
-       int on_rq, running;
+       int queued, running;
        unsigned long flags;
        struct rq *rq;
 
        rq = task_rq_lock(tsk, &flags);
 
        running = task_current(rq, tsk);
-       on_rq = tsk->on_rq;
+       queued = task_on_rq_queued(tsk);
 
-       if (on_rq)
+       if (queued)
                dequeue_task(rq, tsk, 0);
        if (unlikely(running))
-               tsk->sched_class->put_prev_task(rq, tsk);
+               put_prev_task(rq, tsk);
 
-       tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
-                               lockdep_is_held(&tsk->sighand->siglock)),
+       /*
+        * All callers are synchronized by task_rq_lock(); we do not use RCU
+        * which is pointless here. Thus, we pass "true" to task_css_check()
+        * to prevent lockdep warnings.
+        */
+       tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
                          struct task_group, css);
        tg = autogroup_task_group(tsk, tg);
        tsk->sched_task_group = tg;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->task_move_group)
-               tsk->sched_class->task_move_group(tsk, on_rq);
+               tsk->sched_class->task_move_group(tsk, queued);
        else
 #endif
                set_task_rq(tsk, task_cpu(tsk));
 
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
-       if (on_rq)
+       if (queued)
                enqueue_task(rq, tsk, 0);
 
        task_rq_unlock(rq, tsk, &flags);
 }
 #endif /* CONFIG_CGROUP_SCHED */
 
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-static unsigned long to_ratio(u64 period, u64 runtime)
-{
-       if (runtime == RUNTIME_INF)
-               return 1ULL << 20;
-
-       return div64_u64(runtime << 20, period);
-}
-#endif
-
 #ifdef CONFIG_RT_GROUP_SCHED
 /*
  * Ensure that the real time constraints are schedulable.
@@ -7359,10 +7479,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
 {
        struct task_struct *g, *p;
 
-       do_each_thread(g, p) {
-               if (rt_task(p) && task_rq(p)->rt.tg == tg)
+       for_each_process_thread(g, p) {
+               if (rt_task(p) && task_group(p) == tg)
                        return 1;
-       } while_each_thread(g, p);
+       }
 
        return 0;
 }
@@ -7476,7 +7596,7 @@ unlock:
        return err;
 }
 
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
        u64 rt_runtime, rt_period;
 
@@ -7488,7 +7608,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 
-long sched_group_rt_runtime(struct task_group *tg)
+static long sched_group_rt_runtime(struct task_group *tg)
 {
        u64 rt_runtime_us;
 
@@ -7500,7 +7620,7 @@ long sched_group_rt_runtime(struct task_group *tg)
        return rt_runtime_us;
 }
 
-int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 {
        u64 rt_runtime, rt_period;
 
@@ -7513,7 +7633,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 
-long sched_group_rt_period(struct task_group *tg)
+static long sched_group_rt_period(struct task_group *tg)
 {
        u64 rt_period_us;
 
@@ -7521,24 +7641,13 @@ long sched_group_rt_period(struct task_group *tg)
        do_div(rt_period_us, NSEC_PER_USEC);
        return rt_period_us;
 }
+#endif /* CONFIG_RT_GROUP_SCHED */
 
+#ifdef CONFIG_RT_GROUP_SCHED
 static int sched_rt_global_constraints(void)
 {
-       u64 runtime, period;
        int ret = 0;
 
-       if (sysctl_sched_rt_period <= 0)
-               return -EINVAL;
-
-       runtime = global_rt_runtime();
-       period = global_rt_period();
-
-       /*
-        * Sanity check on the sysctl variables.
-        */
-       if (runtime > period && runtime != RUNTIME_INF)
-               return -EINVAL;
-
        mutex_lock(&rt_constraints_mutex);
        read_lock(&tasklist_lock);
        ret = __rt_schedulable(NULL, 0, 0);
@@ -7548,7 +7657,7 @@ static int sched_rt_global_constraints(void)
        return ret;
 }
 
-int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
        /* Don't accept realtime tasks when there is no way for them to run */
        if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
@@ -7561,17 +7670,7 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 static int sched_rt_global_constraints(void)
 {
        unsigned long flags;
-       int i;
-
-       if (sysctl_sched_rt_period <= 0)
-               return -EINVAL;
-
-       /*
-        * There's always some RT tasks in the root group
-        * -- migration, kstopmachine etc..
-        */
-       if (sysctl_sched_rt_runtime == 0)
-               return -EBUSY;
+       int i, ret = 0;
 
        raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
@@ -7583,36 +7682,99 @@ static int sched_rt_global_constraints(void)
        }
        raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
 
-       return 0;
+       return ret;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-int sched_rr_handler(struct ctl_table *table, int write,
-               void __user *buffer, size_t *lenp,
-               loff_t *ppos)
+static int sched_dl_global_constraints(void)
 {
-       int ret;
-       static DEFINE_MUTEX(mutex);
+       u64 runtime = global_rt_runtime();
+       u64 period = global_rt_period();
+       u64 new_bw = to_ratio(period, runtime);
+       struct dl_bw *dl_b;
+       int cpu, ret = 0;
+       unsigned long flags;
 
-       mutex_lock(&mutex);
-       ret = proc_dointvec(table, write, buffer, lenp, ppos);
-       /* make sure that internally we keep jiffies */
-       /* also, writing zero resets timeslice to default */
-       if (!ret && write) {
-               sched_rr_timeslice = sched_rr_timeslice <= 0 ?
-                       RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+       /*
+        * Here we want to check the bandwidth not being set to some
+        * value smaller than the currently allocated bandwidth in
+        * any of the root_domains.
+        *
+        * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+        * cycling on root_domains... Discussion on different/better
+        * solutions is welcome!
+        */
+       for_each_possible_cpu(cpu) {
+               rcu_read_lock_sched();
+               dl_b = dl_bw_of(cpu);
+
+               raw_spin_lock_irqsave(&dl_b->lock, flags);
+               if (new_bw < dl_b->total_bw)
+                       ret = -EBUSY;
+               raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+               rcu_read_unlock_sched();
+
+               if (ret)
+                       break;
        }
-       mutex_unlock(&mutex);
+
        return ret;
 }
 
+static void sched_dl_do_global(void)
+{
+       u64 new_bw = -1;
+       struct dl_bw *dl_b;
+       int cpu;
+       unsigned long flags;
+
+       def_dl_bandwidth.dl_period = global_rt_period();
+       def_dl_bandwidth.dl_runtime = global_rt_runtime();
+
+       if (global_rt_runtime() != RUNTIME_INF)
+               new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+
+       /*
+        * FIXME: As above...
+        */
+       for_each_possible_cpu(cpu) {
+               rcu_read_lock_sched();
+               dl_b = dl_bw_of(cpu);
+
+               raw_spin_lock_irqsave(&dl_b->lock, flags);
+               dl_b->bw = new_bw;
+               raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+               rcu_read_unlock_sched();
+       }
+}
+
+static int sched_rt_global_validate(void)
+{
+       if (sysctl_sched_rt_period <= 0)
+               return -EINVAL;
+
+       if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+               (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+               return -EINVAL;
+
+       return 0;
+}
+
+static void sched_rt_do_global(void)
+{
+       def_rt_bandwidth.rt_runtime = global_rt_runtime();
+       def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+}
+
 int sched_rt_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-       int ret;
        int old_period, old_runtime;
        static DEFINE_MUTEX(mutex);
+       int ret;
 
        mutex_lock(&mutex);
        old_period = sysctl_sched_rt_period;
@@ -7621,40 +7783,68 @@ int sched_rt_handler(struct ctl_table *table, int write,
        ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
        if (!ret && write) {
+               ret = sched_rt_global_validate();
+               if (ret)
+                       goto undo;
+
                ret = sched_rt_global_constraints();
-               if (ret) {
-                       sysctl_sched_rt_period = old_period;
-                       sysctl_sched_rt_runtime = old_runtime;
-               } else {
-                       def_rt_bandwidth.rt_runtime = global_rt_runtime();
-                       def_rt_bandwidth.rt_period =
-                               ns_to_ktime(global_rt_period());
-               }
+               if (ret)
+                       goto undo;
+
+               ret = sched_dl_global_constraints();
+               if (ret)
+                       goto undo;
+
+               sched_rt_do_global();
+               sched_dl_do_global();
+       }
+       if (0) {
+undo:
+               sysctl_sched_rt_period = old_period;
+               sysctl_sched_rt_runtime = old_runtime;
        }
        mutex_unlock(&mutex);
 
        return ret;
 }
 
+int sched_rr_handler(struct ctl_table *table, int write,
+               void __user *buffer, size_t *lenp,
+               loff_t *ppos)
+{
+       int ret;
+       static DEFINE_MUTEX(mutex);
+
+       mutex_lock(&mutex);
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
+       /* make sure that internally we keep jiffies */
+       /* also, writing zero resets timeslice to default */
+       if (!ret && write) {
+               sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+                       RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+       }
+       mutex_unlock(&mutex);
+       return ret;
+}
+
 #ifdef CONFIG_CGROUP_SCHED
 
-/* return corresponding task_group object of a cgroup */
-static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
 {
-       return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
-                           struct task_group, css);
+       return css ? container_of(css, struct task_group, css) : NULL;
 }
 
-static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
-       struct task_group *tg, *parent;
+       struct task_group *parent = css_tg(parent_css);
+       struct task_group *tg;
 
-       if (!cgrp->parent) {
+       if (!parent) {
                /* This is early initialization for the top cgroup */
                return &root_task_group.css;
        }
 
-       parent = cgroup_tg(cgrp->parent);
        tg = sched_create_group(parent);
        if (IS_ERR(tg))
                return ERR_PTR(-ENOMEM);
@@ -7662,41 +7852,43 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
        return &tg->css;
 }
 
-static int cpu_cgroup_css_online(struct cgroup *cgrp)
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 {
-       struct task_group *tg = cgroup_tg(cgrp);
-       struct task_group *parent;
+       struct task_group *tg = css_tg(css);
+       struct task_group *parent = css_tg(css->parent);
 
-       if (!cgrp->parent)
-               return 0;
-
-       parent = cgroup_tg(cgrp->parent);
-       sched_online_group(tg, parent);
+       if (parent)
+               sched_online_group(tg, parent);
        return 0;
 }
 
-static void cpu_cgroup_css_free(struct cgroup *cgrp)
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
 {
-       struct task_group *tg = cgroup_tg(cgrp);
+       struct task_group *tg = css_tg(css);
 
        sched_destroy_group(tg);
 }
 
-static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
-       struct task_group *tg = cgroup_tg(cgrp);
+       struct task_group *tg = css_tg(css);
 
        sched_offline_group(tg);
 }
 
-static int cpu_cgroup_can_attach(struct cgroup *cgrp,
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+       sched_move_task(task);
+}
+
+static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
                                 struct cgroup_taskset *tset)
 {
        struct task_struct *task;
 
-       cgroup_taskset_for_each(task, cgrp, tset) {
+       cgroup_taskset_for_each(task, tset) {
 #ifdef CONFIG_RT_GROUP_SCHED
-               if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+               if (!sched_rt_can_attach(css_tg(css), task))
                        return -EINVAL;
 #else
                /* We don't support RT-tasks being in separate groups */
@@ -7707,18 +7899,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp,
        return 0;
 }
 
-static void cpu_cgroup_attach(struct cgroup *cgrp,
+static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
                              struct cgroup_taskset *tset)
 {
        struct task_struct *task;
 
-       cgroup_taskset_for_each(task, cgrp, tset)
+       cgroup_taskset_for_each(task, tset)
                sched_move_task(task);
 }
 
-static void
-cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
-               struct task_struct *task)
+static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
+                           struct cgroup_subsys_state *old_css,
+                           struct task_struct *task)
 {
        /*
         * cgroup_exit() is called in the copy_process() failure path.
@@ -7732,15 +7924,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
-                               u64 shareval)
+static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
+                               struct cftype *cftype, u64 shareval)
 {
-       return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
+       return sched_group_set_shares(css_tg(css), scale_load(shareval));
 }
 
-static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
+                              struct cftype *cft)
 {
-       struct task_group *tg = cgroup_tg(cgrp);
+       struct task_group *tg = css_tg(css);
 
        return (u64) scale_load_down(tg->shares);
 }
@@ -7777,6 +7970,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        if (period > max_cfs_quota_period)
                return -EINVAL;
 
+       /*
+        * Prevent race between setting of cfs_rq->runtime_enabled and
+        * unthrottle_offline_cfs_rqs().
+        */
+       get_online_cpus();
        mutex_lock(&cfs_constraints_mutex);
        ret = __cfs_schedulable(tg, period, quota);
        if (ret)
@@ -7784,7 +7982,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 
        runtime_enabled = quota != RUNTIME_INF;
        runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
-       account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+       /*
+        * If we need to toggle cfs_bandwidth_used, off->on must occur
+        * before making related changes, and on->off must occur afterwards
+        */
+       if (runtime_enabled && !runtime_was_enabled)
+               cfs_bandwidth_usage_inc();
        raw_spin_lock_irq(&cfs_b->lock);
        cfs_b->period = ns_to_ktime(period);
        cfs_b->quota = quota;
@@ -7793,12 +7996,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        /* restart the period timer (if active) to handle new period expiry */
        if (runtime_enabled && cfs_b->timer_active) {
                /* force a reprogram */
-               cfs_b->timer_active = 0;
-               __start_cfs_bandwidth(cfs_b);
+               __start_cfs_bandwidth(cfs_b, true);
        }
        raw_spin_unlock_irq(&cfs_b->lock);
 
-       for_each_possible_cpu(i) {
+       for_each_online_cpu(i) {
                struct cfs_rq *cfs_rq = tg->cfs_rq[i];
                struct rq *rq = cfs_rq->rq;
 
@@ -7810,8 +8012,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
                        unthrottle_cfs_rq(cfs_rq);
                raw_spin_unlock_irq(&rq->lock);
        }
+       if (runtime_was_enabled && !runtime_enabled)
+               cfs_bandwidth_usage_dec();
 out_unlock:
        mutex_unlock(&cfs_constraints_mutex);
+       put_online_cpus();
 
        return ret;
 }
@@ -7862,26 +8067,28 @@ long tg_get_cfs_period(struct task_group *tg)
        return cfs_period_us;
 }
 
-static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
+                                 struct cftype *cft)
 {
-       return tg_get_cfs_quota(cgroup_tg(cgrp));
+       return tg_get_cfs_quota(css_tg(css));
 }
 
-static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
-                               s64 cfs_quota_us)
+static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
+                                  struct cftype *cftype, s64 cfs_quota_us)
 {
-       return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+       return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
 }
 
-static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
+                                  struct cftype *cft)
 {
-       return tg_get_cfs_period(cgroup_tg(cgrp));
+       return tg_get_cfs_period(css_tg(css));
 }
 
-static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
-                               u64 cfs_period_us)
+static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
+                                   struct cftype *cftype, u64 cfs_period_us)
 {
-       return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+       return tg_set_cfs_period(css_tg(css), cfs_period_us);
 }
 
 struct cfs_schedulable_data {
@@ -7925,7 +8132,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
                struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
 
                quota = normalize_cfs_quota(tg, d);
-               parent_quota = parent_b->hierarchal_quota;
+               parent_quota = parent_b->hierarchical_quota;
 
                /*
                 * ensure max(child_quota) <= parent_quota, inherit when no
@@ -7936,7 +8143,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
                else if (parent_quota != RUNTIME_INF && quota > parent_quota)
                        return -EINVAL;
        }
-       cfs_b->hierarchal_quota = quota;
+       cfs_b->hierarchical_quota = quota;
 
        return 0;
 }
@@ -7962,15 +8169,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
        return ret;
 }
 
-static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
-               struct cgroup_map_cb *cb)
+static int cpu_stats_show(struct seq_file *sf, void *v)
 {
-       struct task_group *tg = cgroup_tg(cgrp);
+       struct task_group *tg = css_tg(seq_css(sf));
        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 
-       cb->fill(cb, "nr_periods", cfs_b->nr_periods);
-       cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
-       cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+       seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
+       seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
+       seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
 
        return 0;
 }
@@ -7978,26 +8184,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
-static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
-                               s64 val)
+static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
+                               struct cftype *cft, s64 val)
 {
-       return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+       return sched_group_set_rt_runtime(css_tg(css), val);
 }
 
-static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
+                              struct cftype *cft)
 {
-       return sched_group_rt_runtime(cgroup_tg(cgrp));
+       return sched_group_rt_runtime(css_tg(css));
 }
 
-static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
-               u64 rt_period_us)
+static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
+                                   struct cftype *cftype, u64 rt_period_us)
 {
-       return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
+       return sched_group_set_rt_period(css_tg(css), rt_period_us);
 }
 
-static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
+                                  struct cftype *cft)
 {
-       return sched_group_rt_period(cgroup_tg(cgrp));
+       return sched_group_rt_period(css_tg(css));
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
@@ -8022,7 +8230,7 @@ static struct cftype cpu_files[] = {
        },
        {
                .name = "stat",
-               .read_map = cpu_stats_show,
+               .seq_show = cpu_stats_show,
        },
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -8040,242 +8248,21 @@ static struct cftype cpu_files[] = {
        { }     /* terminate */
 };
 
-struct cgroup_subsys cpu_cgroup_subsys = {
-       .name           = "cpu",
+struct cgroup_subsys cpu_cgrp_subsys = {
        .css_alloc      = cpu_cgroup_css_alloc,
        .css_free       = cpu_cgroup_css_free,
        .css_online     = cpu_cgroup_css_online,
        .css_offline    = cpu_cgroup_css_offline,
+       .fork           = cpu_cgroup_fork,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
-       .subsys_id      = cpu_cgroup_subsys_id,
-       .base_cftypes   = cpu_files,
+       .legacy_cftypes = cpu_files,
        .early_init     = 1,
 };
 
 #endif /* CONFIG_CGROUP_SCHED */
 
-#ifdef CONFIG_CGROUP_CPUACCT
-
-/*
- * CPU accounting code for task groups.
- *
- * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
- * (balbir@in.ibm.com).
- */
-
-struct cpuacct root_cpuacct;
-
-/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
-{
-       struct cpuacct *ca;
-
-       if (!cgrp->parent)
-               return &root_cpuacct.css;
-
-       ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-       if (!ca)
-               goto out;
-
-       ca->cpuusage = alloc_percpu(u64);
-       if (!ca->cpuusage)
-               goto out_free_ca;
-
-       ca->cpustat = alloc_percpu(struct kernel_cpustat);
-       if (!ca->cpustat)
-               goto out_free_cpuusage;
-
-       return &ca->css;
-
-out_free_cpuusage:
-       free_percpu(ca->cpuusage);
-out_free_ca:
-       kfree(ca);
-out:
-       return ERR_PTR(-ENOMEM);
-}
-
-/* destroy an existing cpu accounting group */
-static void cpuacct_css_free(struct cgroup *cgrp)
-{
-       struct cpuacct *ca = cgroup_ca(cgrp);
-
-       free_percpu(ca->cpustat);
-       free_percpu(ca->cpuusage);
-       kfree(ca);
-}
-
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
-{
-       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-       u64 data;
-
-#ifndef CONFIG_64BIT
-       /*
-        * Take rq->lock to make 64-bit read safe on 32-bit platforms.
-        */
-       raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-       data = *cpuusage;
-       raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-       data = *cpuusage;
-#endif
-
-       return data;
-}
-
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
-{
-       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-
-#ifndef CONFIG_64BIT
-       /*
-        * Take rq->lock to make 64-bit write safe on 32-bit platforms.
-        */
-       raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-       *cpuusage = val;
-       raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-       *cpuusage = val;
-#endif
-}
-
-/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
-{
-       struct cpuacct *ca = cgroup_ca(cgrp);
-       u64 totalcpuusage = 0;
-       int i;
-
-       for_each_present_cpu(i)
-               totalcpuusage += cpuacct_cpuusage_read(ca, i);
-
-       return totalcpuusage;
-}
-
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
-                                                               u64 reset)
-{
-       struct cpuacct *ca = cgroup_ca(cgrp);
-       int err = 0;
-       int i;
-
-       if (reset) {
-               err = -EINVAL;
-               goto out;
-       }
-
-       for_each_present_cpu(i)
-               cpuacct_cpuusage_write(ca, i, 0);
-
-out:
-       return err;
-}
-
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
-                                  struct seq_file *m)
-{
-       struct cpuacct *ca = cgroup_ca(cgroup);
-       u64 percpu;
-       int i;
-
-       for_each_present_cpu(i) {
-               percpu = cpuacct_cpuusage_read(ca, i);
-               seq_printf(m, "%llu ", (unsigned long long) percpu);
-       }
-       seq_printf(m, "\n");
-       return 0;
-}
-
-static const char *cpuacct_stat_desc[] = {
-       [CPUACCT_STAT_USER] = "user",
-       [CPUACCT_STAT_SYSTEM] = "system",
-};
-
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
-                             struct cgroup_map_cb *cb)
-{
-       struct cpuacct *ca = cgroup_ca(cgrp);
-       int cpu;
-       s64 val = 0;
-
-       for_each_online_cpu(cpu) {
-               struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-               val += kcpustat->cpustat[CPUTIME_USER];
-               val += kcpustat->cpustat[CPUTIME_NICE];
-       }
-       val = cputime64_to_clock_t(val);
-       cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-
-       val = 0;
-       for_each_online_cpu(cpu) {
-               struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-               val += kcpustat->cpustat[CPUTIME_SYSTEM];
-               val += kcpustat->cpustat[CPUTIME_IRQ];
-               val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
-       }
-
-       val = cputime64_to_clock_t(val);
-       cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
-
-       return 0;
-}
-
-static struct cftype files[] = {
-       {
-               .name = "usage",
-               .read_u64 = cpuusage_read,
-               .write_u64 = cpuusage_write,
-       },
-       {
-               .name = "usage_percpu",
-               .read_seq_string = cpuacct_percpu_seq_read,
-       },
-       {
-               .name = "stat",
-               .read_map = cpuacct_stats_show,
-       },
-       { }     /* terminate */
-};
-
-/*
- * charge this task's execution time to its accounting group.
- *
- * called with rq->lock held.
- */
-void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-       struct cpuacct *ca;
-       int cpu;
-
-       if (unlikely(!cpuacct_subsys.active))
-               return;
-
-       cpu = task_cpu(tsk);
-
-       rcu_read_lock();
-
-       ca = task_ca(tsk);
-
-       for (; ca; ca = parent_ca(ca)) {
-               u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-               *cpuusage += cputime;
-       }
-
-       rcu_read_unlock();
-}
-
-struct cgroup_subsys cpuacct_subsys = {
-       .name = "cpuacct",
-       .css_alloc = cpuacct_css_alloc,
-       .css_free = cpuacct_css_free,
-       .subsys_id = cpuacct_subsys_id,
-       .base_cftypes = files,
-};
-#endif /* CONFIG_CGROUP_CPUACCT */
-
 void dump_cpu_task(int cpu)
 {
        pr_info("Task dump for CPU %d:\n", cpu);