Merge tag 'v4.1' into p/abusse/merge_upgrade
[projects/modsched/linux.git] / kernel / sched / cfs / rt.c
index 127a2c4..575da76 100644 (file)
@@ -6,6 +6,7 @@
 #include "sched.h"
 
 #include <linux/slab.h>
+#include <linux/irq_work.h>
 
 int sched_rr_timeslice = RR_TIMESLICE;
 
@@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
        raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
 
-void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+#ifdef CONFIG_SMP
+static void push_irq_work_func(struct irq_work *work);
+#endif
+
+void init_rt_rq(struct rt_rq *rt_rq)
 {
        struct rt_prio_array *array;
        int i;
@@ -78,7 +83,16 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
        plist_head_init(&rt_rq->pushable_tasks);
+
+#ifdef HAVE_RT_PUSH_IPI
+       rt_rq->push_flags = 0;
+       rt_rq->push_cpu = nr_cpu_ids;
+       raw_spin_lock_init(&rt_rq->push_lock);
+       init_irq_work(&rt_rq->push_work, push_irq_work_func);
 #endif
+#endif /* CONFIG_SMP */
+       /* We start is dequeued state, because no RT tasks are queued */
+       rt_rq->rt_queued = 0;
 
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
@@ -112,6 +126,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
        return rt_se->rt_rq;
 }
 
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *rt_rq = rt_se->rt_rq;
+
+       return rt_rq->rq;
+}
+
 void free_rt_sched_group(struct task_group *tg)
 {
        int i;
@@ -184,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
 
-               init_rt_rq(rt_rq, cpu_rq(i));
+               init_rt_rq(rt_rq);
                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
@@ -211,10 +232,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
        return container_of(rt_rq, struct rq, rt);
 }
 
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 {
        struct task_struct *p = rt_task_of(rt_se);
-       struct rq *rq = task_rq(p);
+
+       return task_rq(p);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+       struct rq *rq = rq_of_rt_se(rt_se);
 
        return &rq->rt;
 }
@@ -229,6 +256,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 
 #ifdef CONFIG_SMP
 
+static int pull_rt_task(struct rq *this_rq);
+
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+       /* Try to pull RT tasks here if we lower this rq's prio */
+       return rq->rt.highest_prio.curr > prev->prio;
+}
+
 static inline int rt_overloaded(struct rq *rq)
 {
        return atomic_read(&rq->rd->rto_count);
@@ -246,8 +281,10 @@ static inline void rt_set_overload(struct rq *rq)
         * if we should look at the mask. It would be a shame
         * if we looked at the mask, but the mask was not
         * updated yet.
+        *
+        * Matched by the barrier in pull_rt_task().
         */
-       wmb();
+       smp_wmb();
        atomic_inc(&rq->rd->rto_count);
 }
 
@@ -313,6 +350,15 @@ static inline int has_pushable_tasks(struct rq *rq)
        return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
+static inline void set_post_schedule(struct rq *rq)
+{
+       /*
+        * We detect this state here so that we can avoid taking the RQ
+        * lock again later if there is no need to push
+        */
+       rq->post_schedule = has_pushable_tasks(rq);
+}
+
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 {
        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
@@ -357,8 +403,24 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 }
 
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+       return false;
+}
+
+static inline int pull_rt_task(struct rq *this_rq)
+{
+       return 0;
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 
+static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
        return !list_empty(&rt_se->run_list);
@@ -399,20 +461,6 @@ static inline struct task_group *next_task_group(struct task_group *tg)
                (iter = next_task_group(iter)) &&                       \
                (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-       list_add_rcu(&rt_rq->leaf_rt_rq_list,
-                       &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
-}
-
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-       list_del_rcu(&rt_rq->leaf_rt_rq_list);
-}
-
-#define for_each_leaf_rt_rq(rt_rq, rq) \
-       list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
-
 #define for_each_sched_rt_entity(rt_se) \
        for (; rt_se; rt_se = rt_se->parent)
 
@@ -427,17 +475,21 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+       struct rq *rq = rq_of_rt_rq(rt_rq);
        struct sched_rt_entity *rt_se;
 
-       int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+       int cpu = cpu_of(rq);
 
        rt_se = rt_rq->tg->rt_se[cpu];
 
        if (rt_rq->rt_nr_running) {
-               if (rt_se && !on_rt_rq(rt_se))
+               if (!rt_se)
+                       enqueue_top_rt_rq(rt_rq);
+               else if (!on_rt_rq(rt_se))
                        enqueue_rt_entity(rt_se, false);
+
                if (rt_rq->highest_prio.curr < curr->prio)
-                       resched_task(curr);
+                       resched_curr(rq);
        }
 }
 
@@ -448,7 +500,9 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 
        rt_se = rt_rq->tg->rt_se[cpu];
 
-       if (rt_se && on_rt_rq(rt_se))
+       if (!rt_se)
+               dequeue_top_rt_rq(rt_rq);
+       else if (on_rt_rq(rt_se))
                dequeue_rt_entity(rt_se);
 }
 
@@ -472,7 +526,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
 #ifdef CONFIG_SMP
 static inline const struct cpumask *sched_rt_period_mask(void)
 {
-       return cpu_rq(smp_processor_id())->rd->span;
+       return this_rq()->rd->span;
 }
 #else
 static inline const struct cpumask *sched_rt_period_mask(void)
@@ -509,17 +563,6 @@ typedef struct rt_rq *rt_rq_iter_t;
 #define for_each_rt_rq(rt_rq, iter, rq) \
        for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-
-#define for_each_leaf_rt_rq(rt_rq, rq) \
-       for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-
 #define for_each_sched_rt_entity(rt_se) \
        for (; rt_se; rt_se = NULL)
 
@@ -530,12 +573,18 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 
 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
-       if (rt_rq->rt_nr_running)
-               resched_task(rq_of_rt_rq(rt_rq)->curr);
+       struct rq *rq = rq_of_rt_rq(rt_rq);
+
+       if (!rt_rq->rt_nr_running)
+               return;
+
+       enqueue_top_rt_rq(rt_rq);
+       resched_curr(rq);
 }
 
 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
+       dequeue_top_rt_rq(rt_rq);
 }
 
 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -561,6 +610,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+{
+       struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+       return (hrtimer_active(&rt_b->rt_period_timer) ||
+               rt_rq->rt_time < rt_b->rt_runtime);
+}
+
 #ifdef CONFIG_SMP
 /*
  * We ran out of runtime, see if we can borrow some from our neighbours.
@@ -696,16 +753,10 @@ balanced:
                rt_rq->rt_throttled = 0;
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
                raw_spin_unlock(&rt_b->rt_runtime_lock);
-       }
-}
 
-static void disable_runtime(struct rq *rq)
-{
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       __disable_runtime(rq);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+               /* Make rt_rq available for pick_next_task() */
+               sched_rt_rq_enqueue(rt_rq);
+       }
 }
 
 static void __enable_runtime(struct rq *rq)
@@ -732,37 +783,6 @@ static void __enable_runtime(struct rq *rq)
        }
 }
 
-static void enable_runtime(struct rq *rq)
-{
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       __enable_runtime(rq);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-       int cpu = (int)(long)hcpu;
-
-       switch (action) {
-       case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
-               disable_runtime(cpu_rq(cpu));
-               return NOTIFY_OK;
-
-       case CPU_DOWN_FAILED:
-       case CPU_DOWN_FAILED_FROZEN:
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               enable_runtime(cpu_rq(cpu));
-               return NOTIFY_OK;
-
-       default:
-               return NOTIFY_DONE;
-       }
-}
-
 static int balance_runtime(struct rt_rq *rt_rq)
 {
        int more = 0;
@@ -823,11 +843,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                                enqueue = 1;
 
                                /*
-                                * Force a clock update if the CPU was idle,
-                                * lest wakeup -> unthrottle time accumulate.
+                                * When we're idle and a woken (rt) task is
+                                * throttled check_preempt_curr() will set
+                                * skip_update and the time between the wakeup
+                                * and this unthrottle will get accounted as
+                                * 'runtime'.
                                 */
                                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
-                                       rq->skip_clock_update = -1;
+                                       rq_clock_skip_update(rq, false);
                        }
                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
                                idle = 0;
@@ -886,14 +909,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
                 * but accrue some time due to boosting.
                 */
                if (likely(rt_b->rt_runtime)) {
-                       static bool once = false;
-
                        rt_rq->rt_throttled = 1;
-
-                       if (!once) {
-                               once = true;
-                               printk_sched("sched: RT throttling activated\n");
-                       }
+                       printk_deferred_once("sched: RT throttling activated\n");
                } else {
                        /*
                         * In case we did anyway, make it go away,
@@ -920,13 +937,12 @@ static void update_curr_rt(struct rq *rq)
 {
        struct task_struct *curr = rq->curr;
        struct sched_rt_entity *rt_se = &curr->rt;
-       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        u64 delta_exec;
 
        if (curr->sched_class != &rt_sched_class)
                return;
 
-       delta_exec = rq->clock_task - curr->se.exec_start;
+       delta_exec = rq_clock_task(rq) - curr->se.exec_start;
        if (unlikely((s64)delta_exec <= 0))
                return;
 
@@ -936,7 +952,7 @@ static void update_curr_rt(struct rq *rq)
        curr->se.sum_exec_runtime += delta_exec;
        account_group_exec_runtime(curr, delta_exec);
 
-       curr->se.exec_start = rq->clock_task;
+       curr->se.exec_start = rq_clock_task(rq);
        cpuacct_charge(curr, delta_exec);
 
        sched_rt_avg_update(rq, delta_exec);
@@ -945,18 +961,50 @@ static void update_curr_rt(struct rq *rq)
                return;
 
        for_each_sched_rt_entity(rt_se) {
-               rt_rq = rt_rq_of_se(rt_se);
+               struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 
                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
                        raw_spin_lock(&rt_rq->rt_runtime_lock);
                        rt_rq->rt_time += delta_exec;
                        if (sched_rt_runtime_exceeded(rt_rq))
-                               resched_task(curr);
+                               resched_curr(rq);
                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
                }
        }
 }
 
+static void
+dequeue_top_rt_rq(struct rt_rq *rt_rq)
+{
+       struct rq *rq = rq_of_rt_rq(rt_rq);
+
+       BUG_ON(&rq->rt != rt_rq);
+
+       if (!rt_rq->rt_queued)
+               return;
+
+       BUG_ON(!rq->nr_running);
+
+       sub_nr_running(rq, rt_rq->rt_nr_running);
+       rt_rq->rt_queued = 0;
+}
+
+static void
+enqueue_top_rt_rq(struct rt_rq *rt_rq)
+{
+       struct rq *rq = rq_of_rt_rq(rt_rq);
+
+       BUG_ON(&rq->rt != rt_rq);
+
+       if (rt_rq->rt_queued)
+               return;
+       if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+               return;
+
+       add_nr_running(rq, rt_rq->rt_nr_running);
+       rt_rq->rt_queued = 1;
+}
+
 #if defined CONFIG_SMP
 
 static void
@@ -964,6 +1012,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
 
+#ifdef CONFIG_RT_GROUP_SCHED
+       /*
+        * Change rq's cpupri only if rt_rq is the top queue.
+        */
+       if (&rq->rt != rt_rq)
+               return;
+#endif
        if (rq->online && prio < prev_prio)
                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 }
@@ -973,6 +1028,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
 
+#ifdef CONFIG_RT_GROUP_SCHED
+       /*
+        * Change rq's cpupri only if rt_rq is the top queue.
+        */
+       if (&rq->rt != rt_rq)
+               return;
+#endif
        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 }
@@ -1065,13 +1127,24 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+static inline
+unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *group_rq = group_rt_rq(rt_se);
+
+       if (group_rq)
+               return group_rq->rt_nr_running;
+       else
+               return 1;
+}
+
 static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
        int prio = rt_se_prio(rt_se);
 
        WARN_ON(!rt_prio(prio));
-       rt_rq->rt_nr_running++;
+       rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
 
        inc_rt_prio(rt_rq, prio);
        inc_rt_migration(rt_se, rt_rq);
@@ -1083,7 +1156,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        WARN_ON(!rt_rq->rt_nr_running);
-       rt_rq->rt_nr_running--;
+       rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
 
        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
        dec_rt_migration(rt_se, rt_rq);
@@ -1106,9 +1179,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
 
-       if (!rt_rq->rt_nr_running)
-               list_add_leaf_rt_rq(rt_rq);
-
        if (head)
                list_add(&rt_se->run_list, queue);
        else
@@ -1128,8 +1198,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
                __clear_bit(rt_se_prio(rt_se), array->bitmap);
 
        dec_rt_tasks(rt_se, rt_rq);
-       if (!rt_rq->rt_nr_running)
-               list_del_leaf_rt_rq(rt_rq);
 }
 
 /*
@@ -1145,6 +1213,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
                back = rt_se;
        }
 
+       dequeue_top_rt_rq(rt_rq_of_se(back));
+
        for (rt_se = back; rt_se; rt_se = rt_se->back) {
                if (on_rt_rq(rt_se))
                        __dequeue_rt_entity(rt_se);
@@ -1153,13 +1223,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
+       struct rq *rq = rq_of_rt_se(rt_se);
+
        dequeue_rt_stack(rt_se);
        for_each_sched_rt_entity(rt_se)
                __enqueue_rt_entity(rt_se, head);
+       enqueue_top_rt_rq(&rq->rt);
 }
 
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 {
+       struct rq *rq = rq_of_rt_se(rt_se);
+
        dequeue_rt_stack(rt_se);
 
        for_each_sched_rt_entity(rt_se) {
@@ -1168,6 +1243,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
                if (rt_rq && rt_rq->rt_nr_running)
                        __enqueue_rt_entity(rt_se, false);
        }
+       enqueue_top_rt_rq(&rq->rt);
 }
 
 /*
@@ -1185,8 +1261,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
-
-       inc_nr_running(rq);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1197,8 +1271,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        dequeue_rt_entity(rt_se);
 
        dequeue_pushable_task(rq, p);
-
-       dec_nr_running(rq);
 }
 
 /*
@@ -1239,16 +1311,10 @@ static void yield_task_rt(struct rq *rq)
 static int find_lowest_rq(struct task_struct *task);
 
 static int
-select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
        struct task_struct *curr;
        struct rq *rq;
-       int cpu;
-
-       cpu = task_cpu(p);
-
-       if (p->nr_cpus_allowed == 1)
-               goto out;
 
        /* For anything but wake ups, just return the task_cpu */
        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
@@ -1283,11 +1349,15 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
         */
        if (curr && unlikely(rt_task(curr)) &&
            (curr->nr_cpus_allowed < 2 ||
-            curr->prio <= p->prio) &&
-           (p->nr_cpus_allowed > 1)) {
+            curr->prio <= p->prio)) {
                int target = find_lowest_rq(p);
 
-               if (target != -1)
+               /*
+                * Don't bother moving it if the destination CPU is
+                * not running a lower priority task.
+                */
+               if (target != -1 &&
+                   p->prio < cpu_rq(target)->rt.highest_prio.curr)
                        cpu = target;
        }
        rcu_read_unlock();
@@ -1298,23 +1368,29 @@ out:
 
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-       if (rq->curr->nr_cpus_allowed == 1)
+       /*
+        * Current can't be migrated, useless to reschedule,
+        * let's hope p can move out.
+        */
+       if (rq->curr->nr_cpus_allowed == 1 ||
+           !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
                return;
 
+       /*
+        * p is migratable, so let's not schedule it and
+        * see if it is pushed or pulled somewhere else.
+        */
        if (p->nr_cpus_allowed != 1
            && cpupri_find(&rq->rd->cpupri, p, NULL))
                return;
 
-       if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
-               return;
-
        /*
         * There appears to be other cpus that can accept
         * current and none to run 'p', so lets reschedule
         * to try and push current away:
         */
        requeue_task_rt(rq, p, 1);
-       resched_task(rq->curr);
+       resched_curr(rq);
 }
 
 #endif /* CONFIG_SMP */
@@ -1325,7 +1401,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
 {
        if (p->prio < rq->curr->prio) {
-               resched_task(rq->curr);
+               resched_curr(rq);
                return;
        }
 
@@ -1368,15 +1444,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 {
        struct sched_rt_entity *rt_se;
        struct task_struct *p;
-       struct rt_rq *rt_rq;
-
-       rt_rq = &rq->rt;
-
-       if (!rt_rq->rt_nr_running)
-               return NULL;
-
-       if (rt_rq_throttled(rt_rq))
-               return NULL;
+       struct rt_rq *rt_rq  = &rq->rt;
 
        do {
                rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1385,26 +1453,47 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
        } while (rt_rq);
 
        p = rt_task_of(rt_se);
-       p->se.exec_start = rq->clock_task;
+       p->se.exec_start = rq_clock_task(rq);
 
        return p;
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 {
-       struct task_struct *p = _pick_next_task_rt(rq);
+       struct task_struct *p;
+       struct rt_rq *rt_rq = &rq->rt;
 
-       /* The running task is never eligible for pushing */
-       if (p)
-               dequeue_pushable_task(rq, p);
+       if (need_pull_rt_task(rq, prev)) {
+               pull_rt_task(rq);
+               /*
+                * pull_rt_task() can drop (and re-acquire) rq->lock; this
+                * means a dl or stop task can slip in, in which case we need
+                * to re-start task selection.
+                */
+               if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
+                            rq->dl.dl_nr_running))
+                       return RETRY_TASK;
+       }
 
-#ifdef CONFIG_SMP
        /*
-        * We detect this state here so that we can avoid taking the RQ
-        * lock again later if there is no need to push
+        * We may dequeue prev's rt_rq in put_prev_task().
+        * So, we update time before rt_nr_running check.
         */
-       rq->post_schedule = has_pushable_tasks(rq);
-#endif
+       if (prev->sched_class == &rt_sched_class)
+               update_curr_rt(rq);
+
+       if (!rt_rq->rt_queued)
+               return NULL;
+
+       put_prev_task(rq, prev);
+
+       p = _pick_next_task_rt(rq);
+
+       /* The running task is never eligible for pushing */
+       dequeue_pushable_task(rq, p);
+
+       set_post_schedule(rq);
 
        return p;
 }
@@ -1434,42 +1523,24 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
        return 0;
 }
 
-/* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
+/*
+ * Return the highest pushable rq's task, which is suitable to be executed
+ * on the cpu, NULL otherwise
+ */
+static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
 {
-       struct task_struct *next = NULL;
-       struct sched_rt_entity *rt_se;
-       struct rt_prio_array *array;
-       struct rt_rq *rt_rq;
-       int idx;
-
-       for_each_leaf_rt_rq(rt_rq, rq) {
-               array = &rt_rq->active;
-               idx = sched_find_first_bit(array->bitmap);
-next_idx:
-               if (idx >= MAX_RT_PRIO)
-                       continue;
-               if (next && next->prio <= idx)
-                       continue;
-               list_for_each_entry(rt_se, array->queue + idx, run_list) {
-                       struct task_struct *p;
+       struct plist_head *head = &rq->rt.pushable_tasks;
+       struct task_struct *p;
 
-                       if (!rt_entity_is_task(rt_se))
-                               continue;
+       if (!has_pushable_tasks(rq))
+               return NULL;
 
-                       p = rt_task_of(rt_se);
-                       if (pick_rt_task(rq, p, cpu)) {
-                               next = p;
-                               break;
-                       }
-               }
-               if (!next) {
-                       idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-                       goto next_idx;
-               }
+       plist_for_each_entry(p, head, pushable_tasks) {
+               if (pick_rt_task(rq, p, cpu))
+                       return p;
        }
 
-       return next;
+       return NULL;
 }
 
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
@@ -1477,7 +1548,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 static int find_lowest_rq(struct task_struct *task)
 {
        struct sched_domain *sd;
-       struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
+       struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
 
@@ -1563,6 +1634,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 
                lowest_rq = cpu_rq(cpu);
 
+               if (lowest_rq->rt.highest_prio.curr <= task->prio) {
+                       /*
+                        * Target rq has tasks of equal or higher priority,
+                        * retrying does not release any lock and is unlikely
+                        * to yield a different result.
+                        */
+                       lowest_rq = NULL;
+                       break;
+               }
+
                /* if the prio of this runqueue changed, try again */
                if (double_lock_balance(rq, lowest_rq)) {
                        /*
@@ -1575,7 +1656,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                                     !cpumask_test_cpu(lowest_rq->cpu,
                                                       tsk_cpus_allowed(task)) ||
                                     task_running(rq, task) ||
-                                    !task->on_rq)) {
+                                    !task_on_rq_queued(task))) {
 
                                double_unlock_balance(rq, lowest_rq);
                                lowest_rq = NULL;
@@ -1609,7 +1690,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
        BUG_ON(task_current(rq, p));
        BUG_ON(p->nr_cpus_allowed <= 1);
 
-       BUG_ON(!p->on_rq);
+       BUG_ON(!task_on_rq_queued(p));
        BUG_ON(!rt_task(p));
 
        return p;
@@ -1645,7 +1726,7 @@ retry:
         * just reschedule current.
         */
        if (unlikely(next_task->prio < rq->curr->prio)) {
-               resched_task(rq->curr);
+               resched_curr(rq);
                return 0;
        }
 
@@ -1692,7 +1773,7 @@ retry:
        activate_task(lowest_rq, next_task, 0);
        ret = 1;
 
-       resched_task(lowest_rq->curr);
+       resched_curr(lowest_rq);
 
        double_unlock_balance(rq, lowest_rq);
 
@@ -1709,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq)
                ;
 }
 
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * The search for the next cpu always starts at rq->cpu and ends
+ * when we reach rq->cpu again. It will never return rq->cpu.
+ * This returns the next cpu to check, or nr_cpu_ids if the loop
+ * is complete.
+ *
+ * rq->rt.push_cpu holds the last cpu returned by this function,
+ * or if this is the first instance, it must hold rq->cpu.
+ */
+static int rto_next_cpu(struct rq *rq)
+{
+       int prev_cpu = rq->rt.push_cpu;
+       int cpu;
+
+       cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
+
+       /*
+        * If the previous cpu is less than the rq's CPU, then it already
+        * passed the end of the mask, and has started from the beginning.
+        * We end if the next CPU is greater or equal to rq's CPU.
+        */
+       if (prev_cpu < rq->cpu) {
+               if (cpu >= rq->cpu)
+                       return nr_cpu_ids;
+
+       } else if (cpu >= nr_cpu_ids) {
+               /*
+                * We passed the end of the mask, start at the beginning.
+                * If the result is greater or equal to the rq's CPU, then
+                * the loop is finished.
+                */
+               cpu = cpumask_first(rq->rd->rto_mask);
+               if (cpu >= rq->cpu)
+                       return nr_cpu_ids;
+       }
+       rq->rt.push_cpu = cpu;
+
+       /* Return cpu to let the caller know if the loop is finished or not */
+       return cpu;
+}
+
+static int find_next_push_cpu(struct rq *rq)
+{
+       struct rq *next_rq;
+       int cpu;
+
+       while (1) {
+               cpu = rto_next_cpu(rq);
+               if (cpu >= nr_cpu_ids)
+                       break;
+               next_rq = cpu_rq(cpu);
+
+               /* Make sure the next rq can push to this rq */
+               if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+                       break;
+       }
+
+       return cpu;
+}
+
+#define RT_PUSH_IPI_EXECUTING          1
+#define RT_PUSH_IPI_RESTART            2
+
+static void tell_cpu_to_push(struct rq *rq)
+{
+       int cpu;
+
+       if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+               raw_spin_lock(&rq->rt.push_lock);
+               /* Make sure it's still executing */
+               if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+                       /*
+                        * Tell the IPI to restart the loop as things have
+                        * changed since it started.
+                        */
+                       rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
+                       raw_spin_unlock(&rq->rt.push_lock);
+                       return;
+               }
+               raw_spin_unlock(&rq->rt.push_lock);
+       }
+
+       /* When here, there's no IPI going around */
+
+       rq->rt.push_cpu = rq->cpu;
+       cpu = find_next_push_cpu(rq);
+       if (cpu >= nr_cpu_ids)
+               return;
+
+       rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+
+       irq_work_queue_on(&rq->rt.push_work, cpu);
+}
+
+/* Called from hardirq context */
+static void try_to_push_tasks(void *arg)
+{
+       struct rt_rq *rt_rq = arg;
+       struct rq *rq, *src_rq;
+       int this_cpu;
+       int cpu;
+
+       this_cpu = rt_rq->push_cpu;
+
+       /* Paranoid check */
+       BUG_ON(this_cpu != smp_processor_id());
+
+       rq = cpu_rq(this_cpu);
+       src_rq = rq_of_rt_rq(rt_rq);
+
+again:
+       if (has_pushable_tasks(rq)) {
+               raw_spin_lock(&rq->lock);
+               push_rt_task(rq);
+               raw_spin_unlock(&rq->lock);
+       }
+
+       /* Pass the IPI to the next rt overloaded queue */
+       raw_spin_lock(&rt_rq->push_lock);
+       /*
+        * If the source queue changed since the IPI went out,
+        * we need to restart the search from that CPU again.
+        */
+       if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
+               rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
+               rt_rq->push_cpu = src_rq->cpu;
+       }
+
+       cpu = find_next_push_cpu(src_rq);
+
+       if (cpu >= nr_cpu_ids)
+               rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
+       raw_spin_unlock(&rt_rq->push_lock);
+
+       if (cpu >= nr_cpu_ids)
+               return;
+
+       /*
+        * It is possible that a restart caused this CPU to be
+        * chosen again. Don't bother with an IPI, just see if we
+        * have more to push.
+        */
+       if (unlikely(cpu == rq->cpu))
+               goto again;
+
+       /* Try the next RT overloaded CPU */
+       irq_work_queue_on(&rt_rq->push_work, cpu);
+}
+
+static void push_irq_work_func(struct irq_work *work)
+{
+       struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
+
+       try_to_push_tasks(rt_rq);
+}
+#endif /* HAVE_RT_PUSH_IPI */
+
 static int pull_rt_task(struct rq *this_rq)
 {
        int this_cpu = this_rq->cpu, ret = 0, cpu;
@@ -1718,6 +1957,19 @@ static int pull_rt_task(struct rq *this_rq)
        if (likely(!rt_overloaded(this_rq)))
                return 0;
 
+       /*
+        * Match the barrier from rt_set_overloaded; this guarantees that if we
+        * see overloaded we must also see the rto_mask bit.
+        */
+       smp_rmb();
+
+#ifdef HAVE_RT_PUSH_IPI
+       if (sched_feat(RT_PUSH_IPI)) {
+               tell_cpu_to_push(this_rq);
+               return 0;
+       }
+#endif
+
        for_each_cpu(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
@@ -1743,12 +1995,10 @@ static int pull_rt_task(struct rq *this_rq)
                double_lock_balance(this_rq, src_rq);
 
                /*
-                * Are there still pullable RT tasks?
+                * We can pull only a task, which is pushable
+                * on its rq, and no others.
                 */
-               if (src_rq->rt.rt_nr_running <= 1)
-                       goto skip;
-
-               p = pick_next_highest_task_rt(src_rq, this_cpu);
+               p = pick_highest_pushable_task(src_rq, this_cpu);
 
                /*
                 * Do we have an RT task that preempts
@@ -1756,7 +2006,7 @@ static int pull_rt_task(struct rq *this_rq)
                 */
                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
                        WARN_ON(p == src_rq->curr);
-                       WARN_ON(!p->on_rq);
+                       WARN_ON(!task_on_rq_queued(p));
 
                        /*
                         * There's a chance that p is higher in priority
@@ -1788,13 +2038,6 @@ skip:
        return ret;
 }
 
-static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
-{
-       /* Try to pull RT tasks here if we lower this rq's prio */
-       if (rq->rt.highest_prio.curr > prev->prio)
-               pull_rt_task(rq);
-}
-
 static void post_schedule_rt(struct rq *rq)
 {
        push_rt_tasks(rq);
@@ -1810,7 +2053,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
            !test_tsk_need_resched(rq->curr) &&
            has_pushable_tasks(rq) &&
            p->nr_cpus_allowed > 1 &&
-           rt_task(rq->curr) &&
+           (dl_task(rq->curr) || rt_task(rq->curr)) &&
            (rq->curr->nr_cpus_allowed < 2 ||
             rq->curr->prio <= p->prio))
                push_rt_tasks(rq);
@@ -1824,7 +2067,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 
        BUG_ON(!rt_task(p));
 
-       if (!p->on_rq)
+       if (!task_on_rq_queued(p))
                return;
 
        weight = cpumask_weight(new_mask);
@@ -1890,14 +2133,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
         * we may need to handle the pulling of RT tasks
         * now.
         */
-       if (!p->on_rq || rq->rt.rt_nr_running)
+       if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
                return;
 
        if (pull_rt_task(rq))
-               resched_task(rq->curr);
+               resched_curr(rq);
 }
 
-void init_sched_rt_class(void)
+void __init init_sched_rt_class(void)
 {
        unsigned int i;
 
@@ -1924,15 +2167,15 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
         * If that current running task is also an RT task
         * then see if we can move to another run queue.
         */
-       if (p->on_rq && rq->curr != p) {
+       if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-               if (rq->rt.overloaded && push_rt_task(rq) &&
+               if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
                    /* Don't resched if we changed runqueues */
-                   rq != task_rq(p))
+                   push_rt_task(rq) && rq != task_rq(p))
                        check_resched = 0;
 #endif /* CONFIG_SMP */
                if (check_resched && p->prio < rq->curr->prio)
-                       resched_task(rq->curr);
+                       resched_curr(rq);
        }
 }
 
@@ -1943,7 +2186,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 static void
 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
-       if (!p->on_rq)
+       if (!task_on_rq_queued(p))
                return;
 
        if (rq->curr == p) {
@@ -1961,11 +2204,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
                 * Only reschedule if p is still on the same runqueue.
                 */
                if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
-                       resched_task(p);
+                       resched_curr(rq);
 #else
                /* For UP simply resched on drop of prio */
                if (oldprio < p->prio)
-                       resched_task(p);
+                       resched_curr(rq);
 #endif /* CONFIG_SMP */
        } else {
                /*
@@ -1974,7 +2217,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
                 * then reschedule.
                 */
                if (p->prio < rq->curr->prio)
-                       resched_task(rq->curr);
+                       resched_curr(rq);
        }
 }
 
@@ -2021,13 +2264,13 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        p->rt.time_slice = sched_rr_timeslice;
 
        /*
-        * Requeue to the end of queue if we (and all of our ancestors) are the
-        * only element on the queue
+        * Requeue to the end of queue if we (and all of our ancestors) are not
+        * the only element on the queue
         */
        for_each_sched_rt_entity(rt_se) {
                if (rt_se->run_list.prev != rt_se->run_list.next) {
                        requeue_task_rt(rq, p, 0);
-                       set_tsk_need_resched(p);
+                       resched_curr(rq);
                        return;
                }
        }
@@ -2037,7 +2280,7 @@ static void set_curr_task_rt(struct rq *rq)
 {
        struct task_struct *p = rq->curr;
 
-       p->se.exec_start = rq->clock_task;
+       p->se.exec_start = rq_clock_task(rq);
 
        /* The running task is never eligible for pushing */
        dequeue_pushable_task(rq, p);
@@ -2071,7 +2314,6 @@ const struct sched_class rt_sched_class = {
        .set_cpus_allowed       = set_cpus_allowed_rt,
        .rq_online              = rq_online_rt,
        .rq_offline             = rq_offline_rt,
-       .pre_schedule           = pre_schedule_rt,
        .post_schedule          = post_schedule_rt,
        .task_woken             = task_woken_rt,
        .switched_from          = switched_from_rt,
@@ -2084,6 +2326,8 @@ const struct sched_class rt_sched_class = {
 
        .prio_changed           = prio_changed_rt,
        .switched_to            = switched_to_rt,
+
+       .update_curr            = update_curr_rt,
 };
 
 #ifdef CONFIG_SCHED_DEBUG