Merge tag 'v4.3' into p/abusse/merge_upgrade
[projects/modsched/linux.git] / kernel / sched / cfs / core.c
index 43d3380..0c301cf 100644 (file)
@@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
 
 static void sched_feat_disable(int i)
 {
-       if (static_key_enabled(&sched_feat_keys[i]))
-               static_key_slow_dec(&sched_feat_keys[i]);
+       static_key_disable(&sched_feat_keys[i]);
 }
 
 static void sched_feat_enable(int i)
 {
-       if (!static_key_enabled(&sched_feat_keys[i]))
-               static_key_slow_inc(&sched_feat_keys[i]);
+       static_key_enable(&sched_feat_keys[i]);
 }
 #else
 static void sched_feat_disable(int i) { };
@@ -623,18 +621,21 @@ int get_nohz_timer_target(void)
        int i, cpu = smp_processor_id();
        struct sched_domain *sd;
 
-       if (!idle_cpu(cpu))
+       if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
                return cpu;
 
        rcu_read_lock();
        for_each_domain(cpu, sd) {
                for_each_cpu(i, sched_domain_span(sd)) {
-                       if (!idle_cpu(i)) {
+                       if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
                                cpu = i;
                                goto unlock;
                        }
                }
        }
+
+       if (!is_housekeeping_cpu(cpu))
+               cpu = housekeeping_any_cpu();
 unlock:
        rcu_read_unlock();
        return cpu;
@@ -1151,15 +1152,45 @@ static int migration_cpu_stop(void *data)
        return 0;
 }
 
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+/*
+ * sched_class::set_cpus_allowed must do the below, but is not required to
+ * actually call this function.
+ */
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
 {
-       if (p->sched_class->set_cpus_allowed)
-               p->sched_class->set_cpus_allowed(p, new_mask);
-
        cpumask_copy(&p->cpus_allowed, new_mask);
        p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+       struct rq *rq = task_rq(p);
+       bool queued, running;
+
+       lockdep_assert_held(&p->pi_lock);
+
+       queued = task_on_rq_queued(p);
+       running = task_current(rq, p);
+
+       if (queued) {
+               /*
+                * Because __kthread_bind() calls this on blocked tasks without
+                * holding rq->lock.
+                */
+               lockdep_assert_held(&rq->lock);
+               dequeue_task(rq, p, 0);
+       }
+       if (running)
+               put_prev_task(rq, p);
+
+       p->sched_class->set_cpus_allowed(p, new_mask);
+
+       if (running)
+               p->sched_class->set_curr_task(rq);
+       if (queued)
+               enqueue_task(rq, p, 0);
+}
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
@@ -1169,7 +1200,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+                                 const struct cpumask *new_mask, bool check)
 {
        unsigned long flags;
        struct rq *rq;
@@ -1178,6 +1210,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 
        rq = task_rq_lock(p, &flags);
 
+       /*
+        * Must re-check here, to close a race against __kthread_bind(),
+        * sched_setaffinity() is not guaranteed to observe the flag.
+        */
+       if (check && (p->flags & PF_NO_SETAFFINITY)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
        if (cpumask_equal(&p->cpus_allowed, new_mask))
                goto out;
 
@@ -1214,6 +1255,11 @@ out:
 
        return ret;
 }
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+       return __set_cpus_allowed_ptr(p, new_mask, false);
+}
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -1595,6 +1641,15 @@ static void update_avg(u64 *avg, u64 sample)
        s64 diff = sample - *avg;
        *avg += diff >> 3;
 }
+
+#else
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+                                        const struct cpumask *new_mask, bool check)
+{
+       return set_cpus_allowed_ptr(p, new_mask);
+}
+
 #endif /* CONFIG_SMP */
 
 static void
@@ -1654,9 +1709,9 @@ static void
 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
        check_preempt_curr(rq, p, wake_flags);
-       trace_sched_wakeup(p, true);
-
        p->state = TASK_RUNNING;
+       trace_sched_wakeup(p);
+
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken) {
                /*
@@ -1874,6 +1929,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        if (!(p->state & state))
                goto out;
 
+       trace_sched_waking(p);
+
        success = 1; /* we're going to change ->state */
        cpu = task_cpu(p);
 
@@ -1953,6 +2010,8 @@ static void try_to_wake_up_local(struct task_struct *p)
        if (!(p->state & TASK_NORMAL))
                goto out;
 
+       trace_sched_waking(p);
+
        if (!task_on_rq_queued(p))
                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
@@ -2022,9 +2081,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
        p->se.vruntime                  = 0;
-#ifdef CONFIG_SMP
-       p->se.avg.decay_count           = 0;
-#endif
        INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_SCHEDSTATS
@@ -2206,8 +2262,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
 #ifdef CONFIG_SMP
 inline struct dl_bw *dl_bw_of(int i)
 {
-       rcu_lockdep_assert(rcu_read_lock_sched_held(),
-                          "sched RCU must be held");
+       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+                        "sched RCU must be held");
        return &cpu_rq(i)->rd->dl_bw;
 }
 
@@ -2216,8 +2272,8 @@ static inline int dl_bw_cpus(int i)
        struct root_domain *rd = cpu_rq(i)->rd;
        int cpus = 0;
 
-       rcu_lockdep_assert(rcu_read_lock_sched_held(),
-                          "sched RCU must be held");
+       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+                        "sched RCU must be held");
        for_each_cpu_and(i, rd->span, cpu_active_mask)
                cpus++;
 
@@ -2309,15 +2365,22 @@ void wake_up_new_task(struct task_struct *p)
 #endif
 
        /* Initialize new task's runnable average */
-       init_task_runnable_average(p);
+       init_entity_runnable_average(&p->se);
        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
-       trace_sched_wakeup_new(p, true);
+       trace_sched_wakeup_new(p);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
-       if (p->sched_class->task_woken)
+       if (p->sched_class->task_woken) {
+               /*
+                * Nothing relies on rq->lock after this, so its fine to
+                * drop it.
+                */
+               lockdep_unpin_lock(&rq->lock);
                p->sched_class->task_woken(rq, p);
+               lockdep_pin_lock(&rq->lock);
+       }
 #endif
        task_rq_unlock(rq, p, &flags);
 }
@@ -2467,15 +2530,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
         * If a task dies, then it sets TASK_DEAD in tsk->state and calls
         * schedule one last time. The schedule call will never return, and
         * the scheduled task must drop that reference.
-        * The test for TASK_DEAD must occur while the runqueue locks are
-        * still held, otherwise prev could be scheduled on another cpu, die
-        * there before we look at prev->state, and then the reference would
-        * be dropped twice.
-        *              Manfred Spraul <manfred@colorfullife.com>
+        *
+        * We must observe prev->state before clearing prev->on_cpu (in
+        * finish_lock_switch), otherwise a concurrent wakeup can get prev
+        * running on another CPU and we could rave with its RUNNING -> DEAD
+        * transition, resulting in a double drop.
         */
        prev_state = prev->state;
        vtime_task_switch(prev);
-       finish_arch_switch(prev);
        perf_event_task_sched_in(prev, current);
        finish_lock_switch(rq, prev);
        finish_arch_post_lock_switch();
@@ -2495,7 +2557,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
                put_task_struct(prev);
        }
 
-       tick_nohz_task_switch(current);
+       tick_nohz_task_switch();
        return rq;
 }
 
@@ -2620,13 +2682,20 @@ unsigned long nr_running(void)
 
 /*
  * Check if only the current task is running on the cpu.
+ *
+ * Caution: this function does not check that the caller has disabled
+ * preemption, thus the result might have a time-of-check-to-time-of-use
+ * race.  The caller is responsible to use it correctly, for example:
+ *
+ * - from a non-preemptable section (of course)
+ *
+ * - from a thread that is bound to a single CPU
+ *
+ * - in a loop with very short iterations (e.g. a polling loop)
  */
 bool single_task_running(void)
 {
-       if (cpu_rq(smp_processor_id())->nr_running == 1)
-               return true;
-       else
-               return false;
+       return raw_rq()->nr_running == 1;
 }
 EXPORT_SYMBOL(single_task_running);
 
@@ -4359,7 +4428,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        }
 #endif
 again:
-       retval = set_cpus_allowed_ptr(p, new_mask);
+       retval = __set_cpus_allowed_ptr(p, new_mask, true);
 
        if (!retval) {
                cpuset_cpus_allowed(p, cpus_allowed);
@@ -4511,7 +4580,7 @@ SYSCALL_DEFINE0(sched_yield)
 
 int __sched _cond_resched(void)
 {
-       if (should_resched()) {
+       if (should_resched(0)) {
                preempt_schedule_common();
                return 1;
        }
@@ -4529,7 +4598,7 @@ EXPORT_SYMBOL(_cond_resched);
  */
 int __cond_resched_lock(spinlock_t *lock)
 {
-       int resched = should_resched();
+       int resched = should_resched(PREEMPT_LOCK_OFFSET);
        int ret = 0;
 
        lockdep_assert_held(lock);
@@ -4551,7 +4620,7 @@ int __sched __cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
 
-       if (should_resched()) {
+       if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
                local_bh_enable();
                preempt_schedule_common();
                local_bh_disable();
@@ -4884,13 +4953,22 @@ void init_idle(struct task_struct *idle, int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       raw_spin_lock_irqsave(&idle->pi_lock, flags);
+       raw_spin_lock(&rq->lock);
 
        __sched_fork(0, idle);
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
 
-       do_set_cpus_allowed(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMP
+       /*
+        * Its possible that init_idle() gets called multiple times on a task,
+        * in that case do_set_cpus_allowed() will not do the right thing.
+        *
+        * And since this is boot we can forgo the serialization.
+        */
+       set_cpus_allowed_common(idle, cpumask_of(cpu));
+#endif
        /*
         * We're having a chicken and egg problem, even though we are
         * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -4907,10 +4985,11 @@ void init_idle(struct task_struct *idle, int cpu)
 
        rq->curr = rq->idle = idle;
        idle->on_rq = TASK_ON_RQ_QUEUED;
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
        idle->on_cpu = 1;
 #endif
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       raw_spin_unlock(&rq->lock);
+       raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
 
        /* Set the preempt count _outside_ the spinlocks! */
        init_idle_preempt_count(idle, cpu);
@@ -4921,7 +5000,7 @@ void init_idle(struct task_struct *idle, int cpu)
        idle->sched_class = &idle_sched_class;
        ftrace_graph_init_idle_task(idle, cpu);
        vtime_init_idle(idle, cpu);
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
 }
@@ -5143,24 +5222,47 @@ static void migrate_tasks(struct rq *dead_rq)
                        break;
 
                /*
-                * Ensure rq->lock covers the entire task selection
-                * until the migration.
+                * pick_next_task assumes pinned rq->lock.
                 */
                lockdep_pin_lock(&rq->lock);
                next = pick_next_task(rq, &fake_task);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
 
+               /*
+                * Rules for changing task_struct::cpus_allowed are holding
+                * both pi_lock and rq->lock, such that holding either
+                * stabilizes the mask.
+                *
+                * Drop rq->lock is not quite as disastrous as it usually is
+                * because !cpu_active at this point, which means load-balance
+                * will not interfere. Also, stop-machine.
+                */
+               lockdep_unpin_lock(&rq->lock);
+               raw_spin_unlock(&rq->lock);
+               raw_spin_lock(&next->pi_lock);
+               raw_spin_lock(&rq->lock);
+
+               /*
+                * Since we're inside stop-machine, _nothing_ should have
+                * changed the task, WARN if weird stuff happened, because in
+                * that case the above rq->lock drop is a fail too.
+                */
+               if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+                       raw_spin_unlock(&next->pi_lock);
+                       continue;
+               }
+
                /* Find suitable destination for @next, with force if needed. */
                dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
-               lockdep_unpin_lock(&rq->lock);
                rq = __migrate_task(rq, next, dest_cpu);
                if (rq != dead_rq) {
                        raw_spin_unlock(&rq->lock);
                        rq = dead_rq;
                        raw_spin_lock(&rq->lock);
                }
+               raw_spin_unlock(&next->pi_lock);
        }
 
        rq->stop = stop;
@@ -5330,8 +5432,7 @@ static void register_sched_domain_sysctl(void)
 /* may be called multiple times per register */
 static void unregister_sched_domain_sysctl(void)
 {
-       if (sd_sysctl_header)
-               unregister_sysctl_table(sd_sysctl_header);
+       unregister_sysctl_table(sd_sysctl_header);
        sd_sysctl_header = NULL;
        if (sd_ctl_dir[0].child)
                sd_free_ctl_entry(&sd_ctl_dir[0].child);
@@ -5452,6 +5553,14 @@ static int sched_cpu_active(struct notifier_block *nfb,
        case CPU_STARTING:
                set_cpu_rq_start_time();
                return NOTIFY_OK;
+       case CPU_ONLINE:
+               /*
+                * At this point a starting CPU has marked itself as online via
+                * set_cpu_online(). But it might not yet have marked itself
+                * as active, which is essential from here on.
+                *
+                * Thus, fall-through and help the starting CPU along.
+                */
        case CPU_DOWN_FAILED:
                set_cpu_active((long)hcpu, true);
                return NOTIFY_OK;
@@ -6464,8 +6573,10 @@ static void init_numa_topology_type(void)
 
        n = sched_max_numa_distance;
 
-       if (n <= 1)
+       if (sched_domains_numa_levels <= 1) {
                sched_numa_topology_type = NUMA_DIRECT;
+               return;
+       }
 
        for_each_online_node(a) {
                for_each_online_node(b) {
@@ -7153,9 +7264,6 @@ void __init sched_init_smp(void)
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
-       /* nohz_full won't take effect without isolating the cpus. */
-       tick_nohz_full_add_cpus_to(cpu_isolated_map);
-
        sched_init_numa();
 
        /*
@@ -8087,7 +8195,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
        sched_offline_group(tg);
 }
 
-static void cpu_cgroup_fork(struct task_struct *task)
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
 {
        sched_move_task(task);
 }