Merge tag 'v4.3' into p/abusse/merge_upgrade
[projects/modsched/linux.git] / kernel / sched / cfs / cputime.c
index e93cca9..8cbc3db 100644 (file)
@@ -115,31 +115,15 @@ static int irqtime_account_si_update(void)
 static inline void task_group_account_field(struct task_struct *p, int index,
                                            u64 tmp)
 {
-#ifdef CONFIG_CGROUP_CPUACCT
-       struct kernel_cpustat *kcpustat;
-       struct cpuacct *ca;
-#endif
        /*
         * Since all updates are sure to touch the root cgroup, we
         * get ourselves ahead and touch it first. If the root cgroup
         * is the only cgroup, then nothing else should be necessary.
         *
         */
-       __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
-
-#ifdef CONFIG_CGROUP_CPUACCT
-       if (unlikely(!cpuacct_subsys.active))
-               return;
+       __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
 
-       rcu_read_lock();
-       ca = task_ca(p);
-       while (ca && (ca != &root_cpuacct)) {
-               kcpustat = this_cpu_ptr(ca->cpustat);
-               kcpustat->cpustat[index] += tmp;
-               ca = parent_ca(ca);
-       }
-       rcu_read_unlock();
-#endif
+       cpuacct_account_field(p, index, tmp);
 }
 
 /*
@@ -158,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
        p->utimescaled += cputime_scaled;
        account_group_user_time(p, cputime);
 
-       index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+       index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 
        /* Add user time to cpustat. */
        task_group_account_field(p, index, (__force u64) cputime);
@@ -185,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
        p->gtime += cputime;
 
        /* Add guest time to cpustat. */
-       if (TASK_NICE(p) > 0) {
+       if (task_nice(p) > 0) {
                cpustat[CPUTIME_NICE] += (__force u64) cputime;
                cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
        } else {
@@ -274,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void)
 {
 #ifdef CONFIG_PARAVIRT
        if (static_key_false(&paravirt_steal_enabled)) {
-               u64 steal, st = 0;
+               u64 steal;
+               cputime_t steal_ct;
 
                steal = paravirt_steal_clock(smp_processor_id());
                steal -= this_rq()->prev_steal_time;
 
-               st = steal_ticks(steal);
-               this_rq()->prev_steal_time += st * TICK_NSEC;
+               /*
+                * cputime_t may be less precise than nsecs (eg: if it's
+                * based on jiffies). Lets cast the result to cputime
+                * granularity and account the rest on the next rounds.
+                */
+               steal_ct = nsecs_to_cputime(steal);
+               this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
 
-               account_steal_time(st);
-               return st;
+               account_steal_time(steal_ct);
+               return steal_ct;
        }
 #endif
        return false;
@@ -298,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        struct signal_struct *sig = tsk->signal;
        cputime_t utime, stime;
        struct task_struct *t;
-
-       times->utime = sig->utime;
-       times->stime = sig->stime;
-       times->sum_exec_runtime = sig->sum_sched_runtime;
+       unsigned int seq, nextseq;
+       unsigned long flags;
 
        rcu_read_lock();
-       /* make sure we can trust tsk->thread_group list */
-       if (!likely(pid_alive(tsk)))
-               goto out;
-
-       t = tsk;
+       /* Attempt a lockless read on the first round. */
+       nextseq = 0;
        do {
-               task_cputime(t, &utime, &stime);
-               times->utime += utime;
-               times->stime += stime;
-               times->sum_exec_runtime += task_sched_runtime(t);
-       } while_each_thread(tsk, t);
-out:
+               seq = nextseq;
+               flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+               times->utime = sig->utime;
+               times->stime = sig->stime;
+               times->sum_exec_runtime = sig->sum_sched_runtime;
+
+               for_each_thread(tsk, t) {
+                       task_cputime(t, &utime, &stime);
+                       times->utime += utime;
+                       times->stime += stime;
+                       times->sum_exec_runtime += task_sched_runtime(t);
+               }
+               /* If lockless access failed, take the lock. */
+               nextseq = 1;
+       } while (need_seqretry(&sig->stats_lock, seq));
+       done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
        rcu_read_unlock();
 }
 
@@ -342,53 +337,124 @@ out:
  * softirq as those do not count in task exec_runtime any more.
  */
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                               struct rq *rq)
+                                        struct rq *rq, int ticks)
 {
-       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+       cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+       u64 cputime = (__force u64) cputime_one_jiffy;
        u64 *cpustat = kcpustat_this_cpu->cpustat;
 
        if (steal_account_process_tick())
                return;
 
+       cputime *= ticks;
+       scaled *= ticks;
+
        if (irqtime_account_hi_update()) {
-               cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+               cpustat[CPUTIME_IRQ] += cputime;
        } else if (irqtime_account_si_update()) {
-               cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+               cpustat[CPUTIME_SOFTIRQ] += cputime;
        } else if (this_cpu_ksoftirqd() == p) {
                /*
                 * ksoftirqd time do not get accounted in cpu_softirq_time.
                 * So, we have to handle it separately here.
                 * Also, p->stime needs to be updated for ksoftirqd.
                 */
-               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                       CPUTIME_SOFTIRQ);
+               __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
        } else if (user_tick) {
-               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+               account_user_time(p, cputimescaled);
        } else if (p == rq->idle) {
-               account_idle_time(cputime_one_jiffy);
+               account_idle_time(cputime);
        } else if (p->flags & PF_VCPU) { /* System time or guest time */
-               account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+               account_guest_time(p, cputimescaled);
        } else {
-               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                       CPUTIME_SYSTEM);
+               __account_system_time(p, cputime, scaled,       CPUTIME_SYSTEM);
        }
 }
 
 static void irqtime_account_idle_ticks(int ticks)
 {
-       int i;
        struct rq *rq = this_rq();
 
-       for (i = 0; i < ticks; i++)
-               irqtime_account_process_tick(current, 0, rq);
+       irqtime_account_process_tick(current, 0, rq, ticks);
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 static inline void irqtime_account_idle_ticks(int ticks) {}
 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                               struct rq *rq) {}
+                                               struct rq *rq, int nr_ticks) {}
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_common_task_switch(struct task_struct *prev)
+{
+       if (is_idle_task(prev))
+               vtime_account_idle(prev);
+       else
+               vtime_account_system(prev);
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+       vtime_account_user(prev);
+#endif
+       arch_vtime_task_switch(prev);
+}
+#endif
+
+/*
+ * Archs that account the whole time spent in the idle task
+ * (outside irq) as idle time can rely on this and just implement
+ * vtime_account_system() and vtime_account_idle(). Archs that
+ * have other meaning of the idle time (s390 only includes the
+ * time spent by the CPU when it's in low power mode) must override
+ * vtime_account().
+ */
+#ifndef __ARCH_HAS_VTIME_ACCOUNT
+void vtime_common_account_irq_enter(struct task_struct *tsk)
+{
+       if (!in_interrupt()) {
+               /*
+                * If we interrupted user, context_tracking_in_user()
+                * is 1 because the context tracking don't hook
+                * on irq entry/exit. This way we know if
+                * we need to flush user time on kernel entry.
+                */
+               if (context_tracking_in_user()) {
+                       vtime_account_user(tsk);
+                       return;
+               }
+
+               if (is_idle_task(tsk)) {
+                       vtime_account_idle(tsk);
+                       return;
+               }
+       }
+       vtime_account_system(tsk);
+}
+EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+       *ut = p->utime;
+       *st = p->stime;
+}
+
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+       struct task_cputime cputime;
+
+       thread_group_cputime(p, &cputime);
+
+       *ut = cputime.utime;
+       *st = cputime.stime;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 /*
  * Account a single tick of cpu time.
  * @p: the process that the cpu time gets accounted to
@@ -403,7 +469,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
                return;
 
        if (sched_clock_irqtime) {
-               irqtime_account_process_tick(p, user_tick, rq);
+               irqtime_account_process_tick(p, user_tick, rq, 1);
                return;
        }
 
@@ -443,138 +509,138 @@ void account_idle_ticks(unsigned long ticks)
 
        account_idle_time(jiffies_to_cputime(ticks));
 }
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 /*
- * Use precise platform statistics if available:
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
+ * loosing precision when the numbers are big.
  */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
 {
-       *ut = p->utime;
-       *st = p->stime;
-}
+       u64 scaled;
 
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-       struct task_cputime cputime;
+       for (;;) {
+               /* Make sure "rtime" is the bigger of stime/rtime */
+               if (stime > rtime)
+                       swap(rtime, stime);
 
-       thread_group_cputime(p, &cputime);
+               /* Make sure 'total' fits in 32 bits */
+               if (total >> 32)
+                       goto drop_precision;
 
-       *ut = cputime.utime;
-       *st = cputime.stime;
-}
+               /* Does rtime (and thus stime) fit in 32 bits? */
+               if (!(rtime >> 32))
+                       break;
 
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
-{
-       if (!vtime_accounting_enabled())
-               return;
-
-       if (is_idle_task(prev))
-               vtime_account_idle(prev);
-       else
-               vtime_account_system(prev);
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-       vtime_account_user(prev);
-#endif
-       arch_vtime_task_switch(prev);
-}
-#endif
-
-/*
- * Archs that account the whole time spent in the idle task
- * (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
- * have other meaning of the idle time (s390 only includes the
- * time spent by the CPU when it's in low power mode) must override
- * vtime_account().
- */
-#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account_irq_enter(struct task_struct *tsk)
-{
-       if (!vtime_accounting_enabled())
-               return;
+               /* Can we just balance rtime/stime rather than dropping bits? */
+               if (stime >> 31)
+                       goto drop_precision;
 
-       if (!in_interrupt()) {
-               /*
-                * If we interrupted user, context_tracking_in_user()
-                * is 1 because the context tracking don't hook
-                * on irq entry/exit. This way we know if
-                * we need to flush user time on kernel entry.
-                */
-               if (context_tracking_in_user()) {
-                       vtime_account_user(tsk);
-                       return;
-               }
+               /* We can grow stime and shrink rtime and try to make them both fit */
+               stime <<= 1;
+               rtime >>= 1;
+               continue;
 
-               if (is_idle_task(tsk)) {
-                       vtime_account_idle(tsk);
-                       return;
-               }
+drop_precision:
+               /* We drop from rtime, it has more bits than stime */
+               rtime >>= 1;
+               total >>= 1;
        }
-       vtime_account_system(tsk);
-}
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
-
-static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
-{
-       u64 temp = (__force u64) rtime;
-
-       temp *= (__force u64) stime;
 
-       if (sizeof(cputime_t) == 4)
-               temp = div_u64(temp, (__force u32) total);
-       else
-               temp = div64_u64(temp, (__force u64) total);
-
-       return (__force cputime_t) temp;
+       /*
+        * Make sure gcc understands that this is a 32x32->64 multiply,
+        * followed by a 64/32->64 divide.
+        */
+       scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
+       return (__force cputime_t) scaled;
 }
 
 /*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
+ * Adjust tick based cputime random precision against scheduler runtime
+ * accounting.
+ *
+ * Tick based cputime accounting depend on random scheduling timeslices of a
+ * task to be interrupted or not by the timer.  Depending on these
+ * circumstances, the number of these interrupts may be over or
+ * under-optimistic, matching the real user and system cputime with a variable
+ * precision.
+ *
+ * Fix this by scaling these tick based values against the total runtime
+ * accounted by the CFS scheduler.
+ *
+ * This code provides the following guarantees:
+ *
+ *   stime + utime == rtime
+ *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
  */
 static void cputime_adjust(struct task_cputime *curr,
-                          struct cputime *prev,
+                          struct prev_cputime *prev,
                           cputime_t *ut, cputime_t *st)
 {
-       cputime_t rtime, stime, total;
+       cputime_t rtime, stime, utime;
+       unsigned long flags;
 
-       stime = curr->stime;
-       total = stime + curr->utime;
+       /* Serialize concurrent callers such that we can honour our guarantees */
+       raw_spin_lock_irqsave(&prev->lock, flags);
+       rtime = nsecs_to_cputime(curr->sum_exec_runtime);
 
        /*
-        * Tick based cputime accounting depend on random scheduling
-        * timeslices of a task to be interrupted or not by the timer.
-        * Depending on these circumstances, the number of these interrupts
-        * may be over or under-optimistic, matching the real user and system
-        * cputime with a variable precision.
+        * This is possible under two circumstances:
+        *  - rtime isn't monotonic after all (a bug);
+        *  - we got reordered by the lock.
         *
-        * Fix this by scaling these tick based values against the total
-        * runtime accounted by the CFS scheduler.
+        * In both cases this acts as a filter such that the rest of the code
+        * can assume it is monotonic regardless of anything else.
         */
-       rtime = nsecs_to_cputime(curr->sum_exec_runtime);
+       if (prev->stime + prev->utime >= rtime)
+               goto out;
 
-       if (total)
-               stime = scale_stime(stime, rtime, total);
-       else
+       stime = curr->stime;
+       utime = curr->utime;
+
+       if (utime == 0) {
                stime = rtime;
+               goto update;
+       }
+
+       if (stime == 0) {
+               utime = rtime;
+               goto update;
+       }
+
+       stime = scale_stime((__force u64)stime, (__force u64)rtime,
+                           (__force u64)(stime + utime));
+
+       /*
+        * Make sure stime doesn't go backwards; this preserves monotonicity
+        * for utime because rtime is monotonic.
+        *
+        *  utime_i+1 = rtime_i+1 - stime_i
+        *            = rtime_i+1 - (rtime_i - utime_i)
+        *            = (rtime_i+1 - rtime_i) + utime_i
+        *            >= utime_i
+        */
+       if (stime < prev->stime)
+               stime = prev->stime;
+       utime = rtime - stime;
 
        /*
-        * If the tick based count grows faster than the scheduler one,
-        * the result of the scaling may go backward.
-        * Let's enforce monotonicity.
+        * Make sure utime doesn't go backwards; this still preserves
+        * monotonicity for stime, analogous argument to above.
         */
-       prev->stime = max(prev->stime, stime);
-       prev->utime = max(prev->utime, rtime - prev->stime);
+       if (utime < prev->utime) {
+               utime = prev->utime;
+               stime = rtime - utime;
+       }
 
+update:
+       prev->stime = stime;
+       prev->utime = utime;
+out:
        *ut = prev->utime;
        *st = prev->stime;
+       raw_spin_unlock_irqrestore(&prev->lock, flags);
 }
 
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
@@ -587,9 +653,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
 
-/*
- * Must be called with siglock held.
- */
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        struct task_cputime cputime;
@@ -597,7 +660,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
        thread_group_cputime(p, &cputime);
        cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
 }
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 static unsigned long long vtime_delta(struct task_struct *tsk)
@@ -631,23 +694,17 @@ static void __vtime_account_system(struct task_struct *tsk)
 
 void vtime_account_system(struct task_struct *tsk)
 {
-       if (!vtime_accounting_enabled())
-               return;
-
        write_seqlock(&tsk->vtime_seqlock);
        __vtime_account_system(tsk);
        write_sequnlock(&tsk->vtime_seqlock);
 }
 
-void vtime_account_irq_exit(struct task_struct *tsk)
+void vtime_gen_account_irq_exit(struct task_struct *tsk)
 {
-       if (!vtime_accounting_enabled())
-               return;
-
        write_seqlock(&tsk->vtime_seqlock);
+       __vtime_account_system(tsk);
        if (context_tracking_in_user())
                tsk->vtime_snap_whence = VTIME_USER;
-       __vtime_account_system(tsk);
        write_sequnlock(&tsk->vtime_seqlock);
 }
 
@@ -655,12 +712,8 @@ void vtime_account_user(struct task_struct *tsk)
 {
        cputime_t delta_cpu;
 
-       if (!vtime_accounting_enabled())
-               return;
-
-       delta_cpu = get_vtime_delta(tsk);
-
        write_seqlock(&tsk->vtime_seqlock);
+       delta_cpu = get_vtime_delta(tsk);
        tsk->vtime_snap_whence = VTIME_SYS;
        account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
        write_sequnlock(&tsk->vtime_seqlock);
@@ -668,22 +721,27 @@ void vtime_account_user(struct task_struct *tsk)
 
 void vtime_user_enter(struct task_struct *tsk)
 {
-       if (!vtime_accounting_enabled())
-               return;
-
        write_seqlock(&tsk->vtime_seqlock);
-       tsk->vtime_snap_whence = VTIME_USER;
        __vtime_account_system(tsk);
+       tsk->vtime_snap_whence = VTIME_USER;
        write_sequnlock(&tsk->vtime_seqlock);
 }
 
 void vtime_guest_enter(struct task_struct *tsk)
 {
+       /*
+        * The flags must be updated under the lock with
+        * the vtime_snap flush and update.
+        * That enforces a right ordering and update sequence
+        * synchronization against the reader (task_gtime())
+        * that can thus safely catch up with a tickless delta.
+        */
        write_seqlock(&tsk->vtime_seqlock);
        __vtime_account_system(tsk);
        current->flags |= PF_VCPU;
        write_sequnlock(&tsk->vtime_seqlock);
 }
+EXPORT_SYMBOL_GPL(vtime_guest_enter);
 
 void vtime_guest_exit(struct task_struct *tsk)
 {
@@ -692,6 +750,7 @@ void vtime_guest_exit(struct task_struct *tsk)
        current->flags &= ~PF_VCPU;
        write_sequnlock(&tsk->vtime_seqlock);
 }
+EXPORT_SYMBOL_GPL(vtime_guest_exit);
 
 void vtime_account_idle(struct task_struct *tsk)
 {
@@ -700,11 +759,6 @@ void vtime_account_idle(struct task_struct *tsk)
        account_idle_time(delta_cpu);
 }
 
-bool vtime_accounting_enabled(void)
-{
-       return context_tracking_active();
-}
-
 void arch_vtime_task_switch(struct task_struct *prev)
 {
        write_seqlock(&prev->vtime_seqlock);
@@ -713,17 +767,17 @@ void arch_vtime_task_switch(struct task_struct *prev)
 
        write_seqlock(&current->vtime_seqlock);
        current->vtime_snap_whence = VTIME_SYS;
-       current->vtime_snap = sched_clock();
+       current->vtime_snap = sched_clock_cpu(smp_processor_id());
        write_sequnlock(&current->vtime_seqlock);
 }
 
-void vtime_init_idle(struct task_struct *t)
+void vtime_init_idle(struct task_struct *t, int cpu)
 {
        unsigned long flags;
 
        write_seqlock_irqsave(&t->vtime_seqlock, flags);
        t->vtime_snap_whence = VTIME_SYS;
-       t->vtime_snap = sched_clock();
+       t->vtime_snap = sched_clock_cpu(cpu);
        write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
 }