Merge tag 'v3.18' into p/abusse/merge_upgrade
[projects/modsched/linux.git] / kernel / sched / cfs / cputime.c
index a7959e0..8394b1e 100644 (file)
@@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
         * is the only cgroup, then nothing else should be necessary.
         *
         */
-       __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+       __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
 
        cpuacct_account_field(p, index, tmp);
 }
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
        p->utimescaled += cputime_scaled;
        account_group_user_time(p, cputime);
 
-       index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+       index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 
        /* Add user time to cpustat. */
        task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
        p->gtime += cputime;
 
        /* Add guest time to cpustat. */
-       if (TASK_NICE(p) > 0) {
+       if (task_nice(p) > 0) {
                cpustat[CPUTIME_NICE] += (__force u64) cputime;
                cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
        } else {
@@ -258,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void)
 {
 #ifdef CONFIG_PARAVIRT
        if (static_key_false(&paravirt_steal_enabled)) {
-               u64 steal, st = 0;
+               u64 steal;
+               cputime_t steal_ct;
 
                steal = paravirt_steal_clock(smp_processor_id());
                steal -= this_rq()->prev_steal_time;
 
-               st = steal_ticks(steal);
-               this_rq()->prev_steal_time += st * TICK_NSEC;
+               /*
+                * cputime_t may be less precise than nsecs (eg: if it's
+                * based on jiffies). Lets cast the result to cputime
+                * granularity and account the rest on the next rounds.
+                */
+               steal_ct = nsecs_to_cputime(steal);
+               this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
 
-               account_steal_time(st);
-               return st;
+               account_steal_time(steal_ct);
+               return steal_ct;
        }
 #endif
        return false;
@@ -282,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        struct signal_struct *sig = tsk->signal;
        cputime_t utime, stime;
        struct task_struct *t;
-
-       times->utime = sig->utime;
-       times->stime = sig->stime;
-       times->sum_exec_runtime = sig->sum_sched_runtime;
+       unsigned int seq, nextseq;
+       unsigned long flags;
 
        rcu_read_lock();
-       /* make sure we can trust tsk->thread_group list */
-       if (!likely(pid_alive(tsk)))
-               goto out;
-
-       t = tsk;
+       /* Attempt a lockless read on the first round. */
+       nextseq = 0;
        do {
-               task_cputime(t, &utime, &stime);
-               times->utime += utime;
-               times->stime += stime;
-               times->sum_exec_runtime += task_sched_runtime(t);
-       } while_each_thread(tsk, t);
-out:
+               seq = nextseq;
+               flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+               times->utime = sig->utime;
+               times->stime = sig->stime;
+               times->sum_exec_runtime = sig->sum_sched_runtime;
+
+               for_each_thread(tsk, t) {
+                       task_cputime(t, &utime, &stime);
+                       times->utime += utime;
+                       times->stime += stime;
+                       times->sum_exec_runtime += task_sched_runtime(t);
+               }
+               /* If lockless access failed, take the lock. */
+               nextseq = 1;
+       } while (need_seqretry(&sig->stats_lock, seq));
+       done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
        rcu_read_unlock();
 }
 
@@ -326,50 +337,50 @@ out:
  * softirq as those do not count in task exec_runtime any more.
  */
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                               struct rq *rq)
+                                        struct rq *rq, int ticks)
 {
-       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+       cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+       u64 cputime = (__force u64) cputime_one_jiffy;
        u64 *cpustat = kcpustat_this_cpu->cpustat;
 
        if (steal_account_process_tick())
                return;
 
+       cputime *= ticks;
+       scaled *= ticks;
+
        if (irqtime_account_hi_update()) {
-               cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+               cpustat[CPUTIME_IRQ] += cputime;
        } else if (irqtime_account_si_update()) {
-               cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+               cpustat[CPUTIME_SOFTIRQ] += cputime;
        } else if (this_cpu_ksoftirqd() == p) {
                /*
                 * ksoftirqd time do not get accounted in cpu_softirq_time.
                 * So, we have to handle it separately here.
                 * Also, p->stime needs to be updated for ksoftirqd.
                 */
-               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                       CPUTIME_SOFTIRQ);
+               __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
        } else if (user_tick) {
-               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+               account_user_time(p, cputimescaled);
        } else if (p == rq->idle) {
-               account_idle_time(cputime_one_jiffy);
+               account_idle_time(cputime);
        } else if (p->flags & PF_VCPU) { /* System time or guest time */
-               account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+               account_guest_time(p, cputimescaled);
        } else {
-               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                       CPUTIME_SYSTEM);
+               __account_system_time(p, cputime, scaled,       CPUTIME_SYSTEM);
        }
 }
 
 static void irqtime_account_idle_ticks(int ticks)
 {
-       int i;
        struct rq *rq = this_rq();
 
-       for (i = 0; i < ticks; i++)
-               irqtime_account_process_tick(current, 0, rq);
+       irqtime_account_process_tick(current, 0, rq, ticks);
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 static inline void irqtime_account_idle_ticks(int ticks) {}
 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                               struct rq *rq) {}
+                                               struct rq *rq, int nr_ticks) {}
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 /*
@@ -378,11 +389,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 
 #ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
+void vtime_common_task_switch(struct task_struct *prev)
 {
-       if (!vtime_accounting_enabled())
-               return;
-
        if (is_idle_task(prev))
                vtime_account_idle(prev);
        else
@@ -404,11 +412,8 @@ void vtime_task_switch(struct task_struct *prev)
  * vtime_account().
  */
 #ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account_irq_enter(struct task_struct *tsk)
+void vtime_common_account_irq_enter(struct task_struct *tsk)
 {
-       if (!vtime_accounting_enabled())
-               return;
-
        if (!in_interrupt()) {
                /*
                 * If we interrupted user, context_tracking_in_user()
@@ -428,7 +433,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
        }
        vtime_account_system(tsk);
 }
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
+EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
 
@@ -464,7 +469,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
                return;
 
        if (sched_clock_irqtime) {
-               irqtime_account_process_tick(p, user_tick, rq);
+               irqtime_account_process_tick(p, user_tick, rq, 1);
                return;
        }
 
@@ -549,6 +554,23 @@ drop_precision:
        return (__force cputime_t) scaled;
 }
 
+/*
+ * Atomically advance counter to the new value. Interrupts, vcpu
+ * scheduling, and scaling inaccuracies can cause cputime_advance
+ * to be occasionally called with a new value smaller than counter.
+ * Let's enforce atomicity.
+ *
+ * Normally a caller will only go through this loop once, or not
+ * at all in case a previous caller updated counter the same jiffy.
+ */
+static void cputime_advance(cputime_t *counter, cputime_t new)
+{
+       cputime_t old;
+
+       while (new > (old = ACCESS_ONCE(*counter)))
+               cmpxchg_cputime(counter, old, new);
+}
+
 /*
  * Adjust tick based cputime random precision against scheduler
  * runtime accounting.
@@ -557,16 +579,7 @@ static void cputime_adjust(struct task_cputime *curr,
                           struct cputime *prev,
                           cputime_t *ut, cputime_t *st)
 {
-       cputime_t rtime, stime, utime, total;
-
-       if (vtime_accounting_enabled()) {
-               *ut = curr->utime;
-               *st = curr->stime;
-               return;
-       }
-
-       stime = curr->stime;
-       total = stime + curr->utime;
+       cputime_t rtime, stime, utime;
 
        /*
         * Tick based cputime accounting depend on random scheduling
@@ -588,22 +601,23 @@ static void cputime_adjust(struct task_cputime *curr,
        if (prev->stime + prev->utime >= rtime)
                goto out;
 
-       if (total) {
+       stime = curr->stime;
+       utime = curr->utime;
+
+       if (utime == 0) {
+               stime = rtime;
+       } else if (stime == 0) {
+               utime = rtime;
+       } else {
+               cputime_t total = stime + utime;
+
                stime = scale_stime((__force u64)stime,
                                    (__force u64)rtime, (__force u64)total);
                utime = rtime - stime;
-       } else {
-               stime = rtime;
-               utime = 0;
        }
 
-       /*
-        * If the tick based count grows faster than the scheduler one,
-        * the result of the scaling may go backward.
-        * Let's enforce monotonicity.
-        */
-       prev->stime = max(prev->stime, stime);
-       prev->utime = max(prev->utime, utime);
+       cputime_advance(&prev->stime, stime);
+       cputime_advance(&prev->utime, utime);
 
 out:
        *ut = prev->utime;
@@ -620,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
 
-/*
- * Must be called with siglock held.
- */
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        struct task_cputime cputime;
@@ -664,23 +675,17 @@ static void __vtime_account_system(struct task_struct *tsk)
 
 void vtime_account_system(struct task_struct *tsk)
 {
-       if (!vtime_accounting_enabled())
-               return;
-
        write_seqlock(&tsk->vtime_seqlock);
        __vtime_account_system(tsk);
        write_sequnlock(&tsk->vtime_seqlock);
 }
 
-void vtime_account_irq_exit(struct task_struct *tsk)
+void vtime_gen_account_irq_exit(struct task_struct *tsk)
 {
-       if (!vtime_accounting_enabled())
-               return;
-
        write_seqlock(&tsk->vtime_seqlock);
+       __vtime_account_system(tsk);
        if (context_tracking_in_user())
                tsk->vtime_snap_whence = VTIME_USER;
-       __vtime_account_system(tsk);
        write_sequnlock(&tsk->vtime_seqlock);
 }
 
@@ -688,12 +693,8 @@ void vtime_account_user(struct task_struct *tsk)
 {
        cputime_t delta_cpu;
 
-       if (!vtime_accounting_enabled())
-               return;
-
-       delta_cpu = get_vtime_delta(tsk);
-
        write_seqlock(&tsk->vtime_seqlock);
+       delta_cpu = get_vtime_delta(tsk);
        tsk->vtime_snap_whence = VTIME_SYS;
        account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
        write_sequnlock(&tsk->vtime_seqlock);
@@ -701,22 +702,27 @@ void vtime_account_user(struct task_struct *tsk)
 
 void vtime_user_enter(struct task_struct *tsk)
 {
-       if (!vtime_accounting_enabled())
-               return;
-
        write_seqlock(&tsk->vtime_seqlock);
-       tsk->vtime_snap_whence = VTIME_USER;
        __vtime_account_system(tsk);
+       tsk->vtime_snap_whence = VTIME_USER;
        write_sequnlock(&tsk->vtime_seqlock);
 }
 
 void vtime_guest_enter(struct task_struct *tsk)
 {
+       /*
+        * The flags must be updated under the lock with
+        * the vtime_snap flush and update.
+        * That enforces a right ordering and update sequence
+        * synchronization against the reader (task_gtime())
+        * that can thus safely catch up with a tickless delta.
+        */
        write_seqlock(&tsk->vtime_seqlock);
        __vtime_account_system(tsk);
        current->flags |= PF_VCPU;
        write_sequnlock(&tsk->vtime_seqlock);
 }
+EXPORT_SYMBOL_GPL(vtime_guest_enter);
 
 void vtime_guest_exit(struct task_struct *tsk)
 {
@@ -725,6 +731,7 @@ void vtime_guest_exit(struct task_struct *tsk)
        current->flags &= ~PF_VCPU;
        write_sequnlock(&tsk->vtime_seqlock);
 }
+EXPORT_SYMBOL_GPL(vtime_guest_exit);
 
 void vtime_account_idle(struct task_struct *tsk)
 {
@@ -733,11 +740,6 @@ void vtime_account_idle(struct task_struct *tsk)
        account_idle_time(delta_cpu);
 }
 
-bool vtime_accounting_enabled(void)
-{
-       return context_tracking_active();
-}
-
 void arch_vtime_task_switch(struct task_struct *prev)
 {
        write_seqlock(&prev->vtime_seqlock);