Merge tag 'v4.3' into p/abusse/merge_upgrade
[projects/modsched/linux.git] / kernel / sched / cfs / cputime.c
index 9994791..8cbc3db 100644 (file)
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
        p->utimescaled += cputime_scaled;
        account_group_user_time(p, cputime);
 
-       index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+       index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 
        /* Add user time to cpustat. */
        task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
        p->gtime += cputime;
 
        /* Add guest time to cpustat. */
-       if (TASK_NICE(p) > 0) {
+       if (task_nice(p) > 0) {
                cpustat[CPUTIME_NICE] += (__force u64) cputime;
                cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
        } else {
@@ -258,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void)
 {
 #ifdef CONFIG_PARAVIRT
        if (static_key_false(&paravirt_steal_enabled)) {
-               u64 steal, st = 0;
+               u64 steal;
+               cputime_t steal_ct;
 
                steal = paravirt_steal_clock(smp_processor_id());
                steal -= this_rq()->prev_steal_time;
 
-               st = steal_ticks(steal);
-               this_rq()->prev_steal_time += st * TICK_NSEC;
+               /*
+                * cputime_t may be less precise than nsecs (eg: if it's
+                * based on jiffies). Lets cast the result to cputime
+                * granularity and account the rest on the next rounds.
+                */
+               steal_ct = nsecs_to_cputime(steal);
+               this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
 
-               account_steal_time(st);
-               return st;
+               account_steal_time(steal_ct);
+               return steal_ct;
        }
 #endif
        return false;
@@ -282,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        struct signal_struct *sig = tsk->signal;
        cputime_t utime, stime;
        struct task_struct *t;
-
-       times->utime = sig->utime;
-       times->stime = sig->stime;
-       times->sum_exec_runtime = sig->sum_sched_runtime;
+       unsigned int seq, nextseq;
+       unsigned long flags;
 
        rcu_read_lock();
-       /* make sure we can trust tsk->thread_group list */
-       if (!likely(pid_alive(tsk)))
-               goto out;
-
-       t = tsk;
+       /* Attempt a lockless read on the first round. */
+       nextseq = 0;
        do {
-               task_cputime(t, &utime, &stime);
-               times->utime += utime;
-               times->stime += stime;
-               times->sum_exec_runtime += task_sched_runtime(t);
-       } while_each_thread(tsk, t);
-out:
+               seq = nextseq;
+               flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+               times->utime = sig->utime;
+               times->stime = sig->stime;
+               times->sum_exec_runtime = sig->sum_sched_runtime;
+
+               for_each_thread(tsk, t) {
+                       task_cputime(t, &utime, &stime);
+                       times->utime += utime;
+                       times->stime += stime;
+                       times->sum_exec_runtime += task_sched_runtime(t);
+               }
+               /* If lockless access failed, take the lock. */
+               nextseq = 1;
+       } while (need_seqretry(&sig->stats_lock, seq));
+       done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
        rcu_read_unlock();
 }
 
@@ -326,50 +337,50 @@ out:
  * softirq as those do not count in task exec_runtime any more.
  */
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                               struct rq *rq)
+                                        struct rq *rq, int ticks)
 {
-       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+       cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+       u64 cputime = (__force u64) cputime_one_jiffy;
        u64 *cpustat = kcpustat_this_cpu->cpustat;
 
        if (steal_account_process_tick())
                return;
 
+       cputime *= ticks;
+       scaled *= ticks;
+
        if (irqtime_account_hi_update()) {
-               cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+               cpustat[CPUTIME_IRQ] += cputime;
        } else if (irqtime_account_si_update()) {
-               cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+               cpustat[CPUTIME_SOFTIRQ] += cputime;
        } else if (this_cpu_ksoftirqd() == p) {
                /*
                 * ksoftirqd time do not get accounted in cpu_softirq_time.
                 * So, we have to handle it separately here.
                 * Also, p->stime needs to be updated for ksoftirqd.
                 */
-               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                       CPUTIME_SOFTIRQ);
+               __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
        } else if (user_tick) {
-               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+               account_user_time(p, cputimescaled);
        } else if (p == rq->idle) {
-               account_idle_time(cputime_one_jiffy);
+               account_idle_time(cputime);
        } else if (p->flags & PF_VCPU) { /* System time or guest time */
-               account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+               account_guest_time(p, cputimescaled);
        } else {
-               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                       CPUTIME_SYSTEM);
+               __account_system_time(p, cputime, scaled,       CPUTIME_SYSTEM);
        }
 }
 
 static void irqtime_account_idle_ticks(int ticks)
 {
-       int i;
        struct rq *rq = this_rq();
 
-       for (i = 0; i < ticks; i++)
-               irqtime_account_process_tick(current, 0, rq);
+       irqtime_account_process_tick(current, 0, rq, ticks);
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 static inline void irqtime_account_idle_ticks(int ticks) {}
 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                               struct rq *rq) {}
+                                               struct rq *rq, int nr_ticks) {}
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 /*
@@ -458,7 +469,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
                return;
 
        if (sched_clock_irqtime) {
-               irqtime_account_process_tick(p, user_tick, rq);
+               irqtime_account_process_tick(p, user_tick, rq, 1);
                return;
        }
 
@@ -544,31 +555,43 @@ drop_precision:
 }
 
 /*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
+ * Adjust tick based cputime random precision against scheduler runtime
+ * accounting.
+ *
+ * Tick based cputime accounting depend on random scheduling timeslices of a
+ * task to be interrupted or not by the timer.  Depending on these
+ * circumstances, the number of these interrupts may be over or
+ * under-optimistic, matching the real user and system cputime with a variable
+ * precision.
+ *
+ * Fix this by scaling these tick based values against the total runtime
+ * accounted by the CFS scheduler.
+ *
+ * This code provides the following guarantees:
+ *
+ *   stime + utime == rtime
+ *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
  */
 static void cputime_adjust(struct task_cputime *curr,
-                          struct cputime *prev,
+                          struct prev_cputime *prev,
                           cputime_t *ut, cputime_t *st)
 {
        cputime_t rtime, stime, utime;
+       unsigned long flags;
 
-       /*
-        * Tick based cputime accounting depend on random scheduling
-        * timeslices of a task to be interrupted or not by the timer.
-        * Depending on these circumstances, the number of these interrupts
-        * may be over or under-optimistic, matching the real user and system
-        * cputime with a variable precision.
-        *
-        * Fix this by scaling these tick based values against the total
-        * runtime accounted by the CFS scheduler.
-        */
+       /* Serialize concurrent callers such that we can honour our guarantees */
+       raw_spin_lock_irqsave(&prev->lock, flags);
        rtime = nsecs_to_cputime(curr->sum_exec_runtime);
 
        /*
-        * Update userspace visible utime/stime values only if actual execution
-        * time is bigger than already exported. Note that can happen, that we
-        * provided bigger values due to scaling inaccuracy on big numbers.
+        * This is possible under two circumstances:
+        *  - rtime isn't monotonic after all (a bug);
+        *  - we got reordered by the lock.
+        *
+        * In both cases this acts as a filter such that the rest of the code
+        * can assume it is monotonic regardless of anything else.
         */
        if (prev->stime + prev->utime >= rtime)
                goto out;
@@ -578,27 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
 
        if (utime == 0) {
                stime = rtime;
-       } else if (stime == 0) {
-               utime = rtime;
-       } else {
-               cputime_t total = stime + utime;
+               goto update;
+       }
 
-               stime = scale_stime((__force u64)stime,
-                                   (__force u64)rtime, (__force u64)total);
-               utime = rtime - stime;
+       if (stime == 0) {
+               utime = rtime;
+               goto update;
        }
 
+       stime = scale_stime((__force u64)stime, (__force u64)rtime,
+                           (__force u64)(stime + utime));
+
+       /*
+        * Make sure stime doesn't go backwards; this preserves monotonicity
+        * for utime because rtime is monotonic.
+        *
+        *  utime_i+1 = rtime_i+1 - stime_i
+        *            = rtime_i+1 - (rtime_i - utime_i)
+        *            = (rtime_i+1 - rtime_i) + utime_i
+        *            >= utime_i
+        */
+       if (stime < prev->stime)
+               stime = prev->stime;
+       utime = rtime - stime;
+
        /*
-        * If the tick based count grows faster than the scheduler one,
-        * the result of the scaling may go backward.
-        * Let's enforce monotonicity.
+        * Make sure utime doesn't go backwards; this still preserves
+        * monotonicity for stime, analogous argument to above.
         */
-       prev->stime = max(prev->stime, stime);
-       prev->utime = max(prev->utime, utime);
+       if (utime < prev->utime) {
+               utime = prev->utime;
+               stime = rtime - utime;
+       }
 
+update:
+       prev->stime = stime;
+       prev->utime = utime;
 out:
        *ut = prev->utime;
        *st = prev->stime;
+       raw_spin_unlock_irqrestore(&prev->lock, flags);
 }
 
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
@@ -611,9 +653,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
 
-/*
- * Must be called with siglock held.
- */
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        struct task_cputime cputime;