root/drivers/cpuidle/governors/teo.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. teo_update
  2. teo_find_shallower_state
  3. teo_select
  4. teo_reflect
  5. teo_enable_device
  6. teo_governor_init

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Timer events oriented CPU idle governor
   4  *
   5  * Copyright (C) 2018 Intel Corporation
   6  * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
   7  *
   8  * The idea of this governor is based on the observation that on many systems
   9  * timer events are two or more orders of magnitude more frequent than any
  10  * other interrupts, so they are likely to be the most significant source of CPU
  11  * wakeups from idle states.  Moreover, information about what happened in the
  12  * (relatively recent) past can be used to estimate whether or not the deepest
  13  * idle state with target residency within the time to the closest timer is
  14  * likely to be suitable for the upcoming idle time of the CPU and, if not, then
  15  * which of the shallower idle states to choose.
  16  *
  17  * Of course, non-timer wakeup sources are more important in some use cases and
  18  * they can be covered by taking a few most recent idle time intervals of the
  19  * CPU into account.  However, even in that case it is not necessary to consider
  20  * idle duration values greater than the time till the closest timer, as the
  21  * patterns that they may belong to produce average values close enough to
  22  * the time till the closest timer (sleep length) anyway.
  23  *
  24  * Thus this governor estimates whether or not the upcoming idle time of the CPU
  25  * is likely to be significantly shorter than the sleep length and selects an
  26  * idle state for it in accordance with that, as follows:
  27  *
  28  * - Find an idle state on the basis of the sleep length and state statistics
  29  *   collected over time:
  30  *
  31  *   o Find the deepest idle state whose target residency is less than or equal
  32  *     to the sleep length.
  33  *
  34  *   o Select it if it matched both the sleep length and the observed idle
  35  *     duration in the past more often than it matched the sleep length alone
  36  *     (i.e. the observed idle duration was significantly shorter than the sleep
  37  *     length matched by it).
  38  *
  39  *   o Otherwise, select the shallower state with the greatest matched "early"
  40  *     wakeups metric.
  41  *
  42  * - If the majority of the most recent idle duration values are below the
  43  *   target residency of the idle state selected so far, use those values to
  44  *   compute the new expected idle duration and find an idle state matching it
  45  *   (which has to be shallower than the one selected so far).
  46  */
  47 
  48 #include <linux/cpuidle.h>
  49 #include <linux/jiffies.h>
  50 #include <linux/kernel.h>
  51 #include <linux/sched/clock.h>
  52 #include <linux/tick.h>
  53 
  54 /*
  55  * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
  56  * is used for decreasing metrics on a regular basis.
  57  */
  58 #define PULSE           1024
  59 #define DECAY_SHIFT     3
  60 
  61 /*
  62  * Number of the most recent idle duration values to take into consideration for
  63  * the detection of wakeup patterns.
  64  */
  65 #define INTERVALS       8
  66 
  67 /**
  68  * struct teo_idle_state - Idle state data used by the TEO cpuidle governor.
  69  * @early_hits: "Early" CPU wakeups "matching" this state.
  70  * @hits: "On time" CPU wakeups "matching" this state.
  71  * @misses: CPU wakeups "missing" this state.
  72  *
  73  * A CPU wakeup is "matched" by a given idle state if the idle duration measured
  74  * after the wakeup is between the target residency of that state and the target
  75  * residency of the next one (or if this is the deepest available idle state, it
  76  * "matches" a CPU wakeup when the measured idle duration is at least equal to
  77  * its target residency).
  78  *
  79  * Also, from the TEO governor perspective, a CPU wakeup from idle is "early" if
  80  * it occurs significantly earlier than the closest expected timer event (that
  81  * is, early enough to match an idle state shallower than the one matching the
  82  * time till the closest timer event).  Otherwise, the wakeup is "on time", or
  83  * it is a "hit".
  84  *
  85  * A "miss" occurs when the given state doesn't match the wakeup, but it matches
  86  * the time till the closest timer event used for idle state selection.
  87  */
  88 struct teo_idle_state {
  89         unsigned int early_hits;
  90         unsigned int hits;
  91         unsigned int misses;
  92 };
  93 
  94 /**
  95  * struct teo_cpu - CPU data used by the TEO cpuidle governor.
  96  * @time_span_ns: Time between idle state selection and post-wakeup update.
  97  * @sleep_length_ns: Time till the closest timer event (at the selection time).
  98  * @states: Idle states data corresponding to this CPU.
  99  * @interval_idx: Index of the most recent saved idle interval.
 100  * @intervals: Saved idle duration values.
 101  */
 102 struct teo_cpu {
 103         u64 time_span_ns;
 104         u64 sleep_length_ns;
 105         struct teo_idle_state states[CPUIDLE_STATE_MAX];
 106         int interval_idx;
 107         unsigned int intervals[INTERVALS];
 108 };
 109 
 110 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
 111 
 112 /**
 113  * teo_update - Update CPU data after wakeup.
 114  * @drv: cpuidle driver containing state data.
 115  * @dev: Target CPU.
 116  */
 117 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 118 {
 119         struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
 120         unsigned int sleep_length_us = ktime_to_us(cpu_data->sleep_length_ns);
 121         int i, idx_hit = -1, idx_timer = -1;
 122         unsigned int measured_us;
 123 
 124         if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
 125                 /*
 126                  * One of the safety nets has triggered or the wakeup was close
 127                  * enough to the closest timer event expected at the idle state
 128                  * selection time to be discarded.
 129                  */
 130                 measured_us = UINT_MAX;
 131         } else {
 132                 unsigned int lat;
 133 
 134                 lat = drv->states[dev->last_state_idx].exit_latency;
 135 
 136                 measured_us = ktime_to_us(cpu_data->time_span_ns);
 137                 /*
 138                  * The delay between the wakeup and the first instruction
 139                  * executed by the CPU is not likely to be worst-case every
 140                  * time, so take 1/2 of the exit latency as a very rough
 141                  * approximation of the average of it.
 142                  */
 143                 if (measured_us >= lat)
 144                         measured_us -= lat / 2;
 145                 else
 146                         measured_us /= 2;
 147         }
 148 
 149         /*
 150          * Decay the "early hits" metric for all of the states and find the
 151          * states matching the sleep length and the measured idle duration.
 152          */
 153         for (i = 0; i < drv->state_count; i++) {
 154                 unsigned int early_hits = cpu_data->states[i].early_hits;
 155 
 156                 cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT;
 157 
 158                 if (drv->states[i].target_residency <= sleep_length_us) {
 159                         idx_timer = i;
 160                         if (drv->states[i].target_residency <= measured_us)
 161                                 idx_hit = i;
 162                 }
 163         }
 164 
 165         /*
 166          * Update the "hits" and "misses" data for the state matching the sleep
 167          * length.  If it matches the measured idle duration too, this is a hit,
 168          * so increase the "hits" metric for it then.  Otherwise, this is a
 169          * miss, so increase the "misses" metric for it.  In the latter case
 170          * also increase the "early hits" metric for the state that actually
 171          * matches the measured idle duration.
 172          */
 173         if (idx_timer >= 0) {
 174                 unsigned int hits = cpu_data->states[idx_timer].hits;
 175                 unsigned int misses = cpu_data->states[idx_timer].misses;
 176 
 177                 hits -= hits >> DECAY_SHIFT;
 178                 misses -= misses >> DECAY_SHIFT;
 179 
 180                 if (idx_timer > idx_hit) {
 181                         misses += PULSE;
 182                         if (idx_hit >= 0)
 183                                 cpu_data->states[idx_hit].early_hits += PULSE;
 184                 } else {
 185                         hits += PULSE;
 186                 }
 187 
 188                 cpu_data->states[idx_timer].misses = misses;
 189                 cpu_data->states[idx_timer].hits = hits;
 190         }
 191 
 192         /*
 193          * Save idle duration values corresponding to non-timer wakeups for
 194          * pattern detection.
 195          */
 196         cpu_data->intervals[cpu_data->interval_idx++] = measured_us;
 197         if (cpu_data->interval_idx >= INTERVALS)
 198                 cpu_data->interval_idx = 0;
 199 }
 200 
 201 /**
 202  * teo_find_shallower_state - Find shallower idle state matching given duration.
 203  * @drv: cpuidle driver containing state data.
 204  * @dev: Target CPU.
 205  * @state_idx: Index of the capping idle state.
 206  * @duration_us: Idle duration value to match.
 207  */
 208 static int teo_find_shallower_state(struct cpuidle_driver *drv,
 209                                     struct cpuidle_device *dev, int state_idx,
 210                                     unsigned int duration_us)
 211 {
 212         int i;
 213 
 214         for (i = state_idx - 1; i >= 0; i--) {
 215                 if (drv->states[i].disabled || dev->states_usage[i].disable)
 216                         continue;
 217 
 218                 state_idx = i;
 219                 if (drv->states[i].target_residency <= duration_us)
 220                         break;
 221         }
 222         return state_idx;
 223 }
 224 
 225 /**
 226  * teo_select - Selects the next idle state to enter.
 227  * @drv: cpuidle driver containing state data.
 228  * @dev: Target CPU.
 229  * @stop_tick: Indication on whether or not to stop the scheduler tick.
 230  */
 231 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 232                       bool *stop_tick)
 233 {
 234         struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
 235         int latency_req = cpuidle_governor_latency_req(dev->cpu);
 236         unsigned int duration_us, hits, misses, early_hits;
 237         int max_early_idx, prev_max_early_idx, constraint_idx, idx, i;
 238         ktime_t delta_tick;
 239 
 240         if (dev->last_state_idx >= 0) {
 241                 teo_update(drv, dev);
 242                 dev->last_state_idx = -1;
 243         }
 244 
 245         cpu_data->time_span_ns = local_clock();
 246 
 247         cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick);
 248         duration_us = ktime_to_us(cpu_data->sleep_length_ns);
 249 
 250         hits = 0;
 251         misses = 0;
 252         early_hits = 0;
 253         max_early_idx = -1;
 254         prev_max_early_idx = -1;
 255         constraint_idx = drv->state_count;
 256         idx = -1;
 257 
 258         for (i = 0; i < drv->state_count; i++) {
 259                 struct cpuidle_state *s = &drv->states[i];
 260                 struct cpuidle_state_usage *su = &dev->states_usage[i];
 261 
 262                 if (s->disabled || su->disable) {
 263                         /*
 264                          * Ignore disabled states with target residencies beyond
 265                          * the anticipated idle duration.
 266                          */
 267                         if (s->target_residency > duration_us)
 268                                 continue;
 269 
 270                         /*
 271                          * This state is disabled, so the range of idle duration
 272                          * values corresponding to it is covered by the current
 273                          * candidate state, but still the "hits" and "misses"
 274                          * metrics of the disabled state need to be used to
 275                          * decide whether or not the state covering the range in
 276                          * question is good enough.
 277                          */
 278                         hits = cpu_data->states[i].hits;
 279                         misses = cpu_data->states[i].misses;
 280 
 281                         if (early_hits >= cpu_data->states[i].early_hits ||
 282                             idx < 0)
 283                                 continue;
 284 
 285                         /*
 286                          * If the current candidate state has been the one with
 287                          * the maximum "early hits" metric so far, the "early
 288                          * hits" metric of the disabled state replaces the
 289                          * current "early hits" count to avoid selecting a
 290                          * deeper state with lower "early hits" metric.
 291                          */
 292                         if (max_early_idx == idx) {
 293                                 early_hits = cpu_data->states[i].early_hits;
 294                                 continue;
 295                         }
 296 
 297                         /*
 298                          * The current candidate state is closer to the disabled
 299                          * one than the current maximum "early hits" state, so
 300                          * replace the latter with it, but in case the maximum
 301                          * "early hits" state index has not been set so far,
 302                          * check if the current candidate state is not too
 303                          * shallow for that role.
 304                          */
 305                         if (!(tick_nohz_tick_stopped() &&
 306                               drv->states[idx].target_residency < TICK_USEC)) {
 307                                 prev_max_early_idx = max_early_idx;
 308                                 early_hits = cpu_data->states[i].early_hits;
 309                                 max_early_idx = idx;
 310                         }
 311 
 312                         continue;
 313                 }
 314 
 315                 if (idx < 0) {
 316                         idx = i; /* first enabled state */
 317                         hits = cpu_data->states[i].hits;
 318                         misses = cpu_data->states[i].misses;
 319                 }
 320 
 321                 if (s->target_residency > duration_us)
 322                         break;
 323 
 324                 if (s->exit_latency > latency_req && constraint_idx > i)
 325                         constraint_idx = i;
 326 
 327                 idx = i;
 328                 hits = cpu_data->states[i].hits;
 329                 misses = cpu_data->states[i].misses;
 330 
 331                 if (early_hits < cpu_data->states[i].early_hits &&
 332                     !(tick_nohz_tick_stopped() &&
 333                       drv->states[i].target_residency < TICK_USEC)) {
 334                         prev_max_early_idx = max_early_idx;
 335                         early_hits = cpu_data->states[i].early_hits;
 336                         max_early_idx = i;
 337                 }
 338         }
 339 
 340         /*
 341          * If the "hits" metric of the idle state matching the sleep length is
 342          * greater than its "misses" metric, that is the one to use.  Otherwise,
 343          * it is more likely that one of the shallower states will match the
 344          * idle duration observed after wakeup, so take the one with the maximum
 345          * "early hits" metric, but if that cannot be determined, just use the
 346          * state selected so far.
 347          */
 348         if (hits <= misses) {
 349                 /*
 350                  * The current candidate state is not suitable, so take the one
 351                  * whose "early hits" metric is the maximum for the range of
 352                  * shallower states.
 353                  */
 354                 if (idx == max_early_idx)
 355                         max_early_idx = prev_max_early_idx;
 356 
 357                 if (max_early_idx >= 0) {
 358                         idx = max_early_idx;
 359                         duration_us = drv->states[idx].target_residency;
 360                 }
 361         }
 362 
 363         /*
 364          * If there is a latency constraint, it may be necessary to use a
 365          * shallower idle state than the one selected so far.
 366          */
 367         if (constraint_idx < idx)
 368                 idx = constraint_idx;
 369 
 370         if (idx < 0) {
 371                 idx = 0; /* No states enabled. Must use 0. */
 372         } else if (idx > 0) {
 373                 unsigned int count = 0;
 374                 u64 sum = 0;
 375 
 376                 /*
 377                  * Count and sum the most recent idle duration values less than
 378                  * the current expected idle duration value.
 379                  */
 380                 for (i = 0; i < INTERVALS; i++) {
 381                         unsigned int val = cpu_data->intervals[i];
 382 
 383                         if (val >= duration_us)
 384                                 continue;
 385 
 386                         count++;
 387                         sum += val;
 388                 }
 389 
 390                 /*
 391                  * Give up unless the majority of the most recent idle duration
 392                  * values are in the interesting range.
 393                  */
 394                 if (count > INTERVALS / 2) {
 395                         unsigned int avg_us = div64_u64(sum, count);
 396 
 397                         /*
 398                          * Avoid spending too much time in an idle state that
 399                          * would be too shallow.
 400                          */
 401                         if (!(tick_nohz_tick_stopped() && avg_us < TICK_USEC)) {
 402                                 duration_us = avg_us;
 403                                 if (drv->states[idx].target_residency > avg_us)
 404                                         idx = teo_find_shallower_state(drv, dev,
 405                                                                        idx, avg_us);
 406                         }
 407                 }
 408         }
 409 
 410         /*
 411          * Don't stop the tick if the selected state is a polling one or if the
 412          * expected idle duration is shorter than the tick period length.
 413          */
 414         if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
 415             duration_us < TICK_USEC) && !tick_nohz_tick_stopped()) {
 416                 unsigned int delta_tick_us = ktime_to_us(delta_tick);
 417 
 418                 *stop_tick = false;
 419 
 420                 /*
 421                  * The tick is not going to be stopped, so if the target
 422                  * residency of the state to be returned is not within the time
 423                  * till the closest timer including the tick, try to correct
 424                  * that.
 425                  */
 426                 if (idx > 0 && drv->states[idx].target_residency > delta_tick_us)
 427                         idx = teo_find_shallower_state(drv, dev, idx, delta_tick_us);
 428         }
 429 
 430         return idx;
 431 }
 432 
 433 /**
 434  * teo_reflect - Note that governor data for the CPU need to be updated.
 435  * @dev: Target CPU.
 436  * @state: Entered state.
 437  */
 438 static void teo_reflect(struct cpuidle_device *dev, int state)
 439 {
 440         struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
 441 
 442         dev->last_state_idx = state;
 443         /*
 444          * If the wakeup was not "natural", but triggered by one of the safety
 445          * nets, assume that the CPU might have been idle for the entire sleep
 446          * length time.
 447          */
 448         if (dev->poll_time_limit ||
 449             (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
 450                 dev->poll_time_limit = false;
 451                 cpu_data->time_span_ns = cpu_data->sleep_length_ns;
 452         } else {
 453                 cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
 454         }
 455 }
 456 
 457 /**
 458  * teo_enable_device - Initialize the governor's data for the target CPU.
 459  * @drv: cpuidle driver (not used).
 460  * @dev: Target CPU.
 461  */
 462 static int teo_enable_device(struct cpuidle_driver *drv,
 463                              struct cpuidle_device *dev)
 464 {
 465         struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
 466         int i;
 467 
 468         memset(cpu_data, 0, sizeof(*cpu_data));
 469 
 470         for (i = 0; i < INTERVALS; i++)
 471                 cpu_data->intervals[i] = UINT_MAX;
 472 
 473         return 0;
 474 }
 475 
 476 static struct cpuidle_governor teo_governor = {
 477         .name =         "teo",
 478         .rating =       19,
 479         .enable =       teo_enable_device,
 480         .select =       teo_select,
 481         .reflect =      teo_reflect,
 482 };
 483 
 484 static int __init teo_governor_init(void)
 485 {
 486         return cpuidle_register_governor(&teo_governor);
 487 }
 488 
 489 postcore_initcall(teo_governor_init);

/* [<][>][^][v][top][bottom][index][help] */