root/drivers/cpufreq/powernv-cpufreq.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. extract_pstate
  2. idx_to_pstate
  3. pstate_to_idx
  4. reset_gpstates
  5. init_powernv_pstates
  6. pstate_id_to_freq
  7. cpuinfo_nominal_freq_show
  8. get_pmspr
  9. set_pmspr
  10. powernv_read_cpu_freq
  11. powernv_cpufreq_get
  12. set_pstate
  13. get_nominal_index
  14. powernv_cpufreq_throttle_check
  15. calc_global_pstate
  16. queue_gpstate_timer
  17. gpstate_timer_handler
  18. powernv_cpufreq_target_index
  19. powernv_cpufreq_cpu_init
  20. powernv_cpufreq_cpu_exit
  21. powernv_cpufreq_reboot_notifier
  22. powernv_cpufreq_work_fn
  23. powernv_cpufreq_occ_msg
  24. powernv_cpufreq_stop_cpu
  25. powernv_fast_switch
  26. init_chip_info
  27. clean_chip_info
  28. unregister_all_notifiers
  29. powernv_cpufreq_init
  30. powernv_cpufreq_exit

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * POWERNV cpufreq driver for the IBM POWER processors
   4  *
   5  * (C) Copyright IBM 2014
   6  *
   7  * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
   8  */
   9 
  10 #define pr_fmt(fmt)     "powernv-cpufreq: " fmt
  11 
  12 #include <linux/kernel.h>
  13 #include <linux/sysfs.h>
  14 #include <linux/cpumask.h>
  15 #include <linux/module.h>
  16 #include <linux/cpufreq.h>
  17 #include <linux/smp.h>
  18 #include <linux/of.h>
  19 #include <linux/reboot.h>
  20 #include <linux/slab.h>
  21 #include <linux/cpu.h>
  22 #include <linux/hashtable.h>
  23 #include <trace/events/power.h>
  24 
  25 #include <asm/cputhreads.h>
  26 #include <asm/firmware.h>
  27 #include <asm/reg.h>
  28 #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */
  29 #include <asm/opal.h>
  30 #include <linux/timer.h>
  31 
  32 #define POWERNV_MAX_PSTATES_ORDER  8
  33 #define POWERNV_MAX_PSTATES     (1UL << (POWERNV_MAX_PSTATES_ORDER))
  34 #define PMSR_PSAFE_ENABLE       (1UL << 30)
  35 #define PMSR_SPR_EM_DISABLE     (1UL << 31)
  36 #define MAX_PSTATE_SHIFT        32
  37 #define LPSTATE_SHIFT           48
  38 #define GPSTATE_SHIFT           56
  39 
  40 #define MAX_RAMP_DOWN_TIME                              5120
  41 /*
  42  * On an idle system we want the global pstate to ramp-down from max value to
  43  * min over a span of ~5 secs. Also we want it to initially ramp-down slowly and
  44  * then ramp-down rapidly later on.
  45  *
  46  * This gives a percentage rampdown for time elapsed in milliseconds.
  47  * ramp_down_percentage = ((ms * ms) >> 18)
  48  *                      ~= 3.8 * (sec * sec)
  49  *
  50  * At 0 ms      ramp_down_percent = 0
  51  * At 5120 ms   ramp_down_percent = 100
  52  */
  53 #define ramp_down_percent(time)         ((time * time) >> 18)
  54 
  55 /* Interval after which the timer is queued to bring down global pstate */
  56 #define GPSTATE_TIMER_INTERVAL                          2000
  57 
  58 /**
  59  * struct global_pstate_info -  Per policy data structure to maintain history of
  60  *                              global pstates
  61  * @highest_lpstate_idx:        The local pstate index from which we are
  62  *                              ramping down
  63  * @elapsed_time:               Time in ms spent in ramping down from
  64  *                              highest_lpstate_idx
  65  * @last_sampled_time:          Time from boot in ms when global pstates were
  66  *                              last set
  67  * @last_lpstate_idx,           Last set value of local pstate and global
  68  * last_gpstate_idx             pstate in terms of cpufreq table index
  69  * @timer:                      Is used for ramping down if cpu goes idle for
  70  *                              a long time with global pstate held high
  71  * @gpstate_lock:               A spinlock to maintain synchronization between
  72  *                              routines called by the timer handler and
  73  *                              governer's target_index calls
  74  */
  75 struct global_pstate_info {
  76         int highest_lpstate_idx;
  77         unsigned int elapsed_time;
  78         unsigned int last_sampled_time;
  79         int last_lpstate_idx;
  80         int last_gpstate_idx;
  81         spinlock_t gpstate_lock;
  82         struct timer_list timer;
  83         struct cpufreq_policy *policy;
  84 };
  85 
  86 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
  87 
  88 DEFINE_HASHTABLE(pstate_revmap, POWERNV_MAX_PSTATES_ORDER);
  89 /**
  90  * struct pstate_idx_revmap_data: Entry in the hashmap pstate_revmap
  91  *                                indexed by a function of pstate id.
  92  *
  93  * @pstate_id: pstate id for this entry.
  94  *
  95  * @cpufreq_table_idx: Index into the powernv_freqs
  96  *                     cpufreq_frequency_table for frequency
  97  *                     corresponding to pstate_id.
  98  *
  99  * @hentry: hlist_node that hooks this entry into the pstate_revmap
 100  *          hashtable
 101  */
 102 struct pstate_idx_revmap_data {
 103         u8 pstate_id;
 104         unsigned int cpufreq_table_idx;
 105         struct hlist_node hentry;
 106 };
 107 
 108 static bool rebooting, throttled, occ_reset;
 109 
 110 static const char * const throttle_reason[] = {
 111         "No throttling",
 112         "Power Cap",
 113         "Processor Over Temperature",
 114         "Power Supply Failure",
 115         "Over Current",
 116         "OCC Reset"
 117 };
 118 
 119 enum throttle_reason_type {
 120         NO_THROTTLE = 0,
 121         POWERCAP,
 122         CPU_OVERTEMP,
 123         POWER_SUPPLY_FAILURE,
 124         OVERCURRENT,
 125         OCC_RESET_THROTTLE,
 126         OCC_MAX_REASON
 127 };
 128 
 129 static struct chip {
 130         unsigned int id;
 131         bool throttled;
 132         bool restore;
 133         u8 throttle_reason;
 134         cpumask_t mask;
 135         struct work_struct throttle;
 136         int throttle_turbo;
 137         int throttle_sub_turbo;
 138         int reason[OCC_MAX_REASON];
 139 } *chips;
 140 
 141 static int nr_chips;
 142 static DEFINE_PER_CPU(struct chip *, chip_info);
 143 
 144 /*
 145  * Note:
 146  * The set of pstates consists of contiguous integers.
 147  * powernv_pstate_info stores the index of the frequency table for
 148  * max, min and nominal frequencies. It also stores number of
 149  * available frequencies.
 150  *
 151  * powernv_pstate_info.nominal indicates the index to the highest
 152  * non-turbo frequency.
 153  */
 154 static struct powernv_pstate_info {
 155         unsigned int min;
 156         unsigned int max;
 157         unsigned int nominal;
 158         unsigned int nr_pstates;
 159         bool wof_enabled;
 160 } powernv_pstate_info;
 161 
 162 static inline u8 extract_pstate(u64 pmsr_val, unsigned int shift)
 163 {
 164         return ((pmsr_val >> shift) & 0xFF);
 165 }
 166 
 167 #define extract_local_pstate(x) extract_pstate(x, LPSTATE_SHIFT)
 168 #define extract_global_pstate(x) extract_pstate(x, GPSTATE_SHIFT)
 169 #define extract_max_pstate(x)  extract_pstate(x, MAX_PSTATE_SHIFT)
 170 
 171 /* Use following functions for conversions between pstate_id and index */
 172 
 173 /**
 174  * idx_to_pstate : Returns the pstate id corresponding to the
 175  *                 frequency in the cpufreq frequency table
 176  *                 powernv_freqs indexed by @i.
 177  *
 178  *                 If @i is out of bound, this will return the pstate
 179  *                 corresponding to the nominal frequency.
 180  */
 181 static inline u8 idx_to_pstate(unsigned int i)
 182 {
 183         if (unlikely(i >= powernv_pstate_info.nr_pstates)) {
 184                 pr_warn_once("idx_to_pstate: index %u is out of bound\n", i);
 185                 return powernv_freqs[powernv_pstate_info.nominal].driver_data;
 186         }
 187 
 188         return powernv_freqs[i].driver_data;
 189 }
 190 
 191 /**
 192  * pstate_to_idx : Returns the index in the cpufreq frequencytable
 193  *                 powernv_freqs for the frequency whose corresponding
 194  *                 pstate id is @pstate.
 195  *
 196  *                 If no frequency corresponding to @pstate is found,
 197  *                 this will return the index of the nominal
 198  *                 frequency.
 199  */
 200 static unsigned int pstate_to_idx(u8 pstate)
 201 {
 202         unsigned int key = pstate % POWERNV_MAX_PSTATES;
 203         struct pstate_idx_revmap_data *revmap_data;
 204 
 205         hash_for_each_possible(pstate_revmap, revmap_data, hentry, key) {
 206                 if (revmap_data->pstate_id == pstate)
 207                         return revmap_data->cpufreq_table_idx;
 208         }
 209 
 210         pr_warn_once("pstate_to_idx: pstate 0x%x not found\n", pstate);
 211         return powernv_pstate_info.nominal;
 212 }
 213 
 214 static inline void reset_gpstates(struct cpufreq_policy *policy)
 215 {
 216         struct global_pstate_info *gpstates = policy->driver_data;
 217 
 218         gpstates->highest_lpstate_idx = 0;
 219         gpstates->elapsed_time = 0;
 220         gpstates->last_sampled_time = 0;
 221         gpstates->last_lpstate_idx = 0;
 222         gpstates->last_gpstate_idx = 0;
 223 }
 224 
 225 /*
 226  * Initialize the freq table based on data obtained
 227  * from the firmware passed via device-tree
 228  */
 229 static int init_powernv_pstates(void)
 230 {
 231         struct device_node *power_mgt;
 232         int i, nr_pstates = 0;
 233         const __be32 *pstate_ids, *pstate_freqs;
 234         u32 len_ids, len_freqs;
 235         u32 pstate_min, pstate_max, pstate_nominal;
 236         u32 pstate_turbo, pstate_ultra_turbo;
 237         int rc = -ENODEV;
 238 
 239         power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
 240         if (!power_mgt) {
 241                 pr_warn("power-mgt node not found\n");
 242                 return -ENODEV;
 243         }
 244 
 245         if (of_property_read_u32(power_mgt, "ibm,pstate-min", &pstate_min)) {
 246                 pr_warn("ibm,pstate-min node not found\n");
 247                 goto out;
 248         }
 249 
 250         if (of_property_read_u32(power_mgt, "ibm,pstate-max", &pstate_max)) {
 251                 pr_warn("ibm,pstate-max node not found\n");
 252                 goto out;
 253         }
 254 
 255         if (of_property_read_u32(power_mgt, "ibm,pstate-nominal",
 256                                  &pstate_nominal)) {
 257                 pr_warn("ibm,pstate-nominal not found\n");
 258                 goto out;
 259         }
 260 
 261         if (of_property_read_u32(power_mgt, "ibm,pstate-ultra-turbo",
 262                                  &pstate_ultra_turbo)) {
 263                 powernv_pstate_info.wof_enabled = false;
 264                 goto next;
 265         }
 266 
 267         if (of_property_read_u32(power_mgt, "ibm,pstate-turbo",
 268                                  &pstate_turbo)) {
 269                 powernv_pstate_info.wof_enabled = false;
 270                 goto next;
 271         }
 272 
 273         if (pstate_turbo == pstate_ultra_turbo)
 274                 powernv_pstate_info.wof_enabled = false;
 275         else
 276                 powernv_pstate_info.wof_enabled = true;
 277 
 278 next:
 279         pr_info("cpufreq pstate min 0x%x nominal 0x%x max 0x%x\n", pstate_min,
 280                 pstate_nominal, pstate_max);
 281         pr_info("Workload Optimized Frequency is %s in the platform\n",
 282                 (powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
 283 
 284         pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids);
 285         if (!pstate_ids) {
 286                 pr_warn("ibm,pstate-ids not found\n");
 287                 goto out;
 288         }
 289 
 290         pstate_freqs = of_get_property(power_mgt, "ibm,pstate-frequencies-mhz",
 291                                       &len_freqs);
 292         if (!pstate_freqs) {
 293                 pr_warn("ibm,pstate-frequencies-mhz not found\n");
 294                 goto out;
 295         }
 296 
 297         if (len_ids != len_freqs) {
 298                 pr_warn("Entries in ibm,pstate-ids and "
 299                         "ibm,pstate-frequencies-mhz does not match\n");
 300         }
 301 
 302         nr_pstates = min(len_ids, len_freqs) / sizeof(u32);
 303         if (!nr_pstates) {
 304                 pr_warn("No PStates found\n");
 305                 goto out;
 306         }
 307 
 308         powernv_pstate_info.nr_pstates = nr_pstates;
 309         pr_debug("NR PStates %d\n", nr_pstates);
 310 
 311         for (i = 0; i < nr_pstates; i++) {
 312                 u32 id = be32_to_cpu(pstate_ids[i]);
 313                 u32 freq = be32_to_cpu(pstate_freqs[i]);
 314                 struct pstate_idx_revmap_data *revmap_data;
 315                 unsigned int key;
 316 
 317                 pr_debug("PState id %d freq %d MHz\n", id, freq);
 318                 powernv_freqs[i].frequency = freq * 1000; /* kHz */
 319                 powernv_freqs[i].driver_data = id & 0xFF;
 320 
 321                 revmap_data = kmalloc(sizeof(*revmap_data), GFP_KERNEL);
 322                 if (!revmap_data) {
 323                         rc = -ENOMEM;
 324                         goto out;
 325                 }
 326 
 327                 revmap_data->pstate_id = id & 0xFF;
 328                 revmap_data->cpufreq_table_idx = i;
 329                 key = (revmap_data->pstate_id) % POWERNV_MAX_PSTATES;
 330                 hash_add(pstate_revmap, &revmap_data->hentry, key);
 331 
 332                 if (id == pstate_max)
 333                         powernv_pstate_info.max = i;
 334                 if (id == pstate_nominal)
 335                         powernv_pstate_info.nominal = i;
 336                 if (id == pstate_min)
 337                         powernv_pstate_info.min = i;
 338 
 339                 if (powernv_pstate_info.wof_enabled && id == pstate_turbo) {
 340                         int j;
 341 
 342                         for (j = i - 1; j >= (int)powernv_pstate_info.max; j--)
 343                                 powernv_freqs[j].flags = CPUFREQ_BOOST_FREQ;
 344                 }
 345         }
 346 
 347         /* End of list marker entry */
 348         powernv_freqs[i].frequency = CPUFREQ_TABLE_END;
 349 
 350         of_node_put(power_mgt);
 351         return 0;
 352 out:
 353         of_node_put(power_mgt);
 354         return rc;
 355 }
 356 
 357 /* Returns the CPU frequency corresponding to the pstate_id. */
 358 static unsigned int pstate_id_to_freq(u8 pstate_id)
 359 {
 360         int i;
 361 
 362         i = pstate_to_idx(pstate_id);
 363         if (i >= powernv_pstate_info.nr_pstates || i < 0) {
 364                 pr_warn("PState id 0x%x outside of PState table, reporting nominal id 0x%x instead\n",
 365                         pstate_id, idx_to_pstate(powernv_pstate_info.nominal));
 366                 i = powernv_pstate_info.nominal;
 367         }
 368 
 369         return powernv_freqs[i].frequency;
 370 }
 371 
 372 /*
 373  * cpuinfo_nominal_freq_show - Show the nominal CPU frequency as indicated by
 374  * the firmware
 375  */
 376 static ssize_t cpuinfo_nominal_freq_show(struct cpufreq_policy *policy,
 377                                         char *buf)
 378 {
 379         return sprintf(buf, "%u\n",
 380                 powernv_freqs[powernv_pstate_info.nominal].frequency);
 381 }
 382 
 383 struct freq_attr cpufreq_freq_attr_cpuinfo_nominal_freq =
 384         __ATTR_RO(cpuinfo_nominal_freq);
 385 
 386 #define SCALING_BOOST_FREQS_ATTR_INDEX          2
 387 
 388 static struct freq_attr *powernv_cpu_freq_attr[] = {
 389         &cpufreq_freq_attr_scaling_available_freqs,
 390         &cpufreq_freq_attr_cpuinfo_nominal_freq,
 391         &cpufreq_freq_attr_scaling_boost_freqs,
 392         NULL,
 393 };
 394 
 395 #define throttle_attr(name, member)                                     \
 396 static ssize_t name##_show(struct cpufreq_policy *policy, char *buf)    \
 397 {                                                                       \
 398         struct chip *chip = per_cpu(chip_info, policy->cpu);            \
 399                                                                         \
 400         return sprintf(buf, "%u\n", chip->member);                      \
 401 }                                                                       \
 402                                                                         \
 403 static struct freq_attr throttle_attr_##name = __ATTR_RO(name)          \
 404 
 405 throttle_attr(unthrottle, reason[NO_THROTTLE]);
 406 throttle_attr(powercap, reason[POWERCAP]);
 407 throttle_attr(overtemp, reason[CPU_OVERTEMP]);
 408 throttle_attr(supply_fault, reason[POWER_SUPPLY_FAILURE]);
 409 throttle_attr(overcurrent, reason[OVERCURRENT]);
 410 throttle_attr(occ_reset, reason[OCC_RESET_THROTTLE]);
 411 throttle_attr(turbo_stat, throttle_turbo);
 412 throttle_attr(sub_turbo_stat, throttle_sub_turbo);
 413 
 414 static struct attribute *throttle_attrs[] = {
 415         &throttle_attr_unthrottle.attr,
 416         &throttle_attr_powercap.attr,
 417         &throttle_attr_overtemp.attr,
 418         &throttle_attr_supply_fault.attr,
 419         &throttle_attr_overcurrent.attr,
 420         &throttle_attr_occ_reset.attr,
 421         &throttle_attr_turbo_stat.attr,
 422         &throttle_attr_sub_turbo_stat.attr,
 423         NULL,
 424 };
 425 
 426 static const struct attribute_group throttle_attr_grp = {
 427         .name   = "throttle_stats",
 428         .attrs  = throttle_attrs,
 429 };
 430 
 431 /* Helper routines */
 432 
 433 /* Access helpers to power mgt SPR */
 434 
 435 static inline unsigned long get_pmspr(unsigned long sprn)
 436 {
 437         switch (sprn) {
 438         case SPRN_PMCR:
 439                 return mfspr(SPRN_PMCR);
 440 
 441         case SPRN_PMICR:
 442                 return mfspr(SPRN_PMICR);
 443 
 444         case SPRN_PMSR:
 445                 return mfspr(SPRN_PMSR);
 446         }
 447         BUG();
 448 }
 449 
 450 static inline void set_pmspr(unsigned long sprn, unsigned long val)
 451 {
 452         switch (sprn) {
 453         case SPRN_PMCR:
 454                 mtspr(SPRN_PMCR, val);
 455                 return;
 456 
 457         case SPRN_PMICR:
 458                 mtspr(SPRN_PMICR, val);
 459                 return;
 460         }
 461         BUG();
 462 }
 463 
 464 /*
 465  * Use objects of this type to query/update
 466  * pstates on a remote CPU via smp_call_function.
 467  */
 468 struct powernv_smp_call_data {
 469         unsigned int freq;
 470         u8 pstate_id;
 471         u8 gpstate_id;
 472 };
 473 
 474 /*
 475  * powernv_read_cpu_freq: Reads the current frequency on this CPU.
 476  *
 477  * Called via smp_call_function.
 478  *
 479  * Note: The caller of the smp_call_function should pass an argument of
 480  * the type 'struct powernv_smp_call_data *' along with this function.
 481  *
 482  * The current frequency on this CPU will be returned via
 483  * ((struct powernv_smp_call_data *)arg)->freq;
 484  */
 485 static void powernv_read_cpu_freq(void *arg)
 486 {
 487         unsigned long pmspr_val;
 488         struct powernv_smp_call_data *freq_data = arg;
 489 
 490         pmspr_val = get_pmspr(SPRN_PMSR);
 491         freq_data->pstate_id = extract_local_pstate(pmspr_val);
 492         freq_data->freq = pstate_id_to_freq(freq_data->pstate_id);
 493 
 494         pr_debug("cpu %d pmsr %016lX pstate_id 0x%x frequency %d kHz\n",
 495                  raw_smp_processor_id(), pmspr_val, freq_data->pstate_id,
 496                  freq_data->freq);
 497 }
 498 
 499 /*
 500  * powernv_cpufreq_get: Returns the CPU frequency as reported by the
 501  * firmware for CPU 'cpu'. This value is reported through the sysfs
 502  * file cpuinfo_cur_freq.
 503  */
 504 static unsigned int powernv_cpufreq_get(unsigned int cpu)
 505 {
 506         struct powernv_smp_call_data freq_data;
 507 
 508         smp_call_function_any(cpu_sibling_mask(cpu), powernv_read_cpu_freq,
 509                         &freq_data, 1);
 510 
 511         return freq_data.freq;
 512 }
 513 
 514 /*
 515  * set_pstate: Sets the pstate on this CPU.
 516  *
 517  * This is called via an smp_call_function.
 518  *
 519  * The caller must ensure that freq_data is of the type
 520  * (struct powernv_smp_call_data *) and the pstate_id which needs to be set
 521  * on this CPU should be present in freq_data->pstate_id.
 522  */
 523 static void set_pstate(void *data)
 524 {
 525         unsigned long val;
 526         struct powernv_smp_call_data *freq_data = data;
 527         unsigned long pstate_ul = freq_data->pstate_id;
 528         unsigned long gpstate_ul = freq_data->gpstate_id;
 529 
 530         val = get_pmspr(SPRN_PMCR);
 531         val = val & 0x0000FFFFFFFFFFFFULL;
 532 
 533         pstate_ul = pstate_ul & 0xFF;
 534         gpstate_ul = gpstate_ul & 0xFF;
 535 
 536         /* Set both global(bits 56..63) and local(bits 48..55) PStates */
 537         val = val | (gpstate_ul << 56) | (pstate_ul << 48);
 538 
 539         pr_debug("Setting cpu %d pmcr to %016lX\n",
 540                         raw_smp_processor_id(), val);
 541         set_pmspr(SPRN_PMCR, val);
 542 }
 543 
 544 /*
 545  * get_nominal_index: Returns the index corresponding to the nominal
 546  * pstate in the cpufreq table
 547  */
 548 static inline unsigned int get_nominal_index(void)
 549 {
 550         return powernv_pstate_info.nominal;
 551 }
 552 
 553 static void powernv_cpufreq_throttle_check(void *data)
 554 {
 555         struct chip *chip;
 556         unsigned int cpu = smp_processor_id();
 557         unsigned long pmsr;
 558         u8 pmsr_pmax;
 559         unsigned int pmsr_pmax_idx;
 560 
 561         pmsr = get_pmspr(SPRN_PMSR);
 562         chip = this_cpu_read(chip_info);
 563 
 564         /* Check for Pmax Capping */
 565         pmsr_pmax = extract_max_pstate(pmsr);
 566         pmsr_pmax_idx = pstate_to_idx(pmsr_pmax);
 567         if (pmsr_pmax_idx != powernv_pstate_info.max) {
 568                 if (chip->throttled)
 569                         goto next;
 570                 chip->throttled = true;
 571                 if (pmsr_pmax_idx > powernv_pstate_info.nominal) {
 572                         pr_warn_once("CPU %d on Chip %u has Pmax(0x%x) reduced below that of nominal frequency(0x%x)\n",
 573                                      cpu, chip->id, pmsr_pmax,
 574                                      idx_to_pstate(powernv_pstate_info.nominal));
 575                         chip->throttle_sub_turbo++;
 576                 } else {
 577                         chip->throttle_turbo++;
 578                 }
 579                 trace_powernv_throttle(chip->id,
 580                                       throttle_reason[chip->throttle_reason],
 581                                       pmsr_pmax);
 582         } else if (chip->throttled) {
 583                 chip->throttled = false;
 584                 trace_powernv_throttle(chip->id,
 585                                       throttle_reason[chip->throttle_reason],
 586                                       pmsr_pmax);
 587         }
 588 
 589         /* Check if Psafe_mode_active is set in PMSR. */
 590 next:
 591         if (pmsr & PMSR_PSAFE_ENABLE) {
 592                 throttled = true;
 593                 pr_info("Pstate set to safe frequency\n");
 594         }
 595 
 596         /* Check if SPR_EM_DISABLE is set in PMSR */
 597         if (pmsr & PMSR_SPR_EM_DISABLE) {
 598                 throttled = true;
 599                 pr_info("Frequency Control disabled from OS\n");
 600         }
 601 
 602         if (throttled) {
 603                 pr_info("PMSR = %16lx\n", pmsr);
 604                 pr_warn("CPU Frequency could be throttled\n");
 605         }
 606 }
 607 
 608 /**
 609  * calc_global_pstate - Calculate global pstate
 610  * @elapsed_time:               Elapsed time in milliseconds
 611  * @local_pstate_idx:           New local pstate
 612  * @highest_lpstate_idx:        pstate from which its ramping down
 613  *
 614  * Finds the appropriate global pstate based on the pstate from which its
 615  * ramping down and the time elapsed in ramping down. It follows a quadratic
 616  * equation which ensures that it reaches ramping down to pmin in 5sec.
 617  */
 618 static inline int calc_global_pstate(unsigned int elapsed_time,
 619                                      int highest_lpstate_idx,
 620                                      int local_pstate_idx)
 621 {
 622         int index_diff;
 623 
 624         /*
 625          * Using ramp_down_percent we get the percentage of rampdown
 626          * that we are expecting to be dropping. Difference between
 627          * highest_lpstate_idx and powernv_pstate_info.min will give a absolute
 628          * number of how many pstates we will drop eventually by the end of
 629          * 5 seconds, then just scale it get the number pstates to be dropped.
 630          */
 631         index_diff =  ((int)ramp_down_percent(elapsed_time) *
 632                         (powernv_pstate_info.min - highest_lpstate_idx)) / 100;
 633 
 634         /* Ensure that global pstate is >= to local pstate */
 635         if (highest_lpstate_idx + index_diff >= local_pstate_idx)
 636                 return local_pstate_idx;
 637         else
 638                 return highest_lpstate_idx + index_diff;
 639 }
 640 
 641 static inline void  queue_gpstate_timer(struct global_pstate_info *gpstates)
 642 {
 643         unsigned int timer_interval;
 644 
 645         /*
 646          * Setting up timer to fire after GPSTATE_TIMER_INTERVAL ms, But
 647          * if it exceeds MAX_RAMP_DOWN_TIME ms for ramp down time.
 648          * Set timer such that it fires exactly at MAX_RAMP_DOWN_TIME
 649          * seconds of ramp down time.
 650          */
 651         if ((gpstates->elapsed_time + GPSTATE_TIMER_INTERVAL)
 652              > MAX_RAMP_DOWN_TIME)
 653                 timer_interval = MAX_RAMP_DOWN_TIME - gpstates->elapsed_time;
 654         else
 655                 timer_interval = GPSTATE_TIMER_INTERVAL;
 656 
 657         mod_timer(&gpstates->timer, jiffies + msecs_to_jiffies(timer_interval));
 658 }
 659 
 660 /**
 661  * gpstate_timer_handler
 662  *
 663  * @data: pointer to cpufreq_policy on which timer was queued
 664  *
 665  * This handler brings down the global pstate closer to the local pstate
 666  * according quadratic equation. Queues a new timer if it is still not equal
 667  * to local pstate
 668  */
 669 void gpstate_timer_handler(struct timer_list *t)
 670 {
 671         struct global_pstate_info *gpstates = from_timer(gpstates, t, timer);
 672         struct cpufreq_policy *policy = gpstates->policy;
 673         int gpstate_idx, lpstate_idx;
 674         unsigned long val;
 675         unsigned int time_diff = jiffies_to_msecs(jiffies)
 676                                         - gpstates->last_sampled_time;
 677         struct powernv_smp_call_data freq_data;
 678 
 679         if (!spin_trylock(&gpstates->gpstate_lock))
 680                 return;
 681         /*
 682          * If the timer has migrated to the different cpu then bring
 683          * it back to one of the policy->cpus
 684          */
 685         if (!cpumask_test_cpu(raw_smp_processor_id(), policy->cpus)) {
 686                 gpstates->timer.expires = jiffies + msecs_to_jiffies(1);
 687                 add_timer_on(&gpstates->timer, cpumask_first(policy->cpus));
 688                 spin_unlock(&gpstates->gpstate_lock);
 689                 return;
 690         }
 691 
 692         /*
 693          * If PMCR was last updated was using fast_swtich then
 694          * We may have wrong in gpstate->last_lpstate_idx
 695          * value. Hence, read from PMCR to get correct data.
 696          */
 697         val = get_pmspr(SPRN_PMCR);
 698         freq_data.gpstate_id = extract_global_pstate(val);
 699         freq_data.pstate_id = extract_local_pstate(val);
 700         if (freq_data.gpstate_id  == freq_data.pstate_id) {
 701                 reset_gpstates(policy);
 702                 spin_unlock(&gpstates->gpstate_lock);
 703                 return;
 704         }
 705 
 706         gpstates->last_sampled_time += time_diff;
 707         gpstates->elapsed_time += time_diff;
 708 
 709         if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
 710                 gpstate_idx = pstate_to_idx(freq_data.pstate_id);
 711                 lpstate_idx = gpstate_idx;
 712                 reset_gpstates(policy);
 713                 gpstates->highest_lpstate_idx = gpstate_idx;
 714         } else {
 715                 lpstate_idx = pstate_to_idx(freq_data.pstate_id);
 716                 gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
 717                                                  gpstates->highest_lpstate_idx,
 718                                                  lpstate_idx);
 719         }
 720         freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
 721         gpstates->last_gpstate_idx = gpstate_idx;
 722         gpstates->last_lpstate_idx = lpstate_idx;
 723         /*
 724          * If local pstate is equal to global pstate, rampdown is over
 725          * So timer is not required to be queued.
 726          */
 727         if (gpstate_idx != gpstates->last_lpstate_idx)
 728                 queue_gpstate_timer(gpstates);
 729 
 730         set_pstate(&freq_data);
 731         spin_unlock(&gpstates->gpstate_lock);
 732 }
 733 
 734 /*
 735  * powernv_cpufreq_target_index: Sets the frequency corresponding to
 736  * the cpufreq table entry indexed by new_index on the cpus in the
 737  * mask policy->cpus
 738  */
 739 static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
 740                                         unsigned int new_index)
 741 {
 742         struct powernv_smp_call_data freq_data;
 743         unsigned int cur_msec, gpstate_idx;
 744         struct global_pstate_info *gpstates = policy->driver_data;
 745 
 746         if (unlikely(rebooting) && new_index != get_nominal_index())
 747                 return 0;
 748 
 749         if (!throttled) {
 750                 /* we don't want to be preempted while
 751                  * checking if the CPU frequency has been throttled
 752                  */
 753                 preempt_disable();
 754                 powernv_cpufreq_throttle_check(NULL);
 755                 preempt_enable();
 756         }
 757 
 758         cur_msec = jiffies_to_msecs(get_jiffies_64());
 759 
 760         freq_data.pstate_id = idx_to_pstate(new_index);
 761         if (!gpstates) {
 762                 freq_data.gpstate_id = freq_data.pstate_id;
 763                 goto no_gpstate;
 764         }
 765 
 766         spin_lock(&gpstates->gpstate_lock);
 767 
 768         if (!gpstates->last_sampled_time) {
 769                 gpstate_idx = new_index;
 770                 gpstates->highest_lpstate_idx = new_index;
 771                 goto gpstates_done;
 772         }
 773 
 774         if (gpstates->last_gpstate_idx < new_index) {
 775                 gpstates->elapsed_time += cur_msec -
 776                                                  gpstates->last_sampled_time;
 777 
 778                 /*
 779                  * If its has been ramping down for more than MAX_RAMP_DOWN_TIME
 780                  * we should be resetting all global pstate related data. Set it
 781                  * equal to local pstate to start fresh.
 782                  */
 783                 if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
 784                         reset_gpstates(policy);
 785                         gpstates->highest_lpstate_idx = new_index;
 786                         gpstate_idx = new_index;
 787                 } else {
 788                 /* Elaspsed_time is less than 5 seconds, continue to rampdown */
 789                         gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
 790                                                          gpstates->highest_lpstate_idx,
 791                                                          new_index);
 792                 }
 793         } else {
 794                 reset_gpstates(policy);
 795                 gpstates->highest_lpstate_idx = new_index;
 796                 gpstate_idx = new_index;
 797         }
 798 
 799         /*
 800          * If local pstate is equal to global pstate, rampdown is over
 801          * So timer is not required to be queued.
 802          */
 803         if (gpstate_idx != new_index)
 804                 queue_gpstate_timer(gpstates);
 805         else
 806                 del_timer_sync(&gpstates->timer);
 807 
 808 gpstates_done:
 809         freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
 810         gpstates->last_sampled_time = cur_msec;
 811         gpstates->last_gpstate_idx = gpstate_idx;
 812         gpstates->last_lpstate_idx = new_index;
 813 
 814         spin_unlock(&gpstates->gpstate_lock);
 815 
 816 no_gpstate:
 817         /*
 818          * Use smp_call_function to send IPI and execute the
 819          * mtspr on target CPU.  We could do that without IPI
 820          * if current CPU is within policy->cpus (core)
 821          */
 822         smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
 823         return 0;
 824 }
 825 
 826 static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 827 {
 828         int base, i;
 829         struct kernfs_node *kn;
 830         struct global_pstate_info *gpstates;
 831 
 832         base = cpu_first_thread_sibling(policy->cpu);
 833 
 834         for (i = 0; i < threads_per_core; i++)
 835                 cpumask_set_cpu(base + i, policy->cpus);
 836 
 837         kn = kernfs_find_and_get(policy->kobj.sd, throttle_attr_grp.name);
 838         if (!kn) {
 839                 int ret;
 840 
 841                 ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp);
 842                 if (ret) {
 843                         pr_info("Failed to create throttle stats directory for cpu %d\n",
 844                                 policy->cpu);
 845                         return ret;
 846                 }
 847         } else {
 848                 kernfs_put(kn);
 849         }
 850 
 851         policy->freq_table = powernv_freqs;
 852         policy->fast_switch_possible = true;
 853 
 854         if (pvr_version_is(PVR_POWER9))
 855                 return 0;
 856 
 857         /* Initialise Gpstate ramp-down timer only on POWER8 */
 858         gpstates =  kzalloc(sizeof(*gpstates), GFP_KERNEL);
 859         if (!gpstates)
 860                 return -ENOMEM;
 861 
 862         policy->driver_data = gpstates;
 863 
 864         /* initialize timer */
 865         gpstates->policy = policy;
 866         timer_setup(&gpstates->timer, gpstate_timer_handler,
 867                     TIMER_PINNED | TIMER_DEFERRABLE);
 868         gpstates->timer.expires = jiffies +
 869                                 msecs_to_jiffies(GPSTATE_TIMER_INTERVAL);
 870         spin_lock_init(&gpstates->gpstate_lock);
 871 
 872         return 0;
 873 }
 874 
 875 static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 876 {
 877         /* timer is deleted in cpufreq_cpu_stop() */
 878         kfree(policy->driver_data);
 879 
 880         return 0;
 881 }
 882 
 883 static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb,
 884                                 unsigned long action, void *unused)
 885 {
 886         int cpu;
 887         struct cpufreq_policy cpu_policy;
 888 
 889         rebooting = true;
 890         for_each_online_cpu(cpu) {
 891                 cpufreq_get_policy(&cpu_policy, cpu);
 892                 powernv_cpufreq_target_index(&cpu_policy, get_nominal_index());
 893         }
 894 
 895         return NOTIFY_DONE;
 896 }
 897 
 898 static struct notifier_block powernv_cpufreq_reboot_nb = {
 899         .notifier_call = powernv_cpufreq_reboot_notifier,
 900 };
 901 
 902 void powernv_cpufreq_work_fn(struct work_struct *work)
 903 {
 904         struct chip *chip = container_of(work, struct chip, throttle);
 905         unsigned int cpu;
 906         cpumask_t mask;
 907 
 908         get_online_cpus();
 909         cpumask_and(&mask, &chip->mask, cpu_online_mask);
 910         smp_call_function_any(&mask,
 911                               powernv_cpufreq_throttle_check, NULL, 0);
 912 
 913         if (!chip->restore)
 914                 goto out;
 915 
 916         chip->restore = false;
 917         for_each_cpu(cpu, &mask) {
 918                 int index;
 919                 struct cpufreq_policy policy;
 920 
 921                 cpufreq_get_policy(&policy, cpu);
 922                 index = cpufreq_table_find_index_c(&policy, policy.cur);
 923                 powernv_cpufreq_target_index(&policy, index);
 924                 cpumask_andnot(&mask, &mask, policy.cpus);
 925         }
 926 out:
 927         put_online_cpus();
 928 }
 929 
 930 static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
 931                                    unsigned long msg_type, void *_msg)
 932 {
 933         struct opal_msg *msg = _msg;
 934         struct opal_occ_msg omsg;
 935         int i;
 936 
 937         if (msg_type != OPAL_MSG_OCC)
 938                 return 0;
 939 
 940         omsg.type = be64_to_cpu(msg->params[0]);
 941 
 942         switch (omsg.type) {
 943         case OCC_RESET:
 944                 occ_reset = true;
 945                 pr_info("OCC (On Chip Controller - enforces hard thermal/power limits) Resetting\n");
 946                 /*
 947                  * powernv_cpufreq_throttle_check() is called in
 948                  * target() callback which can detect the throttle state
 949                  * for governors like ondemand.
 950                  * But static governors will not call target() often thus
 951                  * report throttling here.
 952                  */
 953                 if (!throttled) {
 954                         throttled = true;
 955                         pr_warn("CPU frequency is throttled for duration\n");
 956                 }
 957 
 958                 break;
 959         case OCC_LOAD:
 960                 pr_info("OCC Loading, CPU frequency is throttled until OCC is started\n");
 961                 break;
 962         case OCC_THROTTLE:
 963                 omsg.chip = be64_to_cpu(msg->params[1]);
 964                 omsg.throttle_status = be64_to_cpu(msg->params[2]);
 965 
 966                 if (occ_reset) {
 967                         occ_reset = false;
 968                         throttled = false;
 969                         pr_info("OCC Active, CPU frequency is no longer throttled\n");
 970 
 971                         for (i = 0; i < nr_chips; i++) {
 972                                 chips[i].restore = true;
 973                                 schedule_work(&chips[i].throttle);
 974                         }
 975 
 976                         return 0;
 977                 }
 978 
 979                 for (i = 0; i < nr_chips; i++)
 980                         if (chips[i].id == omsg.chip)
 981                                 break;
 982 
 983                 if (omsg.throttle_status >= 0 &&
 984                     omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) {
 985                         chips[i].throttle_reason = omsg.throttle_status;
 986                         chips[i].reason[omsg.throttle_status]++;
 987                 }
 988 
 989                 if (!omsg.throttle_status)
 990                         chips[i].restore = true;
 991 
 992                 schedule_work(&chips[i].throttle);
 993         }
 994         return 0;
 995 }
 996 
 997 static struct notifier_block powernv_cpufreq_opal_nb = {
 998         .notifier_call  = powernv_cpufreq_occ_msg,
 999         .next           = NULL,
1000         .priority       = 0,
1001 };
1002 
1003 static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
1004 {
1005         struct powernv_smp_call_data freq_data;
1006         struct global_pstate_info *gpstates = policy->driver_data;
1007 
1008         freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min);
1009         freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min);
1010         smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
1011         if (gpstates)
1012                 del_timer_sync(&gpstates->timer);
1013 }
1014 
1015 static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,
1016                                         unsigned int target_freq)
1017 {
1018         int index;
1019         struct powernv_smp_call_data freq_data;
1020 
1021         index = cpufreq_table_find_index_dl(policy, target_freq);
1022         freq_data.pstate_id = powernv_freqs[index].driver_data;
1023         freq_data.gpstate_id = powernv_freqs[index].driver_data;
1024         set_pstate(&freq_data);
1025 
1026         return powernv_freqs[index].frequency;
1027 }
1028 
1029 static struct cpufreq_driver powernv_cpufreq_driver = {
1030         .name           = "powernv-cpufreq",
1031         .flags          = CPUFREQ_CONST_LOOPS,
1032         .init           = powernv_cpufreq_cpu_init,
1033         .exit           = powernv_cpufreq_cpu_exit,
1034         .verify         = cpufreq_generic_frequency_table_verify,
1035         .target_index   = powernv_cpufreq_target_index,
1036         .fast_switch    = powernv_fast_switch,
1037         .get            = powernv_cpufreq_get,
1038         .stop_cpu       = powernv_cpufreq_stop_cpu,
1039         .attr           = powernv_cpu_freq_attr,
1040 };
1041 
1042 static int init_chip_info(void)
1043 {
1044         unsigned int *chip;
1045         unsigned int cpu, i;
1046         unsigned int prev_chip_id = UINT_MAX;
1047         int ret = 0;
1048 
1049         chip = kcalloc(num_possible_cpus(), sizeof(*chip), GFP_KERNEL);
1050         if (!chip)
1051                 return -ENOMEM;
1052 
1053         for_each_possible_cpu(cpu) {
1054                 unsigned int id = cpu_to_chip_id(cpu);
1055 
1056                 if (prev_chip_id != id) {
1057                         prev_chip_id = id;
1058                         chip[nr_chips++] = id;
1059                 }
1060         }
1061 
1062         chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
1063         if (!chips) {
1064                 ret = -ENOMEM;
1065                 goto free_and_return;
1066         }
1067 
1068         for (i = 0; i < nr_chips; i++) {
1069                 chips[i].id = chip[i];
1070                 cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
1071                 INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
1072                 for_each_cpu(cpu, &chips[i].mask)
1073                         per_cpu(chip_info, cpu) =  &chips[i];
1074         }
1075 
1076 free_and_return:
1077         kfree(chip);
1078         return ret;
1079 }
1080 
1081 static inline void clean_chip_info(void)
1082 {
1083         int i;
1084 
1085         /* flush any pending work items */
1086         if (chips)
1087                 for (i = 0; i < nr_chips; i++)
1088                         cancel_work_sync(&chips[i].throttle);
1089         kfree(chips);
1090 }
1091 
1092 static inline void unregister_all_notifiers(void)
1093 {
1094         opal_message_notifier_unregister(OPAL_MSG_OCC,
1095                                          &powernv_cpufreq_opal_nb);
1096         unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
1097 }
1098 
1099 static int __init powernv_cpufreq_init(void)
1100 {
1101         int rc = 0;
1102 
1103         /* Don't probe on pseries (guest) platforms */
1104         if (!firmware_has_feature(FW_FEATURE_OPAL))
1105                 return -ENODEV;
1106 
1107         /* Discover pstates from device tree and init */
1108         rc = init_powernv_pstates();
1109         if (rc)
1110                 goto out;
1111 
1112         /* Populate chip info */
1113         rc = init_chip_info();
1114         if (rc)
1115                 goto out;
1116 
1117         register_reboot_notifier(&powernv_cpufreq_reboot_nb);
1118         opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb);
1119 
1120         if (powernv_pstate_info.wof_enabled)
1121                 powernv_cpufreq_driver.boost_enabled = true;
1122         else
1123                 powernv_cpu_freq_attr[SCALING_BOOST_FREQS_ATTR_INDEX] = NULL;
1124 
1125         rc = cpufreq_register_driver(&powernv_cpufreq_driver);
1126         if (rc) {
1127                 pr_info("Failed to register the cpufreq driver (%d)\n", rc);
1128                 goto cleanup_notifiers;
1129         }
1130 
1131         if (powernv_pstate_info.wof_enabled)
1132                 cpufreq_enable_boost_support();
1133 
1134         return 0;
1135 cleanup_notifiers:
1136         unregister_all_notifiers();
1137         clean_chip_info();
1138 out:
1139         pr_info("Platform driver disabled. System does not support PState control\n");
1140         return rc;
1141 }
1142 module_init(powernv_cpufreq_init);
1143 
1144 static void __exit powernv_cpufreq_exit(void)
1145 {
1146         cpufreq_unregister_driver(&powernv_cpufreq_driver);
1147         unregister_all_notifiers();
1148         clean_chip_info();
1149 }
1150 module_exit(powernv_cpufreq_exit);
1151 
1152 MODULE_LICENSE("GPL");
1153 MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>");

/* [<][>][^][v][top][bottom][index][help] */