root/drivers/powercap/intel_rapl_common.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. get_energy_counter
  2. get_max_energy_counter
  3. release_zone
  4. find_nr_power_limit
  5. set_domain_enable
  6. get_domain_enable
  7. contraint_to_pl
  8. set_power_limit
  9. get_current_power_limit
  10. set_time_window
  11. get_time_window
  12. get_constraint_name
  13. get_max_power
  14. rapl_init_domains
  15. rapl_unit_xlate
  16. rapl_read_data_raw
  17. rapl_write_data_raw
  18. rapl_check_unit_core
  19. rapl_check_unit_atom
  20. power_limit_irq_save_cpu
  21. package_power_limit_irq_save
  22. package_power_limit_irq_restore
  23. set_floor_freq_default
  24. set_floor_freq_atom
  25. rapl_compute_time_window_core
  26. rapl_compute_time_window_atom
  27. rapl_update_domain_data
  28. rapl_package_register_powercap
  29. rapl_add_platform_domain
  30. rapl_remove_platform_domain
  31. rapl_check_domain
  32. rapl_detect_powerlimit
  33. rapl_detect_domains
  34. rapl_remove_package
  35. rapl_find_package_domain
  36. rapl_add_package
  37. power_limit_state_save
  38. power_limit_state_restore
  39. rapl_pm_callback
  40. rapl_init
  41. rapl_exit

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Common code for Intel Running Average Power Limit (RAPL) support.
   4  * Copyright (c) 2019, Intel Corporation.
   5  */
   6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7 
   8 #include <linux/kernel.h>
   9 #include <linux/module.h>
  10 #include <linux/list.h>
  11 #include <linux/types.h>
  12 #include <linux/device.h>
  13 #include <linux/slab.h>
  14 #include <linux/log2.h>
  15 #include <linux/bitmap.h>
  16 #include <linux/delay.h>
  17 #include <linux/sysfs.h>
  18 #include <linux/cpu.h>
  19 #include <linux/powercap.h>
  20 #include <linux/suspend.h>
  21 #include <linux/intel_rapl.h>
  22 #include <linux/processor.h>
  23 #include <linux/platform_device.h>
  24 
  25 #include <asm/iosf_mbi.h>
  26 #include <asm/cpu_device_id.h>
  27 #include <asm/intel-family.h>
  28 
  29 /* Local defines */
  30 #define MSR_PLATFORM_POWER_LIMIT        0x0000065C
  31 
  32 /* bitmasks for RAPL MSRs, used by primitive access functions */
  33 #define ENERGY_STATUS_MASK      0xffffffff
  34 
  35 #define POWER_LIMIT1_MASK       0x7FFF
  36 #define POWER_LIMIT1_ENABLE     BIT(15)
  37 #define POWER_LIMIT1_CLAMP      BIT(16)
  38 
  39 #define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
  40 #define POWER_LIMIT2_ENABLE     BIT_ULL(47)
  41 #define POWER_LIMIT2_CLAMP      BIT_ULL(48)
  42 #define POWER_HIGH_LOCK         BIT_ULL(63)
  43 #define POWER_LOW_LOCK          BIT(31)
  44 
  45 #define TIME_WINDOW1_MASK       (0x7FULL<<17)
  46 #define TIME_WINDOW2_MASK       (0x7FULL<<49)
  47 
  48 #define POWER_UNIT_OFFSET       0
  49 #define POWER_UNIT_MASK         0x0F
  50 
  51 #define ENERGY_UNIT_OFFSET      0x08
  52 #define ENERGY_UNIT_MASK        0x1F00
  53 
  54 #define TIME_UNIT_OFFSET        0x10
  55 #define TIME_UNIT_MASK          0xF0000
  56 
  57 #define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
  58 #define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
  59 #define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
  60 #define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
  61 
  62 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
  63 #define PP_POLICY_MASK         0x1F
  64 
  65 /* Non HW constants */
  66 #define RAPL_PRIMITIVE_DERIVED       BIT(1)     /* not from raw data */
  67 #define RAPL_PRIMITIVE_DUMMY         BIT(2)
  68 
  69 #define TIME_WINDOW_MAX_MSEC 40000
  70 #define TIME_WINDOW_MIN_MSEC 250
  71 #define ENERGY_UNIT_SCALE    1000       /* scale from driver unit to powercap unit */
  72 enum unit_type {
  73         ARBITRARY_UNIT,         /* no translation */
  74         POWER_UNIT,
  75         ENERGY_UNIT,
  76         TIME_UNIT,
  77 };
  78 
  79 /* per domain data, some are optional */
  80 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
  81 
  82 #define DOMAIN_STATE_INACTIVE           BIT(0)
  83 #define DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
  84 #define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
  85 
  86 static const char pl1_name[] = "long_term";
  87 static const char pl2_name[] = "short_term";
  88 
  89 #define power_zone_to_rapl_domain(_zone) \
  90         container_of(_zone, struct rapl_domain, power_zone)
  91 
  92 struct rapl_defaults {
  93         u8 floor_freq_reg_addr;
  94         int (*check_unit)(struct rapl_package *rp, int cpu);
  95         void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
  96         u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
  97                                     bool to_raw);
  98         unsigned int dram_domain_energy_unit;
  99 };
 100 static struct rapl_defaults *rapl_defaults;
 101 
 102 /* Sideband MBI registers */
 103 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
 104 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
 105 
 106 #define PACKAGE_PLN_INT_SAVED   BIT(0)
 107 #define MAX_PRIM_NAME (32)
 108 
 109 /* per domain data. used to describe individual knobs such that access function
 110  * can be consolidated into one instead of many inline functions.
 111  */
 112 struct rapl_primitive_info {
 113         const char *name;
 114         u64 mask;
 115         int shift;
 116         enum rapl_domain_reg_id id;
 117         enum unit_type unit;
 118         u32 flag;
 119 };
 120 
 121 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \
 122                 .name = #p,                     \
 123                 .mask = m,                      \
 124                 .shift = s,                     \
 125                 .id = i,                        \
 126                 .unit = u,                      \
 127                 .flag = f                       \
 128         }
 129 
 130 static void rapl_init_domains(struct rapl_package *rp);
 131 static int rapl_read_data_raw(struct rapl_domain *rd,
 132                               enum rapl_primitives prim,
 133                               bool xlate, u64 *data);
 134 static int rapl_write_data_raw(struct rapl_domain *rd,
 135                                enum rapl_primitives prim,
 136                                unsigned long long value);
 137 static u64 rapl_unit_xlate(struct rapl_domain *rd,
 138                            enum unit_type type, u64 value, int to_raw);
 139 static void package_power_limit_irq_save(struct rapl_package *rp);
 140 
 141 static LIST_HEAD(rapl_packages);        /* guarded by CPU hotplug lock */
 142 
 143 static const char *const rapl_domain_names[] = {
 144         "package",
 145         "core",
 146         "uncore",
 147         "dram",
 148         "psys",
 149 };
 150 
 151 static int get_energy_counter(struct powercap_zone *power_zone,
 152                               u64 *energy_raw)
 153 {
 154         struct rapl_domain *rd;
 155         u64 energy_now;
 156 
 157         /* prevent CPU hotplug, make sure the RAPL domain does not go
 158          * away while reading the counter.
 159          */
 160         get_online_cpus();
 161         rd = power_zone_to_rapl_domain(power_zone);
 162 
 163         if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
 164                 *energy_raw = energy_now;
 165                 put_online_cpus();
 166 
 167                 return 0;
 168         }
 169         put_online_cpus();
 170 
 171         return -EIO;
 172 }
 173 
 174 static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
 175 {
 176         struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
 177 
 178         *energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
 179         return 0;
 180 }
 181 
 182 static int release_zone(struct powercap_zone *power_zone)
 183 {
 184         struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
 185         struct rapl_package *rp = rd->rp;
 186 
 187         /* package zone is the last zone of a package, we can free
 188          * memory here since all children has been unregistered.
 189          */
 190         if (rd->id == RAPL_DOMAIN_PACKAGE) {
 191                 kfree(rd);
 192                 rp->domains = NULL;
 193         }
 194 
 195         return 0;
 196 
 197 }
 198 
 199 static int find_nr_power_limit(struct rapl_domain *rd)
 200 {
 201         int i, nr_pl = 0;
 202 
 203         for (i = 0; i < NR_POWER_LIMITS; i++) {
 204                 if (rd->rpl[i].name)
 205                         nr_pl++;
 206         }
 207 
 208         return nr_pl;
 209 }
 210 
 211 static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
 212 {
 213         struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
 214 
 215         if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
 216                 return -EACCES;
 217 
 218         get_online_cpus();
 219         rapl_write_data_raw(rd, PL1_ENABLE, mode);
 220         if (rapl_defaults->set_floor_freq)
 221                 rapl_defaults->set_floor_freq(rd, mode);
 222         put_online_cpus();
 223 
 224         return 0;
 225 }
 226 
 227 static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
 228 {
 229         struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
 230         u64 val;
 231 
 232         if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
 233                 *mode = false;
 234                 return 0;
 235         }
 236         get_online_cpus();
 237         if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
 238                 put_online_cpus();
 239                 return -EIO;
 240         }
 241         *mode = val;
 242         put_online_cpus();
 243 
 244         return 0;
 245 }
 246 
 247 /* per RAPL domain ops, in the order of rapl_domain_type */
 248 static const struct powercap_zone_ops zone_ops[] = {
 249         /* RAPL_DOMAIN_PACKAGE */
 250         {
 251          .get_energy_uj = get_energy_counter,
 252          .get_max_energy_range_uj = get_max_energy_counter,
 253          .release = release_zone,
 254          .set_enable = set_domain_enable,
 255          .get_enable = get_domain_enable,
 256          },
 257         /* RAPL_DOMAIN_PP0 */
 258         {
 259          .get_energy_uj = get_energy_counter,
 260          .get_max_energy_range_uj = get_max_energy_counter,
 261          .release = release_zone,
 262          .set_enable = set_domain_enable,
 263          .get_enable = get_domain_enable,
 264          },
 265         /* RAPL_DOMAIN_PP1 */
 266         {
 267          .get_energy_uj = get_energy_counter,
 268          .get_max_energy_range_uj = get_max_energy_counter,
 269          .release = release_zone,
 270          .set_enable = set_domain_enable,
 271          .get_enable = get_domain_enable,
 272          },
 273         /* RAPL_DOMAIN_DRAM */
 274         {
 275          .get_energy_uj = get_energy_counter,
 276          .get_max_energy_range_uj = get_max_energy_counter,
 277          .release = release_zone,
 278          .set_enable = set_domain_enable,
 279          .get_enable = get_domain_enable,
 280          },
 281         /* RAPL_DOMAIN_PLATFORM */
 282         {
 283          .get_energy_uj = get_energy_counter,
 284          .get_max_energy_range_uj = get_max_energy_counter,
 285          .release = release_zone,
 286          .set_enable = set_domain_enable,
 287          .get_enable = get_domain_enable,
 288          },
 289 };
 290 
 291 /*
 292  * Constraint index used by powercap can be different than power limit (PL)
 293  * index in that some  PLs maybe missing due to non-existent MSRs. So we
 294  * need to convert here by finding the valid PLs only (name populated).
 295  */
 296 static int contraint_to_pl(struct rapl_domain *rd, int cid)
 297 {
 298         int i, j;
 299 
 300         for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
 301                 if ((rd->rpl[i].name) && j++ == cid) {
 302                         pr_debug("%s: index %d\n", __func__, i);
 303                         return i;
 304                 }
 305         }
 306         pr_err("Cannot find matching power limit for constraint %d\n", cid);
 307 
 308         return -EINVAL;
 309 }
 310 
 311 static int set_power_limit(struct powercap_zone *power_zone, int cid,
 312                            u64 power_limit)
 313 {
 314         struct rapl_domain *rd;
 315         struct rapl_package *rp;
 316         int ret = 0;
 317         int id;
 318 
 319         get_online_cpus();
 320         rd = power_zone_to_rapl_domain(power_zone);
 321         id = contraint_to_pl(rd, cid);
 322         if (id < 0) {
 323                 ret = id;
 324                 goto set_exit;
 325         }
 326 
 327         rp = rd->rp;
 328 
 329         if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
 330                 dev_warn(&power_zone->dev,
 331                          "%s locked by BIOS, monitoring only\n", rd->name);
 332                 ret = -EACCES;
 333                 goto set_exit;
 334         }
 335 
 336         switch (rd->rpl[id].prim_id) {
 337         case PL1_ENABLE:
 338                 rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
 339                 break;
 340         case PL2_ENABLE:
 341                 rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
 342                 break;
 343         default:
 344                 ret = -EINVAL;
 345         }
 346         if (!ret)
 347                 package_power_limit_irq_save(rp);
 348 set_exit:
 349         put_online_cpus();
 350         return ret;
 351 }
 352 
 353 static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
 354                                    u64 *data)
 355 {
 356         struct rapl_domain *rd;
 357         u64 val;
 358         int prim;
 359         int ret = 0;
 360         int id;
 361 
 362         get_online_cpus();
 363         rd = power_zone_to_rapl_domain(power_zone);
 364         id = contraint_to_pl(rd, cid);
 365         if (id < 0) {
 366                 ret = id;
 367                 goto get_exit;
 368         }
 369 
 370         switch (rd->rpl[id].prim_id) {
 371         case PL1_ENABLE:
 372                 prim = POWER_LIMIT1;
 373                 break;
 374         case PL2_ENABLE:
 375                 prim = POWER_LIMIT2;
 376                 break;
 377         default:
 378                 put_online_cpus();
 379                 return -EINVAL;
 380         }
 381         if (rapl_read_data_raw(rd, prim, true, &val))
 382                 ret = -EIO;
 383         else
 384                 *data = val;
 385 
 386 get_exit:
 387         put_online_cpus();
 388 
 389         return ret;
 390 }
 391 
 392 static int set_time_window(struct powercap_zone *power_zone, int cid,
 393                            u64 window)
 394 {
 395         struct rapl_domain *rd;
 396         int ret = 0;
 397         int id;
 398 
 399         get_online_cpus();
 400         rd = power_zone_to_rapl_domain(power_zone);
 401         id = contraint_to_pl(rd, cid);
 402         if (id < 0) {
 403                 ret = id;
 404                 goto set_time_exit;
 405         }
 406 
 407         switch (rd->rpl[id].prim_id) {
 408         case PL1_ENABLE:
 409                 rapl_write_data_raw(rd, TIME_WINDOW1, window);
 410                 break;
 411         case PL2_ENABLE:
 412                 rapl_write_data_raw(rd, TIME_WINDOW2, window);
 413                 break;
 414         default:
 415                 ret = -EINVAL;
 416         }
 417 
 418 set_time_exit:
 419         put_online_cpus();
 420         return ret;
 421 }
 422 
 423 static int get_time_window(struct powercap_zone *power_zone, int cid,
 424                            u64 *data)
 425 {
 426         struct rapl_domain *rd;
 427         u64 val;
 428         int ret = 0;
 429         int id;
 430 
 431         get_online_cpus();
 432         rd = power_zone_to_rapl_domain(power_zone);
 433         id = contraint_to_pl(rd, cid);
 434         if (id < 0) {
 435                 ret = id;
 436                 goto get_time_exit;
 437         }
 438 
 439         switch (rd->rpl[id].prim_id) {
 440         case PL1_ENABLE:
 441                 ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
 442                 break;
 443         case PL2_ENABLE:
 444                 ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
 445                 break;
 446         default:
 447                 put_online_cpus();
 448                 return -EINVAL;
 449         }
 450         if (!ret)
 451                 *data = val;
 452 
 453 get_time_exit:
 454         put_online_cpus();
 455 
 456         return ret;
 457 }
 458 
 459 static const char *get_constraint_name(struct powercap_zone *power_zone,
 460                                        int cid)
 461 {
 462         struct rapl_domain *rd;
 463         int id;
 464 
 465         rd = power_zone_to_rapl_domain(power_zone);
 466         id = contraint_to_pl(rd, cid);
 467         if (id >= 0)
 468                 return rd->rpl[id].name;
 469 
 470         return NULL;
 471 }
 472 
 473 static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data)
 474 {
 475         struct rapl_domain *rd;
 476         u64 val;
 477         int prim;
 478         int ret = 0;
 479 
 480         get_online_cpus();
 481         rd = power_zone_to_rapl_domain(power_zone);
 482         switch (rd->rpl[id].prim_id) {
 483         case PL1_ENABLE:
 484                 prim = THERMAL_SPEC_POWER;
 485                 break;
 486         case PL2_ENABLE:
 487                 prim = MAX_POWER;
 488                 break;
 489         default:
 490                 put_online_cpus();
 491                 return -EINVAL;
 492         }
 493         if (rapl_read_data_raw(rd, prim, true, &val))
 494                 ret = -EIO;
 495         else
 496                 *data = val;
 497 
 498         put_online_cpus();
 499 
 500         return ret;
 501 }
 502 
 503 static const struct powercap_zone_constraint_ops constraint_ops = {
 504         .set_power_limit_uw = set_power_limit,
 505         .get_power_limit_uw = get_current_power_limit,
 506         .set_time_window_us = set_time_window,
 507         .get_time_window_us = get_time_window,
 508         .get_max_power_uw = get_max_power,
 509         .get_name = get_constraint_name,
 510 };
 511 
 512 /* called after domain detection and package level data are set */
 513 static void rapl_init_domains(struct rapl_package *rp)
 514 {
 515         enum rapl_domain_type i;
 516         enum rapl_domain_reg_id j;
 517         struct rapl_domain *rd = rp->domains;
 518 
 519         for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 520                 unsigned int mask = rp->domain_map & (1 << i);
 521 
 522                 if (!mask)
 523                         continue;
 524 
 525                 rd->rp = rp;
 526                 rd->name = rapl_domain_names[i];
 527                 rd->id = i;
 528                 rd->rpl[0].prim_id = PL1_ENABLE;
 529                 rd->rpl[0].name = pl1_name;
 530                 /* some domain may support two power limits */
 531                 if (rp->priv->limits[i] == 2) {
 532                         rd->rpl[1].prim_id = PL2_ENABLE;
 533                         rd->rpl[1].name = pl2_name;
 534                 }
 535 
 536                 for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
 537                         rd->regs[j] = rp->priv->regs[i][j];
 538 
 539                 if (i == RAPL_DOMAIN_DRAM) {
 540                         rd->domain_energy_unit =
 541                             rapl_defaults->dram_domain_energy_unit;
 542                         if (rd->domain_energy_unit)
 543                                 pr_info("DRAM domain energy unit %dpj\n",
 544                                         rd->domain_energy_unit);
 545                 }
 546                 rd++;
 547         }
 548 }
 549 
 550 static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
 551                            u64 value, int to_raw)
 552 {
 553         u64 units = 1;
 554         struct rapl_package *rp = rd->rp;
 555         u64 scale = 1;
 556 
 557         switch (type) {
 558         case POWER_UNIT:
 559                 units = rp->power_unit;
 560                 break;
 561         case ENERGY_UNIT:
 562                 scale = ENERGY_UNIT_SCALE;
 563                 /* per domain unit takes precedence */
 564                 if (rd->domain_energy_unit)
 565                         units = rd->domain_energy_unit;
 566                 else
 567                         units = rp->energy_unit;
 568                 break;
 569         case TIME_UNIT:
 570                 return rapl_defaults->compute_time_window(rp, value, to_raw);
 571         case ARBITRARY_UNIT:
 572         default:
 573                 return value;
 574         };
 575 
 576         if (to_raw)
 577                 return div64_u64(value, units) * scale;
 578 
 579         value *= units;
 580 
 581         return div64_u64(value, scale);
 582 }
 583 
 584 /* in the order of enum rapl_primitives */
 585 static struct rapl_primitive_info rpi[] = {
 586         /* name, mask, shift, msr index, unit divisor */
 587         PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
 588                             RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
 589         PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
 590                             RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
 591         PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
 592                             RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
 593         PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
 594                             RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 595         PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
 596                             RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 597         PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
 598                             RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 599         PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
 600                             RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 601         PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
 602                             RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 603         PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
 604                             RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
 605         PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
 606                             RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
 607         PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
 608                             0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
 609         PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
 610                             RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
 611         PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
 612                             RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
 613         PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
 614                             RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
 615         PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
 616                             RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
 617         PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
 618                             RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
 619         /* non-hardware */
 620         PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
 621                             RAPL_PRIMITIVE_DERIVED),
 622         {NULL, 0, 0, 0},
 623 };
 624 
 625 /* Read primitive data based on its related struct rapl_primitive_info.
 626  * if xlate flag is set, return translated data based on data units, i.e.
 627  * time, energy, and power.
 628  * RAPL MSRs are non-architectual and are laid out not consistently across
 629  * domains. Here we use primitive info to allow writing consolidated access
 630  * functions.
 631  * For a given primitive, it is processed by MSR mask and shift. Unit conversion
 632  * is pre-assigned based on RAPL unit MSRs read at init time.
 633  * 63-------------------------- 31--------------------------- 0
 634  * |                           xxxxx (mask)                   |
 635  * |                                |<- shift ----------------|
 636  * 63-------------------------- 31--------------------------- 0
 637  */
 638 static int rapl_read_data_raw(struct rapl_domain *rd,
 639                               enum rapl_primitives prim, bool xlate, u64 *data)
 640 {
 641         u64 value;
 642         struct rapl_primitive_info *rp = &rpi[prim];
 643         struct reg_action ra;
 644         int cpu;
 645 
 646         if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
 647                 return -EINVAL;
 648 
 649         ra.reg = rd->regs[rp->id];
 650         if (!ra.reg)
 651                 return -EINVAL;
 652 
 653         cpu = rd->rp->lead_cpu;
 654 
 655         /* domain with 2 limits has different bit */
 656         if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) {
 657                 rp->mask = POWER_HIGH_LOCK;
 658                 rp->shift = 63;
 659         }
 660         /* non-hardware data are collected by the polling thread */
 661         if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
 662                 *data = rd->rdd.primitives[prim];
 663                 return 0;
 664         }
 665 
 666         ra.mask = rp->mask;
 667 
 668         if (rd->rp->priv->read_raw(cpu, &ra)) {
 669                 pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu);
 670                 return -EIO;
 671         }
 672 
 673         value = ra.value >> rp->shift;
 674 
 675         if (xlate)
 676                 *data = rapl_unit_xlate(rd, rp->unit, value, 0);
 677         else
 678                 *data = value;
 679 
 680         return 0;
 681 }
 682 
 683 /* Similar use of primitive info in the read counterpart */
 684 static int rapl_write_data_raw(struct rapl_domain *rd,
 685                                enum rapl_primitives prim,
 686                                unsigned long long value)
 687 {
 688         struct rapl_primitive_info *rp = &rpi[prim];
 689         int cpu;
 690         u64 bits;
 691         struct reg_action ra;
 692         int ret;
 693 
 694         cpu = rd->rp->lead_cpu;
 695         bits = rapl_unit_xlate(rd, rp->unit, value, 1);
 696         bits <<= rp->shift;
 697         bits &= rp->mask;
 698 
 699         memset(&ra, 0, sizeof(ra));
 700 
 701         ra.reg = rd->regs[rp->id];
 702         ra.mask = rp->mask;
 703         ra.value = bits;
 704 
 705         ret = rd->rp->priv->write_raw(cpu, &ra);
 706 
 707         return ret;
 708 }
 709 
 710 /*
 711  * Raw RAPL data stored in MSRs are in certain scales. We need to
 712  * convert them into standard units based on the units reported in
 713  * the RAPL unit MSRs. This is specific to CPUs as the method to
 714  * calculate units differ on different CPUs.
 715  * We convert the units to below format based on CPUs.
 716  * i.e.
 717  * energy unit: picoJoules  : Represented in picoJoules by default
 718  * power unit : microWatts  : Represented in milliWatts by default
 719  * time unit  : microseconds: Represented in seconds by default
 720  */
 721 static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
 722 {
 723         struct reg_action ra;
 724         u32 value;
 725 
 726         ra.reg = rp->priv->reg_unit;
 727         ra.mask = ~0;
 728         if (rp->priv->read_raw(cpu, &ra)) {
 729                 pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
 730                        rp->priv->reg_unit, cpu);
 731                 return -ENODEV;
 732         }
 733 
 734         value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
 735         rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
 736 
 737         value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
 738         rp->power_unit = 1000000 / (1 << value);
 739 
 740         value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
 741         rp->time_unit = 1000000 / (1 << value);
 742 
 743         pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
 744                  rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
 745 
 746         return 0;
 747 }
 748 
 749 static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
 750 {
 751         struct reg_action ra;
 752         u32 value;
 753 
 754         ra.reg = rp->priv->reg_unit;
 755         ra.mask = ~0;
 756         if (rp->priv->read_raw(cpu, &ra)) {
 757                 pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
 758                        rp->priv->reg_unit, cpu);
 759                 return -ENODEV;
 760         }
 761 
 762         value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
 763         rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
 764 
 765         value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
 766         rp->power_unit = (1 << value) * 1000;
 767 
 768         value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
 769         rp->time_unit = 1000000 / (1 << value);
 770 
 771         pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
 772                  rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
 773 
 774         return 0;
 775 }
 776 
 777 static void power_limit_irq_save_cpu(void *info)
 778 {
 779         u32 l, h = 0;
 780         struct rapl_package *rp = (struct rapl_package *)info;
 781 
 782         /* save the state of PLN irq mask bit before disabling it */
 783         rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
 784         if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
 785                 rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
 786                 rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
 787         }
 788         l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
 789         wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 790 }
 791 
 792 /* REVISIT:
 793  * When package power limit is set artificially low by RAPL, LVT
 794  * thermal interrupt for package power limit should be ignored
 795  * since we are not really exceeding the real limit. The intention
 796  * is to avoid excessive interrupts while we are trying to save power.
 797  * A useful feature might be routing the package_power_limit interrupt
 798  * to userspace via eventfd. once we have a usecase, this is simple
 799  * to do by adding an atomic notifier.
 800  */
 801 
 802 static void package_power_limit_irq_save(struct rapl_package *rp)
 803 {
 804         if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
 805                 return;
 806 
 807         smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
 808 }
 809 
 810 /*
 811  * Restore per package power limit interrupt enable state. Called from cpu
 812  * hotplug code on package removal.
 813  */
 814 static void package_power_limit_irq_restore(struct rapl_package *rp)
 815 {
 816         u32 l, h;
 817 
 818         if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
 819                 return;
 820 
 821         /* irq enable state not saved, nothing to restore */
 822         if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
 823                 return;
 824 
 825         rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
 826 
 827         if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
 828                 l |= PACKAGE_THERM_INT_PLN_ENABLE;
 829         else
 830                 l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
 831 
 832         wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 833 }
 834 
 835 static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
 836 {
 837         int nr_powerlimit = find_nr_power_limit(rd);
 838 
 839         /* always enable clamp such that p-state can go below OS requested
 840          * range. power capping priority over guranteed frequency.
 841          */
 842         rapl_write_data_raw(rd, PL1_CLAMP, mode);
 843 
 844         /* some domains have pl2 */
 845         if (nr_powerlimit > 1) {
 846                 rapl_write_data_raw(rd, PL2_ENABLE, mode);
 847                 rapl_write_data_raw(rd, PL2_CLAMP, mode);
 848         }
 849 }
 850 
 851 static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
 852 {
 853         static u32 power_ctrl_orig_val;
 854         u32 mdata;
 855 
 856         if (!rapl_defaults->floor_freq_reg_addr) {
 857                 pr_err("Invalid floor frequency config register\n");
 858                 return;
 859         }
 860 
 861         if (!power_ctrl_orig_val)
 862                 iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
 863                               rapl_defaults->floor_freq_reg_addr,
 864                               &power_ctrl_orig_val);
 865         mdata = power_ctrl_orig_val;
 866         if (enable) {
 867                 mdata &= ~(0x7f << 8);
 868                 mdata |= 1 << 8;
 869         }
 870         iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
 871                        rapl_defaults->floor_freq_reg_addr, mdata);
 872 }
 873 
 874 static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
 875                                          bool to_raw)
 876 {
 877         u64 f, y;               /* fraction and exp. used for time unit */
 878 
 879         /*
 880          * Special processing based on 2^Y*(1+F/4), refer
 881          * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
 882          */
 883         if (!to_raw) {
 884                 f = (value & 0x60) >> 5;
 885                 y = value & 0x1f;
 886                 value = (1 << y) * (4 + f) * rp->time_unit / 4;
 887         } else {
 888                 do_div(value, rp->time_unit);
 889                 y = ilog2(value);
 890                 f = div64_u64(4 * (value - (1 << y)), 1 << y);
 891                 value = (y & 0x1f) | ((f & 0x3) << 5);
 892         }
 893         return value;
 894 }
 895 
 896 static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
 897                                          bool to_raw)
 898 {
 899         /*
 900          * Atom time unit encoding is straight forward val * time_unit,
 901          * where time_unit is default to 1 sec. Never 0.
 902          */
 903         if (!to_raw)
 904                 return (value) ? value *= rp->time_unit : rp->time_unit;
 905 
 906         value = div64_u64(value, rp->time_unit);
 907 
 908         return value;
 909 }
 910 
 911 static const struct rapl_defaults rapl_defaults_core = {
 912         .floor_freq_reg_addr = 0,
 913         .check_unit = rapl_check_unit_core,
 914         .set_floor_freq = set_floor_freq_default,
 915         .compute_time_window = rapl_compute_time_window_core,
 916 };
 917 
 918 static const struct rapl_defaults rapl_defaults_hsw_server = {
 919         .check_unit = rapl_check_unit_core,
 920         .set_floor_freq = set_floor_freq_default,
 921         .compute_time_window = rapl_compute_time_window_core,
 922         .dram_domain_energy_unit = 15300,
 923 };
 924 
 925 static const struct rapl_defaults rapl_defaults_byt = {
 926         .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
 927         .check_unit = rapl_check_unit_atom,
 928         .set_floor_freq = set_floor_freq_atom,
 929         .compute_time_window = rapl_compute_time_window_atom,
 930 };
 931 
 932 static const struct rapl_defaults rapl_defaults_tng = {
 933         .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
 934         .check_unit = rapl_check_unit_atom,
 935         .set_floor_freq = set_floor_freq_atom,
 936         .compute_time_window = rapl_compute_time_window_atom,
 937 };
 938 
 939 static const struct rapl_defaults rapl_defaults_ann = {
 940         .floor_freq_reg_addr = 0,
 941         .check_unit = rapl_check_unit_atom,
 942         .set_floor_freq = NULL,
 943         .compute_time_window = rapl_compute_time_window_atom,
 944 };
 945 
 946 static const struct rapl_defaults rapl_defaults_cht = {
 947         .floor_freq_reg_addr = 0,
 948         .check_unit = rapl_check_unit_atom,
 949         .set_floor_freq = NULL,
 950         .compute_time_window = rapl_compute_time_window_atom,
 951 };
 952 
 953 static const struct x86_cpu_id rapl_ids[] __initconst = {
 954         INTEL_CPU_FAM6(SANDYBRIDGE, rapl_defaults_core),
 955         INTEL_CPU_FAM6(SANDYBRIDGE_X, rapl_defaults_core),
 956 
 957         INTEL_CPU_FAM6(IVYBRIDGE, rapl_defaults_core),
 958         INTEL_CPU_FAM6(IVYBRIDGE_X, rapl_defaults_core),
 959 
 960         INTEL_CPU_FAM6(HASWELL, rapl_defaults_core),
 961         INTEL_CPU_FAM6(HASWELL_L, rapl_defaults_core),
 962         INTEL_CPU_FAM6(HASWELL_G, rapl_defaults_core),
 963         INTEL_CPU_FAM6(HASWELL_X, rapl_defaults_hsw_server),
 964 
 965         INTEL_CPU_FAM6(BROADWELL, rapl_defaults_core),
 966         INTEL_CPU_FAM6(BROADWELL_G, rapl_defaults_core),
 967         INTEL_CPU_FAM6(BROADWELL_D, rapl_defaults_core),
 968         INTEL_CPU_FAM6(BROADWELL_X, rapl_defaults_hsw_server),
 969 
 970         INTEL_CPU_FAM6(SKYLAKE, rapl_defaults_core),
 971         INTEL_CPU_FAM6(SKYLAKE_L, rapl_defaults_core),
 972         INTEL_CPU_FAM6(SKYLAKE_X, rapl_defaults_hsw_server),
 973         INTEL_CPU_FAM6(KABYLAKE_L, rapl_defaults_core),
 974         INTEL_CPU_FAM6(KABYLAKE, rapl_defaults_core),
 975         INTEL_CPU_FAM6(CANNONLAKE_L, rapl_defaults_core),
 976         INTEL_CPU_FAM6(ICELAKE_L, rapl_defaults_core),
 977         INTEL_CPU_FAM6(ICELAKE, rapl_defaults_core),
 978         INTEL_CPU_FAM6(ICELAKE_NNPI, rapl_defaults_core),
 979         INTEL_CPU_FAM6(ICELAKE_X, rapl_defaults_hsw_server),
 980         INTEL_CPU_FAM6(ICELAKE_D, rapl_defaults_hsw_server),
 981 
 982         INTEL_CPU_FAM6(ATOM_SILVERMONT, rapl_defaults_byt),
 983         INTEL_CPU_FAM6(ATOM_AIRMONT, rapl_defaults_cht),
 984         INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, rapl_defaults_tng),
 985         INTEL_CPU_FAM6(ATOM_AIRMONT_MID, rapl_defaults_ann),
 986         INTEL_CPU_FAM6(ATOM_GOLDMONT, rapl_defaults_core),
 987         INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS, rapl_defaults_core),
 988         INTEL_CPU_FAM6(ATOM_GOLDMONT_D, rapl_defaults_core),
 989         INTEL_CPU_FAM6(ATOM_TREMONT_D, rapl_defaults_core),
 990 
 991         INTEL_CPU_FAM6(XEON_PHI_KNL, rapl_defaults_hsw_server),
 992         INTEL_CPU_FAM6(XEON_PHI_KNM, rapl_defaults_hsw_server),
 993         {}
 994 };
 995 
 996 MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
 997 
 998 /* Read once for all raw primitive data for domains */
 999 static void rapl_update_domain_data(struct rapl_package *rp)
1000 {
1001         int dmn, prim;
1002         u64 val;
1003 
1004         for (dmn = 0; dmn < rp->nr_domains; dmn++) {
1005                 pr_debug("update %s domain %s data\n", rp->name,
1006                          rp->domains[dmn].name);
1007                 /* exclude non-raw primitives */
1008                 for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
1009                         if (!rapl_read_data_raw(&rp->domains[dmn], prim,
1010                                                 rpi[prim].unit, &val))
1011                                 rp->domains[dmn].rdd.primitives[prim] = val;
1012                 }
1013         }
1014 
1015 }
1016 
1017 static int rapl_package_register_powercap(struct rapl_package *rp)
1018 {
1019         struct rapl_domain *rd;
1020         struct powercap_zone *power_zone = NULL;
1021         int nr_pl, ret;
1022 
1023         /* Update the domain data of the new package */
1024         rapl_update_domain_data(rp);
1025 
1026         /* first we register package domain as the parent zone */
1027         for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1028                 if (rd->id == RAPL_DOMAIN_PACKAGE) {
1029                         nr_pl = find_nr_power_limit(rd);
1030                         pr_debug("register package domain %s\n", rp->name);
1031                         power_zone = powercap_register_zone(&rd->power_zone,
1032                                             rp->priv->control_type, rp->name,
1033                                             NULL, &zone_ops[rd->id], nr_pl,
1034                                             &constraint_ops);
1035                         if (IS_ERR(power_zone)) {
1036                                 pr_debug("failed to register power zone %s\n",
1037                                          rp->name);
1038                                 return PTR_ERR(power_zone);
1039                         }
1040                         /* track parent zone in per package/socket data */
1041                         rp->power_zone = power_zone;
1042                         /* done, only one package domain per socket */
1043                         break;
1044                 }
1045         }
1046         if (!power_zone) {
1047                 pr_err("no package domain found, unknown topology!\n");
1048                 return -ENODEV;
1049         }
1050         /* now register domains as children of the socket/package */
1051         for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1052                 if (rd->id == RAPL_DOMAIN_PACKAGE)
1053                         continue;
1054                 /* number of power limits per domain varies */
1055                 nr_pl = find_nr_power_limit(rd);
1056                 power_zone = powercap_register_zone(&rd->power_zone,
1057                                                     rp->priv->control_type,
1058                                                     rd->name, rp->power_zone,
1059                                                     &zone_ops[rd->id], nr_pl,
1060                                                     &constraint_ops);
1061 
1062                 if (IS_ERR(power_zone)) {
1063                         pr_debug("failed to register power_zone, %s:%s\n",
1064                                  rp->name, rd->name);
1065                         ret = PTR_ERR(power_zone);
1066                         goto err_cleanup;
1067                 }
1068         }
1069         return 0;
1070 
1071 err_cleanup:
1072         /*
1073          * Clean up previously initialized domains within the package if we
1074          * failed after the first domain setup.
1075          */
1076         while (--rd >= rp->domains) {
1077                 pr_debug("unregister %s domain %s\n", rp->name, rd->name);
1078                 powercap_unregister_zone(rp->priv->control_type,
1079                                          &rd->power_zone);
1080         }
1081 
1082         return ret;
1083 }
1084 
1085 int rapl_add_platform_domain(struct rapl_if_priv *priv)
1086 {
1087         struct rapl_domain *rd;
1088         struct powercap_zone *power_zone;
1089         struct reg_action ra;
1090         int ret;
1091 
1092         ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
1093         ra.mask = ~0;
1094         ret = priv->read_raw(0, &ra);
1095         if (ret || !ra.value)
1096                 return -ENODEV;
1097 
1098         ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
1099         ra.mask = ~0;
1100         ret = priv->read_raw(0, &ra);
1101         if (ret || !ra.value)
1102                 return -ENODEV;
1103 
1104         rd = kzalloc(sizeof(*rd), GFP_KERNEL);
1105         if (!rd)
1106                 return -ENOMEM;
1107 
1108         rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
1109         rd->id = RAPL_DOMAIN_PLATFORM;
1110         rd->regs[RAPL_DOMAIN_REG_LIMIT] =
1111             priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
1112         rd->regs[RAPL_DOMAIN_REG_STATUS] =
1113             priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
1114         rd->rpl[0].prim_id = PL1_ENABLE;
1115         rd->rpl[0].name = pl1_name;
1116         rd->rpl[1].prim_id = PL2_ENABLE;
1117         rd->rpl[1].name = pl2_name;
1118         rd->rp = rapl_find_package_domain(0, priv);
1119 
1120         power_zone = powercap_register_zone(&rd->power_zone, priv->control_type,
1121                                             "psys", NULL,
1122                                             &zone_ops[RAPL_DOMAIN_PLATFORM],
1123                                             2, &constraint_ops);
1124 
1125         if (IS_ERR(power_zone)) {
1126                 kfree(rd);
1127                 return PTR_ERR(power_zone);
1128         }
1129 
1130         priv->platform_rapl_domain = rd;
1131 
1132         return 0;
1133 }
1134 EXPORT_SYMBOL_GPL(rapl_add_platform_domain);
1135 
1136 void rapl_remove_platform_domain(struct rapl_if_priv *priv)
1137 {
1138         if (priv->platform_rapl_domain) {
1139                 powercap_unregister_zone(priv->control_type,
1140                                  &priv->platform_rapl_domain->power_zone);
1141                 kfree(priv->platform_rapl_domain);
1142         }
1143 }
1144 EXPORT_SYMBOL_GPL(rapl_remove_platform_domain);
1145 
1146 static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
1147 {
1148         struct reg_action ra;
1149 
1150         switch (domain) {
1151         case RAPL_DOMAIN_PACKAGE:
1152         case RAPL_DOMAIN_PP0:
1153         case RAPL_DOMAIN_PP1:
1154         case RAPL_DOMAIN_DRAM:
1155                 ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
1156                 break;
1157         case RAPL_DOMAIN_PLATFORM:
1158                 /* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
1159                 return -EINVAL;
1160         default:
1161                 pr_err("invalid domain id %d\n", domain);
1162                 return -EINVAL;
1163         }
1164         /* make sure domain counters are available and contains non-zero
1165          * values, otherwise skip it.
1166          */
1167 
1168         ra.mask = ~0;
1169         if (rp->priv->read_raw(cpu, &ra) || !ra.value)
1170                 return -ENODEV;
1171 
1172         return 0;
1173 }
1174 
1175 /*
1176  * Check if power limits are available. Two cases when they are not available:
1177  * 1. Locked by BIOS, in this case we still provide read-only access so that
1178  *    users can see what limit is set by the BIOS.
1179  * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
1180  *    exist at all. In this case, we do not show the constraints in powercap.
1181  *
1182  * Called after domains are detected and initialized.
1183  */
1184 static void rapl_detect_powerlimit(struct rapl_domain *rd)
1185 {
1186         u64 val64;
1187         int i;
1188 
1189         /* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
1190         if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
1191                 if (val64) {
1192                         pr_info("RAPL %s domain %s locked by BIOS\n",
1193                                 rd->rp->name, rd->name);
1194                         rd->state |= DOMAIN_STATE_BIOS_LOCKED;
1195                 }
1196         }
1197         /* check if power limit MSR exists, otherwise domain is monitoring only */
1198         for (i = 0; i < NR_POWER_LIMITS; i++) {
1199                 int prim = rd->rpl[i].prim_id;
1200 
1201                 if (rapl_read_data_raw(rd, prim, false, &val64))
1202                         rd->rpl[i].name = NULL;
1203         }
1204 }
1205 
1206 /* Detect active and valid domains for the given CPU, caller must
1207  * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
1208  */
1209 static int rapl_detect_domains(struct rapl_package *rp, int cpu)
1210 {
1211         struct rapl_domain *rd;
1212         int i;
1213 
1214         for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
1215                 /* use physical package id to read counters */
1216                 if (!rapl_check_domain(cpu, i, rp)) {
1217                         rp->domain_map |= 1 << i;
1218                         pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
1219                 }
1220         }
1221         rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
1222         if (!rp->nr_domains) {
1223                 pr_debug("no valid rapl domains found in %s\n", rp->name);
1224                 return -ENODEV;
1225         }
1226         pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
1227 
1228         rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
1229                               GFP_KERNEL);
1230         if (!rp->domains)
1231                 return -ENOMEM;
1232 
1233         rapl_init_domains(rp);
1234 
1235         for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
1236                 rapl_detect_powerlimit(rd);
1237 
1238         return 0;
1239 }
1240 
1241 /* called from CPU hotplug notifier, hotplug lock held */
1242 void rapl_remove_package(struct rapl_package *rp)
1243 {
1244         struct rapl_domain *rd, *rd_package = NULL;
1245 
1246         package_power_limit_irq_restore(rp);
1247 
1248         for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1249                 rapl_write_data_raw(rd, PL1_ENABLE, 0);
1250                 rapl_write_data_raw(rd, PL1_CLAMP, 0);
1251                 if (find_nr_power_limit(rd) > 1) {
1252                         rapl_write_data_raw(rd, PL2_ENABLE, 0);
1253                         rapl_write_data_raw(rd, PL2_CLAMP, 0);
1254                 }
1255                 if (rd->id == RAPL_DOMAIN_PACKAGE) {
1256                         rd_package = rd;
1257                         continue;
1258                 }
1259                 pr_debug("remove package, undo power limit on %s: %s\n",
1260                          rp->name, rd->name);
1261                 powercap_unregister_zone(rp->priv->control_type,
1262                                          &rd->power_zone);
1263         }
1264         /* do parent zone last */
1265         powercap_unregister_zone(rp->priv->control_type,
1266                                  &rd_package->power_zone);
1267         list_del(&rp->plist);
1268         kfree(rp);
1269 }
1270 EXPORT_SYMBOL_GPL(rapl_remove_package);
1271 
1272 /* caller to ensure CPU hotplug lock is held */
1273 struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
1274 {
1275         int id = topology_logical_die_id(cpu);
1276         struct rapl_package *rp;
1277 
1278         list_for_each_entry(rp, &rapl_packages, plist) {
1279                 if (rp->id == id
1280                     && rp->priv->control_type == priv->control_type)
1281                         return rp;
1282         }
1283 
1284         return NULL;
1285 }
1286 EXPORT_SYMBOL_GPL(rapl_find_package_domain);
1287 
1288 /* called from CPU hotplug notifier, hotplug lock held */
1289 struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
1290 {
1291         int id = topology_logical_die_id(cpu);
1292         struct rapl_package *rp;
1293         struct cpuinfo_x86 *c = &cpu_data(cpu);
1294         int ret;
1295 
1296         if (!rapl_defaults)
1297                 return ERR_PTR(-ENODEV);
1298 
1299         rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
1300         if (!rp)
1301                 return ERR_PTR(-ENOMEM);
1302 
1303         /* add the new package to the list */
1304         rp->id = id;
1305         rp->lead_cpu = cpu;
1306         rp->priv = priv;
1307 
1308         if (topology_max_die_per_package() > 1)
1309                 snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
1310                          "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
1311         else
1312                 snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
1313                          c->phys_proc_id);
1314 
1315         /* check if the package contains valid domains */
1316         if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) {
1317                 ret = -ENODEV;
1318                 goto err_free_package;
1319         }
1320         ret = rapl_package_register_powercap(rp);
1321         if (!ret) {
1322                 INIT_LIST_HEAD(&rp->plist);
1323                 list_add(&rp->plist, &rapl_packages);
1324                 return rp;
1325         }
1326 
1327 err_free_package:
1328         kfree(rp->domains);
1329         kfree(rp);
1330         return ERR_PTR(ret);
1331 }
1332 EXPORT_SYMBOL_GPL(rapl_add_package);
1333 
1334 static void power_limit_state_save(void)
1335 {
1336         struct rapl_package *rp;
1337         struct rapl_domain *rd;
1338         int nr_pl, ret, i;
1339 
1340         get_online_cpus();
1341         list_for_each_entry(rp, &rapl_packages, plist) {
1342                 if (!rp->power_zone)
1343                         continue;
1344                 rd = power_zone_to_rapl_domain(rp->power_zone);
1345                 nr_pl = find_nr_power_limit(rd);
1346                 for (i = 0; i < nr_pl; i++) {
1347                         switch (rd->rpl[i].prim_id) {
1348                         case PL1_ENABLE:
1349                                 ret = rapl_read_data_raw(rd,
1350                                                  POWER_LIMIT1, true,
1351                                                  &rd->rpl[i].last_power_limit);
1352                                 if (ret)
1353                                         rd->rpl[i].last_power_limit = 0;
1354                                 break;
1355                         case PL2_ENABLE:
1356                                 ret = rapl_read_data_raw(rd,
1357                                                  POWER_LIMIT2, true,
1358                                                  &rd->rpl[i].last_power_limit);
1359                                 if (ret)
1360                                         rd->rpl[i].last_power_limit = 0;
1361                                 break;
1362                         }
1363                 }
1364         }
1365         put_online_cpus();
1366 }
1367 
1368 static void power_limit_state_restore(void)
1369 {
1370         struct rapl_package *rp;
1371         struct rapl_domain *rd;
1372         int nr_pl, i;
1373 
1374         get_online_cpus();
1375         list_for_each_entry(rp, &rapl_packages, plist) {
1376                 if (!rp->power_zone)
1377                         continue;
1378                 rd = power_zone_to_rapl_domain(rp->power_zone);
1379                 nr_pl = find_nr_power_limit(rd);
1380                 for (i = 0; i < nr_pl; i++) {
1381                         switch (rd->rpl[i].prim_id) {
1382                         case PL1_ENABLE:
1383                                 if (rd->rpl[i].last_power_limit)
1384                                         rapl_write_data_raw(rd, POWER_LIMIT1,
1385                                             rd->rpl[i].last_power_limit);
1386                                 break;
1387                         case PL2_ENABLE:
1388                                 if (rd->rpl[i].last_power_limit)
1389                                         rapl_write_data_raw(rd, POWER_LIMIT2,
1390                                             rd->rpl[i].last_power_limit);
1391                                 break;
1392                         }
1393                 }
1394         }
1395         put_online_cpus();
1396 }
1397 
1398 static int rapl_pm_callback(struct notifier_block *nb,
1399                             unsigned long mode, void *_unused)
1400 {
1401         switch (mode) {
1402         case PM_SUSPEND_PREPARE:
1403                 power_limit_state_save();
1404                 break;
1405         case PM_POST_SUSPEND:
1406                 power_limit_state_restore();
1407                 break;
1408         }
1409         return NOTIFY_OK;
1410 }
1411 
1412 static struct notifier_block rapl_pm_notifier = {
1413         .notifier_call = rapl_pm_callback,
1414 };
1415 
1416 static struct platform_device *rapl_msr_platdev;
1417 
1418 static int __init rapl_init(void)
1419 {
1420         const struct x86_cpu_id *id;
1421         int ret;
1422 
1423         id = x86_match_cpu(rapl_ids);
1424         if (!id) {
1425                 pr_err("driver does not support CPU family %d model %d\n",
1426                        boot_cpu_data.x86, boot_cpu_data.x86_model);
1427 
1428                 return -ENODEV;
1429         }
1430 
1431         rapl_defaults = (struct rapl_defaults *)id->driver_data;
1432 
1433         ret = register_pm_notifier(&rapl_pm_notifier);
1434         if (ret)
1435                 return ret;
1436 
1437         rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0);
1438         if (!rapl_msr_platdev) {
1439                 ret = -ENOMEM;
1440                 goto end;
1441         }
1442 
1443         ret = platform_device_add(rapl_msr_platdev);
1444         if (ret)
1445                 platform_device_put(rapl_msr_platdev);
1446 
1447 end:
1448         if (ret)
1449                 unregister_pm_notifier(&rapl_pm_notifier);
1450 
1451         return ret;
1452 }
1453 
1454 static void __exit rapl_exit(void)
1455 {
1456         platform_device_unregister(rapl_msr_platdev);
1457         unregister_pm_notifier(&rapl_pm_notifier);
1458 }
1459 
1460 fs_initcall(rapl_init);
1461 module_exit(rapl_exit);
1462 
1463 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
1464 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
1465 MODULE_LICENSE("GPL v2");

/* [<][>][^][v][top][bottom][index][help] */