1/* 2 * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters 3 * Copyright (C) 2013 Google, Inc., Stephane Eranian 4 * 5 * Intel RAPL interface is specified in the IA-32 Manual Vol3b 6 * section 14.7.1 (September 2013) 7 * 8 * RAPL provides more controls than just reporting energy consumption 9 * however here we only expose the 3 energy consumption free running 10 * counters (pp0, pkg, dram). 11 * 12 * Each of those counters increments in a power unit defined by the 13 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules 14 * but it can vary. 15 * 16 * Counter to rapl events mappings: 17 * 18 * pp0 counter: consumption of all physical cores (power plane 0) 19 * event: rapl_energy_cores 20 * perf code: 0x1 21 * 22 * pkg counter: consumption of the whole processor package 23 * event: rapl_energy_pkg 24 * perf code: 0x2 25 * 26 * dram counter: consumption of the dram domain (servers only) 27 * event: rapl_energy_dram 28 * perf code: 0x3 29 * 30 * dram counter: consumption of the builtin-gpu domain (client only) 31 * event: rapl_energy_gpu 32 * perf code: 0x4 33 * 34 * We manage those counters as free running (read-only). They may be 35 * use simultaneously by other tools, such as turbostat. 36 * 37 * The events only support system-wide mode counting. There is no 38 * sampling support because it does not make sense and is not 39 * supported by the RAPL hardware. 40 * 41 * Because we want to avoid floating-point operations in the kernel, 42 * the events are all reported in fixed point arithmetic (32.32). 43 * Tools must adjust the counts to convert them to Watts using 44 * the duration of the measurement. Tools may use a function such as 45 * ldexp(raw_count, -32); 46 */ 47#include <linux/module.h> 48#include <linux/slab.h> 49#include <linux/perf_event.h> 50#include <asm/cpu_device_id.h> 51#include "perf_event.h" 52 53/* 54 * RAPL energy status counters 55 */ 56#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */ 57#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */ 58#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */ 59#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ 60#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ 61#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ 62#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ 63#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ 64 65#define NR_RAPL_DOMAINS 0x4 66static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { 67 "pp0-core", 68 "package", 69 "dram", 70 "pp1-gpu", 71}; 72 73/* Clients have PP0, PKG */ 74#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ 75 1<<RAPL_IDX_PKG_NRG_STAT|\ 76 1<<RAPL_IDX_PP1_NRG_STAT) 77 78/* Servers have PP0, PKG, RAM */ 79#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\ 80 1<<RAPL_IDX_PKG_NRG_STAT|\ 81 1<<RAPL_IDX_RAM_NRG_STAT) 82 83/* Servers have PP0, PKG, RAM, PP1 */ 84#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\ 85 1<<RAPL_IDX_PKG_NRG_STAT|\ 86 1<<RAPL_IDX_RAM_NRG_STAT|\ 87 1<<RAPL_IDX_PP1_NRG_STAT) 88 89/* Knights Landing has PKG, RAM */ 90#define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\ 91 1<<RAPL_IDX_RAM_NRG_STAT) 92 93/* 94 * event code: LSB 8 bits, passed in attr->config 95 * any other bit is reserved 96 */ 97#define RAPL_EVENT_MASK 0xFFULL 98 99#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ 100static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ 101 struct kobj_attribute *attr, \ 102 char *page) \ 103{ \ 104 BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ 105 return sprintf(page, _format "\n"); \ 106} \ 107static struct kobj_attribute format_attr_##_var = \ 108 __ATTR(_name, 0444, __rapl_##_var##_show, NULL) 109 110#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ 111 112#define RAPL_EVENT_ATTR_STR(_name, v, str) \ 113static struct perf_pmu_events_attr event_attr_##v = { \ 114 .attr = __ATTR(_name, 0444, rapl_sysfs_show, NULL), \ 115 .id = 0, \ 116 .event_str = str, \ 117}; 118 119struct rapl_pmu { 120 spinlock_t lock; 121 int n_active; /* number of active events */ 122 struct list_head active_list; 123 struct pmu *pmu; /* pointer to rapl_pmu_class */ 124 ktime_t timer_interval; /* in ktime_t unit */ 125 struct hrtimer hrtimer; 126}; 127 128static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */ 129static struct pmu rapl_pmu_class; 130static cpumask_t rapl_cpu_mask; 131static int rapl_cntr_mask; 132 133static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); 134static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); 135 136static struct x86_pmu_quirk *rapl_quirks; 137static inline u64 rapl_read_counter(struct perf_event *event) 138{ 139 u64 raw; 140 rdmsrl(event->hw.event_base, raw); 141 return raw; 142} 143 144#define rapl_add_quirk(func_) \ 145do { \ 146 static struct x86_pmu_quirk __quirk __initdata = { \ 147 .func = func_, \ 148 }; \ 149 __quirk.next = rapl_quirks; \ 150 rapl_quirks = &__quirk; \ 151} while (0) 152 153static inline u64 rapl_scale(u64 v, int cfg) 154{ 155 if (cfg > NR_RAPL_DOMAINS) { 156 pr_warn("invalid domain %d, failed to scale data\n", cfg); 157 return v; 158 } 159 /* 160 * scale delta to smallest unit (1/2^32) 161 * users must then scale back: count * 1/(1e9*2^32) to get Joules 162 * or use ldexp(count, -32). 163 * Watts = Joules/Time delta 164 */ 165 return v << (32 - rapl_hw_unit[cfg - 1]); 166} 167 168static u64 rapl_event_update(struct perf_event *event) 169{ 170 struct hw_perf_event *hwc = &event->hw; 171 u64 prev_raw_count, new_raw_count; 172 s64 delta, sdelta; 173 int shift = RAPL_CNTR_WIDTH; 174 175again: 176 prev_raw_count = local64_read(&hwc->prev_count); 177 rdmsrl(event->hw.event_base, new_raw_count); 178 179 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 180 new_raw_count) != prev_raw_count) { 181 cpu_relax(); 182 goto again; 183 } 184 185 /* 186 * Now we have the new raw value and have updated the prev 187 * timestamp already. We can now calculate the elapsed delta 188 * (event-)time and add that to the generic event. 189 * 190 * Careful, not all hw sign-extends above the physical width 191 * of the count. 192 */ 193 delta = (new_raw_count << shift) - (prev_raw_count << shift); 194 delta >>= shift; 195 196 sdelta = rapl_scale(delta, event->hw.config); 197 198 local64_add(sdelta, &event->count); 199 200 return new_raw_count; 201} 202 203static void rapl_start_hrtimer(struct rapl_pmu *pmu) 204{ 205 hrtimer_start(&pmu->hrtimer, pmu->timer_interval, 206 HRTIMER_MODE_REL_PINNED); 207} 208 209static void rapl_stop_hrtimer(struct rapl_pmu *pmu) 210{ 211 hrtimer_cancel(&pmu->hrtimer); 212} 213 214static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) 215{ 216 struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); 217 struct perf_event *event; 218 unsigned long flags; 219 220 if (!pmu->n_active) 221 return HRTIMER_NORESTART; 222 223 spin_lock_irqsave(&pmu->lock, flags); 224 225 list_for_each_entry(event, &pmu->active_list, active_entry) { 226 rapl_event_update(event); 227 } 228 229 spin_unlock_irqrestore(&pmu->lock, flags); 230 231 hrtimer_forward_now(hrtimer, pmu->timer_interval); 232 233 return HRTIMER_RESTART; 234} 235 236static void rapl_hrtimer_init(struct rapl_pmu *pmu) 237{ 238 struct hrtimer *hr = &pmu->hrtimer; 239 240 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 241 hr->function = rapl_hrtimer_handle; 242} 243 244static void __rapl_pmu_event_start(struct rapl_pmu *pmu, 245 struct perf_event *event) 246{ 247 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) 248 return; 249 250 event->hw.state = 0; 251 252 list_add_tail(&event->active_entry, &pmu->active_list); 253 254 local64_set(&event->hw.prev_count, rapl_read_counter(event)); 255 256 pmu->n_active++; 257 if (pmu->n_active == 1) 258 rapl_start_hrtimer(pmu); 259} 260 261static void rapl_pmu_event_start(struct perf_event *event, int mode) 262{ 263 struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); 264 unsigned long flags; 265 266 spin_lock_irqsave(&pmu->lock, flags); 267 __rapl_pmu_event_start(pmu, event); 268 spin_unlock_irqrestore(&pmu->lock, flags); 269} 270 271static void rapl_pmu_event_stop(struct perf_event *event, int mode) 272{ 273 struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); 274 struct hw_perf_event *hwc = &event->hw; 275 unsigned long flags; 276 277 spin_lock_irqsave(&pmu->lock, flags); 278 279 /* mark event as deactivated and stopped */ 280 if (!(hwc->state & PERF_HES_STOPPED)) { 281 WARN_ON_ONCE(pmu->n_active <= 0); 282 pmu->n_active--; 283 if (pmu->n_active == 0) 284 rapl_stop_hrtimer(pmu); 285 286 list_del(&event->active_entry); 287 288 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); 289 hwc->state |= PERF_HES_STOPPED; 290 } 291 292 /* check if update of sw counter is necessary */ 293 if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { 294 /* 295 * Drain the remaining delta count out of a event 296 * that we are disabling: 297 */ 298 rapl_event_update(event); 299 hwc->state |= PERF_HES_UPTODATE; 300 } 301 302 spin_unlock_irqrestore(&pmu->lock, flags); 303} 304 305static int rapl_pmu_event_add(struct perf_event *event, int mode) 306{ 307 struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); 308 struct hw_perf_event *hwc = &event->hw; 309 unsigned long flags; 310 311 spin_lock_irqsave(&pmu->lock, flags); 312 313 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 314 315 if (mode & PERF_EF_START) 316 __rapl_pmu_event_start(pmu, event); 317 318 spin_unlock_irqrestore(&pmu->lock, flags); 319 320 return 0; 321} 322 323static void rapl_pmu_event_del(struct perf_event *event, int flags) 324{ 325 rapl_pmu_event_stop(event, PERF_EF_UPDATE); 326} 327 328static int rapl_pmu_event_init(struct perf_event *event) 329{ 330 u64 cfg = event->attr.config & RAPL_EVENT_MASK; 331 int bit, msr, ret = 0; 332 333 /* only look at RAPL events */ 334 if (event->attr.type != rapl_pmu_class.type) 335 return -ENOENT; 336 337 /* check only supported bits are set */ 338 if (event->attr.config & ~RAPL_EVENT_MASK) 339 return -EINVAL; 340 341 /* 342 * check event is known (determines counter) 343 */ 344 switch (cfg) { 345 case INTEL_RAPL_PP0: 346 bit = RAPL_IDX_PP0_NRG_STAT; 347 msr = MSR_PP0_ENERGY_STATUS; 348 break; 349 case INTEL_RAPL_PKG: 350 bit = RAPL_IDX_PKG_NRG_STAT; 351 msr = MSR_PKG_ENERGY_STATUS; 352 break; 353 case INTEL_RAPL_RAM: 354 bit = RAPL_IDX_RAM_NRG_STAT; 355 msr = MSR_DRAM_ENERGY_STATUS; 356 break; 357 case INTEL_RAPL_PP1: 358 bit = RAPL_IDX_PP1_NRG_STAT; 359 msr = MSR_PP1_ENERGY_STATUS; 360 break; 361 default: 362 return -EINVAL; 363 } 364 /* check event supported */ 365 if (!(rapl_cntr_mask & (1 << bit))) 366 return -EINVAL; 367 368 /* unsupported modes and filters */ 369 if (event->attr.exclude_user || 370 event->attr.exclude_kernel || 371 event->attr.exclude_hv || 372 event->attr.exclude_idle || 373 event->attr.exclude_host || 374 event->attr.exclude_guest || 375 event->attr.sample_period) /* no sampling */ 376 return -EINVAL; 377 378 /* must be done before validate_group */ 379 event->hw.event_base = msr; 380 event->hw.config = cfg; 381 event->hw.idx = bit; 382 383 return ret; 384} 385 386static void rapl_pmu_event_read(struct perf_event *event) 387{ 388 rapl_event_update(event); 389} 390 391static ssize_t rapl_get_attr_cpumask(struct device *dev, 392 struct device_attribute *attr, char *buf) 393{ 394 return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); 395} 396 397static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); 398 399static struct attribute *rapl_pmu_attrs[] = { 400 &dev_attr_cpumask.attr, 401 NULL, 402}; 403 404static struct attribute_group rapl_pmu_attr_group = { 405 .attrs = rapl_pmu_attrs, 406}; 407 408static ssize_t rapl_sysfs_show(struct device *dev, 409 struct device_attribute *attr, 410 char *page) 411{ 412 struct perf_pmu_events_attr *pmu_attr = \ 413 container_of(attr, struct perf_pmu_events_attr, attr); 414 415 if (pmu_attr->event_str) 416 return sprintf(page, "%s", pmu_attr->event_str); 417 418 return 0; 419} 420 421RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); 422RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); 423RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); 424RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); 425 426RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); 427RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); 428RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); 429RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); 430 431/* 432 * we compute in 0.23 nJ increments regardless of MSR 433 */ 434RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); 435RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); 436RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); 437RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); 438 439static struct attribute *rapl_events_srv_attr[] = { 440 EVENT_PTR(rapl_cores), 441 EVENT_PTR(rapl_pkg), 442 EVENT_PTR(rapl_ram), 443 444 EVENT_PTR(rapl_cores_unit), 445 EVENT_PTR(rapl_pkg_unit), 446 EVENT_PTR(rapl_ram_unit), 447 448 EVENT_PTR(rapl_cores_scale), 449 EVENT_PTR(rapl_pkg_scale), 450 EVENT_PTR(rapl_ram_scale), 451 NULL, 452}; 453 454static struct attribute *rapl_events_cln_attr[] = { 455 EVENT_PTR(rapl_cores), 456 EVENT_PTR(rapl_pkg), 457 EVENT_PTR(rapl_gpu), 458 459 EVENT_PTR(rapl_cores_unit), 460 EVENT_PTR(rapl_pkg_unit), 461 EVENT_PTR(rapl_gpu_unit), 462 463 EVENT_PTR(rapl_cores_scale), 464 EVENT_PTR(rapl_pkg_scale), 465 EVENT_PTR(rapl_gpu_scale), 466 NULL, 467}; 468 469static struct attribute *rapl_events_hsw_attr[] = { 470 EVENT_PTR(rapl_cores), 471 EVENT_PTR(rapl_pkg), 472 EVENT_PTR(rapl_gpu), 473 EVENT_PTR(rapl_ram), 474 475 EVENT_PTR(rapl_cores_unit), 476 EVENT_PTR(rapl_pkg_unit), 477 EVENT_PTR(rapl_gpu_unit), 478 EVENT_PTR(rapl_ram_unit), 479 480 EVENT_PTR(rapl_cores_scale), 481 EVENT_PTR(rapl_pkg_scale), 482 EVENT_PTR(rapl_gpu_scale), 483 EVENT_PTR(rapl_ram_scale), 484 NULL, 485}; 486 487static struct attribute *rapl_events_knl_attr[] = { 488 EVENT_PTR(rapl_pkg), 489 EVENT_PTR(rapl_ram), 490 491 EVENT_PTR(rapl_pkg_unit), 492 EVENT_PTR(rapl_ram_unit), 493 494 EVENT_PTR(rapl_pkg_scale), 495 EVENT_PTR(rapl_ram_scale), 496 NULL, 497}; 498 499static struct attribute_group rapl_pmu_events_group = { 500 .name = "events", 501 .attrs = NULL, /* patched at runtime */ 502}; 503 504DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); 505static struct attribute *rapl_formats_attr[] = { 506 &format_attr_event.attr, 507 NULL, 508}; 509 510static struct attribute_group rapl_pmu_format_group = { 511 .name = "format", 512 .attrs = rapl_formats_attr, 513}; 514 515const struct attribute_group *rapl_attr_groups[] = { 516 &rapl_pmu_attr_group, 517 &rapl_pmu_format_group, 518 &rapl_pmu_events_group, 519 NULL, 520}; 521 522static struct pmu rapl_pmu_class = { 523 .attr_groups = rapl_attr_groups, 524 .task_ctx_nr = perf_invalid_context, /* system-wide only */ 525 .event_init = rapl_pmu_event_init, 526 .add = rapl_pmu_event_add, /* must have */ 527 .del = rapl_pmu_event_del, /* must have */ 528 .start = rapl_pmu_event_start, 529 .stop = rapl_pmu_event_stop, 530 .read = rapl_pmu_event_read, 531}; 532 533static void rapl_cpu_exit(int cpu) 534{ 535 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); 536 int i, phys_id = topology_physical_package_id(cpu); 537 int target = -1; 538 539 /* find a new cpu on same package */ 540 for_each_online_cpu(i) { 541 if (i == cpu) 542 continue; 543 if (phys_id == topology_physical_package_id(i)) { 544 target = i; 545 break; 546 } 547 } 548 /* 549 * clear cpu from cpumask 550 * if was set in cpumask and still some cpu on package, 551 * then move to new cpu 552 */ 553 if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) 554 cpumask_set_cpu(target, &rapl_cpu_mask); 555 556 WARN_ON(cpumask_empty(&rapl_cpu_mask)); 557 /* 558 * migrate events and context to new cpu 559 */ 560 if (target >= 0) 561 perf_pmu_migrate_context(pmu->pmu, cpu, target); 562 563 /* cancel overflow polling timer for CPU */ 564 rapl_stop_hrtimer(pmu); 565} 566 567static void rapl_cpu_init(int cpu) 568{ 569 int i, phys_id = topology_physical_package_id(cpu); 570 571 /* check if phys_is is already covered */ 572 for_each_cpu(i, &rapl_cpu_mask) { 573 if (phys_id == topology_physical_package_id(i)) 574 return; 575 } 576 /* was not found, so add it */ 577 cpumask_set_cpu(cpu, &rapl_cpu_mask); 578} 579 580static __init void rapl_hsw_server_quirk(void) 581{ 582 /* 583 * DRAM domain on HSW server has fixed energy unit which can be 584 * different than the unit from power unit MSR. 585 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 586 * of 2. Datasheet, September 2014, Reference Number: 330784-001 " 587 */ 588 rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16; 589} 590 591static int rapl_cpu_prepare(int cpu) 592{ 593 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); 594 int phys_id = topology_physical_package_id(cpu); 595 u64 ms; 596 597 if (pmu) 598 return 0; 599 600 if (phys_id < 0) 601 return -1; 602 603 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); 604 if (!pmu) 605 return -1; 606 spin_lock_init(&pmu->lock); 607 608 INIT_LIST_HEAD(&pmu->active_list); 609 610 pmu->pmu = &rapl_pmu_class; 611 612 /* 613 * use reference of 200W for scaling the timeout 614 * to avoid missing counter overflows. 615 * 200W = 200 Joules/sec 616 * divide interval by 2 to avoid lockstep (2 * 100) 617 * if hw unit is 32, then we use 2 ms 1/200/2 618 */ 619 if (rapl_hw_unit[0] < 32) 620 ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1)); 621 else 622 ms = 2; 623 624 pmu->timer_interval = ms_to_ktime(ms); 625 626 rapl_hrtimer_init(pmu); 627 628 /* set RAPL pmu for this cpu for now */ 629 per_cpu(rapl_pmu, cpu) = pmu; 630 per_cpu(rapl_pmu_to_free, cpu) = NULL; 631 632 return 0; 633} 634 635static void rapl_cpu_kfree(int cpu) 636{ 637 struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); 638 639 kfree(pmu); 640 641 per_cpu(rapl_pmu_to_free, cpu) = NULL; 642} 643 644static int rapl_cpu_dying(int cpu) 645{ 646 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); 647 648 if (!pmu) 649 return 0; 650 651 per_cpu(rapl_pmu, cpu) = NULL; 652 653 per_cpu(rapl_pmu_to_free, cpu) = pmu; 654 655 return 0; 656} 657 658static int rapl_cpu_notifier(struct notifier_block *self, 659 unsigned long action, void *hcpu) 660{ 661 unsigned int cpu = (long)hcpu; 662 663 switch (action & ~CPU_TASKS_FROZEN) { 664 case CPU_UP_PREPARE: 665 rapl_cpu_prepare(cpu); 666 break; 667 case CPU_STARTING: 668 rapl_cpu_init(cpu); 669 break; 670 case CPU_UP_CANCELED: 671 case CPU_DYING: 672 rapl_cpu_dying(cpu); 673 break; 674 case CPU_ONLINE: 675 case CPU_DEAD: 676 rapl_cpu_kfree(cpu); 677 break; 678 case CPU_DOWN_PREPARE: 679 rapl_cpu_exit(cpu); 680 break; 681 default: 682 break; 683 } 684 685 return NOTIFY_OK; 686} 687 688static int rapl_check_hw_unit(void) 689{ 690 u64 msr_rapl_power_unit_bits; 691 int i; 692 693 /* protect rdmsrl() to handle virtualization */ 694 if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) 695 return -1; 696 for (i = 0; i < NR_RAPL_DOMAINS; i++) 697 rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; 698 699 return 0; 700} 701 702static const struct x86_cpu_id rapl_cpu_match[] = { 703 [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, 704 [1] = {}, 705}; 706 707static int __init rapl_pmu_init(void) 708{ 709 struct rapl_pmu *pmu; 710 int cpu, ret; 711 struct x86_pmu_quirk *quirk; 712 int i; 713 714 /* 715 * check for Intel processor family 6 716 */ 717 if (!x86_match_cpu(rapl_cpu_match)) 718 return 0; 719 720 /* check supported CPU */ 721 switch (boot_cpu_data.x86_model) { 722 case 42: /* Sandy Bridge */ 723 case 58: /* Ivy Bridge */ 724 rapl_cntr_mask = RAPL_IDX_CLN; 725 rapl_pmu_events_group.attrs = rapl_events_cln_attr; 726 break; 727 case 63: /* Haswell-Server */ 728 rapl_add_quirk(rapl_hsw_server_quirk); 729 rapl_cntr_mask = RAPL_IDX_SRV; 730 rapl_pmu_events_group.attrs = rapl_events_srv_attr; 731 break; 732 case 60: /* Haswell */ 733 case 69: /* Haswell-Celeron */ 734 case 61: /* Broadwell */ 735 rapl_cntr_mask = RAPL_IDX_HSW; 736 rapl_pmu_events_group.attrs = rapl_events_hsw_attr; 737 break; 738 case 45: /* Sandy Bridge-EP */ 739 case 62: /* IvyTown */ 740 rapl_cntr_mask = RAPL_IDX_SRV; 741 rapl_pmu_events_group.attrs = rapl_events_srv_attr; 742 break; 743 case 87: /* Knights Landing */ 744 rapl_add_quirk(rapl_hsw_server_quirk); 745 rapl_cntr_mask = RAPL_IDX_KNL; 746 rapl_pmu_events_group.attrs = rapl_events_knl_attr; 747 748 default: 749 /* unsupported */ 750 return 0; 751 } 752 ret = rapl_check_hw_unit(); 753 if (ret) 754 return ret; 755 756 /* run cpu model quirks */ 757 for (quirk = rapl_quirks; quirk; quirk = quirk->next) 758 quirk->func(); 759 cpu_notifier_register_begin(); 760 761 for_each_online_cpu(cpu) { 762 ret = rapl_cpu_prepare(cpu); 763 if (ret) 764 goto out; 765 rapl_cpu_init(cpu); 766 } 767 768 __perf_cpu_notifier(rapl_cpu_notifier); 769 770 ret = perf_pmu_register(&rapl_pmu_class, "power", -1); 771 if (WARN_ON(ret)) { 772 pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); 773 cpu_notifier_register_done(); 774 return -1; 775 } 776 777 pmu = __this_cpu_read(rapl_pmu); 778 779 pr_info("RAPL PMU detected," 780 " API unit is 2^-32 Joules," 781 " %d fixed counters" 782 " %llu ms ovfl timer\n", 783 hweight32(rapl_cntr_mask), 784 ktime_to_ms(pmu->timer_interval)); 785 for (i = 0; i < NR_RAPL_DOMAINS; i++) { 786 if (rapl_cntr_mask & (1 << i)) { 787 pr_info("hw unit of domain %s 2^-%d Joules\n", 788 rapl_domain_names[i], rapl_hw_unit[i]); 789 } 790 } 791out: 792 cpu_notifier_register_done(); 793 794 return 0; 795} 796device_initcall(rapl_pmu_init); 797