1/* 2 * Intel Cache Quality-of-Service Monitoring (CQM) support. 3 * 4 * Based very, very heavily on work by Peter Zijlstra. 5 */ 6 7#include <linux/perf_event.h> 8#include <linux/slab.h> 9#include <asm/cpu_device_id.h> 10#include "perf_event.h" 11 12#define MSR_IA32_PQR_ASSOC 0x0c8f 13#define MSR_IA32_QM_CTR 0x0c8e 14#define MSR_IA32_QM_EVTSEL 0x0c8d 15 16static u32 cqm_max_rmid = -1; 17static unsigned int cqm_l3_scale; /* supposedly cacheline size */ 18 19/** 20 * struct intel_pqr_state - State cache for the PQR MSR 21 * @rmid: The cached Resource Monitoring ID 22 * @closid: The cached Class Of Service ID 23 * @rmid_usecnt: The usage counter for rmid 24 * 25 * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the 26 * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always 27 * contains both parts, so we need to cache them. 28 * 29 * The cache also helps to avoid pointless updates if the value does 30 * not change. 31 */ 32struct intel_pqr_state { 33 u32 rmid; 34 u32 closid; 35 int rmid_usecnt; 36}; 37 38/* 39 * The cached intel_pqr_state is strictly per CPU and can never be 40 * updated from a remote CPU. Both functions which modify the state 41 * (intel_cqm_event_start and intel_cqm_event_stop) are called with 42 * interrupts disabled, which is sufficient for the protection. 43 */ 44static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); 45 46/* 47 * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. 48 * Also protects event->hw.cqm_rmid 49 * 50 * Hold either for stability, both for modification of ->hw.cqm_rmid. 51 */ 52static DEFINE_MUTEX(cache_mutex); 53static DEFINE_RAW_SPINLOCK(cache_lock); 54 55/* 56 * Groups of events that have the same target(s), one RMID per group. 57 */ 58static LIST_HEAD(cache_groups); 59 60/* 61 * Mask of CPUs for reading CQM values. We only need one per-socket. 62 */ 63static cpumask_t cqm_cpumask; 64 65#define RMID_VAL_ERROR (1ULL << 63) 66#define RMID_VAL_UNAVAIL (1ULL << 62) 67 68#define QOS_L3_OCCUP_EVENT_ID (1 << 0) 69 70#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID 71 72/* 73 * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). 74 * 75 * This rmid is always free and is guaranteed to have an associated 76 * near-zero occupancy value, i.e. no cachelines are tagged with this 77 * RMID, once __intel_cqm_rmid_rotate() returns. 78 */ 79static u32 intel_cqm_rotation_rmid; 80 81#define INVALID_RMID (-1) 82 83/* 84 * Is @rmid valid for programming the hardware? 85 * 86 * rmid 0 is reserved by the hardware for all non-monitored tasks, which 87 * means that we should never come across an rmid with that value. 88 * Likewise, an rmid value of -1 is used to indicate "no rmid currently 89 * assigned" and is used as part of the rotation code. 90 */ 91static inline bool __rmid_valid(u32 rmid) 92{ 93 if (!rmid || rmid == INVALID_RMID) 94 return false; 95 96 return true; 97} 98 99static u64 __rmid_read(u32 rmid) 100{ 101 u64 val; 102 103 /* 104 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, 105 * it just says that to increase confusion. 106 */ 107 wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); 108 rdmsrl(MSR_IA32_QM_CTR, val); 109 110 /* 111 * Aside from the ERROR and UNAVAIL bits, assume this thing returns 112 * the number of cachelines tagged with @rmid. 113 */ 114 return val; 115} 116 117enum rmid_recycle_state { 118 RMID_YOUNG = 0, 119 RMID_AVAILABLE, 120 RMID_DIRTY, 121}; 122 123struct cqm_rmid_entry { 124 u32 rmid; 125 enum rmid_recycle_state state; 126 struct list_head list; 127 unsigned long queue_time; 128}; 129 130/* 131 * cqm_rmid_free_lru - A least recently used list of RMIDs. 132 * 133 * Oldest entry at the head, newest (most recently used) entry at the 134 * tail. This list is never traversed, it's only used to keep track of 135 * the lru order. That is, we only pick entries of the head or insert 136 * them on the tail. 137 * 138 * All entries on the list are 'free', and their RMIDs are not currently 139 * in use. To mark an RMID as in use, remove its entry from the lru 140 * list. 141 * 142 * 143 * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. 144 * 145 * This list is contains RMIDs that no one is currently using but that 146 * may have a non-zero occupancy value associated with them. The 147 * rotation worker moves RMIDs from the limbo list to the free list once 148 * the occupancy value drops below __intel_cqm_threshold. 149 * 150 * Both lists are protected by cache_mutex. 151 */ 152static LIST_HEAD(cqm_rmid_free_lru); 153static LIST_HEAD(cqm_rmid_limbo_lru); 154 155/* 156 * We use a simple array of pointers so that we can lookup a struct 157 * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid() 158 * and __put_rmid() from having to worry about dealing with struct 159 * cqm_rmid_entry - they just deal with rmids, i.e. integers. 160 * 161 * Once this array is initialized it is read-only. No locks are required 162 * to access it. 163 * 164 * All entries for all RMIDs can be looked up in the this array at all 165 * times. 166 */ 167static struct cqm_rmid_entry **cqm_rmid_ptrs; 168 169static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid) 170{ 171 struct cqm_rmid_entry *entry; 172 173 entry = cqm_rmid_ptrs[rmid]; 174 WARN_ON(entry->rmid != rmid); 175 176 return entry; 177} 178 179/* 180 * Returns < 0 on fail. 181 * 182 * We expect to be called with cache_mutex held. 183 */ 184static u32 __get_rmid(void) 185{ 186 struct cqm_rmid_entry *entry; 187 188 lockdep_assert_held(&cache_mutex); 189 190 if (list_empty(&cqm_rmid_free_lru)) 191 return INVALID_RMID; 192 193 entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); 194 list_del(&entry->list); 195 196 return entry->rmid; 197} 198 199static void __put_rmid(u32 rmid) 200{ 201 struct cqm_rmid_entry *entry; 202 203 lockdep_assert_held(&cache_mutex); 204 205 WARN_ON(!__rmid_valid(rmid)); 206 entry = __rmid_entry(rmid); 207 208 entry->queue_time = jiffies; 209 entry->state = RMID_YOUNG; 210 211 list_add_tail(&entry->list, &cqm_rmid_limbo_lru); 212} 213 214static int intel_cqm_setup_rmid_cache(void) 215{ 216 struct cqm_rmid_entry *entry; 217 unsigned int nr_rmids; 218 int r = 0; 219 220 nr_rmids = cqm_max_rmid + 1; 221 cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) * 222 nr_rmids, GFP_KERNEL); 223 if (!cqm_rmid_ptrs) 224 return -ENOMEM; 225 226 for (; r <= cqm_max_rmid; r++) { 227 struct cqm_rmid_entry *entry; 228 229 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 230 if (!entry) 231 goto fail; 232 233 INIT_LIST_HEAD(&entry->list); 234 entry->rmid = r; 235 cqm_rmid_ptrs[r] = entry; 236 237 list_add_tail(&entry->list, &cqm_rmid_free_lru); 238 } 239 240 /* 241 * RMID 0 is special and is always allocated. It's used for all 242 * tasks that are not monitored. 243 */ 244 entry = __rmid_entry(0); 245 list_del(&entry->list); 246 247 mutex_lock(&cache_mutex); 248 intel_cqm_rotation_rmid = __get_rmid(); 249 mutex_unlock(&cache_mutex); 250 251 return 0; 252fail: 253 while (r--) 254 kfree(cqm_rmid_ptrs[r]); 255 256 kfree(cqm_rmid_ptrs); 257 return -ENOMEM; 258} 259 260/* 261 * Determine if @a and @b measure the same set of tasks. 262 * 263 * If @a and @b measure the same set of tasks then we want to share a 264 * single RMID. 265 */ 266static bool __match_event(struct perf_event *a, struct perf_event *b) 267{ 268 /* Per-cpu and task events don't mix */ 269 if ((a->attach_state & PERF_ATTACH_TASK) != 270 (b->attach_state & PERF_ATTACH_TASK)) 271 return false; 272 273#ifdef CONFIG_CGROUP_PERF 274 if (a->cgrp != b->cgrp) 275 return false; 276#endif 277 278 /* If not task event, we're machine wide */ 279 if (!(b->attach_state & PERF_ATTACH_TASK)) 280 return true; 281 282 /* 283 * Events that target same task are placed into the same cache group. 284 */ 285 if (a->hw.target == b->hw.target) 286 return true; 287 288 /* 289 * Are we an inherited event? 290 */ 291 if (b->parent == a) 292 return true; 293 294 return false; 295} 296 297#ifdef CONFIG_CGROUP_PERF 298static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) 299{ 300 if (event->attach_state & PERF_ATTACH_TASK) 301 return perf_cgroup_from_task(event->hw.target, event->ctx); 302 303 return event->cgrp; 304} 305#endif 306 307/* 308 * Determine if @a's tasks intersect with @b's tasks 309 * 310 * There are combinations of events that we explicitly prohibit, 311 * 312 * PROHIBITS 313 * system-wide -> cgroup and task 314 * cgroup -> system-wide 315 * -> task in cgroup 316 * task -> system-wide 317 * -> task in cgroup 318 * 319 * Call this function before allocating an RMID. 320 */ 321static bool __conflict_event(struct perf_event *a, struct perf_event *b) 322{ 323#ifdef CONFIG_CGROUP_PERF 324 /* 325 * We can have any number of cgroups but only one system-wide 326 * event at a time. 327 */ 328 if (a->cgrp && b->cgrp) { 329 struct perf_cgroup *ac = a->cgrp; 330 struct perf_cgroup *bc = b->cgrp; 331 332 /* 333 * This condition should have been caught in 334 * __match_event() and we should be sharing an RMID. 335 */ 336 WARN_ON_ONCE(ac == bc); 337 338 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || 339 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) 340 return true; 341 342 return false; 343 } 344 345 if (a->cgrp || b->cgrp) { 346 struct perf_cgroup *ac, *bc; 347 348 /* 349 * cgroup and system-wide events are mutually exclusive 350 */ 351 if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || 352 (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) 353 return true; 354 355 /* 356 * Ensure neither event is part of the other's cgroup 357 */ 358 ac = event_to_cgroup(a); 359 bc = event_to_cgroup(b); 360 if (ac == bc) 361 return true; 362 363 /* 364 * Must have cgroup and non-intersecting task events. 365 */ 366 if (!ac || !bc) 367 return false; 368 369 /* 370 * We have cgroup and task events, and the task belongs 371 * to a cgroup. Check for for overlap. 372 */ 373 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || 374 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) 375 return true; 376 377 return false; 378 } 379#endif 380 /* 381 * If one of them is not a task, same story as above with cgroups. 382 */ 383 if (!(a->attach_state & PERF_ATTACH_TASK) || 384 !(b->attach_state & PERF_ATTACH_TASK)) 385 return true; 386 387 /* 388 * Must be non-overlapping. 389 */ 390 return false; 391} 392 393struct rmid_read { 394 u32 rmid; 395 atomic64_t value; 396}; 397 398static void __intel_cqm_event_count(void *info); 399 400/* 401 * Exchange the RMID of a group of events. 402 */ 403static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) 404{ 405 struct perf_event *event; 406 struct list_head *head = &group->hw.cqm_group_entry; 407 u32 old_rmid = group->hw.cqm_rmid; 408 409 lockdep_assert_held(&cache_mutex); 410 411 /* 412 * If our RMID is being deallocated, perform a read now. 413 */ 414 if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { 415 struct rmid_read rr = { 416 .value = ATOMIC64_INIT(0), 417 .rmid = old_rmid, 418 }; 419 420 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, 421 &rr, 1); 422 local64_set(&group->count, atomic64_read(&rr.value)); 423 } 424 425 raw_spin_lock_irq(&cache_lock); 426 427 group->hw.cqm_rmid = rmid; 428 list_for_each_entry(event, head, hw.cqm_group_entry) 429 event->hw.cqm_rmid = rmid; 430 431 raw_spin_unlock_irq(&cache_lock); 432 433 return old_rmid; 434} 435 436/* 437 * If we fail to assign a new RMID for intel_cqm_rotation_rmid because 438 * cachelines are still tagged with RMIDs in limbo, we progressively 439 * increment the threshold until we find an RMID in limbo with <= 440 * __intel_cqm_threshold lines tagged. This is designed to mitigate the 441 * problem where cachelines tagged with an RMID are not steadily being 442 * evicted. 443 * 444 * On successful rotations we decrease the threshold back towards zero. 445 * 446 * __intel_cqm_max_threshold provides an upper bound on the threshold, 447 * and is measured in bytes because it's exposed to userland. 448 */ 449static unsigned int __intel_cqm_threshold; 450static unsigned int __intel_cqm_max_threshold; 451 452/* 453 * Test whether an RMID has a zero occupancy value on this cpu. 454 */ 455static void intel_cqm_stable(void *arg) 456{ 457 struct cqm_rmid_entry *entry; 458 459 list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { 460 if (entry->state != RMID_AVAILABLE) 461 break; 462 463 if (__rmid_read(entry->rmid) > __intel_cqm_threshold) 464 entry->state = RMID_DIRTY; 465 } 466} 467 468/* 469 * If we have group events waiting for an RMID that don't conflict with 470 * events already running, assign @rmid. 471 */ 472static bool intel_cqm_sched_in_event(u32 rmid) 473{ 474 struct perf_event *leader, *event; 475 476 lockdep_assert_held(&cache_mutex); 477 478 leader = list_first_entry(&cache_groups, struct perf_event, 479 hw.cqm_groups_entry); 480 event = leader; 481 482 list_for_each_entry_continue(event, &cache_groups, 483 hw.cqm_groups_entry) { 484 if (__rmid_valid(event->hw.cqm_rmid)) 485 continue; 486 487 if (__conflict_event(event, leader)) 488 continue; 489 490 intel_cqm_xchg_rmid(event, rmid); 491 return true; 492 } 493 494 return false; 495} 496 497/* 498 * Initially use this constant for both the limbo queue time and the 499 * rotation timer interval, pmu::hrtimer_interval_ms. 500 * 501 * They don't need to be the same, but the two are related since if you 502 * rotate faster than you recycle RMIDs, you may run out of available 503 * RMIDs. 504 */ 505#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ 506 507static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; 508 509/* 510 * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list 511 * @nr_available: number of freeable RMIDs on the limbo list 512 * 513 * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no 514 * cachelines are tagged with those RMIDs. After this we can reuse them 515 * and know that the current set of active RMIDs is stable. 516 * 517 * Return %true or %false depending on whether stabilization needs to be 518 * reattempted. 519 * 520 * If we return %true then @nr_available is updated to indicate the 521 * number of RMIDs on the limbo list that have been queued for the 522 * minimum queue time (RMID_AVAILABLE), but whose data occupancy values 523 * are above __intel_cqm_threshold. 524 */ 525static bool intel_cqm_rmid_stabilize(unsigned int *available) 526{ 527 struct cqm_rmid_entry *entry, *tmp; 528 529 lockdep_assert_held(&cache_mutex); 530 531 *available = 0; 532 list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { 533 unsigned long min_queue_time; 534 unsigned long now = jiffies; 535 536 /* 537 * We hold RMIDs placed into limbo for a minimum queue 538 * time. Before the minimum queue time has elapsed we do 539 * not recycle RMIDs. 540 * 541 * The reasoning is that until a sufficient time has 542 * passed since we stopped using an RMID, any RMID 543 * placed onto the limbo list will likely still have 544 * data tagged in the cache, which means we'll probably 545 * fail to recycle it anyway. 546 * 547 * We can save ourselves an expensive IPI by skipping 548 * any RMIDs that have not been queued for the minimum 549 * time. 550 */ 551 min_queue_time = entry->queue_time + 552 msecs_to_jiffies(__rmid_queue_time_ms); 553 554 if (time_after(min_queue_time, now)) 555 break; 556 557 entry->state = RMID_AVAILABLE; 558 (*available)++; 559 } 560 561 /* 562 * Fast return if none of the RMIDs on the limbo list have been 563 * sitting on the queue for the minimum queue time. 564 */ 565 if (!*available) 566 return false; 567 568 /* 569 * Test whether an RMID is free for each package. 570 */ 571 on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); 572 573 list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { 574 /* 575 * Exhausted all RMIDs that have waited min queue time. 576 */ 577 if (entry->state == RMID_YOUNG) 578 break; 579 580 if (entry->state == RMID_DIRTY) 581 continue; 582 583 list_del(&entry->list); /* remove from limbo */ 584 585 /* 586 * The rotation RMID gets priority if it's 587 * currently invalid. In which case, skip adding 588 * the RMID to the the free lru. 589 */ 590 if (!__rmid_valid(intel_cqm_rotation_rmid)) { 591 intel_cqm_rotation_rmid = entry->rmid; 592 continue; 593 } 594 595 /* 596 * If we have groups waiting for RMIDs, hand 597 * them one now provided they don't conflict. 598 */ 599 if (intel_cqm_sched_in_event(entry->rmid)) 600 continue; 601 602 /* 603 * Otherwise place it onto the free list. 604 */ 605 list_add_tail(&entry->list, &cqm_rmid_free_lru); 606 } 607 608 609 return __rmid_valid(intel_cqm_rotation_rmid); 610} 611 612/* 613 * Pick a victim group and move it to the tail of the group list. 614 * @next: The first group without an RMID 615 */ 616static void __intel_cqm_pick_and_rotate(struct perf_event *next) 617{ 618 struct perf_event *rotor; 619 u32 rmid; 620 621 lockdep_assert_held(&cache_mutex); 622 623 rotor = list_first_entry(&cache_groups, struct perf_event, 624 hw.cqm_groups_entry); 625 626 /* 627 * The group at the front of the list should always have a valid 628 * RMID. If it doesn't then no groups have RMIDs assigned and we 629 * don't need to rotate the list. 630 */ 631 if (next == rotor) 632 return; 633 634 rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); 635 __put_rmid(rmid); 636 637 list_rotate_left(&cache_groups); 638} 639 640/* 641 * Deallocate the RMIDs from any events that conflict with @event, and 642 * place them on the back of the group list. 643 */ 644static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) 645{ 646 struct perf_event *group, *g; 647 u32 rmid; 648 649 lockdep_assert_held(&cache_mutex); 650 651 list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) { 652 if (group == event) 653 continue; 654 655 rmid = group->hw.cqm_rmid; 656 657 /* 658 * Skip events that don't have a valid RMID. 659 */ 660 if (!__rmid_valid(rmid)) 661 continue; 662 663 /* 664 * No conflict? No problem! Leave the event alone. 665 */ 666 if (!__conflict_event(group, event)) 667 continue; 668 669 intel_cqm_xchg_rmid(group, INVALID_RMID); 670 __put_rmid(rmid); 671 } 672} 673 674/* 675 * Attempt to rotate the groups and assign new RMIDs. 676 * 677 * We rotate for two reasons, 678 * 1. To handle the scheduling of conflicting events 679 * 2. To recycle RMIDs 680 * 681 * Rotating RMIDs is complicated because the hardware doesn't give us 682 * any clues. 683 * 684 * There's problems with the hardware interface; when you change the 685 * task:RMID map cachelines retain their 'old' tags, giving a skewed 686 * picture. In order to work around this, we must always keep one free 687 * RMID - intel_cqm_rotation_rmid. 688 * 689 * Rotation works by taking away an RMID from a group (the old RMID), 690 * and assigning the free RMID to another group (the new RMID). We must 691 * then wait for the old RMID to not be used (no cachelines tagged). 692 * This ensure that all cachelines are tagged with 'active' RMIDs. At 693 * this point we can start reading values for the new RMID and treat the 694 * old RMID as the free RMID for the next rotation. 695 * 696 * Return %true or %false depending on whether we did any rotating. 697 */ 698static bool __intel_cqm_rmid_rotate(void) 699{ 700 struct perf_event *group, *start = NULL; 701 unsigned int threshold_limit; 702 unsigned int nr_needed = 0; 703 unsigned int nr_available; 704 bool rotated = false; 705 706 mutex_lock(&cache_mutex); 707 708again: 709 /* 710 * Fast path through this function if there are no groups and no 711 * RMIDs that need cleaning. 712 */ 713 if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) 714 goto out; 715 716 list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { 717 if (!__rmid_valid(group->hw.cqm_rmid)) { 718 if (!start) 719 start = group; 720 nr_needed++; 721 } 722 } 723 724 /* 725 * We have some event groups, but they all have RMIDs assigned 726 * and no RMIDs need cleaning. 727 */ 728 if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) 729 goto out; 730 731 if (!nr_needed) 732 goto stabilize; 733 734 /* 735 * We have more event groups without RMIDs than available RMIDs, 736 * or we have event groups that conflict with the ones currently 737 * scheduled. 738 * 739 * We force deallocate the rmid of the group at the head of 740 * cache_groups. The first event group without an RMID then gets 741 * assigned intel_cqm_rotation_rmid. This ensures we always make 742 * forward progress. 743 * 744 * Rotate the cache_groups list so the previous head is now the 745 * tail. 746 */ 747 __intel_cqm_pick_and_rotate(start); 748 749 /* 750 * If the rotation is going to succeed, reduce the threshold so 751 * that we don't needlessly reuse dirty RMIDs. 752 */ 753 if (__rmid_valid(intel_cqm_rotation_rmid)) { 754 intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); 755 intel_cqm_rotation_rmid = __get_rmid(); 756 757 intel_cqm_sched_out_conflicting_events(start); 758 759 if (__intel_cqm_threshold) 760 __intel_cqm_threshold--; 761 } 762 763 rotated = true; 764 765stabilize: 766 /* 767 * We now need to stablize the RMID we freed above (if any) to 768 * ensure that the next time we rotate we have an RMID with zero 769 * occupancy value. 770 * 771 * Alternatively, if we didn't need to perform any rotation, 772 * we'll have a bunch of RMIDs in limbo that need stabilizing. 773 */ 774 threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; 775 776 while (intel_cqm_rmid_stabilize(&nr_available) && 777 __intel_cqm_threshold < threshold_limit) { 778 unsigned int steal_limit; 779 780 /* 781 * Don't spin if nobody is actively waiting for an RMID, 782 * the rotation worker will be kicked as soon as an 783 * event needs an RMID anyway. 784 */ 785 if (!nr_needed) 786 break; 787 788 /* Allow max 25% of RMIDs to be in limbo. */ 789 steal_limit = (cqm_max_rmid + 1) / 4; 790 791 /* 792 * We failed to stabilize any RMIDs so our rotation 793 * logic is now stuck. In order to make forward progress 794 * we have a few options: 795 * 796 * 1. rotate ("steal") another RMID 797 * 2. increase the threshold 798 * 3. do nothing 799 * 800 * We do both of 1. and 2. until we hit the steal limit. 801 * 802 * The steal limit prevents all RMIDs ending up on the 803 * limbo list. This can happen if every RMID has a 804 * non-zero occupancy above threshold_limit, and the 805 * occupancy values aren't dropping fast enough. 806 * 807 * Note that there is prioritisation at work here - we'd 808 * rather increase the number of RMIDs on the limbo list 809 * than increase the threshold, because increasing the 810 * threshold skews the event data (because we reuse 811 * dirty RMIDs) - threshold bumps are a last resort. 812 */ 813 if (nr_available < steal_limit) 814 goto again; 815 816 __intel_cqm_threshold++; 817 } 818 819out: 820 mutex_unlock(&cache_mutex); 821 return rotated; 822} 823 824static void intel_cqm_rmid_rotate(struct work_struct *work); 825 826static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); 827 828static struct pmu intel_cqm_pmu; 829 830static void intel_cqm_rmid_rotate(struct work_struct *work) 831{ 832 unsigned long delay; 833 834 __intel_cqm_rmid_rotate(); 835 836 delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); 837 schedule_delayed_work(&intel_cqm_rmid_work, delay); 838} 839 840/* 841 * Find a group and setup RMID. 842 * 843 * If we're part of a group, we use the group's RMID. 844 */ 845static void intel_cqm_setup_event(struct perf_event *event, 846 struct perf_event **group) 847{ 848 struct perf_event *iter; 849 bool conflict = false; 850 u32 rmid; 851 852 list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { 853 rmid = iter->hw.cqm_rmid; 854 855 if (__match_event(iter, event)) { 856 /* All tasks in a group share an RMID */ 857 event->hw.cqm_rmid = rmid; 858 *group = iter; 859 return; 860 } 861 862 /* 863 * We only care about conflicts for events that are 864 * actually scheduled in (and hence have a valid RMID). 865 */ 866 if (__conflict_event(iter, event) && __rmid_valid(rmid)) 867 conflict = true; 868 } 869 870 if (conflict) 871 rmid = INVALID_RMID; 872 else 873 rmid = __get_rmid(); 874 875 event->hw.cqm_rmid = rmid; 876} 877 878static void intel_cqm_event_read(struct perf_event *event) 879{ 880 unsigned long flags; 881 u32 rmid; 882 u64 val; 883 884 /* 885 * Task events are handled by intel_cqm_event_count(). 886 */ 887 if (event->cpu == -1) 888 return; 889 890 raw_spin_lock_irqsave(&cache_lock, flags); 891 rmid = event->hw.cqm_rmid; 892 893 if (!__rmid_valid(rmid)) 894 goto out; 895 896 val = __rmid_read(rmid); 897 898 /* 899 * Ignore this reading on error states and do not update the value. 900 */ 901 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) 902 goto out; 903 904 local64_set(&event->count, val); 905out: 906 raw_spin_unlock_irqrestore(&cache_lock, flags); 907} 908 909static void __intel_cqm_event_count(void *info) 910{ 911 struct rmid_read *rr = info; 912 u64 val; 913 914 val = __rmid_read(rr->rmid); 915 916 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) 917 return; 918 919 atomic64_add(val, &rr->value); 920} 921 922static inline bool cqm_group_leader(struct perf_event *event) 923{ 924 return !list_empty(&event->hw.cqm_groups_entry); 925} 926 927static u64 intel_cqm_event_count(struct perf_event *event) 928{ 929 unsigned long flags; 930 struct rmid_read rr = { 931 .value = ATOMIC64_INIT(0), 932 }; 933 934 /* 935 * We only need to worry about task events. System-wide events 936 * are handled like usual, i.e. entirely with 937 * intel_cqm_event_read(). 938 */ 939 if (event->cpu != -1) 940 return __perf_event_count(event); 941 942 /* 943 * Only the group leader gets to report values. This stops us 944 * reporting duplicate values to userspace, and gives us a clear 945 * rule for which task gets to report the values. 946 * 947 * Note that it is impossible to attribute these values to 948 * specific packages - we forfeit that ability when we create 949 * task events. 950 */ 951 if (!cqm_group_leader(event)) 952 return 0; 953 954 /* 955 * Getting up-to-date values requires an SMP IPI which is not 956 * possible if we're being called in interrupt context. Return 957 * the cached values instead. 958 */ 959 if (unlikely(in_interrupt())) 960 goto out; 961 962 /* 963 * Notice that we don't perform the reading of an RMID 964 * atomically, because we can't hold a spin lock across the 965 * IPIs. 966 * 967 * Speculatively perform the read, since @event might be 968 * assigned a different (possibly invalid) RMID while we're 969 * busying performing the IPI calls. It's therefore necessary to 970 * check @event's RMID afterwards, and if it has changed, 971 * discard the result of the read. 972 */ 973 rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); 974 975 if (!__rmid_valid(rr.rmid)) 976 goto out; 977 978 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); 979 980 raw_spin_lock_irqsave(&cache_lock, flags); 981 if (event->hw.cqm_rmid == rr.rmid) 982 local64_set(&event->count, atomic64_read(&rr.value)); 983 raw_spin_unlock_irqrestore(&cache_lock, flags); 984out: 985 return __perf_event_count(event); 986} 987 988static void intel_cqm_event_start(struct perf_event *event, int mode) 989{ 990 struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); 991 u32 rmid = event->hw.cqm_rmid; 992 993 if (!(event->hw.cqm_state & PERF_HES_STOPPED)) 994 return; 995 996 event->hw.cqm_state &= ~PERF_HES_STOPPED; 997 998 if (state->rmid_usecnt++) { 999 if (!WARN_ON_ONCE(state->rmid != rmid)) 1000 return; 1001 } else { 1002 WARN_ON_ONCE(state->rmid); 1003 } 1004 1005 state->rmid = rmid; 1006 wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid); 1007} 1008 1009static void intel_cqm_event_stop(struct perf_event *event, int mode) 1010{ 1011 struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); 1012 1013 if (event->hw.cqm_state & PERF_HES_STOPPED) 1014 return; 1015 1016 event->hw.cqm_state |= PERF_HES_STOPPED; 1017 1018 intel_cqm_event_read(event); 1019 1020 if (!--state->rmid_usecnt) { 1021 state->rmid = 0; 1022 wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid); 1023 } else { 1024 WARN_ON_ONCE(!state->rmid); 1025 } 1026} 1027 1028static int intel_cqm_event_add(struct perf_event *event, int mode) 1029{ 1030 unsigned long flags; 1031 u32 rmid; 1032 1033 raw_spin_lock_irqsave(&cache_lock, flags); 1034 1035 event->hw.cqm_state = PERF_HES_STOPPED; 1036 rmid = event->hw.cqm_rmid; 1037 1038 if (__rmid_valid(rmid) && (mode & PERF_EF_START)) 1039 intel_cqm_event_start(event, mode); 1040 1041 raw_spin_unlock_irqrestore(&cache_lock, flags); 1042 1043 return 0; 1044} 1045 1046static void intel_cqm_event_destroy(struct perf_event *event) 1047{ 1048 struct perf_event *group_other = NULL; 1049 1050 mutex_lock(&cache_mutex); 1051 1052 /* 1053 * If there's another event in this group... 1054 */ 1055 if (!list_empty(&event->hw.cqm_group_entry)) { 1056 group_other = list_first_entry(&event->hw.cqm_group_entry, 1057 struct perf_event, 1058 hw.cqm_group_entry); 1059 list_del(&event->hw.cqm_group_entry); 1060 } 1061 1062 /* 1063 * And we're the group leader.. 1064 */ 1065 if (cqm_group_leader(event)) { 1066 /* 1067 * If there was a group_other, make that leader, otherwise 1068 * destroy the group and return the RMID. 1069 */ 1070 if (group_other) { 1071 list_replace(&event->hw.cqm_groups_entry, 1072 &group_other->hw.cqm_groups_entry); 1073 } else { 1074 u32 rmid = event->hw.cqm_rmid; 1075 1076 if (__rmid_valid(rmid)) 1077 __put_rmid(rmid); 1078 list_del(&event->hw.cqm_groups_entry); 1079 } 1080 } 1081 1082 mutex_unlock(&cache_mutex); 1083} 1084 1085static int intel_cqm_event_init(struct perf_event *event) 1086{ 1087 struct perf_event *group = NULL; 1088 bool rotate = false; 1089 1090 if (event->attr.type != intel_cqm_pmu.type) 1091 return -ENOENT; 1092 1093 if (event->attr.config & ~QOS_EVENT_MASK) 1094 return -EINVAL; 1095 1096 /* unsupported modes and filters */ 1097 if (event->attr.exclude_user || 1098 event->attr.exclude_kernel || 1099 event->attr.exclude_hv || 1100 event->attr.exclude_idle || 1101 event->attr.exclude_host || 1102 event->attr.exclude_guest || 1103 event->attr.sample_period) /* no sampling */ 1104 return -EINVAL; 1105 1106 INIT_LIST_HEAD(&event->hw.cqm_group_entry); 1107 INIT_LIST_HEAD(&event->hw.cqm_groups_entry); 1108 1109 event->destroy = intel_cqm_event_destroy; 1110 1111 mutex_lock(&cache_mutex); 1112 1113 /* Will also set rmid */ 1114 intel_cqm_setup_event(event, &group); 1115 1116 if (group) { 1117 list_add_tail(&event->hw.cqm_group_entry, 1118 &group->hw.cqm_group_entry); 1119 } else { 1120 list_add_tail(&event->hw.cqm_groups_entry, 1121 &cache_groups); 1122 1123 /* 1124 * All RMIDs are either in use or have recently been 1125 * used. Kick the rotation worker to clean/free some. 1126 * 1127 * We only do this for the group leader, rather than for 1128 * every event in a group to save on needless work. 1129 */ 1130 if (!__rmid_valid(event->hw.cqm_rmid)) 1131 rotate = true; 1132 } 1133 1134 mutex_unlock(&cache_mutex); 1135 1136 if (rotate) 1137 schedule_delayed_work(&intel_cqm_rmid_work, 0); 1138 1139 return 0; 1140} 1141 1142EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); 1143EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); 1144EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); 1145EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); 1146EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); 1147 1148static struct attribute *intel_cqm_events_attr[] = { 1149 EVENT_PTR(intel_cqm_llc), 1150 EVENT_PTR(intel_cqm_llc_pkg), 1151 EVENT_PTR(intel_cqm_llc_unit), 1152 EVENT_PTR(intel_cqm_llc_scale), 1153 EVENT_PTR(intel_cqm_llc_snapshot), 1154 NULL, 1155}; 1156 1157static struct attribute_group intel_cqm_events_group = { 1158 .name = "events", 1159 .attrs = intel_cqm_events_attr, 1160}; 1161 1162PMU_FORMAT_ATTR(event, "config:0-7"); 1163static struct attribute *intel_cqm_formats_attr[] = { 1164 &format_attr_event.attr, 1165 NULL, 1166}; 1167 1168static struct attribute_group intel_cqm_format_group = { 1169 .name = "format", 1170 .attrs = intel_cqm_formats_attr, 1171}; 1172 1173static ssize_t 1174max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, 1175 char *page) 1176{ 1177 ssize_t rv; 1178 1179 mutex_lock(&cache_mutex); 1180 rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); 1181 mutex_unlock(&cache_mutex); 1182 1183 return rv; 1184} 1185 1186static ssize_t 1187max_recycle_threshold_store(struct device *dev, 1188 struct device_attribute *attr, 1189 const char *buf, size_t count) 1190{ 1191 unsigned int bytes, cachelines; 1192 int ret; 1193 1194 ret = kstrtouint(buf, 0, &bytes); 1195 if (ret) 1196 return ret; 1197 1198 mutex_lock(&cache_mutex); 1199 1200 __intel_cqm_max_threshold = bytes; 1201 cachelines = bytes / cqm_l3_scale; 1202 1203 /* 1204 * The new maximum takes effect immediately. 1205 */ 1206 if (__intel_cqm_threshold > cachelines) 1207 __intel_cqm_threshold = cachelines; 1208 1209 mutex_unlock(&cache_mutex); 1210 1211 return count; 1212} 1213 1214static DEVICE_ATTR_RW(max_recycle_threshold); 1215 1216static struct attribute *intel_cqm_attrs[] = { 1217 &dev_attr_max_recycle_threshold.attr, 1218 NULL, 1219}; 1220 1221static const struct attribute_group intel_cqm_group = { 1222 .attrs = intel_cqm_attrs, 1223}; 1224 1225static const struct attribute_group *intel_cqm_attr_groups[] = { 1226 &intel_cqm_events_group, 1227 &intel_cqm_format_group, 1228 &intel_cqm_group, 1229 NULL, 1230}; 1231 1232static struct pmu intel_cqm_pmu = { 1233 .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, 1234 .attr_groups = intel_cqm_attr_groups, 1235 .task_ctx_nr = perf_sw_context, 1236 .event_init = intel_cqm_event_init, 1237 .add = intel_cqm_event_add, 1238 .del = intel_cqm_event_stop, 1239 .start = intel_cqm_event_start, 1240 .stop = intel_cqm_event_stop, 1241 .read = intel_cqm_event_read, 1242 .count = intel_cqm_event_count, 1243}; 1244 1245static inline void cqm_pick_event_reader(int cpu) 1246{ 1247 int phys_id = topology_physical_package_id(cpu); 1248 int i; 1249 1250 for_each_cpu(i, &cqm_cpumask) { 1251 if (phys_id == topology_physical_package_id(i)) 1252 return; /* already got reader for this socket */ 1253 } 1254 1255 cpumask_set_cpu(cpu, &cqm_cpumask); 1256} 1257 1258static void intel_cqm_cpu_starting(unsigned int cpu) 1259{ 1260 struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); 1261 struct cpuinfo_x86 *c = &cpu_data(cpu); 1262 1263 state->rmid = 0; 1264 state->closid = 0; 1265 state->rmid_usecnt = 0; 1266 1267 WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); 1268 WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); 1269} 1270 1271static void intel_cqm_cpu_exit(unsigned int cpu) 1272{ 1273 int phys_id = topology_physical_package_id(cpu); 1274 int i; 1275 1276 /* 1277 * Is @cpu a designated cqm reader? 1278 */ 1279 if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) 1280 return; 1281 1282 for_each_online_cpu(i) { 1283 if (i == cpu) 1284 continue; 1285 1286 if (phys_id == topology_physical_package_id(i)) { 1287 cpumask_set_cpu(i, &cqm_cpumask); 1288 break; 1289 } 1290 } 1291} 1292 1293static int intel_cqm_cpu_notifier(struct notifier_block *nb, 1294 unsigned long action, void *hcpu) 1295{ 1296 unsigned int cpu = (unsigned long)hcpu; 1297 1298 switch (action & ~CPU_TASKS_FROZEN) { 1299 case CPU_DOWN_PREPARE: 1300 intel_cqm_cpu_exit(cpu); 1301 break; 1302 case CPU_STARTING: 1303 intel_cqm_cpu_starting(cpu); 1304 cqm_pick_event_reader(cpu); 1305 break; 1306 } 1307 1308 return NOTIFY_OK; 1309} 1310 1311static const struct x86_cpu_id intel_cqm_match[] = { 1312 { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, 1313 {} 1314}; 1315 1316static int __init intel_cqm_init(void) 1317{ 1318 char *str, scale[20]; 1319 int i, cpu, ret; 1320 1321 if (!x86_match_cpu(intel_cqm_match)) 1322 return -ENODEV; 1323 1324 cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; 1325 1326 /* 1327 * It's possible that not all resources support the same number 1328 * of RMIDs. Instead of making scheduling much more complicated 1329 * (where we have to match a task's RMID to a cpu that supports 1330 * that many RMIDs) just find the minimum RMIDs supported across 1331 * all cpus. 1332 * 1333 * Also, check that the scales match on all cpus. 1334 */ 1335 cpu_notifier_register_begin(); 1336 1337 for_each_online_cpu(cpu) { 1338 struct cpuinfo_x86 *c = &cpu_data(cpu); 1339 1340 if (c->x86_cache_max_rmid < cqm_max_rmid) 1341 cqm_max_rmid = c->x86_cache_max_rmid; 1342 1343 if (c->x86_cache_occ_scale != cqm_l3_scale) { 1344 pr_err("Multiple LLC scale values, disabling\n"); 1345 ret = -EINVAL; 1346 goto out; 1347 } 1348 } 1349 1350 /* 1351 * A reasonable upper limit on the max threshold is the number 1352 * of lines tagged per RMID if all RMIDs have the same number of 1353 * lines tagged in the LLC. 1354 * 1355 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. 1356 */ 1357 __intel_cqm_max_threshold = 1358 boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); 1359 1360 snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); 1361 str = kstrdup(scale, GFP_KERNEL); 1362 if (!str) { 1363 ret = -ENOMEM; 1364 goto out; 1365 } 1366 1367 event_attr_intel_cqm_llc_scale.event_str = str; 1368 1369 ret = intel_cqm_setup_rmid_cache(); 1370 if (ret) 1371 goto out; 1372 1373 for_each_online_cpu(i) { 1374 intel_cqm_cpu_starting(i); 1375 cqm_pick_event_reader(i); 1376 } 1377 1378 __perf_cpu_notifier(intel_cqm_cpu_notifier); 1379 1380 ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); 1381 if (ret) 1382 pr_err("Intel CQM perf registration failed: %d\n", ret); 1383 else 1384 pr_info("Intel CQM monitoring enabled\n"); 1385 1386out: 1387 cpu_notifier_register_done(); 1388 1389 return ret; 1390} 1391device_initcall(intel_cqm_init); 1392