1/* 2 * x86_pkg_temp_thermal driver 3 * Copyright (c) 2013, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 * You should have received a copy of the GNU General Public License along with 15 * this program; if not, write to the Free Software Foundation, Inc. 16 * 17 */ 18#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 19 20#include <linux/module.h> 21#include <linux/init.h> 22#include <linux/err.h> 23#include <linux/param.h> 24#include <linux/device.h> 25#include <linux/platform_device.h> 26#include <linux/cpu.h> 27#include <linux/smp.h> 28#include <linux/slab.h> 29#include <linux/pm.h> 30#include <linux/thermal.h> 31#include <linux/debugfs.h> 32#include <asm/cpu_device_id.h> 33#include <asm/mce.h> 34 35/* 36* Rate control delay: Idea is to introduce denounce effect 37* This should be long enough to avoid reduce events, when 38* threshold is set to a temperature, which is constantly 39* violated, but at the short enough to take any action. 40* The action can be remove threshold or change it to next 41* interesting setting. Based on experiments, in around 42* every 5 seconds under load will give us a significant 43* temperature change. 44*/ 45#define PKG_TEMP_THERMAL_NOTIFY_DELAY 5000 46static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY; 47module_param(notify_delay_ms, int, 0644); 48MODULE_PARM_DESC(notify_delay_ms, 49 "User space notification delay in milli seconds."); 50 51/* Number of trip points in thermal zone. Currently it can't 52* be more than 2. MSR can allow setting and getting notifications 53* for only 2 thresholds. This define enforces this, if there 54* is some wrong values returned by cpuid for number of thresholds. 55*/ 56#define MAX_NUMBER_OF_TRIPS 2 57/* Limit number of package temp zones */ 58#define MAX_PKG_TEMP_ZONE_IDS 256 59 60struct phy_dev_entry { 61 struct list_head list; 62 u16 phys_proc_id; 63 u16 first_cpu; 64 u32 tj_max; 65 int ref_cnt; 66 u32 start_pkg_therm_low; 67 u32 start_pkg_therm_high; 68 struct thermal_zone_device *tzone; 69}; 70 71static const struct thermal_zone_params pkg_temp_tz_params = { 72 .no_hwmon = true, 73}; 74 75/* List maintaining number of package instances */ 76static LIST_HEAD(phy_dev_list); 77static DEFINE_MUTEX(phy_dev_list_mutex); 78 79/* Interrupt to work function schedule queue */ 80static DEFINE_PER_CPU(struct delayed_work, pkg_temp_thermal_threshold_work); 81 82/* To track if the work is already scheduled on a package */ 83static u8 *pkg_work_scheduled; 84 85/* Spin lock to prevent races with pkg_work_scheduled */ 86static spinlock_t pkg_work_lock; 87static u16 max_phy_id; 88 89/* Debug counters to show using debugfs */ 90static struct dentry *debugfs; 91static unsigned int pkg_interrupt_cnt; 92static unsigned int pkg_work_cnt; 93 94static int pkg_temp_debugfs_init(void) 95{ 96 struct dentry *d; 97 98 debugfs = debugfs_create_dir("pkg_temp_thermal", NULL); 99 if (!debugfs) 100 return -ENOENT; 101 102 d = debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs, 103 (u32 *)&pkg_interrupt_cnt); 104 if (!d) 105 goto err_out; 106 107 d = debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs, 108 (u32 *)&pkg_work_cnt); 109 if (!d) 110 goto err_out; 111 112 return 0; 113 114err_out: 115 debugfs_remove_recursive(debugfs); 116 return -ENOENT; 117} 118 119static struct phy_dev_entry 120 *pkg_temp_thermal_get_phy_entry(unsigned int cpu) 121{ 122 u16 phys_proc_id = topology_physical_package_id(cpu); 123 struct phy_dev_entry *phy_ptr; 124 125 mutex_lock(&phy_dev_list_mutex); 126 127 list_for_each_entry(phy_ptr, &phy_dev_list, list) 128 if (phy_ptr->phys_proc_id == phys_proc_id) { 129 mutex_unlock(&phy_dev_list_mutex); 130 return phy_ptr; 131 } 132 133 mutex_unlock(&phy_dev_list_mutex); 134 135 return NULL; 136} 137 138/* 139* tj-max is is interesting because threshold is set relative to this 140* temperature. 141*/ 142static int get_tj_max(int cpu, u32 *tj_max) 143{ 144 u32 eax, edx; 145 u32 val; 146 int err; 147 148 err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx); 149 if (err) 150 goto err_ret; 151 else { 152 val = (eax >> 16) & 0xff; 153 if (val) 154 *tj_max = val * 1000; 155 else { 156 err = -EINVAL; 157 goto err_ret; 158 } 159 } 160 161 return 0; 162err_ret: 163 *tj_max = 0; 164 return err; 165} 166 167static int sys_get_curr_temp(struct thermal_zone_device *tzd, unsigned long *temp) 168{ 169 u32 eax, edx; 170 struct phy_dev_entry *phy_dev_entry; 171 172 phy_dev_entry = tzd->devdata; 173 rdmsr_on_cpu(phy_dev_entry->first_cpu, MSR_IA32_PACKAGE_THERM_STATUS, 174 &eax, &edx); 175 if (eax & 0x80000000) { 176 *temp = phy_dev_entry->tj_max - 177 ((eax >> 16) & 0x7f) * 1000; 178 pr_debug("sys_get_curr_temp %ld\n", *temp); 179 return 0; 180 } 181 182 return -EINVAL; 183} 184 185static int sys_get_trip_temp(struct thermal_zone_device *tzd, 186 int trip, unsigned long *temp) 187{ 188 u32 eax, edx; 189 struct phy_dev_entry *phy_dev_entry; 190 u32 mask, shift; 191 unsigned long thres_reg_value; 192 int ret; 193 194 if (trip >= MAX_NUMBER_OF_TRIPS) 195 return -EINVAL; 196 197 phy_dev_entry = tzd->devdata; 198 199 if (trip) { 200 mask = THERM_MASK_THRESHOLD1; 201 shift = THERM_SHIFT_THRESHOLD1; 202 } else { 203 mask = THERM_MASK_THRESHOLD0; 204 shift = THERM_SHIFT_THRESHOLD0; 205 } 206 207 ret = rdmsr_on_cpu(phy_dev_entry->first_cpu, 208 MSR_IA32_PACKAGE_THERM_INTERRUPT, &eax, &edx); 209 if (ret < 0) 210 return -EINVAL; 211 212 thres_reg_value = (eax & mask) >> shift; 213 if (thres_reg_value) 214 *temp = phy_dev_entry->tj_max - thres_reg_value * 1000; 215 else 216 *temp = 0; 217 pr_debug("sys_get_trip_temp %ld\n", *temp); 218 219 return 0; 220} 221 222static int sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, 223 unsigned long temp) 224{ 225 u32 l, h; 226 struct phy_dev_entry *phy_dev_entry; 227 u32 mask, shift, intr; 228 int ret; 229 230 phy_dev_entry = tzd->devdata; 231 232 if (trip >= MAX_NUMBER_OF_TRIPS || temp >= phy_dev_entry->tj_max) 233 return -EINVAL; 234 235 ret = rdmsr_on_cpu(phy_dev_entry->first_cpu, 236 MSR_IA32_PACKAGE_THERM_INTERRUPT, 237 &l, &h); 238 if (ret < 0) 239 return -EINVAL; 240 241 if (trip) { 242 mask = THERM_MASK_THRESHOLD1; 243 shift = THERM_SHIFT_THRESHOLD1; 244 intr = THERM_INT_THRESHOLD1_ENABLE; 245 } else { 246 mask = THERM_MASK_THRESHOLD0; 247 shift = THERM_SHIFT_THRESHOLD0; 248 intr = THERM_INT_THRESHOLD0_ENABLE; 249 } 250 l &= ~mask; 251 /* 252 * When users space sets a trip temperature == 0, which is indication 253 * that, it is no longer interested in receiving notifications. 254 */ 255 if (!temp) 256 l &= ~intr; 257 else { 258 l |= (phy_dev_entry->tj_max - temp)/1000 << shift; 259 l |= intr; 260 } 261 262 return wrmsr_on_cpu(phy_dev_entry->first_cpu, 263 MSR_IA32_PACKAGE_THERM_INTERRUPT, 264 l, h); 265} 266 267static int sys_get_trip_type(struct thermal_zone_device *thermal, 268 int trip, enum thermal_trip_type *type) 269{ 270 271 *type = THERMAL_TRIP_PASSIVE; 272 273 return 0; 274} 275 276/* Thermal zone callback registry */ 277static struct thermal_zone_device_ops tzone_ops = { 278 .get_temp = sys_get_curr_temp, 279 .get_trip_temp = sys_get_trip_temp, 280 .get_trip_type = sys_get_trip_type, 281 .set_trip_temp = sys_set_trip_temp, 282}; 283 284static bool pkg_temp_thermal_platform_thermal_rate_control(void) 285{ 286 return true; 287} 288 289/* Enable threshold interrupt on local package/cpu */ 290static inline void enable_pkg_thres_interrupt(void) 291{ 292 u32 l, h; 293 u8 thres_0, thres_1; 294 295 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 296 /* only enable/disable if it had valid threshold value */ 297 thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0; 298 thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1; 299 if (thres_0) 300 l |= THERM_INT_THRESHOLD0_ENABLE; 301 if (thres_1) 302 l |= THERM_INT_THRESHOLD1_ENABLE; 303 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 304} 305 306/* Disable threshold interrupt on local package/cpu */ 307static inline void disable_pkg_thres_interrupt(void) 308{ 309 u32 l, h; 310 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 311 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, 312 l & (~THERM_INT_THRESHOLD0_ENABLE) & 313 (~THERM_INT_THRESHOLD1_ENABLE), h); 314} 315 316static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work) 317{ 318 __u64 msr_val; 319 int cpu = smp_processor_id(); 320 int phy_id = topology_physical_package_id(cpu); 321 struct phy_dev_entry *phdev = pkg_temp_thermal_get_phy_entry(cpu); 322 bool notify = false; 323 unsigned long flags; 324 325 if (!phdev) 326 return; 327 328 spin_lock_irqsave(&pkg_work_lock, flags); 329 ++pkg_work_cnt; 330 if (unlikely(phy_id > max_phy_id)) { 331 spin_unlock_irqrestore(&pkg_work_lock, flags); 332 return; 333 } 334 pkg_work_scheduled[phy_id] = 0; 335 spin_unlock_irqrestore(&pkg_work_lock, flags); 336 337 enable_pkg_thres_interrupt(); 338 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); 339 if (msr_val & THERM_LOG_THRESHOLD0) { 340 wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, 341 msr_val & ~THERM_LOG_THRESHOLD0); 342 notify = true; 343 } 344 if (msr_val & THERM_LOG_THRESHOLD1) { 345 wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, 346 msr_val & ~THERM_LOG_THRESHOLD1); 347 notify = true; 348 } 349 if (notify) { 350 pr_debug("thermal_zone_device_update\n"); 351 thermal_zone_device_update(phdev->tzone); 352 } 353} 354 355static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val) 356{ 357 unsigned long flags; 358 int cpu = smp_processor_id(); 359 int phy_id = topology_physical_package_id(cpu); 360 361 /* 362 * When a package is in interrupted state, all CPU's in that package 363 * are in the same interrupt state. So scheduling on any one CPU in 364 * the package is enough and simply return for others. 365 */ 366 spin_lock_irqsave(&pkg_work_lock, flags); 367 ++pkg_interrupt_cnt; 368 if (unlikely(phy_id > max_phy_id) || unlikely(!pkg_work_scheduled) || 369 pkg_work_scheduled[phy_id]) { 370 disable_pkg_thres_interrupt(); 371 spin_unlock_irqrestore(&pkg_work_lock, flags); 372 return -EINVAL; 373 } 374 pkg_work_scheduled[phy_id] = 1; 375 spin_unlock_irqrestore(&pkg_work_lock, flags); 376 377 disable_pkg_thres_interrupt(); 378 schedule_delayed_work_on(cpu, 379 &per_cpu(pkg_temp_thermal_threshold_work, cpu), 380 msecs_to_jiffies(notify_delay_ms)); 381 return 0; 382} 383 384static int find_siblings_cpu(int cpu) 385{ 386 int i; 387 int id = topology_physical_package_id(cpu); 388 389 for_each_online_cpu(i) 390 if (i != cpu && topology_physical_package_id(i) == id) 391 return i; 392 393 return 0; 394} 395 396static int pkg_temp_thermal_device_add(unsigned int cpu) 397{ 398 int err; 399 u32 tj_max; 400 struct phy_dev_entry *phy_dev_entry; 401 int thres_count; 402 u32 eax, ebx, ecx, edx; 403 u8 *temp; 404 unsigned long flags; 405 406 cpuid(6, &eax, &ebx, &ecx, &edx); 407 thres_count = ebx & 0x07; 408 if (!thres_count) 409 return -ENODEV; 410 411 if (topology_physical_package_id(cpu) > MAX_PKG_TEMP_ZONE_IDS) 412 return -ENODEV; 413 414 thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS); 415 416 err = get_tj_max(cpu, &tj_max); 417 if (err) 418 goto err_ret; 419 420 mutex_lock(&phy_dev_list_mutex); 421 422 phy_dev_entry = kzalloc(sizeof(*phy_dev_entry), GFP_KERNEL); 423 if (!phy_dev_entry) { 424 err = -ENOMEM; 425 goto err_ret_unlock; 426 } 427 428 spin_lock_irqsave(&pkg_work_lock, flags); 429 if (topology_physical_package_id(cpu) > max_phy_id) 430 max_phy_id = topology_physical_package_id(cpu); 431 temp = krealloc(pkg_work_scheduled, 432 (max_phy_id+1) * sizeof(u8), GFP_ATOMIC); 433 if (!temp) { 434 spin_unlock_irqrestore(&pkg_work_lock, flags); 435 err = -ENOMEM; 436 goto err_ret_free; 437 } 438 pkg_work_scheduled = temp; 439 pkg_work_scheduled[topology_physical_package_id(cpu)] = 0; 440 spin_unlock_irqrestore(&pkg_work_lock, flags); 441 442 phy_dev_entry->phys_proc_id = topology_physical_package_id(cpu); 443 phy_dev_entry->first_cpu = cpu; 444 phy_dev_entry->tj_max = tj_max; 445 phy_dev_entry->ref_cnt = 1; 446 phy_dev_entry->tzone = thermal_zone_device_register("x86_pkg_temp", 447 thres_count, 448 (thres_count == MAX_NUMBER_OF_TRIPS) ? 449 0x03 : 0x01, 450 phy_dev_entry, &tzone_ops, &pkg_temp_tz_params, 0, 0); 451 if (IS_ERR(phy_dev_entry->tzone)) { 452 err = PTR_ERR(phy_dev_entry->tzone); 453 goto err_ret_free; 454 } 455 /* Store MSR value for package thermal interrupt, to restore at exit */ 456 rdmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, 457 &phy_dev_entry->start_pkg_therm_low, 458 &phy_dev_entry->start_pkg_therm_high); 459 460 list_add_tail(&phy_dev_entry->list, &phy_dev_list); 461 pr_debug("pkg_temp_thermal_device_add :phy_id %d cpu %d\n", 462 phy_dev_entry->phys_proc_id, cpu); 463 464 mutex_unlock(&phy_dev_list_mutex); 465 466 return 0; 467 468err_ret_free: 469 kfree(phy_dev_entry); 470err_ret_unlock: 471 mutex_unlock(&phy_dev_list_mutex); 472 473err_ret: 474 return err; 475} 476 477static int pkg_temp_thermal_device_remove(unsigned int cpu) 478{ 479 struct phy_dev_entry *n; 480 u16 phys_proc_id = topology_physical_package_id(cpu); 481 struct phy_dev_entry *phdev = 482 pkg_temp_thermal_get_phy_entry(cpu); 483 484 if (!phdev) 485 return -ENODEV; 486 487 mutex_lock(&phy_dev_list_mutex); 488 /* If we are loosing the first cpu for this package, we need change */ 489 if (phdev->first_cpu == cpu) { 490 phdev->first_cpu = find_siblings_cpu(cpu); 491 pr_debug("thermal_device_remove: first cpu switched %d\n", 492 phdev->first_cpu); 493 } 494 /* 495 * It is possible that no siblings left as this was the last cpu 496 * going offline. We don't need to worry about this assignment 497 * as the phydev entry will be removed in this case and 498 * thermal zone is removed. 499 */ 500 --phdev->ref_cnt; 501 pr_debug("thermal_device_remove: pkg: %d cpu %d ref_cnt %d\n", 502 phys_proc_id, cpu, phdev->ref_cnt); 503 if (!phdev->ref_cnt) 504 list_for_each_entry_safe(phdev, n, &phy_dev_list, list) { 505 if (phdev->phys_proc_id == phys_proc_id) { 506 thermal_zone_device_unregister(phdev->tzone); 507 list_del(&phdev->list); 508 kfree(phdev); 509 break; 510 } 511 } 512 mutex_unlock(&phy_dev_list_mutex); 513 514 return 0; 515} 516 517static int get_core_online(unsigned int cpu) 518{ 519 struct cpuinfo_x86 *c = &cpu_data(cpu); 520 struct phy_dev_entry *phdev = pkg_temp_thermal_get_phy_entry(cpu); 521 522 /* Check if there is already an instance for this package */ 523 if (!phdev) { 524 if (!cpu_has(c, X86_FEATURE_DTHERM) || 525 !cpu_has(c, X86_FEATURE_PTS)) 526 return -ENODEV; 527 if (pkg_temp_thermal_device_add(cpu)) 528 return -ENODEV; 529 } else { 530 mutex_lock(&phy_dev_list_mutex); 531 ++phdev->ref_cnt; 532 pr_debug("get_core_online: cpu %d ref_cnt %d\n", 533 cpu, phdev->ref_cnt); 534 mutex_unlock(&phy_dev_list_mutex); 535 } 536 INIT_DELAYED_WORK(&per_cpu(pkg_temp_thermal_threshold_work, cpu), 537 pkg_temp_thermal_threshold_work_fn); 538 539 pr_debug("get_core_online: cpu %d successful\n", cpu); 540 541 return 0; 542} 543 544static void put_core_offline(unsigned int cpu) 545{ 546 if (!pkg_temp_thermal_device_remove(cpu)) 547 cancel_delayed_work_sync( 548 &per_cpu(pkg_temp_thermal_threshold_work, cpu)); 549 550 pr_debug("put_core_offline: cpu %d\n", cpu); 551} 552 553static int pkg_temp_thermal_cpu_callback(struct notifier_block *nfb, 554 unsigned long action, void *hcpu) 555{ 556 unsigned int cpu = (unsigned long) hcpu; 557 558 switch (action) { 559 case CPU_ONLINE: 560 case CPU_DOWN_FAILED: 561 get_core_online(cpu); 562 break; 563 case CPU_DOWN_PREPARE: 564 put_core_offline(cpu); 565 break; 566 } 567 return NOTIFY_OK; 568} 569 570static struct notifier_block pkg_temp_thermal_notifier __refdata = { 571 .notifier_call = pkg_temp_thermal_cpu_callback, 572}; 573 574static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = { 575 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_PTS }, 576 {} 577}; 578MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids); 579 580static int __init pkg_temp_thermal_init(void) 581{ 582 int i; 583 584 if (!x86_match_cpu(pkg_temp_thermal_ids)) 585 return -ENODEV; 586 587 spin_lock_init(&pkg_work_lock); 588 platform_thermal_package_notify = 589 pkg_temp_thermal_platform_thermal_notify; 590 platform_thermal_package_rate_control = 591 pkg_temp_thermal_platform_thermal_rate_control; 592 593 cpu_notifier_register_begin(); 594 for_each_online_cpu(i) 595 if (get_core_online(i)) 596 goto err_ret; 597 __register_hotcpu_notifier(&pkg_temp_thermal_notifier); 598 cpu_notifier_register_done(); 599 600 pkg_temp_debugfs_init(); /* Don't care if fails */ 601 602 return 0; 603 604err_ret: 605 for_each_online_cpu(i) 606 put_core_offline(i); 607 cpu_notifier_register_done(); 608 kfree(pkg_work_scheduled); 609 platform_thermal_package_notify = NULL; 610 platform_thermal_package_rate_control = NULL; 611 612 return -ENODEV; 613} 614 615static void __exit pkg_temp_thermal_exit(void) 616{ 617 struct phy_dev_entry *phdev, *n; 618 int i; 619 620 cpu_notifier_register_begin(); 621 __unregister_hotcpu_notifier(&pkg_temp_thermal_notifier); 622 mutex_lock(&phy_dev_list_mutex); 623 list_for_each_entry_safe(phdev, n, &phy_dev_list, list) { 624 /* Retore old MSR value for package thermal interrupt */ 625 wrmsr_on_cpu(phdev->first_cpu, 626 MSR_IA32_PACKAGE_THERM_INTERRUPT, 627 phdev->start_pkg_therm_low, 628 phdev->start_pkg_therm_high); 629 thermal_zone_device_unregister(phdev->tzone); 630 list_del(&phdev->list); 631 kfree(phdev); 632 } 633 mutex_unlock(&phy_dev_list_mutex); 634 platform_thermal_package_notify = NULL; 635 platform_thermal_package_rate_control = NULL; 636 for_each_online_cpu(i) 637 cancel_delayed_work_sync( 638 &per_cpu(pkg_temp_thermal_threshold_work, i)); 639 cpu_notifier_register_done(); 640 641 kfree(pkg_work_scheduled); 642 643 debugfs_remove_recursive(debugfs); 644} 645 646module_init(pkg_temp_thermal_init) 647module_exit(pkg_temp_thermal_exit) 648 649MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver"); 650MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>"); 651MODULE_LICENSE("GPL v2"); 652