root/drivers/thermal/intel/x86_pkg_temp_thermal.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. pkg_temp_debugfs_init
  2. pkg_temp_thermal_get_dev
  3. get_tj_max
  4. sys_get_curr_temp
  5. sys_get_trip_temp
  6. sys_set_trip_temp
  7. sys_get_trip_type
  8. pkg_thermal_rate_control
  9. enable_pkg_thres_interrupt
  10. disable_pkg_thres_interrupt
  11. pkg_temp_thermal_threshold_work_fn
  12. pkg_thermal_schedule_work
  13. pkg_thermal_notify
  14. pkg_temp_thermal_device_add
  15. pkg_thermal_cpu_offline
  16. pkg_thermal_cpu_online
  17. pkg_temp_thermal_init
  18. module_init

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * x86_pkg_temp_thermal driver
   4  * Copyright (c) 2013, Intel Corporation.
   5  */
   6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7 
   8 #include <linux/module.h>
   9 #include <linux/init.h>
  10 #include <linux/err.h>
  11 #include <linux/param.h>
  12 #include <linux/device.h>
  13 #include <linux/platform_device.h>
  14 #include <linux/cpu.h>
  15 #include <linux/smp.h>
  16 #include <linux/slab.h>
  17 #include <linux/pm.h>
  18 #include <linux/thermal.h>
  19 #include <linux/debugfs.h>
  20 #include <asm/cpu_device_id.h>
  21 #include <asm/mce.h>
  22 
  23 /*
  24 * Rate control delay: Idea is to introduce denounce effect
  25 * This should be long enough to avoid reduce events, when
  26 * threshold is set to a temperature, which is constantly
  27 * violated, but at the short enough to take any action.
  28 * The action can be remove threshold or change it to next
  29 * interesting setting. Based on experiments, in around
  30 * every 5 seconds under load will give us a significant
  31 * temperature change.
  32 */
  33 #define PKG_TEMP_THERMAL_NOTIFY_DELAY   5000
  34 static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
  35 module_param(notify_delay_ms, int, 0644);
  36 MODULE_PARM_DESC(notify_delay_ms,
  37         "User space notification delay in milli seconds.");
  38 
  39 /* Number of trip points in thermal zone. Currently it can't
  40 * be more than 2. MSR can allow setting and getting notifications
  41 * for only 2 thresholds. This define enforces this, if there
  42 * is some wrong values returned by cpuid for number of thresholds.
  43 */
  44 #define MAX_NUMBER_OF_TRIPS     2
  45 
  46 struct zone_device {
  47         int                             cpu;
  48         bool                            work_scheduled;
  49         u32                             tj_max;
  50         u32                             msr_pkg_therm_low;
  51         u32                             msr_pkg_therm_high;
  52         struct delayed_work             work;
  53         struct thermal_zone_device      *tzone;
  54         struct cpumask                  cpumask;
  55 };
  56 
  57 static struct thermal_zone_params pkg_temp_tz_params = {
  58         .no_hwmon       = true,
  59 };
  60 
  61 /* Keep track of how many zone pointers we allocated in init() */
  62 static int max_id __read_mostly;
  63 /* Array of zone pointers */
  64 static struct zone_device **zones;
  65 /* Serializes interrupt notification, work and hotplug */
  66 static DEFINE_SPINLOCK(pkg_temp_lock);
  67 /* Protects zone operation in the work function against hotplug removal */
  68 static DEFINE_MUTEX(thermal_zone_mutex);
  69 
  70 /* The dynamically assigned cpu hotplug state for module_exit() */
  71 static enum cpuhp_state pkg_thermal_hp_state __read_mostly;
  72 
  73 /* Debug counters to show using debugfs */
  74 static struct dentry *debugfs;
  75 static unsigned int pkg_interrupt_cnt;
  76 static unsigned int pkg_work_cnt;
  77 
  78 static void pkg_temp_debugfs_init(void)
  79 {
  80         debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
  81 
  82         debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
  83                            &pkg_interrupt_cnt);
  84         debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
  85                            &pkg_work_cnt);
  86 }
  87 
  88 /*
  89  * Protection:
  90  *
  91  * - cpu hotplug: Read serialized by cpu hotplug lock
  92  *                Write must hold pkg_temp_lock
  93  *
  94  * - Other callsites: Must hold pkg_temp_lock
  95  */
  96 static struct zone_device *pkg_temp_thermal_get_dev(unsigned int cpu)
  97 {
  98         int id = topology_logical_die_id(cpu);
  99 
 100         if (id >= 0 && id < max_id)
 101                 return zones[id];
 102         return NULL;
 103 }
 104 
 105 /*
 106 * tj-max is is interesting because threshold is set relative to this
 107 * temperature.
 108 */
 109 static int get_tj_max(int cpu, u32 *tj_max)
 110 {
 111         u32 eax, edx, val;
 112         int err;
 113 
 114         err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
 115         if (err)
 116                 return err;
 117 
 118         val = (eax >> 16) & 0xff;
 119         *tj_max = val * 1000;
 120 
 121         return val ? 0 : -EINVAL;
 122 }
 123 
 124 static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
 125 {
 126         struct zone_device *zonedev = tzd->devdata;
 127         u32 eax, edx;
 128 
 129         rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_STATUS,
 130                         &eax, &edx);
 131         if (eax & 0x80000000) {
 132                 *temp = zonedev->tj_max - ((eax >> 16) & 0x7f) * 1000;
 133                 pr_debug("sys_get_curr_temp %d\n", *temp);
 134                 return 0;
 135         }
 136         return -EINVAL;
 137 }
 138 
 139 static int sys_get_trip_temp(struct thermal_zone_device *tzd,
 140                              int trip, int *temp)
 141 {
 142         struct zone_device *zonedev = tzd->devdata;
 143         unsigned long thres_reg_value;
 144         u32 mask, shift, eax, edx;
 145         int ret;
 146 
 147         if (trip >= MAX_NUMBER_OF_TRIPS)
 148                 return -EINVAL;
 149 
 150         if (trip) {
 151                 mask = THERM_MASK_THRESHOLD1;
 152                 shift = THERM_SHIFT_THRESHOLD1;
 153         } else {
 154                 mask = THERM_MASK_THRESHOLD0;
 155                 shift = THERM_SHIFT_THRESHOLD0;
 156         }
 157 
 158         ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
 159                            &eax, &edx);
 160         if (ret < 0)
 161                 return ret;
 162 
 163         thres_reg_value = (eax & mask) >> shift;
 164         if (thres_reg_value)
 165                 *temp = zonedev->tj_max - thres_reg_value * 1000;
 166         else
 167                 *temp = 0;
 168         pr_debug("sys_get_trip_temp %d\n", *temp);
 169 
 170         return 0;
 171 }
 172 
 173 static int
 174 sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp)
 175 {
 176         struct zone_device *zonedev = tzd->devdata;
 177         u32 l, h, mask, shift, intr;
 178         int ret;
 179 
 180         if (trip >= MAX_NUMBER_OF_TRIPS || temp >= zonedev->tj_max)
 181                 return -EINVAL;
 182 
 183         ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
 184                            &l, &h);
 185         if (ret < 0)
 186                 return ret;
 187 
 188         if (trip) {
 189                 mask = THERM_MASK_THRESHOLD1;
 190                 shift = THERM_SHIFT_THRESHOLD1;
 191                 intr = THERM_INT_THRESHOLD1_ENABLE;
 192         } else {
 193                 mask = THERM_MASK_THRESHOLD0;
 194                 shift = THERM_SHIFT_THRESHOLD0;
 195                 intr = THERM_INT_THRESHOLD0_ENABLE;
 196         }
 197         l &= ~mask;
 198         /*
 199         * When users space sets a trip temperature == 0, which is indication
 200         * that, it is no longer interested in receiving notifications.
 201         */
 202         if (!temp) {
 203                 l &= ~intr;
 204         } else {
 205                 l |= (zonedev->tj_max - temp)/1000 << shift;
 206                 l |= intr;
 207         }
 208 
 209         return wrmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
 210                         l, h);
 211 }
 212 
 213 static int sys_get_trip_type(struct thermal_zone_device *thermal, int trip,
 214                              enum thermal_trip_type *type)
 215 {
 216         *type = THERMAL_TRIP_PASSIVE;
 217         return 0;
 218 }
 219 
 220 /* Thermal zone callback registry */
 221 static struct thermal_zone_device_ops tzone_ops = {
 222         .get_temp = sys_get_curr_temp,
 223         .get_trip_temp = sys_get_trip_temp,
 224         .get_trip_type = sys_get_trip_type,
 225         .set_trip_temp = sys_set_trip_temp,
 226 };
 227 
 228 static bool pkg_thermal_rate_control(void)
 229 {
 230         return true;
 231 }
 232 
 233 /* Enable threshold interrupt on local package/cpu */
 234 static inline void enable_pkg_thres_interrupt(void)
 235 {
 236         u8 thres_0, thres_1;
 237         u32 l, h;
 238 
 239         rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 240         /* only enable/disable if it had valid threshold value */
 241         thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
 242         thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
 243         if (thres_0)
 244                 l |= THERM_INT_THRESHOLD0_ENABLE;
 245         if (thres_1)
 246                 l |= THERM_INT_THRESHOLD1_ENABLE;
 247         wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 248 }
 249 
 250 /* Disable threshold interrupt on local package/cpu */
 251 static inline void disable_pkg_thres_interrupt(void)
 252 {
 253         u32 l, h;
 254 
 255         rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 256 
 257         l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE);
 258         wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 259 }
 260 
 261 static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
 262 {
 263         struct thermal_zone_device *tzone = NULL;
 264         int cpu = smp_processor_id();
 265         struct zone_device *zonedev;
 266         u64 msr_val, wr_val;
 267 
 268         mutex_lock(&thermal_zone_mutex);
 269         spin_lock_irq(&pkg_temp_lock);
 270         ++pkg_work_cnt;
 271 
 272         zonedev = pkg_temp_thermal_get_dev(cpu);
 273         if (!zonedev) {
 274                 spin_unlock_irq(&pkg_temp_lock);
 275                 mutex_unlock(&thermal_zone_mutex);
 276                 return;
 277         }
 278         zonedev->work_scheduled = false;
 279 
 280         rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
 281         wr_val = msr_val & ~(THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
 282         if (wr_val != msr_val) {
 283                 wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, wr_val);
 284                 tzone = zonedev->tzone;
 285         }
 286 
 287         enable_pkg_thres_interrupt();
 288         spin_unlock_irq(&pkg_temp_lock);
 289 
 290         /*
 291          * If tzone is not NULL, then thermal_zone_mutex will prevent the
 292          * concurrent removal in the cpu offline callback.
 293          */
 294         if (tzone)
 295                 thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED);
 296 
 297         mutex_unlock(&thermal_zone_mutex);
 298 }
 299 
 300 static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
 301 {
 302         unsigned long ms = msecs_to_jiffies(notify_delay_ms);
 303 
 304         schedule_delayed_work_on(cpu, work, ms);
 305 }
 306 
 307 static int pkg_thermal_notify(u64 msr_val)
 308 {
 309         int cpu = smp_processor_id();
 310         struct zone_device *zonedev;
 311         unsigned long flags;
 312 
 313         spin_lock_irqsave(&pkg_temp_lock, flags);
 314         ++pkg_interrupt_cnt;
 315 
 316         disable_pkg_thres_interrupt();
 317 
 318         /* Work is per package, so scheduling it once is enough. */
 319         zonedev = pkg_temp_thermal_get_dev(cpu);
 320         if (zonedev && !zonedev->work_scheduled) {
 321                 zonedev->work_scheduled = true;
 322                 pkg_thermal_schedule_work(zonedev->cpu, &zonedev->work);
 323         }
 324 
 325         spin_unlock_irqrestore(&pkg_temp_lock, flags);
 326         return 0;
 327 }
 328 
 329 static int pkg_temp_thermal_device_add(unsigned int cpu)
 330 {
 331         int id = topology_logical_die_id(cpu);
 332         u32 tj_max, eax, ebx, ecx, edx;
 333         struct zone_device *zonedev;
 334         int thres_count, err;
 335 
 336         if (id >= max_id)
 337                 return -ENOMEM;
 338 
 339         cpuid(6, &eax, &ebx, &ecx, &edx);
 340         thres_count = ebx & 0x07;
 341         if (!thres_count)
 342                 return -ENODEV;
 343 
 344         thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
 345 
 346         err = get_tj_max(cpu, &tj_max);
 347         if (err)
 348                 return err;
 349 
 350         zonedev = kzalloc(sizeof(*zonedev), GFP_KERNEL);
 351         if (!zonedev)
 352                 return -ENOMEM;
 353 
 354         INIT_DELAYED_WORK(&zonedev->work, pkg_temp_thermal_threshold_work_fn);
 355         zonedev->cpu = cpu;
 356         zonedev->tj_max = tj_max;
 357         zonedev->tzone = thermal_zone_device_register("x86_pkg_temp",
 358                         thres_count,
 359                         (thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01,
 360                         zonedev, &tzone_ops, &pkg_temp_tz_params, 0, 0);
 361         if (IS_ERR(zonedev->tzone)) {
 362                 err = PTR_ERR(zonedev->tzone);
 363                 kfree(zonedev);
 364                 return err;
 365         }
 366         /* Store MSR value for package thermal interrupt, to restore at exit */
 367         rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, zonedev->msr_pkg_therm_low,
 368               zonedev->msr_pkg_therm_high);
 369 
 370         cpumask_set_cpu(cpu, &zonedev->cpumask);
 371         spin_lock_irq(&pkg_temp_lock);
 372         zones[id] = zonedev;
 373         spin_unlock_irq(&pkg_temp_lock);
 374         return 0;
 375 }
 376 
 377 static int pkg_thermal_cpu_offline(unsigned int cpu)
 378 {
 379         struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
 380         bool lastcpu, was_target;
 381         int target;
 382 
 383         if (!zonedev)
 384                 return 0;
 385 
 386         target = cpumask_any_but(&zonedev->cpumask, cpu);
 387         cpumask_clear_cpu(cpu, &zonedev->cpumask);
 388         lastcpu = target >= nr_cpu_ids;
 389         /*
 390          * Remove the sysfs files, if this is the last cpu in the package
 391          * before doing further cleanups.
 392          */
 393         if (lastcpu) {
 394                 struct thermal_zone_device *tzone = zonedev->tzone;
 395 
 396                 /*
 397                  * We must protect against a work function calling
 398                  * thermal_zone_update, after/while unregister. We null out
 399                  * the pointer under the zone mutex, so the worker function
 400                  * won't try to call.
 401                  */
 402                 mutex_lock(&thermal_zone_mutex);
 403                 zonedev->tzone = NULL;
 404                 mutex_unlock(&thermal_zone_mutex);
 405 
 406                 thermal_zone_device_unregister(tzone);
 407         }
 408 
 409         /* Protect against work and interrupts */
 410         spin_lock_irq(&pkg_temp_lock);
 411 
 412         /*
 413          * Check whether this cpu was the current target and store the new
 414          * one. When we drop the lock, then the interrupt notify function
 415          * will see the new target.
 416          */
 417         was_target = zonedev->cpu == cpu;
 418         zonedev->cpu = target;
 419 
 420         /*
 421          * If this is the last CPU in the package remove the package
 422          * reference from the array and restore the interrupt MSR. When we
 423          * drop the lock neither the interrupt notify function nor the
 424          * worker will see the package anymore.
 425          */
 426         if (lastcpu) {
 427                 zones[topology_logical_die_id(cpu)] = NULL;
 428                 /* After this point nothing touches the MSR anymore. */
 429                 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
 430                       zonedev->msr_pkg_therm_low, zonedev->msr_pkg_therm_high);
 431         }
 432 
 433         /*
 434          * Check whether there is work scheduled and whether the work is
 435          * targeted at the outgoing CPU.
 436          */
 437         if (zonedev->work_scheduled && was_target) {
 438                 /*
 439                  * To cancel the work we need to drop the lock, otherwise
 440                  * we might deadlock if the work needs to be flushed.
 441                  */
 442                 spin_unlock_irq(&pkg_temp_lock);
 443                 cancel_delayed_work_sync(&zonedev->work);
 444                 spin_lock_irq(&pkg_temp_lock);
 445                 /*
 446                  * If this is not the last cpu in the package and the work
 447                  * did not run after we dropped the lock above, then we
 448                  * need to reschedule the work, otherwise the interrupt
 449                  * stays disabled forever.
 450                  */
 451                 if (!lastcpu && zonedev->work_scheduled)
 452                         pkg_thermal_schedule_work(target, &zonedev->work);
 453         }
 454 
 455         spin_unlock_irq(&pkg_temp_lock);
 456 
 457         /* Final cleanup if this is the last cpu */
 458         if (lastcpu)
 459                 kfree(zonedev);
 460         return 0;
 461 }
 462 
 463 static int pkg_thermal_cpu_online(unsigned int cpu)
 464 {
 465         struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
 466         struct cpuinfo_x86 *c = &cpu_data(cpu);
 467 
 468         /* Paranoia check */
 469         if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS))
 470                 return -ENODEV;
 471 
 472         /* If the package exists, nothing to do */
 473         if (zonedev) {
 474                 cpumask_set_cpu(cpu, &zonedev->cpumask);
 475                 return 0;
 476         }
 477         return pkg_temp_thermal_device_add(cpu);
 478 }
 479 
 480 static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
 481         { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_PTS },
 482         {}
 483 };
 484 MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
 485 
 486 static int __init pkg_temp_thermal_init(void)
 487 {
 488         int ret;
 489 
 490         if (!x86_match_cpu(pkg_temp_thermal_ids))
 491                 return -ENODEV;
 492 
 493         max_id = topology_max_packages() * topology_max_die_per_package();
 494         zones = kcalloc(max_id, sizeof(struct zone_device *),
 495                            GFP_KERNEL);
 496         if (!zones)
 497                 return -ENOMEM;
 498 
 499         ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
 500                                 pkg_thermal_cpu_online, pkg_thermal_cpu_offline);
 501         if (ret < 0)
 502                 goto err;
 503 
 504         /* Store the state for module exit */
 505         pkg_thermal_hp_state = ret;
 506 
 507         platform_thermal_package_notify = pkg_thermal_notify;
 508         platform_thermal_package_rate_control = pkg_thermal_rate_control;
 509 
 510          /* Don't care if it fails */
 511         pkg_temp_debugfs_init();
 512         return 0;
 513 
 514 err:
 515         kfree(zones);
 516         return ret;
 517 }
 518 module_init(pkg_temp_thermal_init)
 519 
 520 static void __exit pkg_temp_thermal_exit(void)
 521 {
 522         platform_thermal_package_notify = NULL;
 523         platform_thermal_package_rate_control = NULL;
 524 
 525         cpuhp_remove_state(pkg_thermal_hp_state);
 526         debugfs_remove_recursive(debugfs);
 527         kfree(zones);
 528 }
 529 module_exit(pkg_temp_thermal_exit)
 530 
 531 MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
 532 MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
 533 MODULE_LICENSE("GPL v2");

/* [<][>][^][v][top][bottom][index][help] */