root/kernel/time/hrtimer.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. is_migration_base
  2. lock_hrtimer_base
  3. hrtimer_check_target
  4. get_target_base
  5. switch_hrtimer_base
  6. is_migration_base
  7. lock_hrtimer_base
  8. __ktime_divns
  9. ktime_add_safe
  10. hrtimer_debug_hint
  11. hrtimer_fixup_init
  12. hrtimer_fixup_activate
  13. hrtimer_fixup_free
  14. debug_hrtimer_init
  15. debug_hrtimer_activate
  16. debug_hrtimer_deactivate
  17. debug_hrtimer_free
  18. hrtimer_init_on_stack
  19. hrtimer_init_sleeper_on_stack
  20. destroy_hrtimer_on_stack
  21. debug_hrtimer_init
  22. debug_hrtimer_activate
  23. debug_hrtimer_deactivate
  24. debug_init
  25. debug_activate
  26. debug_deactivate
  27. __next_base
  28. __hrtimer_next_event_base
  29. __hrtimer_get_next_event
  30. hrtimer_update_base
  31. __hrtimer_hres_active
  32. hrtimer_hres_active
  33. hrtimer_force_reprogram
  34. setup_hrtimer_hres
  35. hrtimer_is_hres_enabled
  36. retrigger_next_event
  37. hrtimer_switch_to_hres
  38. clock_was_set_work
  39. clock_was_set_delayed
  40. hrtimer_is_hres_enabled
  41. hrtimer_switch_to_hres
  42. retrigger_next_event
  43. hrtimer_reprogram
  44. clock_was_set
  45. hrtimers_resume
  46. unlock_hrtimer_base
  47. hrtimer_forward
  48. enqueue_hrtimer
  49. __remove_hrtimer
  50. remove_hrtimer
  51. hrtimer_update_lowres
  52. hrtimer_update_softirq_timer
  53. __hrtimer_start_range_ns
  54. hrtimer_start_range_ns
  55. hrtimer_try_to_cancel
  56. hrtimer_cpu_base_init_expiry_lock
  57. hrtimer_cpu_base_lock_expiry
  58. hrtimer_cpu_base_unlock_expiry
  59. hrtimer_sync_wait_running
  60. hrtimer_cancel_wait_running
  61. hrtimer_cpu_base_init_expiry_lock
  62. hrtimer_cpu_base_lock_expiry
  63. hrtimer_cpu_base_unlock_expiry
  64. hrtimer_sync_wait_running
  65. hrtimer_cancel
  66. __hrtimer_get_remaining
  67. hrtimer_get_next_event
  68. hrtimer_next_event_without
  69. hrtimer_clockid_to_base
  70. __hrtimer_init
  71. hrtimer_init
  72. hrtimer_active
  73. __run_hrtimer
  74. __hrtimer_run_queues
  75. hrtimer_run_softirq
  76. hrtimer_interrupt
  77. __hrtimer_peek_ahead_timers
  78. __hrtimer_peek_ahead_timers
  79. hrtimer_run_queues
  80. hrtimer_wakeup
  81. hrtimer_sleeper_start_expires
  82. __hrtimer_init_sleeper
  83. hrtimer_init_sleeper
  84. nanosleep_copyout
  85. do_nanosleep
  86. hrtimer_nanosleep_restart
  87. hrtimer_nanosleep
  88. SYSCALL_DEFINE2
  89. SYSCALL_DEFINE2
  90. hrtimers_prepare_cpu
  91. migrate_hrtimer_list
  92. hrtimers_dead_cpu
  93. hrtimers_init
  94. schedule_hrtimeout_range_clock
  95. schedule_hrtimeout_range
  96. schedule_hrtimeout

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
   4  *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
   5  *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
   6  *
   7  *  High-resolution kernel timers
   8  *
   9  *  In contrast to the low-resolution timeout API, aka timer wheel,
  10  *  hrtimers provide finer resolution and accuracy depending on system
  11  *  configuration and capabilities.
  12  *
  13  *  Started by: Thomas Gleixner and Ingo Molnar
  14  *
  15  *  Credits:
  16  *      Based on the original timer wheel code
  17  *
  18  *      Help, testing, suggestions, bugfixes, improvements were
  19  *      provided by:
  20  *
  21  *      George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
  22  *      et. al.
  23  */
  24 
  25 #include <linux/cpu.h>
  26 #include <linux/export.h>
  27 #include <linux/percpu.h>
  28 #include <linux/hrtimer.h>
  29 #include <linux/notifier.h>
  30 #include <linux/syscalls.h>
  31 #include <linux/interrupt.h>
  32 #include <linux/tick.h>
  33 #include <linux/err.h>
  34 #include <linux/debugobjects.h>
  35 #include <linux/sched/signal.h>
  36 #include <linux/sched/sysctl.h>
  37 #include <linux/sched/rt.h>
  38 #include <linux/sched/deadline.h>
  39 #include <linux/sched/nohz.h>
  40 #include <linux/sched/debug.h>
  41 #include <linux/timer.h>
  42 #include <linux/freezer.h>
  43 #include <linux/compat.h>
  44 
  45 #include <linux/uaccess.h>
  46 
  47 #include <trace/events/timer.h>
  48 
  49 #include "tick-internal.h"
  50 
  51 /*
  52  * Masks for selecting the soft and hard context timers from
  53  * cpu_base->active
  54  */
  55 #define MASK_SHIFT              (HRTIMER_BASE_MONOTONIC_SOFT)
  56 #define HRTIMER_ACTIVE_HARD     ((1U << MASK_SHIFT) - 1)
  57 #define HRTIMER_ACTIVE_SOFT     (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
  58 #define HRTIMER_ACTIVE_ALL      (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
  59 
  60 /*
  61  * The timer bases:
  62  *
  63  * There are more clockids than hrtimer bases. Thus, we index
  64  * into the timer bases by the hrtimer_base_type enum. When trying
  65  * to reach a base using a clockid, hrtimer_clockid_to_base()
  66  * is used to convert from clockid to the proper hrtimer_base_type.
  67  */
  68 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
  69 {
  70         .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
  71         .clock_base =
  72         {
  73                 {
  74                         .index = HRTIMER_BASE_MONOTONIC,
  75                         .clockid = CLOCK_MONOTONIC,
  76                         .get_time = &ktime_get,
  77                 },
  78                 {
  79                         .index = HRTIMER_BASE_REALTIME,
  80                         .clockid = CLOCK_REALTIME,
  81                         .get_time = &ktime_get_real,
  82                 },
  83                 {
  84                         .index = HRTIMER_BASE_BOOTTIME,
  85                         .clockid = CLOCK_BOOTTIME,
  86                         .get_time = &ktime_get_boottime,
  87                 },
  88                 {
  89                         .index = HRTIMER_BASE_TAI,
  90                         .clockid = CLOCK_TAI,
  91                         .get_time = &ktime_get_clocktai,
  92                 },
  93                 {
  94                         .index = HRTIMER_BASE_MONOTONIC_SOFT,
  95                         .clockid = CLOCK_MONOTONIC,
  96                         .get_time = &ktime_get,
  97                 },
  98                 {
  99                         .index = HRTIMER_BASE_REALTIME_SOFT,
 100                         .clockid = CLOCK_REALTIME,
 101                         .get_time = &ktime_get_real,
 102                 },
 103                 {
 104                         .index = HRTIMER_BASE_BOOTTIME_SOFT,
 105                         .clockid = CLOCK_BOOTTIME,
 106                         .get_time = &ktime_get_boottime,
 107                 },
 108                 {
 109                         .index = HRTIMER_BASE_TAI_SOFT,
 110                         .clockid = CLOCK_TAI,
 111                         .get_time = &ktime_get_clocktai,
 112                 },
 113         }
 114 };
 115 
 116 static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
 117         /* Make sure we catch unsupported clockids */
 118         [0 ... MAX_CLOCKS - 1]  = HRTIMER_MAX_CLOCK_BASES,
 119 
 120         [CLOCK_REALTIME]        = HRTIMER_BASE_REALTIME,
 121         [CLOCK_MONOTONIC]       = HRTIMER_BASE_MONOTONIC,
 122         [CLOCK_BOOTTIME]        = HRTIMER_BASE_BOOTTIME,
 123         [CLOCK_TAI]             = HRTIMER_BASE_TAI,
 124 };
 125 
 126 /*
 127  * Functions and macros which are different for UP/SMP systems are kept in a
 128  * single place
 129  */
 130 #ifdef CONFIG_SMP
 131 
 132 /*
 133  * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
 134  * such that hrtimer_callback_running() can unconditionally dereference
 135  * timer->base->cpu_base
 136  */
 137 static struct hrtimer_cpu_base migration_cpu_base = {
 138         .clock_base = { { .cpu_base = &migration_cpu_base, }, },
 139 };
 140 
 141 #define migration_base  migration_cpu_base.clock_base[0]
 142 
 143 static inline bool is_migration_base(struct hrtimer_clock_base *base)
 144 {
 145         return base == &migration_base;
 146 }
 147 
 148 /*
 149  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
 150  * means that all timers which are tied to this base via timer->base are
 151  * locked, and the base itself is locked too.
 152  *
 153  * So __run_timers/migrate_timers can safely modify all timers which could
 154  * be found on the lists/queues.
 155  *
 156  * When the timer's base is locked, and the timer removed from list, it is
 157  * possible to set timer->base = &migration_base and drop the lock: the timer
 158  * remains locked.
 159  */
 160 static
 161 struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 162                                              unsigned long *flags)
 163 {
 164         struct hrtimer_clock_base *base;
 165 
 166         for (;;) {
 167                 base = READ_ONCE(timer->base);
 168                 if (likely(base != &migration_base)) {
 169                         raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
 170                         if (likely(base == timer->base))
 171                                 return base;
 172                         /* The timer has migrated to another CPU: */
 173                         raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
 174                 }
 175                 cpu_relax();
 176         }
 177 }
 178 
 179 /*
 180  * We do not migrate the timer when it is expiring before the next
 181  * event on the target cpu. When high resolution is enabled, we cannot
 182  * reprogram the target cpu hardware and we would cause it to fire
 183  * late. To keep it simple, we handle the high resolution enabled and
 184  * disabled case similar.
 185  *
 186  * Called with cpu_base->lock of target cpu held.
 187  */
 188 static int
 189 hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
 190 {
 191         ktime_t expires;
 192 
 193         expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
 194         return expires < new_base->cpu_base->expires_next;
 195 }
 196 
 197 static inline
 198 struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
 199                                          int pinned)
 200 {
 201 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 202         if (static_branch_likely(&timers_migration_enabled) && !pinned)
 203                 return &per_cpu(hrtimer_bases, get_nohz_timer_target());
 204 #endif
 205         return base;
 206 }
 207 
 208 /*
 209  * We switch the timer base to a power-optimized selected CPU target,
 210  * if:
 211  *      - NO_HZ_COMMON is enabled
 212  *      - timer migration is enabled
 213  *      - the timer callback is not running
 214  *      - the timer is not the first expiring timer on the new target
 215  *
 216  * If one of the above requirements is not fulfilled we move the timer
 217  * to the current CPU or leave it on the previously assigned CPU if
 218  * the timer callback is currently running.
 219  */
 220 static inline struct hrtimer_clock_base *
 221 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 222                     int pinned)
 223 {
 224         struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
 225         struct hrtimer_clock_base *new_base;
 226         int basenum = base->index;
 227 
 228         this_cpu_base = this_cpu_ptr(&hrtimer_bases);
 229         new_cpu_base = get_target_base(this_cpu_base, pinned);
 230 again:
 231         new_base = &new_cpu_base->clock_base[basenum];
 232 
 233         if (base != new_base) {
 234                 /*
 235                  * We are trying to move timer to new_base.
 236                  * However we can't change timer's base while it is running,
 237                  * so we keep it on the same CPU. No hassle vs. reprogramming
 238                  * the event source in the high resolution case. The softirq
 239                  * code will take care of this when the timer function has
 240                  * completed. There is no conflict as we hold the lock until
 241                  * the timer is enqueued.
 242                  */
 243                 if (unlikely(hrtimer_callback_running(timer)))
 244                         return base;
 245 
 246                 /* See the comment in lock_hrtimer_base() */
 247                 WRITE_ONCE(timer->base, &migration_base);
 248                 raw_spin_unlock(&base->cpu_base->lock);
 249                 raw_spin_lock(&new_base->cpu_base->lock);
 250 
 251                 if (new_cpu_base != this_cpu_base &&
 252                     hrtimer_check_target(timer, new_base)) {
 253                         raw_spin_unlock(&new_base->cpu_base->lock);
 254                         raw_spin_lock(&base->cpu_base->lock);
 255                         new_cpu_base = this_cpu_base;
 256                         WRITE_ONCE(timer->base, base);
 257                         goto again;
 258                 }
 259                 WRITE_ONCE(timer->base, new_base);
 260         } else {
 261                 if (new_cpu_base != this_cpu_base &&
 262                     hrtimer_check_target(timer, new_base)) {
 263                         new_cpu_base = this_cpu_base;
 264                         goto again;
 265                 }
 266         }
 267         return new_base;
 268 }
 269 
 270 #else /* CONFIG_SMP */
 271 
 272 static inline bool is_migration_base(struct hrtimer_clock_base *base)
 273 {
 274         return false;
 275 }
 276 
 277 static inline struct hrtimer_clock_base *
 278 lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 279 {
 280         struct hrtimer_clock_base *base = timer->base;
 281 
 282         raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
 283 
 284         return base;
 285 }
 286 
 287 # define switch_hrtimer_base(t, b, p)   (b)
 288 
 289 #endif  /* !CONFIG_SMP */
 290 
 291 /*
 292  * Functions for the union type storage format of ktime_t which are
 293  * too large for inlining:
 294  */
 295 #if BITS_PER_LONG < 64
 296 /*
 297  * Divide a ktime value by a nanosecond value
 298  */
 299 s64 __ktime_divns(const ktime_t kt, s64 div)
 300 {
 301         int sft = 0;
 302         s64 dclc;
 303         u64 tmp;
 304 
 305         dclc = ktime_to_ns(kt);
 306         tmp = dclc < 0 ? -dclc : dclc;
 307 
 308         /* Make sure the divisor is less than 2^32: */
 309         while (div >> 32) {
 310                 sft++;
 311                 div >>= 1;
 312         }
 313         tmp >>= sft;
 314         do_div(tmp, (unsigned long) div);
 315         return dclc < 0 ? -tmp : tmp;
 316 }
 317 EXPORT_SYMBOL_GPL(__ktime_divns);
 318 #endif /* BITS_PER_LONG >= 64 */
 319 
 320 /*
 321  * Add two ktime values and do a safety check for overflow:
 322  */
 323 ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
 324 {
 325         ktime_t res = ktime_add_unsafe(lhs, rhs);
 326 
 327         /*
 328          * We use KTIME_SEC_MAX here, the maximum timeout which we can
 329          * return to user space in a timespec:
 330          */
 331         if (res < 0 || res < lhs || res < rhs)
 332                 res = ktime_set(KTIME_SEC_MAX, 0);
 333 
 334         return res;
 335 }
 336 
 337 EXPORT_SYMBOL_GPL(ktime_add_safe);
 338 
 339 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 340 
 341 static struct debug_obj_descr hrtimer_debug_descr;
 342 
 343 static void *hrtimer_debug_hint(void *addr)
 344 {
 345         return ((struct hrtimer *) addr)->function;
 346 }
 347 
 348 /*
 349  * fixup_init is called when:
 350  * - an active object is initialized
 351  */
 352 static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
 353 {
 354         struct hrtimer *timer = addr;
 355 
 356         switch (state) {
 357         case ODEBUG_STATE_ACTIVE:
 358                 hrtimer_cancel(timer);
 359                 debug_object_init(timer, &hrtimer_debug_descr);
 360                 return true;
 361         default:
 362                 return false;
 363         }
 364 }
 365 
 366 /*
 367  * fixup_activate is called when:
 368  * - an active object is activated
 369  * - an unknown non-static object is activated
 370  */
 371 static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
 372 {
 373         switch (state) {
 374         case ODEBUG_STATE_ACTIVE:
 375                 WARN_ON(1);
 376                 /* fall through */
 377         default:
 378                 return false;
 379         }
 380 }
 381 
 382 /*
 383  * fixup_free is called when:
 384  * - an active object is freed
 385  */
 386 static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
 387 {
 388         struct hrtimer *timer = addr;
 389 
 390         switch (state) {
 391         case ODEBUG_STATE_ACTIVE:
 392                 hrtimer_cancel(timer);
 393                 debug_object_free(timer, &hrtimer_debug_descr);
 394                 return true;
 395         default:
 396                 return false;
 397         }
 398 }
 399 
 400 static struct debug_obj_descr hrtimer_debug_descr = {
 401         .name           = "hrtimer",
 402         .debug_hint     = hrtimer_debug_hint,
 403         .fixup_init     = hrtimer_fixup_init,
 404         .fixup_activate = hrtimer_fixup_activate,
 405         .fixup_free     = hrtimer_fixup_free,
 406 };
 407 
 408 static inline void debug_hrtimer_init(struct hrtimer *timer)
 409 {
 410         debug_object_init(timer, &hrtimer_debug_descr);
 411 }
 412 
 413 static inline void debug_hrtimer_activate(struct hrtimer *timer,
 414                                           enum hrtimer_mode mode)
 415 {
 416         debug_object_activate(timer, &hrtimer_debug_descr);
 417 }
 418 
 419 static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
 420 {
 421         debug_object_deactivate(timer, &hrtimer_debug_descr);
 422 }
 423 
 424 static inline void debug_hrtimer_free(struct hrtimer *timer)
 425 {
 426         debug_object_free(timer, &hrtimer_debug_descr);
 427 }
 428 
 429 static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 430                            enum hrtimer_mode mode);
 431 
 432 void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
 433                            enum hrtimer_mode mode)
 434 {
 435         debug_object_init_on_stack(timer, &hrtimer_debug_descr);
 436         __hrtimer_init(timer, clock_id, mode);
 437 }
 438 EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
 439 
 440 static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 441                                    clockid_t clock_id, enum hrtimer_mode mode);
 442 
 443 void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
 444                                    clockid_t clock_id, enum hrtimer_mode mode)
 445 {
 446         debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
 447         __hrtimer_init_sleeper(sl, clock_id, mode);
 448 }
 449 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
 450 
 451 void destroy_hrtimer_on_stack(struct hrtimer *timer)
 452 {
 453         debug_object_free(timer, &hrtimer_debug_descr);
 454 }
 455 EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
 456 
 457 #else
 458 
 459 static inline void debug_hrtimer_init(struct hrtimer *timer) { }
 460 static inline void debug_hrtimer_activate(struct hrtimer *timer,
 461                                           enum hrtimer_mode mode) { }
 462 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
 463 #endif
 464 
 465 static inline void
 466 debug_init(struct hrtimer *timer, clockid_t clockid,
 467            enum hrtimer_mode mode)
 468 {
 469         debug_hrtimer_init(timer);
 470         trace_hrtimer_init(timer, clockid, mode);
 471 }
 472 
 473 static inline void debug_activate(struct hrtimer *timer,
 474                                   enum hrtimer_mode mode)
 475 {
 476         debug_hrtimer_activate(timer, mode);
 477         trace_hrtimer_start(timer, mode);
 478 }
 479 
 480 static inline void debug_deactivate(struct hrtimer *timer)
 481 {
 482         debug_hrtimer_deactivate(timer);
 483         trace_hrtimer_cancel(timer);
 484 }
 485 
 486 static struct hrtimer_clock_base *
 487 __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
 488 {
 489         unsigned int idx;
 490 
 491         if (!*active)
 492                 return NULL;
 493 
 494         idx = __ffs(*active);
 495         *active &= ~(1U << idx);
 496 
 497         return &cpu_base->clock_base[idx];
 498 }
 499 
 500 #define for_each_active_base(base, cpu_base, active)    \
 501         while ((base = __next_base((cpu_base), &(active))))
 502 
 503 static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
 504                                          const struct hrtimer *exclude,
 505                                          unsigned int active,
 506                                          ktime_t expires_next)
 507 {
 508         struct hrtimer_clock_base *base;
 509         ktime_t expires;
 510 
 511         for_each_active_base(base, cpu_base, active) {
 512                 struct timerqueue_node *next;
 513                 struct hrtimer *timer;
 514 
 515                 next = timerqueue_getnext(&base->active);
 516                 timer = container_of(next, struct hrtimer, node);
 517                 if (timer == exclude) {
 518                         /* Get to the next timer in the queue. */
 519                         next = timerqueue_iterate_next(next);
 520                         if (!next)
 521                                 continue;
 522 
 523                         timer = container_of(next, struct hrtimer, node);
 524                 }
 525                 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
 526                 if (expires < expires_next) {
 527                         expires_next = expires;
 528 
 529                         /* Skip cpu_base update if a timer is being excluded. */
 530                         if (exclude)
 531                                 continue;
 532 
 533                         if (timer->is_soft)
 534                                 cpu_base->softirq_next_timer = timer;
 535                         else
 536                                 cpu_base->next_timer = timer;
 537                 }
 538         }
 539         /*
 540          * clock_was_set() might have changed base->offset of any of
 541          * the clock bases so the result might be negative. Fix it up
 542          * to prevent a false positive in clockevents_program_event().
 543          */
 544         if (expires_next < 0)
 545                 expires_next = 0;
 546         return expires_next;
 547 }
 548 
 549 /*
 550  * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
 551  * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
 552  *
 553  * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
 554  * those timers will get run whenever the softirq gets handled, at the end of
 555  * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
 556  *
 557  * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
 558  * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
 559  * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
 560  *
 561  * @active_mask must be one of:
 562  *  - HRTIMER_ACTIVE_ALL,
 563  *  - HRTIMER_ACTIVE_SOFT, or
 564  *  - HRTIMER_ACTIVE_HARD.
 565  */
 566 static ktime_t
 567 __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
 568 {
 569         unsigned int active;
 570         struct hrtimer *next_timer = NULL;
 571         ktime_t expires_next = KTIME_MAX;
 572 
 573         if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
 574                 active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
 575                 cpu_base->softirq_next_timer = NULL;
 576                 expires_next = __hrtimer_next_event_base(cpu_base, NULL,
 577                                                          active, KTIME_MAX);
 578 
 579                 next_timer = cpu_base->softirq_next_timer;
 580         }
 581 
 582         if (active_mask & HRTIMER_ACTIVE_HARD) {
 583                 active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
 584                 cpu_base->next_timer = next_timer;
 585                 expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
 586                                                          expires_next);
 587         }
 588 
 589         return expires_next;
 590 }
 591 
 592 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 593 {
 594         ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
 595         ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
 596         ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
 597 
 598         ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
 599                                             offs_real, offs_boot, offs_tai);
 600 
 601         base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
 602         base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
 603         base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
 604 
 605         return now;
 606 }
 607 
 608 /*
 609  * Is the high resolution mode active ?
 610  */
 611 static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
 612 {
 613         return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
 614                 cpu_base->hres_active : 0;
 615 }
 616 
 617 static inline int hrtimer_hres_active(void)
 618 {
 619         return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
 620 }
 621 
 622 /*
 623  * Reprogram the event source with checking both queues for the
 624  * next event
 625  * Called with interrupts disabled and base->lock held
 626  */
 627 static void
 628 hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 629 {
 630         ktime_t expires_next;
 631 
 632         /*
 633          * Find the current next expiration time.
 634          */
 635         expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
 636 
 637         if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
 638                 /*
 639                  * When the softirq is activated, hrtimer has to be
 640                  * programmed with the first hard hrtimer because soft
 641                  * timer interrupt could occur too late.
 642                  */
 643                 if (cpu_base->softirq_activated)
 644                         expires_next = __hrtimer_get_next_event(cpu_base,
 645                                                                 HRTIMER_ACTIVE_HARD);
 646                 else
 647                         cpu_base->softirq_expires_next = expires_next;
 648         }
 649 
 650         if (skip_equal && expires_next == cpu_base->expires_next)
 651                 return;
 652 
 653         cpu_base->expires_next = expires_next;
 654 
 655         /*
 656          * If hres is not active, hardware does not have to be
 657          * reprogrammed yet.
 658          *
 659          * If a hang was detected in the last timer interrupt then we
 660          * leave the hang delay active in the hardware. We want the
 661          * system to make progress. That also prevents the following
 662          * scenario:
 663          * T1 expires 50ms from now
 664          * T2 expires 5s from now
 665          *
 666          * T1 is removed, so this code is called and would reprogram
 667          * the hardware to 5s from now. Any hrtimer_start after that
 668          * will not reprogram the hardware due to hang_detected being
 669          * set. So we'd effectivly block all timers until the T2 event
 670          * fires.
 671          */
 672         if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
 673                 return;
 674 
 675         tick_program_event(cpu_base->expires_next, 1);
 676 }
 677 
 678 /* High resolution timer related functions */
 679 #ifdef CONFIG_HIGH_RES_TIMERS
 680 
 681 /*
 682  * High resolution timer enabled ?
 683  */
 684 static bool hrtimer_hres_enabled __read_mostly  = true;
 685 unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
 686 EXPORT_SYMBOL_GPL(hrtimer_resolution);
 687 
 688 /*
 689  * Enable / Disable high resolution mode
 690  */
 691 static int __init setup_hrtimer_hres(char *str)
 692 {
 693         return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
 694 }
 695 
 696 __setup("highres=", setup_hrtimer_hres);
 697 
 698 /*
 699  * hrtimer_high_res_enabled - query, if the highres mode is enabled
 700  */
 701 static inline int hrtimer_is_hres_enabled(void)
 702 {
 703         return hrtimer_hres_enabled;
 704 }
 705 
 706 /*
 707  * Retrigger next event is called after clock was set
 708  *
 709  * Called with interrupts disabled via on_each_cpu()
 710  */
 711 static void retrigger_next_event(void *arg)
 712 {
 713         struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
 714 
 715         if (!__hrtimer_hres_active(base))
 716                 return;
 717 
 718         raw_spin_lock(&base->lock);
 719         hrtimer_update_base(base);
 720         hrtimer_force_reprogram(base, 0);
 721         raw_spin_unlock(&base->lock);
 722 }
 723 
 724 /*
 725  * Switch to high resolution mode
 726  */
 727 static void hrtimer_switch_to_hres(void)
 728 {
 729         struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
 730 
 731         if (tick_init_highres()) {
 732                 pr_warn("Could not switch to high resolution mode on CPU %u\n",
 733                         base->cpu);
 734                 return;
 735         }
 736         base->hres_active = 1;
 737         hrtimer_resolution = HIGH_RES_NSEC;
 738 
 739         tick_setup_sched_timer();
 740         /* "Retrigger" the interrupt to get things going */
 741         retrigger_next_event(NULL);
 742 }
 743 
 744 static void clock_was_set_work(struct work_struct *work)
 745 {
 746         clock_was_set();
 747 }
 748 
 749 static DECLARE_WORK(hrtimer_work, clock_was_set_work);
 750 
 751 /*
 752  * Called from timekeeping and resume code to reprogram the hrtimer
 753  * interrupt device on all cpus.
 754  */
 755 void clock_was_set_delayed(void)
 756 {
 757         schedule_work(&hrtimer_work);
 758 }
 759 
 760 #else
 761 
 762 static inline int hrtimer_is_hres_enabled(void) { return 0; }
 763 static inline void hrtimer_switch_to_hres(void) { }
 764 static inline void retrigger_next_event(void *arg) { }
 765 
 766 #endif /* CONFIG_HIGH_RES_TIMERS */
 767 
 768 /*
 769  * When a timer is enqueued and expires earlier than the already enqueued
 770  * timers, we have to check, whether it expires earlier than the timer for
 771  * which the clock event device was armed.
 772  *
 773  * Called with interrupts disabled and base->cpu_base.lock held
 774  */
 775 static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
 776 {
 777         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
 778         struct hrtimer_clock_base *base = timer->base;
 779         ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
 780 
 781         WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
 782 
 783         /*
 784          * CLOCK_REALTIME timer might be requested with an absolute
 785          * expiry time which is less than base->offset. Set it to 0.
 786          */
 787         if (expires < 0)
 788                 expires = 0;
 789 
 790         if (timer->is_soft) {
 791                 /*
 792                  * soft hrtimer could be started on a remote CPU. In this
 793                  * case softirq_expires_next needs to be updated on the
 794                  * remote CPU. The soft hrtimer will not expire before the
 795                  * first hard hrtimer on the remote CPU -
 796                  * hrtimer_check_target() prevents this case.
 797                  */
 798                 struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
 799 
 800                 if (timer_cpu_base->softirq_activated)
 801                         return;
 802 
 803                 if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
 804                         return;
 805 
 806                 timer_cpu_base->softirq_next_timer = timer;
 807                 timer_cpu_base->softirq_expires_next = expires;
 808 
 809                 if (!ktime_before(expires, timer_cpu_base->expires_next) ||
 810                     !reprogram)
 811                         return;
 812         }
 813 
 814         /*
 815          * If the timer is not on the current cpu, we cannot reprogram
 816          * the other cpus clock event device.
 817          */
 818         if (base->cpu_base != cpu_base)
 819                 return;
 820 
 821         /*
 822          * If the hrtimer interrupt is running, then it will
 823          * reevaluate the clock bases and reprogram the clock event
 824          * device. The callbacks are always executed in hard interrupt
 825          * context so we don't need an extra check for a running
 826          * callback.
 827          */
 828         if (cpu_base->in_hrtirq)
 829                 return;
 830 
 831         if (expires >= cpu_base->expires_next)
 832                 return;
 833 
 834         /* Update the pointer to the next expiring timer */
 835         cpu_base->next_timer = timer;
 836         cpu_base->expires_next = expires;
 837 
 838         /*
 839          * If hres is not active, hardware does not have to be
 840          * programmed yet.
 841          *
 842          * If a hang was detected in the last timer interrupt then we
 843          * do not schedule a timer which is earlier than the expiry
 844          * which we enforced in the hang detection. We want the system
 845          * to make progress.
 846          */
 847         if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
 848                 return;
 849 
 850         /*
 851          * Program the timer hardware. We enforce the expiry for
 852          * events which are already in the past.
 853          */
 854         tick_program_event(expires, 1);
 855 }
 856 
 857 /*
 858  * Clock realtime was set
 859  *
 860  * Change the offset of the realtime clock vs. the monotonic
 861  * clock.
 862  *
 863  * We might have to reprogram the high resolution timer interrupt. On
 864  * SMP we call the architecture specific code to retrigger _all_ high
 865  * resolution timer interrupts. On UP we just disable interrupts and
 866  * call the high resolution interrupt code.
 867  */
 868 void clock_was_set(void)
 869 {
 870 #ifdef CONFIG_HIGH_RES_TIMERS
 871         /* Retrigger the CPU local events everywhere */
 872         on_each_cpu(retrigger_next_event, NULL, 1);
 873 #endif
 874         timerfd_clock_was_set();
 875 }
 876 
 877 /*
 878  * During resume we might have to reprogram the high resolution timer
 879  * interrupt on all online CPUs.  However, all other CPUs will be
 880  * stopped with IRQs interrupts disabled so the clock_was_set() call
 881  * must be deferred.
 882  */
 883 void hrtimers_resume(void)
 884 {
 885         lockdep_assert_irqs_disabled();
 886         /* Retrigger on the local CPU */
 887         retrigger_next_event(NULL);
 888         /* And schedule a retrigger for all others */
 889         clock_was_set_delayed();
 890 }
 891 
 892 /*
 893  * Counterpart to lock_hrtimer_base above:
 894  */
 895 static inline
 896 void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 897 {
 898         raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
 899 }
 900 
 901 /**
 902  * hrtimer_forward - forward the timer expiry
 903  * @timer:      hrtimer to forward
 904  * @now:        forward past this time
 905  * @interval:   the interval to forward
 906  *
 907  * Forward the timer expiry so it will expire in the future.
 908  * Returns the number of overruns.
 909  *
 910  * Can be safely called from the callback function of @timer. If
 911  * called from other contexts @timer must neither be enqueued nor
 912  * running the callback and the caller needs to take care of
 913  * serialization.
 914  *
 915  * Note: This only updates the timer expiry value and does not requeue
 916  * the timer.
 917  */
 918 u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 919 {
 920         u64 orun = 1;
 921         ktime_t delta;
 922 
 923         delta = ktime_sub(now, hrtimer_get_expires(timer));
 924 
 925         if (delta < 0)
 926                 return 0;
 927 
 928         if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
 929                 return 0;
 930 
 931         if (interval < hrtimer_resolution)
 932                 interval = hrtimer_resolution;
 933 
 934         if (unlikely(delta >= interval)) {
 935                 s64 incr = ktime_to_ns(interval);
 936 
 937                 orun = ktime_divns(delta, incr);
 938                 hrtimer_add_expires_ns(timer, incr * orun);
 939                 if (hrtimer_get_expires_tv64(timer) > now)
 940                         return orun;
 941                 /*
 942                  * This (and the ktime_add() below) is the
 943                  * correction for exact:
 944                  */
 945                 orun++;
 946         }
 947         hrtimer_add_expires(timer, interval);
 948 
 949         return orun;
 950 }
 951 EXPORT_SYMBOL_GPL(hrtimer_forward);
 952 
 953 /*
 954  * enqueue_hrtimer - internal function to (re)start a timer
 955  *
 956  * The timer is inserted in expiry order. Insertion into the
 957  * red black tree is O(log(n)). Must hold the base lock.
 958  *
 959  * Returns 1 when the new timer is the leftmost timer in the tree.
 960  */
 961 static int enqueue_hrtimer(struct hrtimer *timer,
 962                            struct hrtimer_clock_base *base,
 963                            enum hrtimer_mode mode)
 964 {
 965         debug_activate(timer, mode);
 966 
 967         base->cpu_base->active_bases |= 1 << base->index;
 968 
 969         /* Pairs with the lockless read in hrtimer_is_queued() */
 970         WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);
 971 
 972         return timerqueue_add(&base->active, &timer->node);
 973 }
 974 
 975 /*
 976  * __remove_hrtimer - internal function to remove a timer
 977  *
 978  * Caller must hold the base lock.
 979  *
 980  * High resolution timer mode reprograms the clock event device when the
 981  * timer is the one which expires next. The caller can disable this by setting
 982  * reprogram to zero. This is useful, when the context does a reprogramming
 983  * anyway (e.g. timer interrupt)
 984  */
 985 static void __remove_hrtimer(struct hrtimer *timer,
 986                              struct hrtimer_clock_base *base,
 987                              u8 newstate, int reprogram)
 988 {
 989         struct hrtimer_cpu_base *cpu_base = base->cpu_base;
 990         u8 state = timer->state;
 991 
 992         /* Pairs with the lockless read in hrtimer_is_queued() */
 993         WRITE_ONCE(timer->state, newstate);
 994         if (!(state & HRTIMER_STATE_ENQUEUED))
 995                 return;
 996 
 997         if (!timerqueue_del(&base->active, &timer->node))
 998                 cpu_base->active_bases &= ~(1 << base->index);
 999 
1000         /*
1001          * Note: If reprogram is false we do not update
1002          * cpu_base->next_timer. This happens when we remove the first
1003          * timer on a remote cpu. No harm as we never dereference
1004          * cpu_base->next_timer. So the worst thing what can happen is
1005          * an superflous call to hrtimer_force_reprogram() on the
1006          * remote cpu later on if the same timer gets enqueued again.
1007          */
1008         if (reprogram && timer == cpu_base->next_timer)
1009                 hrtimer_force_reprogram(cpu_base, 1);
1010 }
1011 
1012 /*
1013  * remove hrtimer, called with base lock held
1014  */
1015 static inline int
1016 remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
1017 {
1018         u8 state = timer->state;
1019 
1020         if (state & HRTIMER_STATE_ENQUEUED) {
1021                 int reprogram;
1022 
1023                 /*
1024                  * Remove the timer and force reprogramming when high
1025                  * resolution mode is active and the timer is on the current
1026                  * CPU. If we remove a timer on another CPU, reprogramming is
1027                  * skipped. The interrupt event on this CPU is fired and
1028                  * reprogramming happens in the interrupt handler. This is a
1029                  * rare case and less expensive than a smp call.
1030                  */
1031                 debug_deactivate(timer);
1032                 reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
1033 
1034                 if (!restart)
1035                         state = HRTIMER_STATE_INACTIVE;
1036 
1037                 __remove_hrtimer(timer, base, state, reprogram);
1038                 return 1;
1039         }
1040         return 0;
1041 }
1042 
1043 static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
1044                                             const enum hrtimer_mode mode)
1045 {
1046 #ifdef CONFIG_TIME_LOW_RES
1047         /*
1048          * CONFIG_TIME_LOW_RES indicates that the system has no way to return
1049          * granular time values. For relative timers we add hrtimer_resolution
1050          * (i.e. one jiffie) to prevent short timeouts.
1051          */
1052         timer->is_rel = mode & HRTIMER_MODE_REL;
1053         if (timer->is_rel)
1054                 tim = ktime_add_safe(tim, hrtimer_resolution);
1055 #endif
1056         return tim;
1057 }
1058 
1059 static void
1060 hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
1061 {
1062         ktime_t expires;
1063 
1064         /*
1065          * Find the next SOFT expiration.
1066          */
1067         expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
1068 
1069         /*
1070          * reprogramming needs to be triggered, even if the next soft
1071          * hrtimer expires at the same time than the next hard
1072          * hrtimer. cpu_base->softirq_expires_next needs to be updated!
1073          */
1074         if (expires == KTIME_MAX)
1075                 return;
1076 
1077         /*
1078          * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
1079          * cpu_base->*expires_next is only set by hrtimer_reprogram()
1080          */
1081         hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
1082 }
1083 
1084 static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1085                                     u64 delta_ns, const enum hrtimer_mode mode,
1086                                     struct hrtimer_clock_base *base)
1087 {
1088         struct hrtimer_clock_base *new_base;
1089 
1090         /* Remove an active timer from the queue: */
1091         remove_hrtimer(timer, base, true);
1092 
1093         if (mode & HRTIMER_MODE_REL)
1094                 tim = ktime_add_safe(tim, base->get_time());
1095 
1096         tim = hrtimer_update_lowres(timer, tim, mode);
1097 
1098         hrtimer_set_expires_range_ns(timer, tim, delta_ns);
1099 
1100         /* Switch the timer base, if necessary: */
1101         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
1102 
1103         return enqueue_hrtimer(timer, new_base, mode);
1104 }
1105 
1106 /**
1107  * hrtimer_start_range_ns - (re)start an hrtimer
1108  * @timer:      the timer to be added
1109  * @tim:        expiry time
1110  * @delta_ns:   "slack" range for the timer
1111  * @mode:       timer mode: absolute (HRTIMER_MODE_ABS) or
1112  *              relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
1113  *              softirq based mode is considered for debug purpose only!
1114  */
1115 void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1116                             u64 delta_ns, const enum hrtimer_mode mode)
1117 {
1118         struct hrtimer_clock_base *base;
1119         unsigned long flags;
1120 
1121         /*
1122          * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
1123          * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
1124          * expiry mode because unmarked timers are moved to softirq expiry.
1125          */
1126         if (!IS_ENABLED(CONFIG_PREEMPT_RT))
1127                 WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
1128         else
1129                 WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
1130 
1131         base = lock_hrtimer_base(timer, &flags);
1132 
1133         if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
1134                 hrtimer_reprogram(timer, true);
1135 
1136         unlock_hrtimer_base(timer, &flags);
1137 }
1138 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
1139 
1140 /**
1141  * hrtimer_try_to_cancel - try to deactivate a timer
1142  * @timer:      hrtimer to stop
1143  *
1144  * Returns:
1145  *
1146  *  *  0 when the timer was not active
1147  *  *  1 when the timer was active
1148  *  * -1 when the timer is currently executing the callback function and
1149  *    cannot be stopped
1150  */
1151 int hrtimer_try_to_cancel(struct hrtimer *timer)
1152 {
1153         struct hrtimer_clock_base *base;
1154         unsigned long flags;
1155         int ret = -1;
1156 
1157         /*
1158          * Check lockless first. If the timer is not active (neither
1159          * enqueued nor running the callback, nothing to do here.  The
1160          * base lock does not serialize against a concurrent enqueue,
1161          * so we can avoid taking it.
1162          */
1163         if (!hrtimer_active(timer))
1164                 return 0;
1165 
1166         base = lock_hrtimer_base(timer, &flags);
1167 
1168         if (!hrtimer_callback_running(timer))
1169                 ret = remove_hrtimer(timer, base, false);
1170 
1171         unlock_hrtimer_base(timer, &flags);
1172 
1173         return ret;
1174 
1175 }
1176 EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
1177 
1178 #ifdef CONFIG_PREEMPT_RT
1179 static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
1180 {
1181         spin_lock_init(&base->softirq_expiry_lock);
1182 }
1183 
1184 static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
1185 {
1186         spin_lock(&base->softirq_expiry_lock);
1187 }
1188 
1189 static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
1190 {
1191         spin_unlock(&base->softirq_expiry_lock);
1192 }
1193 
1194 /*
1195  * The counterpart to hrtimer_cancel_wait_running().
1196  *
1197  * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
1198  * the timer callback to finish. Drop expiry_lock and reaquire it. That
1199  * allows the waiter to acquire the lock and make progress.
1200  */
1201 static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
1202                                       unsigned long flags)
1203 {
1204         if (atomic_read(&cpu_base->timer_waiters)) {
1205                 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1206                 spin_unlock(&cpu_base->softirq_expiry_lock);
1207                 spin_lock(&cpu_base->softirq_expiry_lock);
1208                 raw_spin_lock_irq(&cpu_base->lock);
1209         }
1210 }
1211 
1212 /*
1213  * This function is called on PREEMPT_RT kernels when the fast path
1214  * deletion of a timer failed because the timer callback function was
1215  * running.
1216  *
1217  * This prevents priority inversion: if the soft irq thread is preempted
1218  * in the middle of a timer callback, then calling del_timer_sync() can
1219  * lead to two issues:
1220  *
1221  *  - If the caller is on a remote CPU then it has to spin wait for the timer
1222  *    handler to complete. This can result in unbound priority inversion.
1223  *
1224  *  - If the caller originates from the task which preempted the timer
1225  *    handler on the same CPU, then spin waiting for the timer handler to
1226  *    complete is never going to end.
1227  */
1228 void hrtimer_cancel_wait_running(const struct hrtimer *timer)
1229 {
1230         /* Lockless read. Prevent the compiler from reloading it below */
1231         struct hrtimer_clock_base *base = READ_ONCE(timer->base);
1232 
1233         /*
1234          * Just relax if the timer expires in hard interrupt context or if
1235          * it is currently on the migration base.
1236          */
1237         if (!timer->is_soft || is_migration_base(base)) {
1238                 cpu_relax();
1239                 return;
1240         }
1241 
1242         /*
1243          * Mark the base as contended and grab the expiry lock, which is
1244          * held by the softirq across the timer callback. Drop the lock
1245          * immediately so the softirq can expire the next timer. In theory
1246          * the timer could already be running again, but that's more than
1247          * unlikely and just causes another wait loop.
1248          */
1249         atomic_inc(&base->cpu_base->timer_waiters);
1250         spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
1251         atomic_dec(&base->cpu_base->timer_waiters);
1252         spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
1253 }
1254 #else
1255 static inline void
1256 hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
1257 static inline void
1258 hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
1259 static inline void
1260 hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
1261 static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
1262                                              unsigned long flags) { }
1263 #endif
1264 
1265 /**
1266  * hrtimer_cancel - cancel a timer and wait for the handler to finish.
1267  * @timer:      the timer to be cancelled
1268  *
1269  * Returns:
1270  *  0 when the timer was not active
1271  *  1 when the timer was active
1272  */
1273 int hrtimer_cancel(struct hrtimer *timer)
1274 {
1275         int ret;
1276 
1277         do {
1278                 ret = hrtimer_try_to_cancel(timer);
1279 
1280                 if (ret < 0)
1281                         hrtimer_cancel_wait_running(timer);
1282         } while (ret < 0);
1283         return ret;
1284 }
1285 EXPORT_SYMBOL_GPL(hrtimer_cancel);
1286 
1287 /**
1288  * hrtimer_get_remaining - get remaining time for the timer
1289  * @timer:      the timer to read
1290  * @adjust:     adjust relative timers when CONFIG_TIME_LOW_RES=y
1291  */
1292 ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
1293 {
1294         unsigned long flags;
1295         ktime_t rem;
1296 
1297         lock_hrtimer_base(timer, &flags);
1298         if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
1299                 rem = hrtimer_expires_remaining_adjusted(timer);
1300         else
1301                 rem = hrtimer_expires_remaining(timer);
1302         unlock_hrtimer_base(timer, &flags);
1303 
1304         return rem;
1305 }
1306 EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
1307 
1308 #ifdef CONFIG_NO_HZ_COMMON
1309 /**
1310  * hrtimer_get_next_event - get the time until next expiry event
1311  *
1312  * Returns the next expiry time or KTIME_MAX if no timer is pending.
1313  */
1314 u64 hrtimer_get_next_event(void)
1315 {
1316         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1317         u64 expires = KTIME_MAX;
1318         unsigned long flags;
1319 
1320         raw_spin_lock_irqsave(&cpu_base->lock, flags);
1321 
1322         if (!__hrtimer_hres_active(cpu_base))
1323                 expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
1324 
1325         raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1326 
1327         return expires;
1328 }
1329 
1330 /**
1331  * hrtimer_next_event_without - time until next expiry event w/o one timer
1332  * @exclude:    timer to exclude
1333  *
1334  * Returns the next expiry time over all timers except for the @exclude one or
1335  * KTIME_MAX if none of them is pending.
1336  */
1337 u64 hrtimer_next_event_without(const struct hrtimer *exclude)
1338 {
1339         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1340         u64 expires = KTIME_MAX;
1341         unsigned long flags;
1342 
1343         raw_spin_lock_irqsave(&cpu_base->lock, flags);
1344 
1345         if (__hrtimer_hres_active(cpu_base)) {
1346                 unsigned int active;
1347 
1348                 if (!cpu_base->softirq_activated) {
1349                         active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
1350                         expires = __hrtimer_next_event_base(cpu_base, exclude,
1351                                                             active, KTIME_MAX);
1352                 }
1353                 active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
1354                 expires = __hrtimer_next_event_base(cpu_base, exclude, active,
1355                                                     expires);
1356         }
1357 
1358         raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1359 
1360         return expires;
1361 }
1362 #endif
1363 
1364 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
1365 {
1366         if (likely(clock_id < MAX_CLOCKS)) {
1367                 int base = hrtimer_clock_to_base_table[clock_id];
1368 
1369                 if (likely(base != HRTIMER_MAX_CLOCK_BASES))
1370                         return base;
1371         }
1372         WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
1373         return HRTIMER_BASE_MONOTONIC;
1374 }
1375 
1376 static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1377                            enum hrtimer_mode mode)
1378 {
1379         bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
1380         struct hrtimer_cpu_base *cpu_base;
1381         int base;
1382 
1383         /*
1384          * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
1385          * marked for hard interrupt expiry mode are moved into soft
1386          * interrupt context for latency reasons and because the callbacks
1387          * can invoke functions which might sleep on RT, e.g. spin_lock().
1388          */
1389         if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
1390                 softtimer = true;
1391 
1392         memset(timer, 0, sizeof(struct hrtimer));
1393 
1394         cpu_base = raw_cpu_ptr(&hrtimer_bases);
1395 
1396         /*
1397          * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
1398          * clock modifications, so they needs to become CLOCK_MONOTONIC to
1399          * ensure POSIX compliance.
1400          */
1401         if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
1402                 clock_id = CLOCK_MONOTONIC;
1403 
1404         base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
1405         base += hrtimer_clockid_to_base(clock_id);
1406         timer->is_soft = softtimer;
1407         timer->is_hard = !softtimer;
1408         timer->base = &cpu_base->clock_base[base];
1409         timerqueue_init(&timer->node);
1410 }
1411 
1412 /**
1413  * hrtimer_init - initialize a timer to the given clock
1414  * @timer:      the timer to be initialized
1415  * @clock_id:   the clock to be used
1416  * @mode:       The modes which are relevant for intitialization:
1417  *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
1418  *              HRTIMER_MODE_REL_SOFT
1419  *
1420  *              The PINNED variants of the above can be handed in,
1421  *              but the PINNED bit is ignored as pinning happens
1422  *              when the hrtimer is started
1423  */
1424 void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1425                   enum hrtimer_mode mode)
1426 {
1427         debug_init(timer, clock_id, mode);
1428         __hrtimer_init(timer, clock_id, mode);
1429 }
1430 EXPORT_SYMBOL_GPL(hrtimer_init);
1431 
1432 /*
1433  * A timer is active, when it is enqueued into the rbtree or the
1434  * callback function is running or it's in the state of being migrated
1435  * to another cpu.
1436  *
1437  * It is important for this function to not return a false negative.
1438  */
1439 bool hrtimer_active(const struct hrtimer *timer)
1440 {
1441         struct hrtimer_clock_base *base;
1442         unsigned int seq;
1443 
1444         do {
1445                 base = READ_ONCE(timer->base);
1446                 seq = raw_read_seqcount_begin(&base->seq);
1447 
1448                 if (timer->state != HRTIMER_STATE_INACTIVE ||
1449                     base->running == timer)
1450                         return true;
1451 
1452         } while (read_seqcount_retry(&base->seq, seq) ||
1453                  base != READ_ONCE(timer->base));
1454 
1455         return false;
1456 }
1457 EXPORT_SYMBOL_GPL(hrtimer_active);
1458 
1459 /*
1460  * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
1461  * distinct sections:
1462  *
1463  *  - queued:   the timer is queued
1464  *  - callback: the timer is being ran
1465  *  - post:     the timer is inactive or (re)queued
1466  *
1467  * On the read side we ensure we observe timer->state and cpu_base->running
1468  * from the same section, if anything changed while we looked at it, we retry.
1469  * This includes timer->base changing because sequence numbers alone are
1470  * insufficient for that.
1471  *
1472  * The sequence numbers are required because otherwise we could still observe
1473  * a false negative if the read side got smeared over multiple consequtive
1474  * __run_hrtimer() invocations.
1475  */
1476 
1477 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
1478                           struct hrtimer_clock_base *base,
1479                           struct hrtimer *timer, ktime_t *now,
1480                           unsigned long flags)
1481 {
1482         enum hrtimer_restart (*fn)(struct hrtimer *);
1483         int restart;
1484 
1485         lockdep_assert_held(&cpu_base->lock);
1486 
1487         debug_deactivate(timer);
1488         base->running = timer;
1489 
1490         /*
1491          * Separate the ->running assignment from the ->state assignment.
1492          *
1493          * As with a regular write barrier, this ensures the read side in
1494          * hrtimer_active() cannot observe base->running == NULL &&
1495          * timer->state == INACTIVE.
1496          */
1497         raw_write_seqcount_barrier(&base->seq);
1498 
1499         __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
1500         fn = timer->function;
1501 
1502         /*
1503          * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
1504          * timer is restarted with a period then it becomes an absolute
1505          * timer. If its not restarted it does not matter.
1506          */
1507         if (IS_ENABLED(CONFIG_TIME_LOW_RES))
1508                 timer->is_rel = false;
1509 
1510         /*
1511          * The timer is marked as running in the CPU base, so it is
1512          * protected against migration to a different CPU even if the lock
1513          * is dropped.
1514          */
1515         raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1516         trace_hrtimer_expire_entry(timer, now);
1517         restart = fn(timer);
1518         trace_hrtimer_expire_exit(timer);
1519         raw_spin_lock_irq(&cpu_base->lock);
1520 
1521         /*
1522          * Note: We clear the running state after enqueue_hrtimer and
1523          * we do not reprogram the event hardware. Happens either in
1524          * hrtimer_start_range_ns() or in hrtimer_interrupt()
1525          *
1526          * Note: Because we dropped the cpu_base->lock above,
1527          * hrtimer_start_range_ns() can have popped in and enqueued the timer
1528          * for us already.
1529          */
1530         if (restart != HRTIMER_NORESTART &&
1531             !(timer->state & HRTIMER_STATE_ENQUEUED))
1532                 enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
1533 
1534         /*
1535          * Separate the ->running assignment from the ->state assignment.
1536          *
1537          * As with a regular write barrier, this ensures the read side in
1538          * hrtimer_active() cannot observe base->running.timer == NULL &&
1539          * timer->state == INACTIVE.
1540          */
1541         raw_write_seqcount_barrier(&base->seq);
1542 
1543         WARN_ON_ONCE(base->running != timer);
1544         base->running = NULL;
1545 }
1546 
1547 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
1548                                  unsigned long flags, unsigned int active_mask)
1549 {
1550         struct hrtimer_clock_base *base;
1551         unsigned int active = cpu_base->active_bases & active_mask;
1552 
1553         for_each_active_base(base, cpu_base, active) {
1554                 struct timerqueue_node *node;
1555                 ktime_t basenow;
1556 
1557                 basenow = ktime_add(now, base->offset);
1558 
1559                 while ((node = timerqueue_getnext(&base->active))) {
1560                         struct hrtimer *timer;
1561 
1562                         timer = container_of(node, struct hrtimer, node);
1563 
1564                         /*
1565                          * The immediate goal for using the softexpires is
1566                          * minimizing wakeups, not running timers at the
1567                          * earliest interrupt after their soft expiration.
1568                          * This allows us to avoid using a Priority Search
1569                          * Tree, which can answer a stabbing querry for
1570                          * overlapping intervals and instead use the simple
1571                          * BST we already have.
1572                          * We don't add extra wakeups by delaying timers that
1573                          * are right-of a not yet expired timer, because that
1574                          * timer will have to trigger a wakeup anyway.
1575                          */
1576                         if (basenow < hrtimer_get_softexpires_tv64(timer))
1577                                 break;
1578 
1579                         __run_hrtimer(cpu_base, base, timer, &basenow, flags);
1580                         if (active_mask == HRTIMER_ACTIVE_SOFT)
1581                                 hrtimer_sync_wait_running(cpu_base, flags);
1582                 }
1583         }
1584 }
1585 
1586 static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
1587 {
1588         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1589         unsigned long flags;
1590         ktime_t now;
1591 
1592         hrtimer_cpu_base_lock_expiry(cpu_base);
1593         raw_spin_lock_irqsave(&cpu_base->lock, flags);
1594 
1595         now = hrtimer_update_base(cpu_base);
1596         __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
1597 
1598         cpu_base->softirq_activated = 0;
1599         hrtimer_update_softirq_timer(cpu_base, true);
1600 
1601         raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1602         hrtimer_cpu_base_unlock_expiry(cpu_base);
1603 }
1604 
1605 #ifdef CONFIG_HIGH_RES_TIMERS
1606 
1607 /*
1608  * High resolution timer interrupt
1609  * Called with interrupts disabled
1610  */
1611 void hrtimer_interrupt(struct clock_event_device *dev)
1612 {
1613         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1614         ktime_t expires_next, now, entry_time, delta;
1615         unsigned long flags;
1616         int retries = 0;
1617 
1618         BUG_ON(!cpu_base->hres_active);
1619         cpu_base->nr_events++;
1620         dev->next_event = KTIME_MAX;
1621 
1622         raw_spin_lock_irqsave(&cpu_base->lock, flags);
1623         entry_time = now = hrtimer_update_base(cpu_base);
1624 retry:
1625         cpu_base->in_hrtirq = 1;
1626         /*
1627          * We set expires_next to KTIME_MAX here with cpu_base->lock
1628          * held to prevent that a timer is enqueued in our queue via
1629          * the migration code. This does not affect enqueueing of
1630          * timers which run their callback and need to be requeued on
1631          * this CPU.
1632          */
1633         cpu_base->expires_next = KTIME_MAX;
1634 
1635         if (!ktime_before(now, cpu_base->softirq_expires_next)) {
1636                 cpu_base->softirq_expires_next = KTIME_MAX;
1637                 cpu_base->softirq_activated = 1;
1638                 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
1639         }
1640 
1641         __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
1642 
1643         /* Reevaluate the clock bases for the next expiry */
1644         expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
1645         /*
1646          * Store the new expiry value so the migration code can verify
1647          * against it.
1648          */
1649         cpu_base->expires_next = expires_next;
1650         cpu_base->in_hrtirq = 0;
1651         raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1652 
1653         /* Reprogramming necessary ? */
1654         if (!tick_program_event(expires_next, 0)) {
1655                 cpu_base->hang_detected = 0;
1656                 return;
1657         }
1658 
1659         /*
1660          * The next timer was already expired due to:
1661          * - tracing
1662          * - long lasting callbacks
1663          * - being scheduled away when running in a VM
1664          *
1665          * We need to prevent that we loop forever in the hrtimer
1666          * interrupt routine. We give it 3 attempts to avoid
1667          * overreacting on some spurious event.
1668          *
1669          * Acquire base lock for updating the offsets and retrieving
1670          * the current time.
1671          */
1672         raw_spin_lock_irqsave(&cpu_base->lock, flags);
1673         now = hrtimer_update_base(cpu_base);
1674         cpu_base->nr_retries++;
1675         if (++retries < 3)
1676                 goto retry;
1677         /*
1678          * Give the system a chance to do something else than looping
1679          * here. We stored the entry time, so we know exactly how long
1680          * we spent here. We schedule the next event this amount of
1681          * time away.
1682          */
1683         cpu_base->nr_hangs++;
1684         cpu_base->hang_detected = 1;
1685         raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1686 
1687         delta = ktime_sub(now, entry_time);
1688         if ((unsigned int)delta > cpu_base->max_hang_time)
1689                 cpu_base->max_hang_time = (unsigned int) delta;
1690         /*
1691          * Limit it to a sensible value as we enforce a longer
1692          * delay. Give the CPU at least 100ms to catch up.
1693          */
1694         if (delta > 100 * NSEC_PER_MSEC)
1695                 expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
1696         else
1697                 expires_next = ktime_add(now, delta);
1698         tick_program_event(expires_next, 1);
1699         pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
1700 }
1701 
1702 /* called with interrupts disabled */
1703 static inline void __hrtimer_peek_ahead_timers(void)
1704 {
1705         struct tick_device *td;
1706 
1707         if (!hrtimer_hres_active())
1708                 return;
1709 
1710         td = this_cpu_ptr(&tick_cpu_device);
1711         if (td && td->evtdev)
1712                 hrtimer_interrupt(td->evtdev);
1713 }
1714 
1715 #else /* CONFIG_HIGH_RES_TIMERS */
1716 
1717 static inline void __hrtimer_peek_ahead_timers(void) { }
1718 
1719 #endif  /* !CONFIG_HIGH_RES_TIMERS */
1720 
1721 /*
1722  * Called from run_local_timers in hardirq context every jiffy
1723  */
1724 void hrtimer_run_queues(void)
1725 {
1726         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1727         unsigned long flags;
1728         ktime_t now;
1729 
1730         if (__hrtimer_hres_active(cpu_base))
1731                 return;
1732 
1733         /*
1734          * This _is_ ugly: We have to check periodically, whether we
1735          * can switch to highres and / or nohz mode. The clocksource
1736          * switch happens with xtime_lock held. Notification from
1737          * there only sets the check bit in the tick_oneshot code,
1738          * otherwise we might deadlock vs. xtime_lock.
1739          */
1740         if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
1741                 hrtimer_switch_to_hres();
1742                 return;
1743         }
1744 
1745         raw_spin_lock_irqsave(&cpu_base->lock, flags);
1746         now = hrtimer_update_base(cpu_base);
1747 
1748         if (!ktime_before(now, cpu_base->softirq_expires_next)) {
1749                 cpu_base->softirq_expires_next = KTIME_MAX;
1750                 cpu_base->softirq_activated = 1;
1751                 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
1752         }
1753 
1754         __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
1755         raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1756 }
1757 
1758 /*
1759  * Sleep related functions:
1760  */
1761 static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
1762 {
1763         struct hrtimer_sleeper *t =
1764                 container_of(timer, struct hrtimer_sleeper, timer);
1765         struct task_struct *task = t->task;
1766 
1767         t->task = NULL;
1768         if (task)
1769                 wake_up_process(task);
1770 
1771         return HRTIMER_NORESTART;
1772 }
1773 
1774 /**
1775  * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
1776  * @sl:         sleeper to be started
1777  * @mode:       timer mode abs/rel
1778  *
1779  * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
1780  * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
1781  */
1782 void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
1783                                    enum hrtimer_mode mode)
1784 {
1785         /*
1786          * Make the enqueue delivery mode check work on RT. If the sleeper
1787          * was initialized for hard interrupt delivery, force the mode bit.
1788          * This is a special case for hrtimer_sleepers because
1789          * hrtimer_init_sleeper() determines the delivery mode on RT so the
1790          * fiddling with this decision is avoided at the call sites.
1791          */
1792         if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
1793                 mode |= HRTIMER_MODE_HARD;
1794 
1795         hrtimer_start_expires(&sl->timer, mode);
1796 }
1797 EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
1798 
1799 static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
1800                                    clockid_t clock_id, enum hrtimer_mode mode)
1801 {
1802         /*
1803          * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
1804          * marked for hard interrupt expiry mode are moved into soft
1805          * interrupt context either for latency reasons or because the
1806          * hrtimer callback takes regular spinlocks or invokes other
1807          * functions which are not suitable for hard interrupt context on
1808          * PREEMPT_RT.
1809          *
1810          * The hrtimer_sleeper callback is RT compatible in hard interrupt
1811          * context, but there is a latency concern: Untrusted userspace can
1812          * spawn many threads which arm timers for the same expiry time on
1813          * the same CPU. That causes a latency spike due to the wakeup of
1814          * a gazillion threads.
1815          *
1816          * OTOH, priviledged real-time user space applications rely on the
1817          * low latency of hard interrupt wakeups. If the current task is in
1818          * a real-time scheduling class, mark the mode for hard interrupt
1819          * expiry.
1820          */
1821         if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
1822                 if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
1823                         mode |= HRTIMER_MODE_HARD;
1824         }
1825 
1826         __hrtimer_init(&sl->timer, clock_id, mode);
1827         sl->timer.function = hrtimer_wakeup;
1828         sl->task = current;
1829 }
1830 
1831 /**
1832  * hrtimer_init_sleeper - initialize sleeper to the given clock
1833  * @sl:         sleeper to be initialized
1834  * @clock_id:   the clock to be used
1835  * @mode:       timer mode abs/rel
1836  */
1837 void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
1838                           enum hrtimer_mode mode)
1839 {
1840         debug_init(&sl->timer, clock_id, mode);
1841         __hrtimer_init_sleeper(sl, clock_id, mode);
1842 
1843 }
1844 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1845 
1846 int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
1847 {
1848         switch(restart->nanosleep.type) {
1849 #ifdef CONFIG_COMPAT_32BIT_TIME
1850         case TT_COMPAT:
1851                 if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp))
1852                         return -EFAULT;
1853                 break;
1854 #endif
1855         case TT_NATIVE:
1856                 if (put_timespec64(ts, restart->nanosleep.rmtp))
1857                         return -EFAULT;
1858                 break;
1859         default:
1860                 BUG();
1861         }
1862         return -ERESTART_RESTARTBLOCK;
1863 }
1864 
1865 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
1866 {
1867         struct restart_block *restart;
1868 
1869         do {
1870                 set_current_state(TASK_INTERRUPTIBLE);
1871                 hrtimer_sleeper_start_expires(t, mode);
1872 
1873                 if (likely(t->task))
1874                         freezable_schedule();
1875 
1876                 hrtimer_cancel(&t->timer);
1877                 mode = HRTIMER_MODE_ABS;
1878 
1879         } while (t->task && !signal_pending(current));
1880 
1881         __set_current_state(TASK_RUNNING);
1882 
1883         if (!t->task)
1884                 return 0;
1885 
1886         restart = &current->restart_block;
1887         if (restart->nanosleep.type != TT_NONE) {
1888                 ktime_t rem = hrtimer_expires_remaining(&t->timer);
1889                 struct timespec64 rmt;
1890 
1891                 if (rem <= 0)
1892                         return 0;
1893                 rmt = ktime_to_timespec64(rem);
1894 
1895                 return nanosleep_copyout(restart, &rmt);
1896         }
1897         return -ERESTART_RESTARTBLOCK;
1898 }
1899 
1900 static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1901 {
1902         struct hrtimer_sleeper t;
1903         int ret;
1904 
1905         hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
1906                                       HRTIMER_MODE_ABS);
1907         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
1908         ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
1909         destroy_hrtimer_on_stack(&t.timer);
1910         return ret;
1911 }
1912 
1913 long hrtimer_nanosleep(const struct timespec64 *rqtp,
1914                        const enum hrtimer_mode mode, const clockid_t clockid)
1915 {
1916         struct restart_block *restart;
1917         struct hrtimer_sleeper t;
1918         int ret = 0;
1919         u64 slack;
1920 
1921         slack = current->timer_slack_ns;
1922         if (dl_task(current) || rt_task(current))
1923                 slack = 0;
1924 
1925         hrtimer_init_sleeper_on_stack(&t, clockid, mode);
1926         hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
1927         ret = do_nanosleep(&t, mode);
1928         if (ret != -ERESTART_RESTARTBLOCK)
1929                 goto out;
1930 
1931         /* Absolute timers do not update the rmtp value and restart: */
1932         if (mode == HRTIMER_MODE_ABS) {
1933                 ret = -ERESTARTNOHAND;
1934                 goto out;
1935         }
1936 
1937         restart = &current->restart_block;
1938         restart->fn = hrtimer_nanosleep_restart;
1939         restart->nanosleep.clockid = t.timer.base->clockid;
1940         restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
1941 out:
1942         destroy_hrtimer_on_stack(&t.timer);
1943         return ret;
1944 }
1945 
1946 #if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
1947 
1948 SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
1949                 struct __kernel_timespec __user *, rmtp)
1950 {
1951         struct timespec64 tu;
1952 
1953         if (get_timespec64(&tu, rqtp))
1954                 return -EFAULT;
1955 
1956         if (!timespec64_valid(&tu))
1957                 return -EINVAL;
1958 
1959         current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
1960         current->restart_block.nanosleep.rmtp = rmtp;
1961         return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
1962 }
1963 
1964 #endif
1965 
1966 #ifdef CONFIG_COMPAT_32BIT_TIME
1967 
1968 SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
1969                        struct old_timespec32 __user *, rmtp)
1970 {
1971         struct timespec64 tu;
1972 
1973         if (get_old_timespec32(&tu, rqtp))
1974                 return -EFAULT;
1975 
1976         if (!timespec64_valid(&tu))
1977                 return -EINVAL;
1978 
1979         current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
1980         current->restart_block.nanosleep.compat_rmtp = rmtp;
1981         return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
1982 }
1983 #endif
1984 
1985 /*
1986  * Functions related to boot-time initialization:
1987  */
1988 int hrtimers_prepare_cpu(unsigned int cpu)
1989 {
1990         struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1991         int i;
1992 
1993         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1994                 cpu_base->clock_base[i].cpu_base = cpu_base;
1995                 timerqueue_init_head(&cpu_base->clock_base[i].active);
1996         }
1997 
1998         cpu_base->cpu = cpu;
1999         cpu_base->active_bases = 0;
2000         cpu_base->hres_active = 0;
2001         cpu_base->hang_detected = 0;
2002         cpu_base->next_timer = NULL;
2003         cpu_base->softirq_next_timer = NULL;
2004         cpu_base->expires_next = KTIME_MAX;
2005         cpu_base->softirq_expires_next = KTIME_MAX;
2006         hrtimer_cpu_base_init_expiry_lock(cpu_base);
2007         return 0;
2008 }
2009 
2010 #ifdef CONFIG_HOTPLUG_CPU
2011 
2012 static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
2013                                 struct hrtimer_clock_base *new_base)
2014 {
2015         struct hrtimer *timer;
2016         struct timerqueue_node *node;
2017 
2018         while ((node = timerqueue_getnext(&old_base->active))) {
2019                 timer = container_of(node, struct hrtimer, node);
2020                 BUG_ON(hrtimer_callback_running(timer));
2021                 debug_deactivate(timer);
2022 
2023                 /*
2024                  * Mark it as ENQUEUED not INACTIVE otherwise the
2025                  * timer could be seen as !active and just vanish away
2026                  * under us on another CPU
2027                  */
2028                 __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
2029                 timer->base = new_base;
2030                 /*
2031                  * Enqueue the timers on the new cpu. This does not
2032                  * reprogram the event device in case the timer
2033                  * expires before the earliest on this CPU, but we run
2034                  * hrtimer_interrupt after we migrated everything to
2035                  * sort out already expired timers and reprogram the
2036                  * event device.
2037                  */
2038                 enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
2039         }
2040 }
2041 
2042 int hrtimers_dead_cpu(unsigned int scpu)
2043 {
2044         struct hrtimer_cpu_base *old_base, *new_base;
2045         int i;
2046 
2047         BUG_ON(cpu_online(scpu));
2048         tick_cancel_sched_timer(scpu);
2049 
2050         /*
2051          * this BH disable ensures that raise_softirq_irqoff() does
2052          * not wakeup ksoftirqd (and acquire the pi-lock) while
2053          * holding the cpu_base lock
2054          */
2055         local_bh_disable();
2056         local_irq_disable();
2057         old_base = &per_cpu(hrtimer_bases, scpu);
2058         new_base = this_cpu_ptr(&hrtimer_bases);
2059         /*
2060          * The caller is globally serialized and nobody else
2061          * takes two locks at once, deadlock is not possible.
2062          */
2063         raw_spin_lock(&new_base->lock);
2064         raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
2065 
2066         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
2067                 migrate_hrtimer_list(&old_base->clock_base[i],
2068                                      &new_base->clock_base[i]);
2069         }
2070 
2071         /*
2072          * The migration might have changed the first expiring softirq
2073          * timer on this CPU. Update it.
2074          */
2075         hrtimer_update_softirq_timer(new_base, false);
2076 
2077         raw_spin_unlock(&old_base->lock);
2078         raw_spin_unlock(&new_base->lock);
2079 
2080         /* Check, if we got expired work to do */
2081         __hrtimer_peek_ahead_timers();
2082         local_irq_enable();
2083         local_bh_enable();
2084         return 0;
2085 }
2086 
2087 #endif /* CONFIG_HOTPLUG_CPU */
2088 
2089 void __init hrtimers_init(void)
2090 {
2091         hrtimers_prepare_cpu(smp_processor_id());
2092         open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
2093 }
2094 
2095 /**
2096  * schedule_hrtimeout_range_clock - sleep until timeout
2097  * @expires:    timeout value (ktime_t)
2098  * @delta:      slack in expires timeout (ktime_t)
2099  * @mode:       timer mode
2100  * @clock_id:   timer clock to be used
2101  */
2102 int __sched
2103 schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
2104                                const enum hrtimer_mode mode, clockid_t clock_id)
2105 {
2106         struct hrtimer_sleeper t;
2107 
2108         /*
2109          * Optimize when a zero timeout value is given. It does not
2110          * matter whether this is an absolute or a relative time.
2111          */
2112         if (expires && *expires == 0) {
2113                 __set_current_state(TASK_RUNNING);
2114                 return 0;
2115         }
2116 
2117         /*
2118          * A NULL parameter means "infinite"
2119          */
2120         if (!expires) {
2121                 schedule();
2122                 return -EINTR;
2123         }
2124 
2125         hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
2126         hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
2127         hrtimer_sleeper_start_expires(&t, mode);
2128 
2129         if (likely(t.task))
2130                 schedule();
2131 
2132         hrtimer_cancel(&t.timer);
2133         destroy_hrtimer_on_stack(&t.timer);
2134 
2135         __set_current_state(TASK_RUNNING);
2136 
2137         return !t.task ? 0 : -EINTR;
2138 }
2139 
2140 /**
2141  * schedule_hrtimeout_range - sleep until timeout
2142  * @expires:    timeout value (ktime_t)
2143  * @delta:      slack in expires timeout (ktime_t)
2144  * @mode:       timer mode
2145  *
2146  * Make the current task sleep until the given expiry time has
2147  * elapsed. The routine will return immediately unless
2148  * the current task state has been set (see set_current_state()).
2149  *
2150  * The @delta argument gives the kernel the freedom to schedule the
2151  * actual wakeup to a time that is both power and performance friendly.
2152  * The kernel give the normal best effort behavior for "@expires+@delta",
2153  * but may decide to fire the timer earlier, but no earlier than @expires.
2154  *
2155  * You can set the task state as follows -
2156  *
2157  * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
2158  * pass before the routine returns unless the current task is explicitly
2159  * woken up, (e.g. by wake_up_process()).
2160  *
2161  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
2162  * delivered to the current task or the current task is explicitly woken
2163  * up.
2164  *
2165  * The current task state is guaranteed to be TASK_RUNNING when this
2166  * routine returns.
2167  *
2168  * Returns 0 when the timer has expired. If the task was woken before the
2169  * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
2170  * by an explicit wakeup, it returns -EINTR.
2171  */
2172 int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
2173                                      const enum hrtimer_mode mode)
2174 {
2175         return schedule_hrtimeout_range_clock(expires, delta, mode,
2176                                               CLOCK_MONOTONIC);
2177 }
2178 EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
2179 
2180 /**
2181  * schedule_hrtimeout - sleep until timeout
2182  * @expires:    timeout value (ktime_t)
2183  * @mode:       timer mode
2184  *
2185  * Make the current task sleep until the given expiry time has
2186  * elapsed. The routine will return immediately unless
2187  * the current task state has been set (see set_current_state()).
2188  *
2189  * You can set the task state as follows -
2190  *
2191  * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
2192  * pass before the routine returns unless the current task is explicitly
2193  * woken up, (e.g. by wake_up_process()).
2194  *
2195  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
2196  * delivered to the current task or the current task is explicitly woken
2197  * up.
2198  *
2199  * The current task state is guaranteed to be TASK_RUNNING when this
2200  * routine returns.
2201  *
2202  * Returns 0 when the timer has expired. If the task was woken before the
2203  * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
2204  * by an explicit wakeup, it returns -EINTR.
2205  */
2206 int __sched schedule_hrtimeout(ktime_t *expires,
2207                                const enum hrtimer_mode mode)
2208 {
2209         return schedule_hrtimeout_range(expires, 0, mode);
2210 }
2211 EXPORT_SYMBOL_GPL(schedule_hrtimeout);

/* [<][>][^][v][top][bottom][index][help] */