This source file includes following definitions.
- setup_psi
- group_init
- psi_init
- test_state
- get_recent_times
- calc_avgs
- collect_percpu_times
- update_averages
- psi_avgs_work
- window_reset
- window_update
- init_triggers
- update_triggers
- psi_schedule_poll_work
- psi_poll_work
- record_times
- psi_group_change
- iterate_groups
- psi_task_change
- psi_memstall_tick
- psi_memstall_enter
- psi_memstall_leave
- psi_cgroup_alloc
- psi_cgroup_free
- cgroup_move_task
- psi_show
- psi_io_show
- psi_memory_show
- psi_cpu_show
- psi_io_open
- psi_memory_open
- psi_cpu_open
- psi_trigger_create
- psi_trigger_destroy
- psi_trigger_replace
- psi_trigger_poll
- psi_write
- psi_io_write
- psi_memory_write
- psi_cpu_write
- psi_fop_poll
- psi_fop_release
- psi_proc_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130 #include "../workqueue_internal.h"
131 #include <linux/sched/loadavg.h>
132 #include <linux/seq_file.h>
133 #include <linux/proc_fs.h>
134 #include <linux/seqlock.h>
135 #include <linux/uaccess.h>
136 #include <linux/cgroup.h>
137 #include <linux/module.h>
138 #include <linux/sched.h>
139 #include <linux/ctype.h>
140 #include <linux/file.h>
141 #include <linux/poll.h>
142 #include <linux/psi.h>
143 #include "sched.h"
144
145 static int psi_bug __read_mostly;
146
147 DEFINE_STATIC_KEY_FALSE(psi_disabled);
148
149 #ifdef CONFIG_PSI_DEFAULT_DISABLED
150 static bool psi_enable;
151 #else
152 static bool psi_enable = true;
153 #endif
154 static int __init setup_psi(char *str)
155 {
156 return kstrtobool(str, &psi_enable) == 0;
157 }
158 __setup("psi=", setup_psi);
159
160
161 #define PSI_FREQ (2*HZ+1)
162 #define EXP_10s 1677
163 #define EXP_60s 1981
164 #define EXP_300s 2034
165
166
167 #define WINDOW_MIN_US 500000
168 #define WINDOW_MAX_US 10000000
169 #define UPDATES_PER_WINDOW 10
170
171
172 static u64 psi_period __read_mostly;
173
174
175 static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
176 struct psi_group psi_system = {
177 .pcpu = &system_group_pcpu,
178 };
179
180 static void psi_avgs_work(struct work_struct *work);
181
182 static void group_init(struct psi_group *group)
183 {
184 int cpu;
185
186 for_each_possible_cpu(cpu)
187 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
188 group->avg_last_update = sched_clock();
189 group->avg_next_update = group->avg_last_update + psi_period;
190 INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
191 mutex_init(&group->avgs_lock);
192
193 atomic_set(&group->poll_scheduled, 0);
194 mutex_init(&group->trigger_lock);
195 INIT_LIST_HEAD(&group->triggers);
196 memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
197 group->poll_states = 0;
198 group->poll_min_period = U32_MAX;
199 memset(group->polling_total, 0, sizeof(group->polling_total));
200 group->polling_next_update = ULLONG_MAX;
201 group->polling_until = 0;
202 rcu_assign_pointer(group->poll_kworker, NULL);
203 }
204
205 void __init psi_init(void)
206 {
207 if (!psi_enable) {
208 static_branch_enable(&psi_disabled);
209 return;
210 }
211
212 psi_period = jiffies_to_nsecs(PSI_FREQ);
213 group_init(&psi_system);
214 }
215
216 static bool test_state(unsigned int *tasks, enum psi_states state)
217 {
218 switch (state) {
219 case PSI_IO_SOME:
220 return tasks[NR_IOWAIT];
221 case PSI_IO_FULL:
222 return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
223 case PSI_MEM_SOME:
224 return tasks[NR_MEMSTALL];
225 case PSI_MEM_FULL:
226 return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
227 case PSI_CPU_SOME:
228 return tasks[NR_RUNNING] > 1;
229 case PSI_NONIDLE:
230 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
231 tasks[NR_RUNNING];
232 default:
233 return false;
234 }
235 }
236
237 static void get_recent_times(struct psi_group *group, int cpu,
238 enum psi_aggregators aggregator, u32 *times,
239 u32 *pchanged_states)
240 {
241 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
242 u64 now, state_start;
243 enum psi_states s;
244 unsigned int seq;
245 u32 state_mask;
246
247 *pchanged_states = 0;
248
249
250 do {
251 seq = read_seqcount_begin(&groupc->seq);
252 now = cpu_clock(cpu);
253 memcpy(times, groupc->times, sizeof(groupc->times));
254 state_mask = groupc->state_mask;
255 state_start = groupc->state_start;
256 } while (read_seqcount_retry(&groupc->seq, seq));
257
258
259 for (s = 0; s < NR_PSI_STATES; s++) {
260 u32 delta;
261
262
263
264
265
266
267
268
269
270 if (state_mask & (1 << s))
271 times[s] += now - state_start;
272
273 delta = times[s] - groupc->times_prev[aggregator][s];
274 groupc->times_prev[aggregator][s] = times[s];
275
276 times[s] = delta;
277 if (delta)
278 *pchanged_states |= (1 << s);
279 }
280 }
281
282 static void calc_avgs(unsigned long avg[3], int missed_periods,
283 u64 time, u64 period)
284 {
285 unsigned long pct;
286
287
288 if (missed_periods) {
289 avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
290 avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
291 avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
292 }
293
294
295 pct = div_u64(time * 100, period);
296 pct *= FIXED_1;
297 avg[0] = calc_load(avg[0], EXP_10s, pct);
298 avg[1] = calc_load(avg[1], EXP_60s, pct);
299 avg[2] = calc_load(avg[2], EXP_300s, pct);
300 }
301
302 static void collect_percpu_times(struct psi_group *group,
303 enum psi_aggregators aggregator,
304 u32 *pchanged_states)
305 {
306 u64 deltas[NR_PSI_STATES - 1] = { 0, };
307 unsigned long nonidle_total = 0;
308 u32 changed_states = 0;
309 int cpu;
310 int s;
311
312
313
314
315
316
317
318
319
320 for_each_possible_cpu(cpu) {
321 u32 times[NR_PSI_STATES];
322 u32 nonidle;
323 u32 cpu_changed_states;
324
325 get_recent_times(group, cpu, aggregator, times,
326 &cpu_changed_states);
327 changed_states |= cpu_changed_states;
328
329 nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
330 nonidle_total += nonidle;
331
332 for (s = 0; s < PSI_NONIDLE; s++)
333 deltas[s] += (u64)times[s] * nonidle;
334 }
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349 for (s = 0; s < NR_PSI_STATES - 1; s++)
350 group->total[aggregator][s] +=
351 div_u64(deltas[s], max(nonidle_total, 1UL));
352
353 if (pchanged_states)
354 *pchanged_states = changed_states;
355 }
356
357 static u64 update_averages(struct psi_group *group, u64 now)
358 {
359 unsigned long missed_periods = 0;
360 u64 expires, period;
361 u64 avg_next_update;
362 int s;
363
364
365 expires = group->avg_next_update;
366 if (now - expires >= psi_period)
367 missed_periods = div_u64(now - expires, psi_period);
368
369
370
371
372
373
374
375
376 avg_next_update = expires + ((1 + missed_periods) * psi_period);
377 period = now - (group->avg_last_update + (missed_periods * psi_period));
378 group->avg_last_update = now;
379
380 for (s = 0; s < NR_PSI_STATES - 1; s++) {
381 u32 sample;
382
383 sample = group->total[PSI_AVGS][s] - group->avg_total[s];
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401 if (sample > period)
402 sample = period;
403 group->avg_total[s] += sample;
404 calc_avgs(group->avg[s], missed_periods, sample, period);
405 }
406
407 return avg_next_update;
408 }
409
410 static void psi_avgs_work(struct work_struct *work)
411 {
412 struct delayed_work *dwork;
413 struct psi_group *group;
414 u32 changed_states;
415 bool nonidle;
416 u64 now;
417
418 dwork = to_delayed_work(work);
419 group = container_of(dwork, struct psi_group, avgs_work);
420
421 mutex_lock(&group->avgs_lock);
422
423 now = sched_clock();
424
425 collect_percpu_times(group, PSI_AVGS, &changed_states);
426 nonidle = changed_states & (1 << PSI_NONIDLE);
427
428
429
430
431
432
433
434 if (now >= group->avg_next_update)
435 group->avg_next_update = update_averages(group, now);
436
437 if (nonidle) {
438 schedule_delayed_work(dwork, nsecs_to_jiffies(
439 group->avg_next_update - now) + 1);
440 }
441
442 mutex_unlock(&group->avgs_lock);
443 }
444
445
446 static void window_reset(struct psi_window *win, u64 now, u64 value,
447 u64 prev_growth)
448 {
449 win->start_time = now;
450 win->start_value = value;
451 win->prev_growth = prev_growth;
452 }
453
454
455
456
457
458
459
460
461
462
463
464
465 static u64 window_update(struct psi_window *win, u64 now, u64 value)
466 {
467 u64 elapsed;
468 u64 growth;
469
470 elapsed = now - win->start_time;
471 growth = value - win->start_value;
472
473
474
475
476
477
478
479 if (elapsed > win->size)
480 window_reset(win, now, value, growth);
481 else {
482 u32 remaining;
483
484 remaining = win->size - elapsed;
485 growth += div64_u64(win->prev_growth * remaining, win->size);
486 }
487
488 return growth;
489 }
490
491 static void init_triggers(struct psi_group *group, u64 now)
492 {
493 struct psi_trigger *t;
494
495 list_for_each_entry(t, &group->triggers, node)
496 window_reset(&t->win, now,
497 group->total[PSI_POLL][t->state], 0);
498 memcpy(group->polling_total, group->total[PSI_POLL],
499 sizeof(group->polling_total));
500 group->polling_next_update = now + group->poll_min_period;
501 }
502
503 static u64 update_triggers(struct psi_group *group, u64 now)
504 {
505 struct psi_trigger *t;
506 bool new_stall = false;
507 u64 *total = group->total[PSI_POLL];
508
509
510
511
512
513 list_for_each_entry(t, &group->triggers, node) {
514 u64 growth;
515
516
517 if (group->polling_total[t->state] == total[t->state])
518 continue;
519
520
521
522
523
524
525
526 new_stall = true;
527
528
529 growth = window_update(&t->win, now, total[t->state]);
530 if (growth < t->threshold)
531 continue;
532
533
534 if (now < t->last_event_time + t->win.size)
535 continue;
536
537
538 if (cmpxchg(&t->event, 0, 1) == 0)
539 wake_up_interruptible(&t->event_wait);
540 t->last_event_time = now;
541 }
542
543 if (new_stall)
544 memcpy(group->polling_total, total,
545 sizeof(group->polling_total));
546
547 return now + group->poll_min_period;
548 }
549
550
551
552
553
554
555
556 static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
557 {
558 struct kthread_worker *kworker;
559
560
561 if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
562 return;
563
564 rcu_read_lock();
565
566 kworker = rcu_dereference(group->poll_kworker);
567
568
569
570
571 if (likely(kworker))
572 kthread_queue_delayed_work(kworker, &group->poll_work, delay);
573 else
574 atomic_set(&group->poll_scheduled, 0);
575
576 rcu_read_unlock();
577 }
578
579 static void psi_poll_work(struct kthread_work *work)
580 {
581 struct kthread_delayed_work *dwork;
582 struct psi_group *group;
583 u32 changed_states;
584 u64 now;
585
586 dwork = container_of(work, struct kthread_delayed_work, work);
587 group = container_of(dwork, struct psi_group, poll_work);
588
589 atomic_set(&group->poll_scheduled, 0);
590
591 mutex_lock(&group->trigger_lock);
592
593 now = sched_clock();
594
595 collect_percpu_times(group, PSI_POLL, &changed_states);
596
597 if (changed_states & group->poll_states) {
598
599 if (now > group->polling_until)
600 init_triggers(group, now);
601
602
603
604
605
606
607 group->polling_until = now +
608 group->poll_min_period * UPDATES_PER_WINDOW;
609 }
610
611 if (now > group->polling_until) {
612 group->polling_next_update = ULLONG_MAX;
613 goto out;
614 }
615
616 if (now >= group->polling_next_update)
617 group->polling_next_update = update_triggers(group, now);
618
619 psi_schedule_poll_work(group,
620 nsecs_to_jiffies(group->polling_next_update - now) + 1);
621
622 out:
623 mutex_unlock(&group->trigger_lock);
624 }
625
626 static void record_times(struct psi_group_cpu *groupc, int cpu,
627 bool memstall_tick)
628 {
629 u32 delta;
630 u64 now;
631
632 now = cpu_clock(cpu);
633 delta = now - groupc->state_start;
634 groupc->state_start = now;
635
636 if (groupc->state_mask & (1 << PSI_IO_SOME)) {
637 groupc->times[PSI_IO_SOME] += delta;
638 if (groupc->state_mask & (1 << PSI_IO_FULL))
639 groupc->times[PSI_IO_FULL] += delta;
640 }
641
642 if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
643 groupc->times[PSI_MEM_SOME] += delta;
644 if (groupc->state_mask & (1 << PSI_MEM_FULL))
645 groupc->times[PSI_MEM_FULL] += delta;
646 else if (memstall_tick) {
647 u32 sample;
648
649
650
651
652
653
654
655
656
657
658
659
660 sample = min(delta, (u32)jiffies_to_nsecs(1));
661 groupc->times[PSI_MEM_FULL] += sample;
662 }
663 }
664
665 if (groupc->state_mask & (1 << PSI_CPU_SOME))
666 groupc->times[PSI_CPU_SOME] += delta;
667
668 if (groupc->state_mask & (1 << PSI_NONIDLE))
669 groupc->times[PSI_NONIDLE] += delta;
670 }
671
672 static u32 psi_group_change(struct psi_group *group, int cpu,
673 unsigned int clear, unsigned int set)
674 {
675 struct psi_group_cpu *groupc;
676 unsigned int t, m;
677 enum psi_states s;
678 u32 state_mask = 0;
679
680 groupc = per_cpu_ptr(group->pcpu, cpu);
681
682
683
684
685
686
687
688
689
690 write_seqcount_begin(&groupc->seq);
691
692 record_times(groupc, cpu, false);
693
694 for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
695 if (!(m & (1 << t)))
696 continue;
697 if (groupc->tasks[t] == 0 && !psi_bug) {
698 printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
699 cpu, t, groupc->tasks[0],
700 groupc->tasks[1], groupc->tasks[2],
701 clear, set);
702 psi_bug = 1;
703 }
704 groupc->tasks[t]--;
705 }
706
707 for (t = 0; set; set &= ~(1 << t), t++)
708 if (set & (1 << t))
709 groupc->tasks[t]++;
710
711
712 for (s = 0; s < NR_PSI_STATES; s++) {
713 if (test_state(groupc->tasks, s))
714 state_mask |= (1 << s);
715 }
716 groupc->state_mask = state_mask;
717
718 write_seqcount_end(&groupc->seq);
719
720 return state_mask;
721 }
722
723 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
724 {
725 #ifdef CONFIG_CGROUPS
726 struct cgroup *cgroup = NULL;
727
728 if (!*iter)
729 cgroup = task->cgroups->dfl_cgrp;
730 else if (*iter == &psi_system)
731 return NULL;
732 else
733 cgroup = cgroup_parent(*iter);
734
735 if (cgroup && cgroup_parent(cgroup)) {
736 *iter = cgroup;
737 return cgroup_psi(cgroup);
738 }
739 #else
740 if (*iter)
741 return NULL;
742 #endif
743 *iter = &psi_system;
744 return &psi_system;
745 }
746
747 void psi_task_change(struct task_struct *task, int clear, int set)
748 {
749 int cpu = task_cpu(task);
750 struct psi_group *group;
751 bool wake_clock = true;
752 void *iter = NULL;
753
754 if (!task->pid)
755 return;
756
757 if (((task->psi_flags & set) ||
758 (task->psi_flags & clear) != clear) &&
759 !psi_bug) {
760 printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
761 task->pid, task->comm, cpu,
762 task->psi_flags, clear, set);
763 psi_bug = 1;
764 }
765
766 task->psi_flags &= ~clear;
767 task->psi_flags |= set;
768
769
770
771
772
773
774
775 if (unlikely((clear & TSK_RUNNING) &&
776 (task->flags & PF_WQ_WORKER) &&
777 wq_worker_last_func(task) == psi_avgs_work))
778 wake_clock = false;
779
780 while ((group = iterate_groups(task, &iter))) {
781 u32 state_mask = psi_group_change(group, cpu, clear, set);
782
783 if (state_mask & group->poll_states)
784 psi_schedule_poll_work(group, 1);
785
786 if (wake_clock && !delayed_work_pending(&group->avgs_work))
787 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
788 }
789 }
790
791 void psi_memstall_tick(struct task_struct *task, int cpu)
792 {
793 struct psi_group *group;
794 void *iter = NULL;
795
796 while ((group = iterate_groups(task, &iter))) {
797 struct psi_group_cpu *groupc;
798
799 groupc = per_cpu_ptr(group->pcpu, cpu);
800 write_seqcount_begin(&groupc->seq);
801 record_times(groupc, cpu, true);
802 write_seqcount_end(&groupc->seq);
803 }
804 }
805
806
807
808
809
810
811
812
813 void psi_memstall_enter(unsigned long *flags)
814 {
815 struct rq_flags rf;
816 struct rq *rq;
817
818 if (static_branch_likely(&psi_disabled))
819 return;
820
821 *flags = current->flags & PF_MEMSTALL;
822 if (*flags)
823 return;
824
825
826
827
828
829 rq = this_rq_lock_irq(&rf);
830
831 current->flags |= PF_MEMSTALL;
832 psi_task_change(current, 0, TSK_MEMSTALL);
833
834 rq_unlock_irq(rq, &rf);
835 }
836
837
838
839
840
841
842
843 void psi_memstall_leave(unsigned long *flags)
844 {
845 struct rq_flags rf;
846 struct rq *rq;
847
848 if (static_branch_likely(&psi_disabled))
849 return;
850
851 if (*flags)
852 return;
853
854
855
856
857
858 rq = this_rq_lock_irq(&rf);
859
860 current->flags &= ~PF_MEMSTALL;
861 psi_task_change(current, TSK_MEMSTALL, 0);
862
863 rq_unlock_irq(rq, &rf);
864 }
865
866 #ifdef CONFIG_CGROUPS
867 int psi_cgroup_alloc(struct cgroup *cgroup)
868 {
869 if (static_branch_likely(&psi_disabled))
870 return 0;
871
872 cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
873 if (!cgroup->psi.pcpu)
874 return -ENOMEM;
875 group_init(&cgroup->psi);
876 return 0;
877 }
878
879 void psi_cgroup_free(struct cgroup *cgroup)
880 {
881 if (static_branch_likely(&psi_disabled))
882 return;
883
884 cancel_delayed_work_sync(&cgroup->psi.avgs_work);
885 free_percpu(cgroup->psi.pcpu);
886
887 WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
888 }
889
890
891
892
893
894
895
896
897
898
899
900
901
902 void cgroup_move_task(struct task_struct *task, struct css_set *to)
903 {
904 unsigned int task_flags = 0;
905 struct rq_flags rf;
906 struct rq *rq;
907
908 if (static_branch_likely(&psi_disabled)) {
909
910
911
912
913 rcu_assign_pointer(task->cgroups, to);
914 return;
915 }
916
917 rq = task_rq_lock(task, &rf);
918
919 if (task_on_rq_queued(task))
920 task_flags = TSK_RUNNING;
921 else if (task->in_iowait)
922 task_flags = TSK_IOWAIT;
923
924 if (task->flags & PF_MEMSTALL)
925 task_flags |= TSK_MEMSTALL;
926
927 if (task_flags)
928 psi_task_change(task, task_flags, 0);
929
930
931 rcu_assign_pointer(task->cgroups, to);
932
933 if (task_flags)
934 psi_task_change(task, 0, task_flags);
935
936 task_rq_unlock(rq, task, &rf);
937 }
938 #endif
939
940 int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
941 {
942 int full;
943 u64 now;
944
945 if (static_branch_likely(&psi_disabled))
946 return -EOPNOTSUPP;
947
948
949 mutex_lock(&group->avgs_lock);
950 now = sched_clock();
951 collect_percpu_times(group, PSI_AVGS, NULL);
952 if (now >= group->avg_next_update)
953 group->avg_next_update = update_averages(group, now);
954 mutex_unlock(&group->avgs_lock);
955
956 for (full = 0; full < 2 - (res == PSI_CPU); full++) {
957 unsigned long avg[3];
958 u64 total;
959 int w;
960
961 for (w = 0; w < 3; w++)
962 avg[w] = group->avg[res * 2 + full][w];
963 total = div_u64(group->total[PSI_AVGS][res * 2 + full],
964 NSEC_PER_USEC);
965
966 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
967 full ? "full" : "some",
968 LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
969 LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
970 LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
971 total);
972 }
973
974 return 0;
975 }
976
977 static int psi_io_show(struct seq_file *m, void *v)
978 {
979 return psi_show(m, &psi_system, PSI_IO);
980 }
981
982 static int psi_memory_show(struct seq_file *m, void *v)
983 {
984 return psi_show(m, &psi_system, PSI_MEM);
985 }
986
987 static int psi_cpu_show(struct seq_file *m, void *v)
988 {
989 return psi_show(m, &psi_system, PSI_CPU);
990 }
991
992 static int psi_io_open(struct inode *inode, struct file *file)
993 {
994 return single_open(file, psi_io_show, NULL);
995 }
996
997 static int psi_memory_open(struct inode *inode, struct file *file)
998 {
999 return single_open(file, psi_memory_show, NULL);
1000 }
1001
1002 static int psi_cpu_open(struct inode *inode, struct file *file)
1003 {
1004 return single_open(file, psi_cpu_show, NULL);
1005 }
1006
1007 struct psi_trigger *psi_trigger_create(struct psi_group *group,
1008 char *buf, size_t nbytes, enum psi_res res)
1009 {
1010 struct psi_trigger *t;
1011 enum psi_states state;
1012 u32 threshold_us;
1013 u32 window_us;
1014
1015 if (static_branch_likely(&psi_disabled))
1016 return ERR_PTR(-EOPNOTSUPP);
1017
1018 if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
1019 state = PSI_IO_SOME + res * 2;
1020 else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
1021 state = PSI_IO_FULL + res * 2;
1022 else
1023 return ERR_PTR(-EINVAL);
1024
1025 if (state >= PSI_NONIDLE)
1026 return ERR_PTR(-EINVAL);
1027
1028 if (window_us < WINDOW_MIN_US ||
1029 window_us > WINDOW_MAX_US)
1030 return ERR_PTR(-EINVAL);
1031
1032
1033 if (threshold_us == 0 || threshold_us > window_us)
1034 return ERR_PTR(-EINVAL);
1035
1036 t = kmalloc(sizeof(*t), GFP_KERNEL);
1037 if (!t)
1038 return ERR_PTR(-ENOMEM);
1039
1040 t->group = group;
1041 t->state = state;
1042 t->threshold = threshold_us * NSEC_PER_USEC;
1043 t->win.size = window_us * NSEC_PER_USEC;
1044 window_reset(&t->win, 0, 0, 0);
1045
1046 t->event = 0;
1047 t->last_event_time = 0;
1048 init_waitqueue_head(&t->event_wait);
1049 kref_init(&t->refcount);
1050
1051 mutex_lock(&group->trigger_lock);
1052
1053 if (!rcu_access_pointer(group->poll_kworker)) {
1054 struct sched_param param = {
1055 .sched_priority = 1,
1056 };
1057 struct kthread_worker *kworker;
1058
1059 kworker = kthread_create_worker(0, "psimon");
1060 if (IS_ERR(kworker)) {
1061 kfree(t);
1062 mutex_unlock(&group->trigger_lock);
1063 return ERR_CAST(kworker);
1064 }
1065 sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m);
1066 kthread_init_delayed_work(&group->poll_work,
1067 psi_poll_work);
1068 rcu_assign_pointer(group->poll_kworker, kworker);
1069 }
1070
1071 list_add(&t->node, &group->triggers);
1072 group->poll_min_period = min(group->poll_min_period,
1073 div_u64(t->win.size, UPDATES_PER_WINDOW));
1074 group->nr_triggers[t->state]++;
1075 group->poll_states |= (1 << t->state);
1076
1077 mutex_unlock(&group->trigger_lock);
1078
1079 return t;
1080 }
1081
1082 static void psi_trigger_destroy(struct kref *ref)
1083 {
1084 struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
1085 struct psi_group *group = t->group;
1086 struct kthread_worker *kworker_to_destroy = NULL;
1087
1088 if (static_branch_likely(&psi_disabled))
1089 return;
1090
1091
1092
1093
1094
1095 wake_up_interruptible(&t->event_wait);
1096
1097 mutex_lock(&group->trigger_lock);
1098
1099 if (!list_empty(&t->node)) {
1100 struct psi_trigger *tmp;
1101 u64 period = ULLONG_MAX;
1102
1103 list_del(&t->node);
1104 group->nr_triggers[t->state]--;
1105 if (!group->nr_triggers[t->state])
1106 group->poll_states &= ~(1 << t->state);
1107
1108 list_for_each_entry(tmp, &group->triggers, node)
1109 period = min(period, div_u64(tmp->win.size,
1110 UPDATES_PER_WINDOW));
1111 group->poll_min_period = period;
1112
1113 if (group->poll_states == 0) {
1114 group->polling_until = 0;
1115 kworker_to_destroy = rcu_dereference_protected(
1116 group->poll_kworker,
1117 lockdep_is_held(&group->trigger_lock));
1118 rcu_assign_pointer(group->poll_kworker, NULL);
1119 }
1120 }
1121
1122 mutex_unlock(&group->trigger_lock);
1123
1124
1125
1126
1127
1128
1129 synchronize_rcu();
1130
1131
1132
1133
1134 if (kworker_to_destroy) {
1135
1136
1137
1138
1139
1140
1141 kthread_cancel_delayed_work_sync(&group->poll_work);
1142 atomic_set(&group->poll_scheduled, 0);
1143
1144 kthread_destroy_worker(kworker_to_destroy);
1145 }
1146 kfree(t);
1147 }
1148
1149 void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
1150 {
1151 struct psi_trigger *old = *trigger_ptr;
1152
1153 if (static_branch_likely(&psi_disabled))
1154 return;
1155
1156 rcu_assign_pointer(*trigger_ptr, new);
1157 if (old)
1158 kref_put(&old->refcount, psi_trigger_destroy);
1159 }
1160
1161 __poll_t psi_trigger_poll(void **trigger_ptr,
1162 struct file *file, poll_table *wait)
1163 {
1164 __poll_t ret = DEFAULT_POLLMASK;
1165 struct psi_trigger *t;
1166
1167 if (static_branch_likely(&psi_disabled))
1168 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1169
1170 rcu_read_lock();
1171
1172 t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
1173 if (!t) {
1174 rcu_read_unlock();
1175 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1176 }
1177 kref_get(&t->refcount);
1178
1179 rcu_read_unlock();
1180
1181 poll_wait(file, &t->event_wait, wait);
1182
1183 if (cmpxchg(&t->event, 1, 0) == 1)
1184 ret |= EPOLLPRI;
1185
1186 kref_put(&t->refcount, psi_trigger_destroy);
1187
1188 return ret;
1189 }
1190
1191 static ssize_t psi_write(struct file *file, const char __user *user_buf,
1192 size_t nbytes, enum psi_res res)
1193 {
1194 char buf[32];
1195 size_t buf_size;
1196 struct seq_file *seq;
1197 struct psi_trigger *new;
1198
1199 if (static_branch_likely(&psi_disabled))
1200 return -EOPNOTSUPP;
1201
1202 if (!nbytes)
1203 return -EINVAL;
1204
1205 buf_size = min(nbytes, sizeof(buf));
1206 if (copy_from_user(buf, user_buf, buf_size))
1207 return -EFAULT;
1208
1209 buf[buf_size - 1] = '\0';
1210
1211 new = psi_trigger_create(&psi_system, buf, nbytes, res);
1212 if (IS_ERR(new))
1213 return PTR_ERR(new);
1214
1215 seq = file->private_data;
1216
1217 mutex_lock(&seq->lock);
1218 psi_trigger_replace(&seq->private, new);
1219 mutex_unlock(&seq->lock);
1220
1221 return nbytes;
1222 }
1223
1224 static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
1225 size_t nbytes, loff_t *ppos)
1226 {
1227 return psi_write(file, user_buf, nbytes, PSI_IO);
1228 }
1229
1230 static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
1231 size_t nbytes, loff_t *ppos)
1232 {
1233 return psi_write(file, user_buf, nbytes, PSI_MEM);
1234 }
1235
1236 static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
1237 size_t nbytes, loff_t *ppos)
1238 {
1239 return psi_write(file, user_buf, nbytes, PSI_CPU);
1240 }
1241
1242 static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
1243 {
1244 struct seq_file *seq = file->private_data;
1245
1246 return psi_trigger_poll(&seq->private, file, wait);
1247 }
1248
1249 static int psi_fop_release(struct inode *inode, struct file *file)
1250 {
1251 struct seq_file *seq = file->private_data;
1252
1253 psi_trigger_replace(&seq->private, NULL);
1254 return single_release(inode, file);
1255 }
1256
1257 static const struct file_operations psi_io_fops = {
1258 .open = psi_io_open,
1259 .read = seq_read,
1260 .llseek = seq_lseek,
1261 .write = psi_io_write,
1262 .poll = psi_fop_poll,
1263 .release = psi_fop_release,
1264 };
1265
1266 static const struct file_operations psi_memory_fops = {
1267 .open = psi_memory_open,
1268 .read = seq_read,
1269 .llseek = seq_lseek,
1270 .write = psi_memory_write,
1271 .poll = psi_fop_poll,
1272 .release = psi_fop_release,
1273 };
1274
1275 static const struct file_operations psi_cpu_fops = {
1276 .open = psi_cpu_open,
1277 .read = seq_read,
1278 .llseek = seq_lseek,
1279 .write = psi_cpu_write,
1280 .poll = psi_fop_poll,
1281 .release = psi_fop_release,
1282 };
1283
1284 static int __init psi_proc_init(void)
1285 {
1286 proc_mkdir("pressure", NULL);
1287 proc_create("pressure/io", 0, NULL, &psi_io_fops);
1288 proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
1289 proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
1290 return 0;
1291 }
1292 module_init(psi_proc_init);