This source file includes following definitions.
- rqos_to_ioc
- q_to_ioc
- q_name
- ioc_name
- pd_to_iocg
- blkg_to_iocg
- iocg_to_blkg
- blkcg_to_iocc
- abs_cost_to_cost
- cost_to_abs_cost
- iocg_commit_bio
- ioc_refresh_period_us
- ioc_autop_idx
- calc_lcoefs
- ioc_refresh_lcoefs
- ioc_refresh_params
- ioc_now
- ioc_start_period
- __propagate_active_weight
- commit_active_weights
- propagate_active_weight
- current_hweight
- weight_updated
- iocg_activate
- iocg_wake_fn
- iocg_kick_waitq
- iocg_waitq_timer_fn
- iocg_kick_delay
- iocg_delay_timer_fn
- ioc_lat_stat
- iocg_is_idle
- surplus_adjusted_hweight_inuse
- ioc_timer_fn
- calc_vtime_cost_builtin
- calc_vtime_cost
- ioc_rqos_throttle
- ioc_rqos_merge
- ioc_rqos_done_bio
- ioc_rqos_done
- ioc_rqos_queue_depth_changed
- ioc_rqos_exit
- blk_iocost_init
- ioc_cpd_alloc
- ioc_cpd_free
- ioc_pd_alloc
- ioc_pd_init
- ioc_pd_free
- ioc_weight_prfill
- ioc_weight_show
- ioc_weight_write
- ioc_qos_prfill
- ioc_qos_show
- ioc_qos_write
- ioc_cost_model_prfill
- ioc_cost_model_show
- ioc_cost_model_write
- ioc_init
- ioc_exit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178 #include <linux/kernel.h>
179 #include <linux/module.h>
180 #include <linux/timer.h>
181 #include <linux/time64.h>
182 #include <linux/parser.h>
183 #include <linux/sched/signal.h>
184 #include <linux/blk-cgroup.h>
185 #include "blk-rq-qos.h"
186 #include "blk-stat.h"
187 #include "blk-wbt.h"
188
189 #ifdef CONFIG_TRACEPOINTS
190
191
192 #define TRACE_IOCG_PATH_LEN 1024
193 static DEFINE_SPINLOCK(trace_iocg_path_lock);
194 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
195
196 #define TRACE_IOCG_PATH(type, iocg, ...) \
197 do { \
198 unsigned long flags; \
199 if (trace_iocost_##type##_enabled()) { \
200 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
201 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
202 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
203 trace_iocost_##type(iocg, trace_iocg_path, \
204 ##__VA_ARGS__); \
205 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
206 } \
207 } while (0)
208
209 #else
210 #define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
211 #endif
212
213 enum {
214 MILLION = 1000000,
215
216
217 MIN_PERIOD = USEC_PER_MSEC,
218 MAX_PERIOD = USEC_PER_SEC,
219
220
221
222
223
224
225 MARGIN_PCT = 50,
226 INUSE_MARGIN_PCT = 10,
227
228
229 WAITQ_TIMER_MARGIN_PCT = 5,
230
231
232
233
234
235
236 VTIME_VALID_DUR = 300 * USEC_PER_SEC,
237
238
239
240
241
242
243
244
245
246 NR_USAGE_SLOTS = 3,
247 MIN_VALID_USAGES = 2,
248
249
250 HWEIGHT_WHOLE = 1 << 16,
251
252
253
254
255
256
257
258
259
260
261
262
263 VTIME_PER_SEC_SHIFT = 37,
264 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
265 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
266
267
268 VRATE_MIN_PPM = 10000,
269 VRATE_MAX_PPM = 100000000,
270
271 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
272 VRATE_CLAMP_ADJ_PCT = 4,
273
274
275 RQ_WAIT_BUSY_PCT = 5,
276
277
278 UNBUSY_THR_PCT = 75,
279
280
281 MAX_LAGGING_PERIODS = 10,
282
283
284
285
286
287 SURPLUS_SCALE_PCT = 125,
288 SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50,
289 SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33,
290
291
292 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
293
294
295
296
297
298
299 IOC_PAGE_SHIFT = 12,
300 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
301 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
302
303
304 LCOEF_RANDIO_PAGES = 4096,
305 };
306
307 enum ioc_running {
308 IOC_IDLE,
309 IOC_RUNNING,
310 IOC_STOP,
311 };
312
313
314 enum {
315 QOS_ENABLE,
316 QOS_CTRL,
317 NR_QOS_CTRL_PARAMS,
318 };
319
320
321 enum {
322 QOS_RPPM,
323 QOS_RLAT,
324 QOS_WPPM,
325 QOS_WLAT,
326 QOS_MIN,
327 QOS_MAX,
328 NR_QOS_PARAMS,
329 };
330
331
332 enum {
333 COST_CTRL,
334 COST_MODEL,
335 NR_COST_CTRL_PARAMS,
336 };
337
338
339 enum {
340 I_LCOEF_RBPS,
341 I_LCOEF_RSEQIOPS,
342 I_LCOEF_RRANDIOPS,
343 I_LCOEF_WBPS,
344 I_LCOEF_WSEQIOPS,
345 I_LCOEF_WRANDIOPS,
346 NR_I_LCOEFS,
347 };
348
349 enum {
350 LCOEF_RPAGE,
351 LCOEF_RSEQIO,
352 LCOEF_RRANDIO,
353 LCOEF_WPAGE,
354 LCOEF_WSEQIO,
355 LCOEF_WRANDIO,
356 NR_LCOEFS,
357 };
358
359 enum {
360 AUTOP_INVALID,
361 AUTOP_HDD,
362 AUTOP_SSD_QD1,
363 AUTOP_SSD_DFL,
364 AUTOP_SSD_FAST,
365 };
366
367 struct ioc_gq;
368
369 struct ioc_params {
370 u32 qos[NR_QOS_PARAMS];
371 u64 i_lcoefs[NR_I_LCOEFS];
372 u64 lcoefs[NR_LCOEFS];
373 u32 too_fast_vrate_pct;
374 u32 too_slow_vrate_pct;
375 };
376
377 struct ioc_missed {
378 u32 nr_met;
379 u32 nr_missed;
380 u32 last_met;
381 u32 last_missed;
382 };
383
384 struct ioc_pcpu_stat {
385 struct ioc_missed missed[2];
386
387 u64 rq_wait_ns;
388 u64 last_rq_wait_ns;
389 };
390
391
392 struct ioc {
393 struct rq_qos rqos;
394
395 bool enabled;
396
397 struct ioc_params params;
398 u32 period_us;
399 u32 margin_us;
400 u64 vrate_min;
401 u64 vrate_max;
402
403 spinlock_t lock;
404 struct timer_list timer;
405 struct list_head active_iocgs;
406 struct ioc_pcpu_stat __percpu *pcpu_stat;
407
408 enum ioc_running running;
409 atomic64_t vtime_rate;
410
411 seqcount_t period_seqcount;
412 u32 period_at;
413 u64 period_at_vtime;
414
415 atomic64_t cur_period;
416 int busy_level;
417
418 u64 inuse_margin_vtime;
419 bool weights_updated;
420 atomic_t hweight_gen;
421
422 u64 autop_too_fast_at;
423 u64 autop_too_slow_at;
424 int autop_idx;
425 bool user_qos_params:1;
426 bool user_cost_model:1;
427 };
428
429
430 struct ioc_gq {
431 struct blkg_policy_data pd;
432 struct ioc *ioc;
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449 u32 cfg_weight;
450 u32 weight;
451 u32 active;
452 u32 inuse;
453 u32 last_inuse;
454
455 sector_t cursor;
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470 atomic64_t vtime;
471 atomic64_t done_vtime;
472 u64 abs_vdebt;
473 u64 last_vtime;
474
475
476
477
478
479 atomic64_t active_period;
480 struct list_head active_list;
481
482
483 u64 child_active_sum;
484 u64 child_inuse_sum;
485 int hweight_gen;
486 u32 hweight_active;
487 u32 hweight_inuse;
488 bool has_surplus;
489
490 struct wait_queue_head waitq;
491 struct hrtimer waitq_timer;
492 struct hrtimer delay_timer;
493
494
495 int usage_idx;
496 u32 usages[NR_USAGE_SLOTS];
497
498
499 int level;
500 struct ioc_gq *ancestors[];
501 };
502
503
504 struct ioc_cgrp {
505 struct blkcg_policy_data cpd;
506 unsigned int dfl_weight;
507 };
508
509 struct ioc_now {
510 u64 now_ns;
511 u32 now;
512 u64 vnow;
513 u64 vrate;
514 };
515
516 struct iocg_wait {
517 struct wait_queue_entry wait;
518 struct bio *bio;
519 u64 abs_cost;
520 bool committed;
521 };
522
523 struct iocg_wake_ctx {
524 struct ioc_gq *iocg;
525 u32 hw_inuse;
526 s64 vbudget;
527 };
528
529 static const struct ioc_params autop[] = {
530 [AUTOP_HDD] = {
531 .qos = {
532 [QOS_RLAT] = 250000,
533 [QOS_WLAT] = 250000,
534 [QOS_MIN] = VRATE_MIN_PPM,
535 [QOS_MAX] = VRATE_MAX_PPM,
536 },
537 .i_lcoefs = {
538 [I_LCOEF_RBPS] = 174019176,
539 [I_LCOEF_RSEQIOPS] = 41708,
540 [I_LCOEF_RRANDIOPS] = 370,
541 [I_LCOEF_WBPS] = 178075866,
542 [I_LCOEF_WSEQIOPS] = 42705,
543 [I_LCOEF_WRANDIOPS] = 378,
544 },
545 },
546 [AUTOP_SSD_QD1] = {
547 .qos = {
548 [QOS_RLAT] = 25000,
549 [QOS_WLAT] = 25000,
550 [QOS_MIN] = VRATE_MIN_PPM,
551 [QOS_MAX] = VRATE_MAX_PPM,
552 },
553 .i_lcoefs = {
554 [I_LCOEF_RBPS] = 245855193,
555 [I_LCOEF_RSEQIOPS] = 61575,
556 [I_LCOEF_RRANDIOPS] = 6946,
557 [I_LCOEF_WBPS] = 141365009,
558 [I_LCOEF_WSEQIOPS] = 33716,
559 [I_LCOEF_WRANDIOPS] = 26796,
560 },
561 },
562 [AUTOP_SSD_DFL] = {
563 .qos = {
564 [QOS_RLAT] = 25000,
565 [QOS_WLAT] = 25000,
566 [QOS_MIN] = VRATE_MIN_PPM,
567 [QOS_MAX] = VRATE_MAX_PPM,
568 },
569 .i_lcoefs = {
570 [I_LCOEF_RBPS] = 488636629,
571 [I_LCOEF_RSEQIOPS] = 8932,
572 [I_LCOEF_RRANDIOPS] = 8518,
573 [I_LCOEF_WBPS] = 427891549,
574 [I_LCOEF_WSEQIOPS] = 28755,
575 [I_LCOEF_WRANDIOPS] = 21940,
576 },
577 .too_fast_vrate_pct = 500,
578 },
579 [AUTOP_SSD_FAST] = {
580 .qos = {
581 [QOS_RLAT] = 5000,
582 [QOS_WLAT] = 5000,
583 [QOS_MIN] = VRATE_MIN_PPM,
584 [QOS_MAX] = VRATE_MAX_PPM,
585 },
586 .i_lcoefs = {
587 [I_LCOEF_RBPS] = 3102524156LLU,
588 [I_LCOEF_RSEQIOPS] = 724816,
589 [I_LCOEF_RRANDIOPS] = 778122,
590 [I_LCOEF_WBPS] = 1742780862LLU,
591 [I_LCOEF_WSEQIOPS] = 425702,
592 [I_LCOEF_WRANDIOPS] = 443193,
593 },
594 .too_slow_vrate_pct = 10,
595 },
596 };
597
598
599
600
601
602 static u32 vrate_adj_pct[] =
603 { 0, 0, 0, 0,
604 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
605 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
606 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
607
608 static struct blkcg_policy blkcg_policy_iocost;
609
610
611 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
612 {
613 return container_of(rqos, struct ioc, rqos);
614 }
615
616 static struct ioc *q_to_ioc(struct request_queue *q)
617 {
618 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
619 }
620
621 static const char *q_name(struct request_queue *q)
622 {
623 if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
624 return kobject_name(q->kobj.parent);
625 else
626 return "<unknown>";
627 }
628
629 static const char __maybe_unused *ioc_name(struct ioc *ioc)
630 {
631 return q_name(ioc->rqos.q);
632 }
633
634 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
635 {
636 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
637 }
638
639 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
640 {
641 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
642 }
643
644 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
645 {
646 return pd_to_blkg(&iocg->pd);
647 }
648
649 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
650 {
651 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
652 struct ioc_cgrp, cpd);
653 }
654
655
656
657
658
659 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
660 {
661 return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
662 }
663
664
665
666
667 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
668 {
669 return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
670 }
671
672 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
673 {
674 bio->bi_iocost_cost = cost;
675 atomic64_add(cost, &iocg->vtime);
676 }
677
678 #define CREATE_TRACE_POINTS
679 #include <trace/events/iocost.h>
680
681
682 static void ioc_refresh_period_us(struct ioc *ioc)
683 {
684 u32 ppm, lat, multi, period_us;
685
686 lockdep_assert_held(&ioc->lock);
687
688
689 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
690 ppm = ioc->params.qos[QOS_RPPM];
691 lat = ioc->params.qos[QOS_RLAT];
692 } else {
693 ppm = ioc->params.qos[QOS_WPPM];
694 lat = ioc->params.qos[QOS_WLAT];
695 }
696
697
698
699
700
701
702
703
704
705 if (ppm)
706 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
707 else
708 multi = 2;
709 period_us = multi * lat;
710 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
711
712
713 ioc->period_us = period_us;
714 ioc->margin_us = period_us * MARGIN_PCT / 100;
715 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
716 period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
717 }
718
719 static int ioc_autop_idx(struct ioc *ioc)
720 {
721 int idx = ioc->autop_idx;
722 const struct ioc_params *p = &autop[idx];
723 u32 vrate_pct;
724 u64 now_ns;
725
726
727 if (!blk_queue_nonrot(ioc->rqos.q))
728 return AUTOP_HDD;
729
730
731 if (blk_queue_depth(ioc->rqos.q) == 1)
732 return AUTOP_SSD_QD1;
733
734
735 if (idx < AUTOP_SSD_DFL)
736 return AUTOP_SSD_DFL;
737
738
739 if (ioc->user_qos_params || ioc->user_cost_model)
740 return idx;
741
742
743 vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
744 VTIME_PER_USEC);
745 now_ns = ktime_get_ns();
746
747 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
748 if (!ioc->autop_too_fast_at)
749 ioc->autop_too_fast_at = now_ns;
750 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
751 return idx + 1;
752 } else {
753 ioc->autop_too_fast_at = 0;
754 }
755
756 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
757 if (!ioc->autop_too_slow_at)
758 ioc->autop_too_slow_at = now_ns;
759 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
760 return idx - 1;
761 } else {
762 ioc->autop_too_slow_at = 0;
763 }
764
765 return idx;
766 }
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
782 u64 *page, u64 *seqio, u64 *randio)
783 {
784 u64 v;
785
786 *page = *seqio = *randio = 0;
787
788 if (bps)
789 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
790 DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
791
792 if (seqiops) {
793 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
794 if (v > *page)
795 *seqio = v - *page;
796 }
797
798 if (randiops) {
799 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
800 if (v > *page)
801 *randio = v - *page;
802 }
803 }
804
805 static void ioc_refresh_lcoefs(struct ioc *ioc)
806 {
807 u64 *u = ioc->params.i_lcoefs;
808 u64 *c = ioc->params.lcoefs;
809
810 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
811 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
812 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
813 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
814 }
815
816 static bool ioc_refresh_params(struct ioc *ioc, bool force)
817 {
818 const struct ioc_params *p;
819 int idx;
820
821 lockdep_assert_held(&ioc->lock);
822
823 idx = ioc_autop_idx(ioc);
824 p = &autop[idx];
825
826 if (idx == ioc->autop_idx && !force)
827 return false;
828
829 if (idx != ioc->autop_idx)
830 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
831
832 ioc->autop_idx = idx;
833 ioc->autop_too_fast_at = 0;
834 ioc->autop_too_slow_at = 0;
835
836 if (!ioc->user_qos_params)
837 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
838 if (!ioc->user_cost_model)
839 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
840
841 ioc_refresh_period_us(ioc);
842 ioc_refresh_lcoefs(ioc);
843
844 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
845 VTIME_PER_USEC, MILLION);
846 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
847 VTIME_PER_USEC, MILLION);
848
849 return true;
850 }
851
852
853 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
854 {
855 unsigned seq;
856
857 now->now_ns = ktime_get();
858 now->now = ktime_to_us(now->now_ns);
859 now->vrate = atomic64_read(&ioc->vtime_rate);
860
861
862
863
864
865
866
867
868
869 do {
870 seq = read_seqcount_begin(&ioc->period_seqcount);
871 now->vnow = ioc->period_at_vtime +
872 (now->now - ioc->period_at) * now->vrate;
873 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
874 }
875
876 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
877 {
878 lockdep_assert_held(&ioc->lock);
879 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
880
881 write_seqcount_begin(&ioc->period_seqcount);
882 ioc->period_at = now->now;
883 ioc->period_at_vtime = now->vnow;
884 write_seqcount_end(&ioc->period_seqcount);
885
886 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
887 add_timer(&ioc->timer);
888 }
889
890
891
892
893
894 static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
895 {
896 struct ioc *ioc = iocg->ioc;
897 int lvl;
898
899 lockdep_assert_held(&ioc->lock);
900
901 inuse = min(active, inuse);
902
903 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
904 struct ioc_gq *parent = iocg->ancestors[lvl];
905 struct ioc_gq *child = iocg->ancestors[lvl + 1];
906 u32 parent_active = 0, parent_inuse = 0;
907
908
909 parent->child_active_sum += (s32)(active - child->active);
910 parent->child_inuse_sum += (s32)(inuse - child->inuse);
911
912 child->active = active;
913 child->inuse = inuse;
914
915
916
917
918
919
920 if (parent->child_active_sum) {
921 parent_active = parent->weight;
922 parent_inuse = DIV64_U64_ROUND_UP(
923 parent_active * parent->child_inuse_sum,
924 parent->child_active_sum);
925 }
926
927
928 if (parent_active == parent->active &&
929 parent_inuse == parent->inuse)
930 break;
931
932 active = parent_active;
933 inuse = parent_inuse;
934 }
935
936 ioc->weights_updated = true;
937 }
938
939 static void commit_active_weights(struct ioc *ioc)
940 {
941 lockdep_assert_held(&ioc->lock);
942
943 if (ioc->weights_updated) {
944
945 smp_wmb();
946 atomic_inc(&ioc->hweight_gen);
947 ioc->weights_updated = false;
948 }
949 }
950
951 static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
952 {
953 __propagate_active_weight(iocg, active, inuse);
954 commit_active_weights(iocg->ioc);
955 }
956
957 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
958 {
959 struct ioc *ioc = iocg->ioc;
960 int lvl;
961 u32 hwa, hwi;
962 int ioc_gen;
963
964
965 ioc_gen = atomic_read(&ioc->hweight_gen);
966 if (ioc_gen == iocg->hweight_gen)
967 goto out;
968
969
970
971
972
973
974
975
976
977
978
979 smp_rmb();
980
981 hwa = hwi = HWEIGHT_WHOLE;
982 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
983 struct ioc_gq *parent = iocg->ancestors[lvl];
984 struct ioc_gq *child = iocg->ancestors[lvl + 1];
985 u32 active_sum = READ_ONCE(parent->child_active_sum);
986 u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
987 u32 active = READ_ONCE(child->active);
988 u32 inuse = READ_ONCE(child->inuse);
989
990
991 if (!active_sum || !inuse_sum)
992 continue;
993
994 active_sum = max(active, active_sum);
995 hwa = hwa * active / active_sum;
996
997 inuse_sum = max(inuse, inuse_sum);
998 hwi = hwi * inuse / inuse_sum;
999 }
1000
1001 iocg->hweight_active = max_t(u32, hwa, 1);
1002 iocg->hweight_inuse = max_t(u32, hwi, 1);
1003 iocg->hweight_gen = ioc_gen;
1004 out:
1005 if (hw_activep)
1006 *hw_activep = iocg->hweight_active;
1007 if (hw_inusep)
1008 *hw_inusep = iocg->hweight_inuse;
1009 }
1010
1011 static void weight_updated(struct ioc_gq *iocg)
1012 {
1013 struct ioc *ioc = iocg->ioc;
1014 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1015 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1016 u32 weight;
1017
1018 lockdep_assert_held(&ioc->lock);
1019
1020 weight = iocg->cfg_weight ?: iocc->dfl_weight;
1021 if (weight != iocg->weight && iocg->active)
1022 propagate_active_weight(iocg, weight,
1023 DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1024 iocg->weight = weight;
1025 }
1026
1027 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1028 {
1029 struct ioc *ioc = iocg->ioc;
1030 u64 last_period, cur_period, max_period_delta;
1031 u64 vtime, vmargin, vmin;
1032 int i;
1033
1034
1035
1036
1037
1038 if (!list_empty(&iocg->active_list)) {
1039 ioc_now(ioc, now);
1040 cur_period = atomic64_read(&ioc->cur_period);
1041 if (atomic64_read(&iocg->active_period) != cur_period)
1042 atomic64_set(&iocg->active_period, cur_period);
1043 return true;
1044 }
1045
1046
1047 if (iocg->child_active_sum)
1048 return false;
1049
1050 spin_lock_irq(&ioc->lock);
1051
1052 ioc_now(ioc, now);
1053
1054
1055 cur_period = atomic64_read(&ioc->cur_period);
1056 last_period = atomic64_read(&iocg->active_period);
1057 atomic64_set(&iocg->active_period, cur_period);
1058
1059
1060 if (!list_empty(&iocg->active_list))
1061 goto succeed_unlock;
1062 for (i = iocg->level - 1; i > 0; i--)
1063 if (!list_empty(&iocg->ancestors[i]->active_list))
1064 goto fail_unlock;
1065
1066 if (iocg->child_active_sum)
1067 goto fail_unlock;
1068
1069
1070
1071
1072
1073
1074
1075 max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1076 vtime = atomic64_read(&iocg->vtime);
1077 vmargin = ioc->margin_us * now->vrate;
1078 vmin = now->vnow - vmargin;
1079
1080 if (last_period + max_period_delta < cur_period ||
1081 time_before64(vtime, vmin)) {
1082 atomic64_add(vmin - vtime, &iocg->vtime);
1083 atomic64_add(vmin - vtime, &iocg->done_vtime);
1084 vtime = vmin;
1085 }
1086
1087
1088
1089
1090
1091
1092 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1093 list_add(&iocg->active_list, &ioc->active_iocgs);
1094 propagate_active_weight(iocg, iocg->weight,
1095 iocg->last_inuse ?: iocg->weight);
1096
1097 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1098 last_period, cur_period, vtime);
1099
1100 iocg->last_vtime = vtime;
1101
1102 if (ioc->running == IOC_IDLE) {
1103 ioc->running = IOC_RUNNING;
1104 ioc_start_period(ioc, now);
1105 }
1106
1107 succeed_unlock:
1108 spin_unlock_irq(&ioc->lock);
1109 return true;
1110
1111 fail_unlock:
1112 spin_unlock_irq(&ioc->lock);
1113 return false;
1114 }
1115
1116 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1117 int flags, void *key)
1118 {
1119 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1120 struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1121 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1122
1123 ctx->vbudget -= cost;
1124
1125 if (ctx->vbudget < 0)
1126 return -1;
1127
1128 iocg_commit_bio(ctx->iocg, wait->bio, cost);
1129
1130
1131
1132
1133
1134
1135 list_del_init(&wq_entry->entry);
1136 wait->committed = true;
1137
1138 default_wake_function(wq_entry, mode, flags, key);
1139 return 0;
1140 }
1141
1142 static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1143 {
1144 struct ioc *ioc = iocg->ioc;
1145 struct iocg_wake_ctx ctx = { .iocg = iocg };
1146 u64 margin_ns = (u64)(ioc->period_us *
1147 WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1148 u64 vdebt, vshortage, expires, oexpires;
1149 s64 vbudget;
1150 u32 hw_inuse;
1151
1152 lockdep_assert_held(&iocg->waitq.lock);
1153
1154 current_hweight(iocg, NULL, &hw_inuse);
1155 vbudget = now->vnow - atomic64_read(&iocg->vtime);
1156
1157
1158 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1159 if (vdebt && vbudget > 0) {
1160 u64 delta = min_t(u64, vbudget, vdebt);
1161 u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1162 iocg->abs_vdebt);
1163
1164 atomic64_add(delta, &iocg->vtime);
1165 atomic64_add(delta, &iocg->done_vtime);
1166 iocg->abs_vdebt -= abs_delta;
1167 }
1168
1169
1170
1171
1172
1173 ctx.hw_inuse = hw_inuse;
1174 ctx.vbudget = vbudget - vdebt;
1175 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1176 if (!waitqueue_active(&iocg->waitq))
1177 return;
1178 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1179 return;
1180
1181
1182 vshortage = -ctx.vbudget;
1183 expires = now->now_ns +
1184 DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1185 expires += margin_ns / 4;
1186
1187
1188 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1189 if (hrtimer_is_queued(&iocg->waitq_timer) &&
1190 abs(oexpires - expires) <= margin_ns / 4)
1191 return;
1192
1193 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1194 margin_ns / 4, HRTIMER_MODE_ABS);
1195 }
1196
1197 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1198 {
1199 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1200 struct ioc_now now;
1201 unsigned long flags;
1202
1203 ioc_now(iocg->ioc, &now);
1204
1205 spin_lock_irqsave(&iocg->waitq.lock, flags);
1206 iocg_kick_waitq(iocg, &now);
1207 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1208
1209 return HRTIMER_NORESTART;
1210 }
1211
1212 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
1213 {
1214 struct ioc *ioc = iocg->ioc;
1215 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1216 u64 vtime = atomic64_read(&iocg->vtime);
1217 u64 vmargin = ioc->margin_us * now->vrate;
1218 u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1219 u64 expires, oexpires;
1220 u32 hw_inuse;
1221
1222 lockdep_assert_held(&iocg->waitq.lock);
1223
1224
1225 current_hweight(iocg, NULL, &hw_inuse);
1226 vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1227
1228
1229
1230
1231
1232
1233 if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1234 blkcg_clear_delay(blkg);
1235 return false;
1236 }
1237 if (!atomic_read(&blkg->use_delay) &&
1238 time_before_eq64(vtime, now->vnow + vmargin))
1239 return false;
1240
1241
1242 if (cost) {
1243 u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
1244 now->vrate);
1245 blkcg_add_delay(blkg, now->now_ns, cost_ns);
1246 }
1247 blkcg_use_delay(blkg);
1248
1249 expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
1250 now->vrate) * NSEC_PER_USEC;
1251
1252
1253 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1254 if (hrtimer_is_queued(&iocg->delay_timer) &&
1255 abs(oexpires - expires) <= margin_ns / 4)
1256 return true;
1257
1258 hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1259 margin_ns / 4, HRTIMER_MODE_ABS);
1260 return true;
1261 }
1262
1263 static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1264 {
1265 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1266 struct ioc_now now;
1267 unsigned long flags;
1268
1269 spin_lock_irqsave(&iocg->waitq.lock, flags);
1270 ioc_now(iocg->ioc, &now);
1271 iocg_kick_delay(iocg, &now, 0);
1272 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1273
1274 return HRTIMER_NORESTART;
1275 }
1276
1277 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1278 {
1279 u32 nr_met[2] = { };
1280 u32 nr_missed[2] = { };
1281 u64 rq_wait_ns = 0;
1282 int cpu, rw;
1283
1284 for_each_online_cpu(cpu) {
1285 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1286 u64 this_rq_wait_ns;
1287
1288 for (rw = READ; rw <= WRITE; rw++) {
1289 u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1290 u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1291
1292 nr_met[rw] += this_met - stat->missed[rw].last_met;
1293 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1294 stat->missed[rw].last_met = this_met;
1295 stat->missed[rw].last_missed = this_missed;
1296 }
1297
1298 this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1299 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1300 stat->last_rq_wait_ns = this_rq_wait_ns;
1301 }
1302
1303 for (rw = READ; rw <= WRITE; rw++) {
1304 if (nr_met[rw] + nr_missed[rw])
1305 missed_ppm_ar[rw] =
1306 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1307 nr_met[rw] + nr_missed[rw]);
1308 else
1309 missed_ppm_ar[rw] = 0;
1310 }
1311
1312 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1313 ioc->period_us * NSEC_PER_USEC);
1314 }
1315
1316
1317 static bool iocg_is_idle(struct ioc_gq *iocg)
1318 {
1319 struct ioc *ioc = iocg->ioc;
1320
1321
1322 if (atomic64_read(&iocg->active_period) ==
1323 atomic64_read(&ioc->cur_period))
1324 return false;
1325
1326
1327 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1328 return false;
1329
1330 return true;
1331 }
1332
1333
1334 static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1335 {
1336
1337 usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1338 usage += SURPLUS_SCALE_ABS;
1339
1340
1341 if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1342 return 0;
1343
1344 return usage;
1345 }
1346
1347 static void ioc_timer_fn(struct timer_list *timer)
1348 {
1349 struct ioc *ioc = container_of(timer, struct ioc, timer);
1350 struct ioc_gq *iocg, *tiocg;
1351 struct ioc_now now;
1352 int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1353 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1354 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1355 u32 missed_ppm[2], rq_wait_pct;
1356 u64 period_vtime;
1357 int prev_busy_level, i;
1358
1359
1360 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1361
1362
1363 spin_lock_irq(&ioc->lock);
1364
1365 ioc_now(ioc, &now);
1366
1367 period_vtime = now.vnow - ioc->period_at_vtime;
1368 if (WARN_ON_ONCE(!period_vtime)) {
1369 spin_unlock_irq(&ioc->lock);
1370 return;
1371 }
1372
1373
1374
1375
1376
1377
1378
1379 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1380 if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt &&
1381 !iocg_is_idle(iocg))
1382 continue;
1383
1384 spin_lock(&iocg->waitq.lock);
1385
1386 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
1387
1388 iocg_kick_waitq(iocg, &now);
1389 iocg_kick_delay(iocg, &now, 0);
1390 } else if (iocg_is_idle(iocg)) {
1391
1392 iocg->last_inuse = iocg->inuse;
1393 __propagate_active_weight(iocg, 0, 0);
1394 list_del_init(&iocg->active_list);
1395 }
1396
1397 spin_unlock(&iocg->waitq.lock);
1398 }
1399 commit_active_weights(ioc);
1400
1401
1402 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1403 u64 vdone, vtime, vusage, vmargin, vmin;
1404 u32 hw_active, hw_inuse, usage;
1405
1406
1407
1408
1409
1410 vdone = atomic64_read(&iocg->done_vtime);
1411 vtime = atomic64_read(&iocg->vtime);
1412 current_hweight(iocg, &hw_active, &hw_inuse);
1413
1414
1415
1416
1417
1418
1419
1420 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1421 !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1422 time_after64(vtime, vdone) &&
1423 time_after64(vtime, now.vnow -
1424 MAX_LAGGING_PERIODS * period_vtime) &&
1425 time_before64(vdone, now.vnow - period_vtime))
1426 nr_lagging++;
1427
1428 if (waitqueue_active(&iocg->waitq))
1429 vusage = now.vnow - iocg->last_vtime;
1430 else if (time_before64(iocg->last_vtime, vtime))
1431 vusage = vtime - iocg->last_vtime;
1432 else
1433 vusage = 0;
1434
1435 iocg->last_vtime += vusage;
1436
1437
1438
1439
1440
1441 vusage = max(vusage, vtime - vdone);
1442
1443
1444 if (vusage) {
1445 usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1446 period_vtime);
1447 iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1448 iocg->usages[iocg->usage_idx] = usage;
1449 } else {
1450 usage = 0;
1451 }
1452
1453
1454 vmargin = ioc->margin_us * now.vrate;
1455 vmin = now.vnow - vmargin;
1456
1457 iocg->has_surplus = false;
1458
1459 if (!waitqueue_active(&iocg->waitq) &&
1460 time_before64(vtime, vmin)) {
1461 u64 delta = vmin - vtime;
1462
1463
1464 atomic64_add(delta, &iocg->vtime);
1465 atomic64_add(delta, &iocg->done_vtime);
1466 iocg->last_vtime += delta;
1467
1468 if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1469 iocg->has_surplus = true;
1470 nr_surpluses++;
1471 }
1472 } else if (hw_inuse < hw_active) {
1473 u32 new_hwi, new_inuse;
1474
1475
1476 if (waitqueue_active(&iocg->waitq)) {
1477 new_hwi = hw_active;
1478 } else {
1479 new_hwi = max(hw_inuse,
1480 usage * SURPLUS_SCALE_PCT / 100 +
1481 SURPLUS_SCALE_ABS);
1482 }
1483
1484 new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1485 hw_inuse);
1486 new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1487
1488 if (new_inuse > iocg->inuse) {
1489 TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1490 iocg->inuse, new_inuse,
1491 hw_inuse, new_hwi);
1492 __propagate_active_weight(iocg, iocg->weight,
1493 new_inuse);
1494 }
1495 } else {
1496
1497 nr_shortages++;
1498 }
1499 }
1500
1501 if (!nr_shortages || !nr_surpluses)
1502 goto skip_surplus_transfers;
1503
1504
1505 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1506 u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1507 int nr_valid = 0;
1508
1509 if (!iocg->has_surplus)
1510 continue;
1511
1512
1513 for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1514 if (iocg->usages[i]) {
1515 usage = max(usage, iocg->usages[i]);
1516 nr_valid++;
1517 }
1518 }
1519 if (nr_valid < MIN_VALID_USAGES)
1520 continue;
1521
1522 current_hweight(iocg, &hw_active, &hw_inuse);
1523 new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1524 if (!new_hwi)
1525 continue;
1526
1527 new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1528 hw_inuse);
1529 if (new_inuse < iocg->inuse) {
1530 TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1531 iocg->inuse, new_inuse,
1532 hw_inuse, new_hwi);
1533 __propagate_active_weight(iocg, iocg->weight, new_inuse);
1534 }
1535 }
1536 skip_surplus_transfers:
1537 commit_active_weights(ioc);
1538
1539
1540
1541
1542
1543
1544
1545 prev_busy_level = ioc->busy_level;
1546 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1547 missed_ppm[READ] > ppm_rthr ||
1548 missed_ppm[WRITE] > ppm_wthr) {
1549 ioc->busy_level = max(ioc->busy_level, 0);
1550 ioc->busy_level++;
1551 } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1552 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1553 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1554
1555 if (nr_shortages && !nr_lagging) {
1556 ioc->busy_level = min(ioc->busy_level, 0);
1557
1558 if (!nr_surpluses)
1559 ioc->busy_level--;
1560 }
1561 } else {
1562 ioc->busy_level = 0;
1563 }
1564
1565 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1566
1567 if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1568 u64 vrate = atomic64_read(&ioc->vtime_rate);
1569 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1570
1571
1572 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1573 vrate_min = VRATE_MIN;
1574
1575
1576
1577
1578
1579
1580 if (vrate < vrate_min) {
1581 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1582 100);
1583 vrate = min(vrate, vrate_min);
1584 } else if (vrate > vrate_max) {
1585 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1586 100);
1587 vrate = max(vrate, vrate_max);
1588 } else {
1589 int idx = min_t(int, abs(ioc->busy_level),
1590 ARRAY_SIZE(vrate_adj_pct) - 1);
1591 u32 adj_pct = vrate_adj_pct[idx];
1592
1593 if (ioc->busy_level > 0)
1594 adj_pct = 100 - adj_pct;
1595 else
1596 adj_pct = 100 + adj_pct;
1597
1598 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1599 vrate_min, vrate_max);
1600 }
1601
1602 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1603 nr_lagging, nr_shortages,
1604 nr_surpluses);
1605
1606 atomic64_set(&ioc->vtime_rate, vrate);
1607 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1608 ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1609 } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1610 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1611 missed_ppm, rq_wait_pct, nr_lagging,
1612 nr_shortages, nr_surpluses);
1613 }
1614
1615 ioc_refresh_params(ioc, false);
1616
1617
1618
1619
1620
1621 atomic64_inc(&ioc->cur_period);
1622
1623 if (ioc->running != IOC_STOP) {
1624 if (!list_empty(&ioc->active_iocgs)) {
1625 ioc_start_period(ioc, &now);
1626 } else {
1627 ioc->busy_level = 0;
1628 ioc->running = IOC_IDLE;
1629 }
1630 }
1631
1632 spin_unlock_irq(&ioc->lock);
1633 }
1634
1635 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1636 bool is_merge, u64 *costp)
1637 {
1638 struct ioc *ioc = iocg->ioc;
1639 u64 coef_seqio, coef_randio, coef_page;
1640 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1641 u64 seek_pages = 0;
1642 u64 cost = 0;
1643
1644 switch (bio_op(bio)) {
1645 case REQ_OP_READ:
1646 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
1647 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
1648 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
1649 break;
1650 case REQ_OP_WRITE:
1651 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
1652 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
1653 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
1654 break;
1655 default:
1656 goto out;
1657 }
1658
1659 if (iocg->cursor) {
1660 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1661 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1662 }
1663
1664 if (!is_merge) {
1665 if (seek_pages > LCOEF_RANDIO_PAGES) {
1666 cost += coef_randio;
1667 } else {
1668 cost += coef_seqio;
1669 }
1670 }
1671 cost += pages * coef_page;
1672 out:
1673 *costp = cost;
1674 }
1675
1676 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1677 {
1678 u64 cost;
1679
1680 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1681 return cost;
1682 }
1683
1684 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1685 {
1686 struct blkcg_gq *blkg = bio->bi_blkg;
1687 struct ioc *ioc = rqos_to_ioc(rqos);
1688 struct ioc_gq *iocg = blkg_to_iocg(blkg);
1689 struct ioc_now now;
1690 struct iocg_wait wait;
1691 u32 hw_active, hw_inuse;
1692 u64 abs_cost, cost, vtime;
1693
1694
1695 if (!ioc->enabled || !iocg->level)
1696 return;
1697
1698
1699 if (!iocg_activate(iocg, &now))
1700 return;
1701
1702
1703 abs_cost = calc_vtime_cost(bio, iocg, false);
1704 if (!abs_cost)
1705 return;
1706
1707 iocg->cursor = bio_end_sector(bio);
1708
1709 vtime = atomic64_read(&iocg->vtime);
1710 current_hweight(iocg, &hw_active, &hw_inuse);
1711
1712 if (hw_inuse < hw_active &&
1713 time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1714 TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1715 iocg->inuse, iocg->weight, hw_inuse, hw_active);
1716 spin_lock_irq(&ioc->lock);
1717 propagate_active_weight(iocg, iocg->weight, iocg->weight);
1718 spin_unlock_irq(&ioc->lock);
1719 current_hweight(iocg, &hw_active, &hw_inuse);
1720 }
1721
1722 cost = abs_cost_to_cost(abs_cost, hw_inuse);
1723
1724
1725
1726
1727
1728
1729 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1730 time_before_eq64(vtime + cost, now.vnow)) {
1731 iocg_commit_bio(iocg, bio, cost);
1732 return;
1733 }
1734
1735
1736
1737
1738
1739
1740
1741 spin_lock_irq(&iocg->waitq.lock);
1742
1743 if (unlikely(list_empty(&iocg->active_list))) {
1744 spin_unlock_irq(&iocg->waitq.lock);
1745 iocg_commit_bio(iocg, bio, cost);
1746 return;
1747 }
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766 if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1767 iocg->abs_vdebt += abs_cost;
1768 if (iocg_kick_delay(iocg, &now, cost))
1769 blkcg_schedule_throttle(rqos->q,
1770 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1771 spin_unlock_irq(&iocg->waitq.lock);
1772 return;
1773 }
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1789 wait.wait.private = current;
1790 wait.bio = bio;
1791 wait.abs_cost = abs_cost;
1792 wait.committed = false;
1793
1794 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1795 iocg_kick_waitq(iocg, &now);
1796
1797 spin_unlock_irq(&iocg->waitq.lock);
1798
1799 while (true) {
1800 set_current_state(TASK_UNINTERRUPTIBLE);
1801 if (wait.committed)
1802 break;
1803 io_schedule();
1804 }
1805
1806
1807 finish_wait(&iocg->waitq, &wait.wait);
1808 }
1809
1810 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1811 struct bio *bio)
1812 {
1813 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1814 struct ioc *ioc = iocg->ioc;
1815 sector_t bio_end = bio_end_sector(bio);
1816 struct ioc_now now;
1817 u32 hw_inuse;
1818 u64 abs_cost, cost;
1819 unsigned long flags;
1820
1821
1822 if (!ioc->enabled || !iocg->level)
1823 return;
1824
1825 abs_cost = calc_vtime_cost(bio, iocg, true);
1826 if (!abs_cost)
1827 return;
1828
1829 ioc_now(ioc, &now);
1830 current_hweight(iocg, NULL, &hw_inuse);
1831 cost = abs_cost_to_cost(abs_cost, hw_inuse);
1832
1833
1834 if (blk_rq_pos(rq) < bio_end &&
1835 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1836 iocg->cursor = bio_end;
1837
1838
1839
1840
1841
1842 if (rq->bio && rq->bio->bi_iocost_cost &&
1843 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
1844 iocg_commit_bio(iocg, bio, cost);
1845 return;
1846 }
1847
1848
1849
1850
1851
1852
1853 spin_lock_irqsave(&iocg->waitq.lock, flags);
1854 if (likely(!list_empty(&iocg->active_list))) {
1855 iocg->abs_vdebt += abs_cost;
1856 iocg_kick_delay(iocg, &now, cost);
1857 } else {
1858 iocg_commit_bio(iocg, bio, cost);
1859 }
1860 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1861 }
1862
1863 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1864 {
1865 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1866
1867 if (iocg && bio->bi_iocost_cost)
1868 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1869 }
1870
1871 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1872 {
1873 struct ioc *ioc = rqos_to_ioc(rqos);
1874 u64 on_q_ns, rq_wait_ns;
1875 int pidx, rw;
1876
1877 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1878 return;
1879
1880 switch (req_op(rq) & REQ_OP_MASK) {
1881 case REQ_OP_READ:
1882 pidx = QOS_RLAT;
1883 rw = READ;
1884 break;
1885 case REQ_OP_WRITE:
1886 pidx = QOS_WLAT;
1887 rw = WRITE;
1888 break;
1889 default:
1890 return;
1891 }
1892
1893 on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1894 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1895
1896 if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1897 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1898 else
1899 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1900
1901 this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1902 }
1903
1904 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1905 {
1906 struct ioc *ioc = rqos_to_ioc(rqos);
1907
1908 spin_lock_irq(&ioc->lock);
1909 ioc_refresh_params(ioc, false);
1910 spin_unlock_irq(&ioc->lock);
1911 }
1912
1913 static void ioc_rqos_exit(struct rq_qos *rqos)
1914 {
1915 struct ioc *ioc = rqos_to_ioc(rqos);
1916
1917 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1918
1919 spin_lock_irq(&ioc->lock);
1920 ioc->running = IOC_STOP;
1921 spin_unlock_irq(&ioc->lock);
1922
1923 del_timer_sync(&ioc->timer);
1924 free_percpu(ioc->pcpu_stat);
1925 kfree(ioc);
1926 }
1927
1928 static struct rq_qos_ops ioc_rqos_ops = {
1929 .throttle = ioc_rqos_throttle,
1930 .merge = ioc_rqos_merge,
1931 .done_bio = ioc_rqos_done_bio,
1932 .done = ioc_rqos_done,
1933 .queue_depth_changed = ioc_rqos_queue_depth_changed,
1934 .exit = ioc_rqos_exit,
1935 };
1936
1937 static int blk_iocost_init(struct request_queue *q)
1938 {
1939 struct ioc *ioc;
1940 struct rq_qos *rqos;
1941 int ret;
1942
1943 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1944 if (!ioc)
1945 return -ENOMEM;
1946
1947 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1948 if (!ioc->pcpu_stat) {
1949 kfree(ioc);
1950 return -ENOMEM;
1951 }
1952
1953 rqos = &ioc->rqos;
1954 rqos->id = RQ_QOS_COST;
1955 rqos->ops = &ioc_rqos_ops;
1956 rqos->q = q;
1957
1958 spin_lock_init(&ioc->lock);
1959 timer_setup(&ioc->timer, ioc_timer_fn, 0);
1960 INIT_LIST_HEAD(&ioc->active_iocgs);
1961
1962 ioc->running = IOC_IDLE;
1963 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
1964 seqcount_init(&ioc->period_seqcount);
1965 ioc->period_at = ktime_to_us(ktime_get());
1966 atomic64_set(&ioc->cur_period, 0);
1967 atomic_set(&ioc->hweight_gen, 0);
1968
1969 spin_lock_irq(&ioc->lock);
1970 ioc->autop_idx = AUTOP_INVALID;
1971 ioc_refresh_params(ioc, true);
1972 spin_unlock_irq(&ioc->lock);
1973
1974 rq_qos_add(q, rqos);
1975 ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
1976 if (ret) {
1977 rq_qos_del(q, rqos);
1978 free_percpu(ioc->pcpu_stat);
1979 kfree(ioc);
1980 return ret;
1981 }
1982 return 0;
1983 }
1984
1985 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
1986 {
1987 struct ioc_cgrp *iocc;
1988
1989 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
1990 if (!iocc)
1991 return NULL;
1992
1993 iocc->dfl_weight = CGROUP_WEIGHT_DFL;
1994 return &iocc->cpd;
1995 }
1996
1997 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
1998 {
1999 kfree(container_of(cpd, struct ioc_cgrp, cpd));
2000 }
2001
2002 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2003 struct blkcg *blkcg)
2004 {
2005 int levels = blkcg->css.cgroup->level + 1;
2006 struct ioc_gq *iocg;
2007
2008 iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
2009 gfp, q->node);
2010 if (!iocg)
2011 return NULL;
2012
2013 return &iocg->pd;
2014 }
2015
2016 static void ioc_pd_init(struct blkg_policy_data *pd)
2017 {
2018 struct ioc_gq *iocg = pd_to_iocg(pd);
2019 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2020 struct ioc *ioc = q_to_ioc(blkg->q);
2021 struct ioc_now now;
2022 struct blkcg_gq *tblkg;
2023 unsigned long flags;
2024
2025 ioc_now(ioc, &now);
2026
2027 iocg->ioc = ioc;
2028 atomic64_set(&iocg->vtime, now.vnow);
2029 atomic64_set(&iocg->done_vtime, now.vnow);
2030 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2031 INIT_LIST_HEAD(&iocg->active_list);
2032 iocg->hweight_active = HWEIGHT_WHOLE;
2033 iocg->hweight_inuse = HWEIGHT_WHOLE;
2034
2035 init_waitqueue_head(&iocg->waitq);
2036 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2037 iocg->waitq_timer.function = iocg_waitq_timer_fn;
2038 hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2039 iocg->delay_timer.function = iocg_delay_timer_fn;
2040
2041 iocg->level = blkg->blkcg->css.cgroup->level;
2042
2043 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2044 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2045 iocg->ancestors[tiocg->level] = tiocg;
2046 }
2047
2048 spin_lock_irqsave(&ioc->lock, flags);
2049 weight_updated(iocg);
2050 spin_unlock_irqrestore(&ioc->lock, flags);
2051 }
2052
2053 static void ioc_pd_free(struct blkg_policy_data *pd)
2054 {
2055 struct ioc_gq *iocg = pd_to_iocg(pd);
2056 struct ioc *ioc = iocg->ioc;
2057
2058 if (ioc) {
2059 spin_lock(&ioc->lock);
2060 if (!list_empty(&iocg->active_list)) {
2061 propagate_active_weight(iocg, 0, 0);
2062 list_del_init(&iocg->active_list);
2063 }
2064 spin_unlock(&ioc->lock);
2065
2066 hrtimer_cancel(&iocg->waitq_timer);
2067 hrtimer_cancel(&iocg->delay_timer);
2068 }
2069 kfree(iocg);
2070 }
2071
2072 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2073 int off)
2074 {
2075 const char *dname = blkg_dev_name(pd->blkg);
2076 struct ioc_gq *iocg = pd_to_iocg(pd);
2077
2078 if (dname && iocg->cfg_weight)
2079 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2080 return 0;
2081 }
2082
2083
2084 static int ioc_weight_show(struct seq_file *sf, void *v)
2085 {
2086 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2087 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2088
2089 seq_printf(sf, "default %u\n", iocc->dfl_weight);
2090 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2091 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2092 return 0;
2093 }
2094
2095 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2096 size_t nbytes, loff_t off)
2097 {
2098 struct blkcg *blkcg = css_to_blkcg(of_css(of));
2099 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2100 struct blkg_conf_ctx ctx;
2101 struct ioc_gq *iocg;
2102 u32 v;
2103 int ret;
2104
2105 if (!strchr(buf, ':')) {
2106 struct blkcg_gq *blkg;
2107
2108 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2109 return -EINVAL;
2110
2111 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2112 return -EINVAL;
2113
2114 spin_lock(&blkcg->lock);
2115 iocc->dfl_weight = v;
2116 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2117 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2118
2119 if (iocg) {
2120 spin_lock_irq(&iocg->ioc->lock);
2121 weight_updated(iocg);
2122 spin_unlock_irq(&iocg->ioc->lock);
2123 }
2124 }
2125 spin_unlock(&blkcg->lock);
2126
2127 return nbytes;
2128 }
2129
2130 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2131 if (ret)
2132 return ret;
2133
2134 iocg = blkg_to_iocg(ctx.blkg);
2135
2136 if (!strncmp(ctx.body, "default", 7)) {
2137 v = 0;
2138 } else {
2139 if (!sscanf(ctx.body, "%u", &v))
2140 goto einval;
2141 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2142 goto einval;
2143 }
2144
2145 spin_lock(&iocg->ioc->lock);
2146 iocg->cfg_weight = v;
2147 weight_updated(iocg);
2148 spin_unlock(&iocg->ioc->lock);
2149
2150 blkg_conf_finish(&ctx);
2151 return nbytes;
2152
2153 einval:
2154 blkg_conf_finish(&ctx);
2155 return -EINVAL;
2156 }
2157
2158 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2159 int off)
2160 {
2161 const char *dname = blkg_dev_name(pd->blkg);
2162 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2163
2164 if (!dname)
2165 return 0;
2166
2167 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2168 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2169 ioc->params.qos[QOS_RPPM] / 10000,
2170 ioc->params.qos[QOS_RPPM] % 10000 / 100,
2171 ioc->params.qos[QOS_RLAT],
2172 ioc->params.qos[QOS_WPPM] / 10000,
2173 ioc->params.qos[QOS_WPPM] % 10000 / 100,
2174 ioc->params.qos[QOS_WLAT],
2175 ioc->params.qos[QOS_MIN] / 10000,
2176 ioc->params.qos[QOS_MIN] % 10000 / 100,
2177 ioc->params.qos[QOS_MAX] / 10000,
2178 ioc->params.qos[QOS_MAX] % 10000 / 100);
2179 return 0;
2180 }
2181
2182 static int ioc_qos_show(struct seq_file *sf, void *v)
2183 {
2184 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2185
2186 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2187 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2188 return 0;
2189 }
2190
2191 static const match_table_t qos_ctrl_tokens = {
2192 { QOS_ENABLE, "enable=%u" },
2193 { QOS_CTRL, "ctrl=%s" },
2194 { NR_QOS_CTRL_PARAMS, NULL },
2195 };
2196
2197 static const match_table_t qos_tokens = {
2198 { QOS_RPPM, "rpct=%s" },
2199 { QOS_RLAT, "rlat=%u" },
2200 { QOS_WPPM, "wpct=%s" },
2201 { QOS_WLAT, "wlat=%u" },
2202 { QOS_MIN, "min=%s" },
2203 { QOS_MAX, "max=%s" },
2204 { NR_QOS_PARAMS, NULL },
2205 };
2206
2207 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2208 size_t nbytes, loff_t off)
2209 {
2210 struct gendisk *disk;
2211 struct ioc *ioc;
2212 u32 qos[NR_QOS_PARAMS];
2213 bool enable, user;
2214 char *p;
2215 int ret;
2216
2217 disk = blkcg_conf_get_disk(&input);
2218 if (IS_ERR(disk))
2219 return PTR_ERR(disk);
2220
2221 ioc = q_to_ioc(disk->queue);
2222 if (!ioc) {
2223 ret = blk_iocost_init(disk->queue);
2224 if (ret)
2225 goto err;
2226 ioc = q_to_ioc(disk->queue);
2227 }
2228
2229 spin_lock_irq(&ioc->lock);
2230 memcpy(qos, ioc->params.qos, sizeof(qos));
2231 enable = ioc->enabled;
2232 user = ioc->user_qos_params;
2233 spin_unlock_irq(&ioc->lock);
2234
2235 while ((p = strsep(&input, " \t\n"))) {
2236 substring_t args[MAX_OPT_ARGS];
2237 char buf[32];
2238 int tok;
2239 s64 v;
2240
2241 if (!*p)
2242 continue;
2243
2244 switch (match_token(p, qos_ctrl_tokens, args)) {
2245 case QOS_ENABLE:
2246 match_u64(&args[0], &v);
2247 enable = v;
2248 continue;
2249 case QOS_CTRL:
2250 match_strlcpy(buf, &args[0], sizeof(buf));
2251 if (!strcmp(buf, "auto"))
2252 user = false;
2253 else if (!strcmp(buf, "user"))
2254 user = true;
2255 else
2256 goto einval;
2257 continue;
2258 }
2259
2260 tok = match_token(p, qos_tokens, args);
2261 switch (tok) {
2262 case QOS_RPPM:
2263 case QOS_WPPM:
2264 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2265 sizeof(buf))
2266 goto einval;
2267 if (cgroup_parse_float(buf, 2, &v))
2268 goto einval;
2269 if (v < 0 || v > 10000)
2270 goto einval;
2271 qos[tok] = v * 100;
2272 break;
2273 case QOS_RLAT:
2274 case QOS_WLAT:
2275 if (match_u64(&args[0], &v))
2276 goto einval;
2277 qos[tok] = v;
2278 break;
2279 case QOS_MIN:
2280 case QOS_MAX:
2281 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2282 sizeof(buf))
2283 goto einval;
2284 if (cgroup_parse_float(buf, 2, &v))
2285 goto einval;
2286 if (v < 0)
2287 goto einval;
2288 qos[tok] = clamp_t(s64, v * 100,
2289 VRATE_MIN_PPM, VRATE_MAX_PPM);
2290 break;
2291 default:
2292 goto einval;
2293 }
2294 user = true;
2295 }
2296
2297 if (qos[QOS_MIN] > qos[QOS_MAX])
2298 goto einval;
2299
2300 spin_lock_irq(&ioc->lock);
2301
2302 if (enable) {
2303 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2304 ioc->enabled = true;
2305 } else {
2306 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2307 ioc->enabled = false;
2308 }
2309
2310 if (user) {
2311 memcpy(ioc->params.qos, qos, sizeof(qos));
2312 ioc->user_qos_params = true;
2313 } else {
2314 ioc->user_qos_params = false;
2315 }
2316
2317 ioc_refresh_params(ioc, true);
2318 spin_unlock_irq(&ioc->lock);
2319
2320 put_disk_and_module(disk);
2321 return nbytes;
2322 einval:
2323 ret = -EINVAL;
2324 err:
2325 put_disk_and_module(disk);
2326 return ret;
2327 }
2328
2329 static u64 ioc_cost_model_prfill(struct seq_file *sf,
2330 struct blkg_policy_data *pd, int off)
2331 {
2332 const char *dname = blkg_dev_name(pd->blkg);
2333 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2334 u64 *u = ioc->params.i_lcoefs;
2335
2336 if (!dname)
2337 return 0;
2338
2339 seq_printf(sf, "%s ctrl=%s model=linear "
2340 "rbps=%llu rseqiops=%llu rrandiops=%llu "
2341 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2342 dname, ioc->user_cost_model ? "user" : "auto",
2343 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2344 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2345 return 0;
2346 }
2347
2348 static int ioc_cost_model_show(struct seq_file *sf, void *v)
2349 {
2350 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2351
2352 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2353 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2354 return 0;
2355 }
2356
2357 static const match_table_t cost_ctrl_tokens = {
2358 { COST_CTRL, "ctrl=%s" },
2359 { COST_MODEL, "model=%s" },
2360 { NR_COST_CTRL_PARAMS, NULL },
2361 };
2362
2363 static const match_table_t i_lcoef_tokens = {
2364 { I_LCOEF_RBPS, "rbps=%u" },
2365 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
2366 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
2367 { I_LCOEF_WBPS, "wbps=%u" },
2368 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
2369 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
2370 { NR_I_LCOEFS, NULL },
2371 };
2372
2373 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2374 size_t nbytes, loff_t off)
2375 {
2376 struct gendisk *disk;
2377 struct ioc *ioc;
2378 u64 u[NR_I_LCOEFS];
2379 bool user;
2380 char *p;
2381 int ret;
2382
2383 disk = blkcg_conf_get_disk(&input);
2384 if (IS_ERR(disk))
2385 return PTR_ERR(disk);
2386
2387 ioc = q_to_ioc(disk->queue);
2388 if (!ioc) {
2389 ret = blk_iocost_init(disk->queue);
2390 if (ret)
2391 goto err;
2392 ioc = q_to_ioc(disk->queue);
2393 }
2394
2395 spin_lock_irq(&ioc->lock);
2396 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2397 user = ioc->user_cost_model;
2398 spin_unlock_irq(&ioc->lock);
2399
2400 while ((p = strsep(&input, " \t\n"))) {
2401 substring_t args[MAX_OPT_ARGS];
2402 char buf[32];
2403 int tok;
2404 u64 v;
2405
2406 if (!*p)
2407 continue;
2408
2409 switch (match_token(p, cost_ctrl_tokens, args)) {
2410 case COST_CTRL:
2411 match_strlcpy(buf, &args[0], sizeof(buf));
2412 if (!strcmp(buf, "auto"))
2413 user = false;
2414 else if (!strcmp(buf, "user"))
2415 user = true;
2416 else
2417 goto einval;
2418 continue;
2419 case COST_MODEL:
2420 match_strlcpy(buf, &args[0], sizeof(buf));
2421 if (strcmp(buf, "linear"))
2422 goto einval;
2423 continue;
2424 }
2425
2426 tok = match_token(p, i_lcoef_tokens, args);
2427 if (tok == NR_I_LCOEFS)
2428 goto einval;
2429 if (match_u64(&args[0], &v))
2430 goto einval;
2431 u[tok] = v;
2432 user = true;
2433 }
2434
2435 spin_lock_irq(&ioc->lock);
2436 if (user) {
2437 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2438 ioc->user_cost_model = true;
2439 } else {
2440 ioc->user_cost_model = false;
2441 }
2442 ioc_refresh_params(ioc, true);
2443 spin_unlock_irq(&ioc->lock);
2444
2445 put_disk_and_module(disk);
2446 return nbytes;
2447
2448 einval:
2449 ret = -EINVAL;
2450 err:
2451 put_disk_and_module(disk);
2452 return ret;
2453 }
2454
2455 static struct cftype ioc_files[] = {
2456 {
2457 .name = "weight",
2458 .flags = CFTYPE_NOT_ON_ROOT,
2459 .seq_show = ioc_weight_show,
2460 .write = ioc_weight_write,
2461 },
2462 {
2463 .name = "cost.qos",
2464 .flags = CFTYPE_ONLY_ON_ROOT,
2465 .seq_show = ioc_qos_show,
2466 .write = ioc_qos_write,
2467 },
2468 {
2469 .name = "cost.model",
2470 .flags = CFTYPE_ONLY_ON_ROOT,
2471 .seq_show = ioc_cost_model_show,
2472 .write = ioc_cost_model_write,
2473 },
2474 {}
2475 };
2476
2477 static struct blkcg_policy blkcg_policy_iocost = {
2478 .dfl_cftypes = ioc_files,
2479 .cpd_alloc_fn = ioc_cpd_alloc,
2480 .cpd_free_fn = ioc_cpd_free,
2481 .pd_alloc_fn = ioc_pd_alloc,
2482 .pd_init_fn = ioc_pd_init,
2483 .pd_free_fn = ioc_pd_free,
2484 };
2485
2486 static int __init ioc_init(void)
2487 {
2488 return blkcg_policy_register(&blkcg_policy_iocost);
2489 }
2490
2491 static void __exit ioc_exit(void)
2492 {
2493 return blkcg_policy_unregister(&blkcg_policy_iocost);
2494 }
2495
2496 module_init(ioc_init);
2497 module_exit(ioc_exit);