This source file includes following definitions.
- blkcg_policy_enabled
- blkg_free
- __blkg_release
- blkg_release
- blkg_async_bio_workfn
- blkg_alloc
- blkg_lookup_slowpath
- blkg_create
- __blkg_lookup_create
- blkg_lookup_create
- blkg_destroy
- blkg_destroy_all
- blkcg_reset_stats
- blkg_dev_name
- blkcg_print_blkgs
- __blkg_prfill_u64
- __blkg_prfill_rwstat
- blkg_prfill_rwstat
- blkg_prfill_rwstat_field
- blkg_print_stat_bytes
- blkg_print_stat_ios
- blkg_prfill_rwstat_field_recursive
- blkg_print_stat_bytes_recursive
- blkg_print_stat_ios_recursive
- blkg_rwstat_recursive_sum
- blkg_lookup_check
- blkcg_conf_get_disk
- blkg_conf_prep
- blkg_conf_finish
- blkcg_print_stat
- blkcg_css_offline
- blkcg_destroy_blkgs
- blkcg_css_free
- blkcg_css_alloc
- blkcg_init_queue
- blkcg_drain_queue
- blkcg_exit_queue
- blkcg_can_attach
- blkcg_bind
- blkcg_exit
- blkcg_activate_policy
- blkcg_deactivate_policy
- blkcg_policy_register
- blkcg_policy_unregister
- __blkcg_punt_bio_submit
- blkcg_scale_delay
- blkcg_maybe_throttle_blkg
- blkcg_maybe_throttle_current
- blkcg_schedule_throttle
- blkcg_add_delay
- blkcg_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 #include <linux/ioprio.h>
19 #include <linux/kdev_t.h>
20 #include <linux/module.h>
21 #include <linux/sched/signal.h>
22 #include <linux/err.h>
23 #include <linux/blkdev.h>
24 #include <linux/backing-dev.h>
25 #include <linux/slab.h>
26 #include <linux/genhd.h>
27 #include <linux/delay.h>
28 #include <linux/atomic.h>
29 #include <linux/ctype.h>
30 #include <linux/blk-cgroup.h>
31 #include <linux/tracehook.h>
32 #include <linux/psi.h>
33 #include "blk.h"
34
35 #define MAX_KEY_LEN 100
36
37
38
39
40
41
42
43
44 static DEFINE_MUTEX(blkcg_pol_register_mutex);
45 static DEFINE_MUTEX(blkcg_pol_mutex);
46
47 struct blkcg blkcg_root;
48 EXPORT_SYMBOL_GPL(blkcg_root);
49
50 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
51 EXPORT_SYMBOL_GPL(blkcg_root_css);
52
53 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
54
55 static LIST_HEAD(all_blkcgs);
56
57 bool blkcg_debug_stats = false;
58 static struct workqueue_struct *blkcg_punt_bio_wq;
59
60 static bool blkcg_policy_enabled(struct request_queue *q,
61 const struct blkcg_policy *pol)
62 {
63 return pol && test_bit(pol->plid, q->blkcg_pols);
64 }
65
66
67
68
69
70
71
72 static void blkg_free(struct blkcg_gq *blkg)
73 {
74 int i;
75
76 if (!blkg)
77 return;
78
79 for (i = 0; i < BLKCG_MAX_POLS; i++)
80 if (blkg->pd[i])
81 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
82
83 blkg_rwstat_exit(&blkg->stat_ios);
84 blkg_rwstat_exit(&blkg->stat_bytes);
85 percpu_ref_exit(&blkg->refcnt);
86 kfree(blkg);
87 }
88
89 static void __blkg_release(struct rcu_head *rcu)
90 {
91 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
92
93 WARN_ON(!bio_list_empty(&blkg->async_bios));
94
95
96 css_put(&blkg->blkcg->css);
97 if (blkg->parent)
98 blkg_put(blkg->parent);
99
100 wb_congested_put(blkg->wb_congested);
101
102 blkg_free(blkg);
103 }
104
105
106
107
108
109
110
111
112
113 static void blkg_release(struct percpu_ref *ref)
114 {
115 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
116
117 call_rcu(&blkg->rcu_head, __blkg_release);
118 }
119
120 static void blkg_async_bio_workfn(struct work_struct *work)
121 {
122 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
123 async_bio_work);
124 struct bio_list bios = BIO_EMPTY_LIST;
125 struct bio *bio;
126
127
128 spin_lock_bh(&blkg->async_bio_lock);
129 bio_list_merge(&bios, &blkg->async_bios);
130 bio_list_init(&blkg->async_bios);
131 spin_unlock_bh(&blkg->async_bio_lock);
132
133 while ((bio = bio_list_pop(&bios)))
134 submit_bio(bio);
135 }
136
137
138
139
140
141
142
143
144
145 static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
146 gfp_t gfp_mask)
147 {
148 struct blkcg_gq *blkg;
149 int i;
150
151
152 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
153 if (!blkg)
154 return NULL;
155
156 if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
157 goto err_free;
158
159 if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
160 blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
161 goto err_free;
162
163 blkg->q = q;
164 INIT_LIST_HEAD(&blkg->q_node);
165 spin_lock_init(&blkg->async_bio_lock);
166 bio_list_init(&blkg->async_bios);
167 INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
168 blkg->blkcg = blkcg;
169
170 for (i = 0; i < BLKCG_MAX_POLS; i++) {
171 struct blkcg_policy *pol = blkcg_policy[i];
172 struct blkg_policy_data *pd;
173
174 if (!blkcg_policy_enabled(q, pol))
175 continue;
176
177
178 pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
179 if (!pd)
180 goto err_free;
181
182 blkg->pd[i] = pd;
183 pd->blkg = blkg;
184 pd->plid = i;
185 }
186
187 return blkg;
188
189 err_free:
190 blkg_free(blkg);
191 return NULL;
192 }
193
194 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
195 struct request_queue *q, bool update_hint)
196 {
197 struct blkcg_gq *blkg;
198
199
200
201
202
203
204
205 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
206 if (blkg && blkg->q == q) {
207 if (update_hint) {
208 lockdep_assert_held(&q->queue_lock);
209 rcu_assign_pointer(blkcg->blkg_hint, blkg);
210 }
211 return blkg;
212 }
213
214 return NULL;
215 }
216 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
217
218
219
220
221
222 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
223 struct request_queue *q,
224 struct blkcg_gq *new_blkg)
225 {
226 struct blkcg_gq *blkg;
227 struct bdi_writeback_congested *wb_congested;
228 int i, ret;
229
230 WARN_ON_ONCE(!rcu_read_lock_held());
231 lockdep_assert_held(&q->queue_lock);
232
233
234 if (blk_queue_dying(q)) {
235 ret = -ENODEV;
236 goto err_free_blkg;
237 }
238
239
240 if (!css_tryget_online(&blkcg->css)) {
241 ret = -ENODEV;
242 goto err_free_blkg;
243 }
244
245 wb_congested = wb_congested_get_create(q->backing_dev_info,
246 blkcg->css.id,
247 GFP_NOWAIT | __GFP_NOWARN);
248 if (!wb_congested) {
249 ret = -ENOMEM;
250 goto err_put_css;
251 }
252
253
254 if (!new_blkg) {
255 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
256 if (unlikely(!new_blkg)) {
257 ret = -ENOMEM;
258 goto err_put_congested;
259 }
260 }
261 blkg = new_blkg;
262 blkg->wb_congested = wb_congested;
263
264
265 if (blkcg_parent(blkcg)) {
266 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
267 if (WARN_ON_ONCE(!blkg->parent)) {
268 ret = -ENODEV;
269 goto err_put_congested;
270 }
271 blkg_get(blkg->parent);
272 }
273
274
275 for (i = 0; i < BLKCG_MAX_POLS; i++) {
276 struct blkcg_policy *pol = blkcg_policy[i];
277
278 if (blkg->pd[i] && pol->pd_init_fn)
279 pol->pd_init_fn(blkg->pd[i]);
280 }
281
282
283 spin_lock(&blkcg->lock);
284 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
285 if (likely(!ret)) {
286 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
287 list_add(&blkg->q_node, &q->blkg_list);
288
289 for (i = 0; i < BLKCG_MAX_POLS; i++) {
290 struct blkcg_policy *pol = blkcg_policy[i];
291
292 if (blkg->pd[i] && pol->pd_online_fn)
293 pol->pd_online_fn(blkg->pd[i]);
294 }
295 }
296 blkg->online = true;
297 spin_unlock(&blkcg->lock);
298
299 if (!ret)
300 return blkg;
301
302
303 blkg_put(blkg);
304 return ERR_PTR(ret);
305
306 err_put_congested:
307 wb_congested_put(wb_congested);
308 err_put_css:
309 css_put(&blkcg->css);
310 err_free_blkg:
311 blkg_free(new_blkg);
312 return ERR_PTR(ret);
313 }
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328 struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
329 struct request_queue *q)
330 {
331 struct blkcg_gq *blkg;
332
333 WARN_ON_ONCE(!rcu_read_lock_held());
334 lockdep_assert_held(&q->queue_lock);
335
336 blkg = __blkg_lookup(blkcg, q, true);
337 if (blkg)
338 return blkg;
339
340
341
342
343
344
345 while (true) {
346 struct blkcg *pos = blkcg;
347 struct blkcg *parent = blkcg_parent(blkcg);
348 struct blkcg_gq *ret_blkg = q->root_blkg;
349
350 while (parent) {
351 blkg = __blkg_lookup(parent, q, false);
352 if (blkg) {
353
354 ret_blkg = blkg;
355 break;
356 }
357 pos = parent;
358 parent = blkcg_parent(parent);
359 }
360
361 blkg = blkg_create(pos, q, NULL);
362 if (IS_ERR(blkg))
363 return ret_blkg;
364 if (pos == blkcg)
365 return blkg;
366 }
367 }
368
369
370
371
372
373
374
375
376
377 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
378 struct request_queue *q)
379 {
380 struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
381
382 if (unlikely(!blkg)) {
383 unsigned long flags;
384
385 spin_lock_irqsave(&q->queue_lock, flags);
386 blkg = __blkg_lookup_create(blkcg, q);
387 spin_unlock_irqrestore(&q->queue_lock, flags);
388 }
389
390 return blkg;
391 }
392
393 static void blkg_destroy(struct blkcg_gq *blkg)
394 {
395 struct blkcg *blkcg = blkg->blkcg;
396 struct blkcg_gq *parent = blkg->parent;
397 int i;
398
399 lockdep_assert_held(&blkg->q->queue_lock);
400 lockdep_assert_held(&blkcg->lock);
401
402
403 WARN_ON_ONCE(list_empty(&blkg->q_node));
404 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
405
406 for (i = 0; i < BLKCG_MAX_POLS; i++) {
407 struct blkcg_policy *pol = blkcg_policy[i];
408
409 if (blkg->pd[i] && pol->pd_offline_fn)
410 pol->pd_offline_fn(blkg->pd[i]);
411 }
412
413 if (parent) {
414 blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
415 blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
416 }
417
418 blkg->online = false;
419
420 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
421 list_del_init(&blkg->q_node);
422 hlist_del_init_rcu(&blkg->blkcg_node);
423
424
425
426
427
428
429 if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
430 rcu_assign_pointer(blkcg->blkg_hint, NULL);
431
432
433
434
435
436 percpu_ref_kill(&blkg->refcnt);
437 }
438
439
440
441
442
443
444
445 static void blkg_destroy_all(struct request_queue *q)
446 {
447 struct blkcg_gq *blkg, *n;
448
449 spin_lock_irq(&q->queue_lock);
450 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
451 struct blkcg *blkcg = blkg->blkcg;
452
453 spin_lock(&blkcg->lock);
454 blkg_destroy(blkg);
455 spin_unlock(&blkcg->lock);
456 }
457
458 q->root_blkg = NULL;
459 spin_unlock_irq(&q->queue_lock);
460 }
461
462 static int blkcg_reset_stats(struct cgroup_subsys_state *css,
463 struct cftype *cftype, u64 val)
464 {
465 struct blkcg *blkcg = css_to_blkcg(css);
466 struct blkcg_gq *blkg;
467 int i;
468
469 mutex_lock(&blkcg_pol_mutex);
470 spin_lock_irq(&blkcg->lock);
471
472
473
474
475
476
477 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
478 blkg_rwstat_reset(&blkg->stat_bytes);
479 blkg_rwstat_reset(&blkg->stat_ios);
480
481 for (i = 0; i < BLKCG_MAX_POLS; i++) {
482 struct blkcg_policy *pol = blkcg_policy[i];
483
484 if (blkg->pd[i] && pol->pd_reset_stats_fn)
485 pol->pd_reset_stats_fn(blkg->pd[i]);
486 }
487 }
488
489 spin_unlock_irq(&blkcg->lock);
490 mutex_unlock(&blkcg_pol_mutex);
491 return 0;
492 }
493
494 const char *blkg_dev_name(struct blkcg_gq *blkg)
495 {
496
497 if (blkg->q->backing_dev_info->dev)
498 return dev_name(blkg->q->backing_dev_info->dev);
499 return NULL;
500 }
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
521 u64 (*prfill)(struct seq_file *,
522 struct blkg_policy_data *, int),
523 const struct blkcg_policy *pol, int data,
524 bool show_total)
525 {
526 struct blkcg_gq *blkg;
527 u64 total = 0;
528
529 rcu_read_lock();
530 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
531 spin_lock_irq(&blkg->q->queue_lock);
532 if (blkcg_policy_enabled(blkg->q, pol))
533 total += prfill(sf, blkg->pd[pol->plid], data);
534 spin_unlock_irq(&blkg->q->queue_lock);
535 }
536 rcu_read_unlock();
537
538 if (show_total)
539 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
540 }
541 EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
542
543
544
545
546
547
548
549
550
551 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
552 {
553 const char *dname = blkg_dev_name(pd->blkg);
554
555 if (!dname)
556 return 0;
557
558 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
559 return v;
560 }
561 EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
562
563
564
565
566
567
568
569
570
571 u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
572 const struct blkg_rwstat_sample *rwstat)
573 {
574 static const char *rwstr[] = {
575 [BLKG_RWSTAT_READ] = "Read",
576 [BLKG_RWSTAT_WRITE] = "Write",
577 [BLKG_RWSTAT_SYNC] = "Sync",
578 [BLKG_RWSTAT_ASYNC] = "Async",
579 [BLKG_RWSTAT_DISCARD] = "Discard",
580 };
581 const char *dname = blkg_dev_name(pd->blkg);
582 u64 v;
583 int i;
584
585 if (!dname)
586 return 0;
587
588 for (i = 0; i < BLKG_RWSTAT_NR; i++)
589 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
590 rwstat->cnt[i]);
591
592 v = rwstat->cnt[BLKG_RWSTAT_READ] +
593 rwstat->cnt[BLKG_RWSTAT_WRITE] +
594 rwstat->cnt[BLKG_RWSTAT_DISCARD];
595 seq_printf(sf, "%s Total %llu\n", dname, v);
596 return v;
597 }
598 EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
599
600
601
602
603
604
605
606
607
608 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
609 int off)
610 {
611 struct blkg_rwstat_sample rwstat = { };
612
613 blkg_rwstat_read((void *)pd + off, &rwstat);
614 return __blkg_prfill_rwstat(sf, pd, &rwstat);
615 }
616 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
617
618 static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
619 struct blkg_policy_data *pd, int off)
620 {
621 struct blkg_rwstat_sample rwstat = { };
622
623 blkg_rwstat_read((void *)pd->blkg + off, &rwstat);
624 return __blkg_prfill_rwstat(sf, pd, &rwstat);
625 }
626
627
628
629
630
631
632
633
634
635 int blkg_print_stat_bytes(struct seq_file *sf, void *v)
636 {
637 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
638 blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
639 offsetof(struct blkcg_gq, stat_bytes), true);
640 return 0;
641 }
642 EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
643
644
645
646
647
648
649
650
651
652 int blkg_print_stat_ios(struct seq_file *sf, void *v)
653 {
654 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
655 blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
656 offsetof(struct blkcg_gq, stat_ios), true);
657 return 0;
658 }
659 EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
660
661 static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
662 struct blkg_policy_data *pd,
663 int off)
664 {
665 struct blkg_rwstat_sample rwstat;
666
667 blkg_rwstat_recursive_sum(pd->blkg, NULL, off, &rwstat);
668 return __blkg_prfill_rwstat(sf, pd, &rwstat);
669 }
670
671
672
673
674
675
676 int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
677 {
678 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
679 blkg_prfill_rwstat_field_recursive,
680 (void *)seq_cft(sf)->private,
681 offsetof(struct blkcg_gq, stat_bytes), true);
682 return 0;
683 }
684 EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
685
686
687
688
689
690
691 int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
692 {
693 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
694 blkg_prfill_rwstat_field_recursive,
695 (void *)seq_cft(sf)->private,
696 offsetof(struct blkcg_gq, stat_ios), true);
697 return 0;
698 }
699 EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715 void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
716 int off, struct blkg_rwstat_sample *sum)
717 {
718 struct blkcg_gq *pos_blkg;
719 struct cgroup_subsys_state *pos_css;
720 unsigned int i;
721
722 lockdep_assert_held(&blkg->q->queue_lock);
723
724 rcu_read_lock();
725 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
726 struct blkg_rwstat *rwstat;
727
728 if (!pos_blkg->online)
729 continue;
730
731 if (pol)
732 rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
733 else
734 rwstat = (void *)pos_blkg + off;
735
736 for (i = 0; i < BLKG_RWSTAT_NR; i++)
737 sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i);
738 }
739 rcu_read_unlock();
740 }
741 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
742
743
744 static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
745 const struct blkcg_policy *pol,
746 struct request_queue *q)
747 {
748 WARN_ON_ONCE(!rcu_read_lock_held());
749 lockdep_assert_held(&q->queue_lock);
750
751 if (!blkcg_policy_enabled(q, pol))
752 return ERR_PTR(-EOPNOTSUPP);
753 return __blkg_lookup(blkcg, q, true );
754 }
755
756
757
758
759
760
761
762
763
764
765
766
767 struct gendisk *blkcg_conf_get_disk(char **inputp)
768 {
769 char *input = *inputp;
770 unsigned int major, minor;
771 struct gendisk *disk;
772 int key_len, part;
773
774 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
775 return ERR_PTR(-EINVAL);
776
777 input += key_len;
778 if (!isspace(*input))
779 return ERR_PTR(-EINVAL);
780 input = skip_spaces(input);
781
782 disk = get_gendisk(MKDEV(major, minor), &part);
783 if (!disk)
784 return ERR_PTR(-ENODEV);
785 if (part) {
786 put_disk_and_module(disk);
787 return ERR_PTR(-ENODEV);
788 }
789
790 *inputp = input;
791 return disk;
792 }
793
794
795
796
797
798
799
800
801
802
803
804
805
806 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
807 char *input, struct blkg_conf_ctx *ctx)
808 __acquires(rcu) __acquires(&disk->queue->queue_lock)
809 {
810 struct gendisk *disk;
811 struct request_queue *q;
812 struct blkcg_gq *blkg;
813 int ret;
814
815 disk = blkcg_conf_get_disk(&input);
816 if (IS_ERR(disk))
817 return PTR_ERR(disk);
818
819 q = disk->queue;
820
821 rcu_read_lock();
822 spin_lock_irq(&q->queue_lock);
823
824 blkg = blkg_lookup_check(blkcg, pol, q);
825 if (IS_ERR(blkg)) {
826 ret = PTR_ERR(blkg);
827 goto fail_unlock;
828 }
829
830 if (blkg)
831 goto success;
832
833
834
835
836
837 while (true) {
838 struct blkcg *pos = blkcg;
839 struct blkcg *parent;
840 struct blkcg_gq *new_blkg;
841
842 parent = blkcg_parent(blkcg);
843 while (parent && !__blkg_lookup(parent, q, false)) {
844 pos = parent;
845 parent = blkcg_parent(parent);
846 }
847
848
849 spin_unlock_irq(&q->queue_lock);
850 rcu_read_unlock();
851
852 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
853 if (unlikely(!new_blkg)) {
854 ret = -ENOMEM;
855 goto fail;
856 }
857
858 rcu_read_lock();
859 spin_lock_irq(&q->queue_lock);
860
861 blkg = blkg_lookup_check(pos, pol, q);
862 if (IS_ERR(blkg)) {
863 ret = PTR_ERR(blkg);
864 goto fail_unlock;
865 }
866
867 if (blkg) {
868 blkg_free(new_blkg);
869 } else {
870 blkg = blkg_create(pos, q, new_blkg);
871 if (IS_ERR(blkg)) {
872 ret = PTR_ERR(blkg);
873 goto fail_unlock;
874 }
875 }
876
877 if (pos == blkcg)
878 goto success;
879 }
880 success:
881 ctx->disk = disk;
882 ctx->blkg = blkg;
883 ctx->body = input;
884 return 0;
885
886 fail_unlock:
887 spin_unlock_irq(&q->queue_lock);
888 rcu_read_unlock();
889 fail:
890 put_disk_and_module(disk);
891
892
893
894
895
896
897 if (ret == -EBUSY) {
898 msleep(10);
899 ret = restart_syscall();
900 }
901 return ret;
902 }
903 EXPORT_SYMBOL_GPL(blkg_conf_prep);
904
905
906
907
908
909
910
911
912 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
913 __releases(&ctx->disk->queue->queue_lock) __releases(rcu)
914 {
915 spin_unlock_irq(&ctx->disk->queue->queue_lock);
916 rcu_read_unlock();
917 put_disk_and_module(ctx->disk);
918 }
919 EXPORT_SYMBOL_GPL(blkg_conf_finish);
920
921 static int blkcg_print_stat(struct seq_file *sf, void *v)
922 {
923 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
924 struct blkcg_gq *blkg;
925
926 rcu_read_lock();
927
928 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
929 const char *dname;
930 char *buf;
931 struct blkg_rwstat_sample rwstat;
932 u64 rbytes, wbytes, rios, wios, dbytes, dios;
933 size_t size = seq_get_buf(sf, &buf), off = 0;
934 int i;
935 bool has_stats = false;
936
937 spin_lock_irq(&blkg->q->queue_lock);
938
939 if (!blkg->online)
940 goto skip;
941
942 dname = blkg_dev_name(blkg);
943 if (!dname)
944 goto skip;
945
946
947
948
949
950
951
952 off += scnprintf(buf+off, size-off, "%s ", dname);
953
954 blkg_rwstat_recursive_sum(blkg, NULL,
955 offsetof(struct blkcg_gq, stat_bytes), &rwstat);
956 rbytes = rwstat.cnt[BLKG_RWSTAT_READ];
957 wbytes = rwstat.cnt[BLKG_RWSTAT_WRITE];
958 dbytes = rwstat.cnt[BLKG_RWSTAT_DISCARD];
959
960 blkg_rwstat_recursive_sum(blkg, NULL,
961 offsetof(struct blkcg_gq, stat_ios), &rwstat);
962 rios = rwstat.cnt[BLKG_RWSTAT_READ];
963 wios = rwstat.cnt[BLKG_RWSTAT_WRITE];
964 dios = rwstat.cnt[BLKG_RWSTAT_DISCARD];
965
966 if (rbytes || wbytes || rios || wios) {
967 has_stats = true;
968 off += scnprintf(buf+off, size-off,
969 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
970 rbytes, wbytes, rios, wios,
971 dbytes, dios);
972 }
973
974 if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
975 has_stats = true;
976 off += scnprintf(buf+off, size-off,
977 " use_delay=%d delay_nsec=%llu",
978 atomic_read(&blkg->use_delay),
979 (unsigned long long)atomic64_read(&blkg->delay_nsec));
980 }
981
982 for (i = 0; i < BLKCG_MAX_POLS; i++) {
983 struct blkcg_policy *pol = blkcg_policy[i];
984 size_t written;
985
986 if (!blkg->pd[i] || !pol->pd_stat_fn)
987 continue;
988
989 written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
990 if (written)
991 has_stats = true;
992 off += written;
993 }
994
995 if (has_stats) {
996 if (off < size - 1) {
997 off += scnprintf(buf+off, size-off, "\n");
998 seq_commit(sf, off);
999 } else {
1000 seq_commit(sf, -1);
1001 }
1002 }
1003 skip:
1004 spin_unlock_irq(&blkg->q->queue_lock);
1005 }
1006
1007 rcu_read_unlock();
1008 return 0;
1009 }
1010
1011 static struct cftype blkcg_files[] = {
1012 {
1013 .name = "stat",
1014 .flags = CFTYPE_NOT_ON_ROOT,
1015 .seq_show = blkcg_print_stat,
1016 },
1017 { }
1018 };
1019
1020 static struct cftype blkcg_legacy_files[] = {
1021 {
1022 .name = "reset_stats",
1023 .write_u64 = blkcg_reset_stats,
1024 },
1025 { }
1026 };
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057 static void blkcg_css_offline(struct cgroup_subsys_state *css)
1058 {
1059 struct blkcg *blkcg = css_to_blkcg(css);
1060
1061
1062 wb_blkcg_offline(blkcg);
1063
1064
1065 blkcg_cgwb_put(blkcg);
1066 }
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079 void blkcg_destroy_blkgs(struct blkcg *blkcg)
1080 {
1081 spin_lock_irq(&blkcg->lock);
1082
1083 while (!hlist_empty(&blkcg->blkg_list)) {
1084 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1085 struct blkcg_gq, blkcg_node);
1086 struct request_queue *q = blkg->q;
1087
1088 if (spin_trylock(&q->queue_lock)) {
1089 blkg_destroy(blkg);
1090 spin_unlock(&q->queue_lock);
1091 } else {
1092 spin_unlock_irq(&blkcg->lock);
1093 cpu_relax();
1094 spin_lock_irq(&blkcg->lock);
1095 }
1096 }
1097
1098 spin_unlock_irq(&blkcg->lock);
1099 }
1100
1101 static void blkcg_css_free(struct cgroup_subsys_state *css)
1102 {
1103 struct blkcg *blkcg = css_to_blkcg(css);
1104 int i;
1105
1106 mutex_lock(&blkcg_pol_mutex);
1107
1108 list_del(&blkcg->all_blkcgs_node);
1109
1110 for (i = 0; i < BLKCG_MAX_POLS; i++)
1111 if (blkcg->cpd[i])
1112 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1113
1114 mutex_unlock(&blkcg_pol_mutex);
1115
1116 kfree(blkcg);
1117 }
1118
1119 static struct cgroup_subsys_state *
1120 blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
1121 {
1122 struct blkcg *blkcg;
1123 struct cgroup_subsys_state *ret;
1124 int i;
1125
1126 mutex_lock(&blkcg_pol_mutex);
1127
1128 if (!parent_css) {
1129 blkcg = &blkcg_root;
1130 } else {
1131 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1132 if (!blkcg) {
1133 ret = ERR_PTR(-ENOMEM);
1134 goto unlock;
1135 }
1136 }
1137
1138 for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1139 struct blkcg_policy *pol = blkcg_policy[i];
1140 struct blkcg_policy_data *cpd;
1141
1142
1143
1144
1145
1146
1147
1148 if (!pol || !pol->cpd_alloc_fn)
1149 continue;
1150
1151 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1152 if (!cpd) {
1153 ret = ERR_PTR(-ENOMEM);
1154 goto free_pd_blkcg;
1155 }
1156 blkcg->cpd[i] = cpd;
1157 cpd->blkcg = blkcg;
1158 cpd->plid = i;
1159 if (pol->cpd_init_fn)
1160 pol->cpd_init_fn(cpd);
1161 }
1162
1163 spin_lock_init(&blkcg->lock);
1164 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
1165 INIT_HLIST_HEAD(&blkcg->blkg_list);
1166 #ifdef CONFIG_CGROUP_WRITEBACK
1167 INIT_LIST_HEAD(&blkcg->cgwb_list);
1168 refcount_set(&blkcg->cgwb_refcnt, 1);
1169 #endif
1170 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1171
1172 mutex_unlock(&blkcg_pol_mutex);
1173 return &blkcg->css;
1174
1175 free_pd_blkcg:
1176 for (i--; i >= 0; i--)
1177 if (blkcg->cpd[i])
1178 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1179
1180 if (blkcg != &blkcg_root)
1181 kfree(blkcg);
1182 unlock:
1183 mutex_unlock(&blkcg_pol_mutex);
1184 return ret;
1185 }
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197 int blkcg_init_queue(struct request_queue *q)
1198 {
1199 struct blkcg_gq *new_blkg, *blkg;
1200 bool preloaded;
1201 int ret;
1202
1203 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
1204 if (!new_blkg)
1205 return -ENOMEM;
1206
1207 preloaded = !radix_tree_preload(GFP_KERNEL);
1208
1209
1210 rcu_read_lock();
1211 spin_lock_irq(&q->queue_lock);
1212 blkg = blkg_create(&blkcg_root, q, new_blkg);
1213 if (IS_ERR(blkg))
1214 goto err_unlock;
1215 q->root_blkg = blkg;
1216 spin_unlock_irq(&q->queue_lock);
1217 rcu_read_unlock();
1218
1219 if (preloaded)
1220 radix_tree_preload_end();
1221
1222 ret = blk_iolatency_init(q);
1223 if (ret)
1224 goto err_destroy_all;
1225
1226 ret = blk_throtl_init(q);
1227 if (ret)
1228 goto err_destroy_all;
1229 return 0;
1230
1231 err_destroy_all:
1232 blkg_destroy_all(q);
1233 return ret;
1234 err_unlock:
1235 spin_unlock_irq(&q->queue_lock);
1236 rcu_read_unlock();
1237 if (preloaded)
1238 radix_tree_preload_end();
1239 return PTR_ERR(blkg);
1240 }
1241
1242
1243
1244
1245
1246
1247
1248 void blkcg_drain_queue(struct request_queue *q)
1249 {
1250 lockdep_assert_held(&q->queue_lock);
1251
1252
1253
1254
1255
1256 if (!q->root_blkg)
1257 return;
1258
1259 blk_throtl_drain(q);
1260 }
1261
1262
1263
1264
1265
1266
1267
1268 void blkcg_exit_queue(struct request_queue *q)
1269 {
1270 blkg_destroy_all(q);
1271 blk_throtl_exit(q);
1272 }
1273
1274
1275
1276
1277
1278
1279
1280 static int blkcg_can_attach(struct cgroup_taskset *tset)
1281 {
1282 struct task_struct *task;
1283 struct cgroup_subsys_state *dst_css;
1284 struct io_context *ioc;
1285 int ret = 0;
1286
1287
1288 cgroup_taskset_for_each(task, dst_css, tset) {
1289 task_lock(task);
1290 ioc = task->io_context;
1291 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1292 ret = -EINVAL;
1293 task_unlock(task);
1294 if (ret)
1295 break;
1296 }
1297 return ret;
1298 }
1299
1300 static void blkcg_bind(struct cgroup_subsys_state *root_css)
1301 {
1302 int i;
1303
1304 mutex_lock(&blkcg_pol_mutex);
1305
1306 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1307 struct blkcg_policy *pol = blkcg_policy[i];
1308 struct blkcg *blkcg;
1309
1310 if (!pol || !pol->cpd_bind_fn)
1311 continue;
1312
1313 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1314 if (blkcg->cpd[pol->plid])
1315 pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1316 }
1317 mutex_unlock(&blkcg_pol_mutex);
1318 }
1319
1320 static void blkcg_exit(struct task_struct *tsk)
1321 {
1322 if (tsk->throttle_queue)
1323 blk_put_queue(tsk->throttle_queue);
1324 tsk->throttle_queue = NULL;
1325 }
1326
1327 struct cgroup_subsys io_cgrp_subsys = {
1328 .css_alloc = blkcg_css_alloc,
1329 .css_offline = blkcg_css_offline,
1330 .css_free = blkcg_css_free,
1331 .can_attach = blkcg_can_attach,
1332 .bind = blkcg_bind,
1333 .dfl_cftypes = blkcg_files,
1334 .legacy_cftypes = blkcg_legacy_files,
1335 .legacy_name = "blkio",
1336 .exit = blkcg_exit,
1337 #ifdef CONFIG_MEMCG
1338
1339
1340
1341
1342
1343 .depends_on = 1 << memory_cgrp_id,
1344 #endif
1345 };
1346 EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364 int blkcg_activate_policy(struct request_queue *q,
1365 const struct blkcg_policy *pol)
1366 {
1367 struct blkg_policy_data *pd_prealloc = NULL;
1368 struct blkcg_gq *blkg, *pinned_blkg = NULL;
1369 int ret;
1370
1371 if (blkcg_policy_enabled(q, pol))
1372 return 0;
1373
1374 if (queue_is_mq(q))
1375 blk_mq_freeze_queue(q);
1376 retry:
1377 spin_lock_irq(&q->queue_lock);
1378
1379
1380 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
1381 struct blkg_policy_data *pd;
1382
1383 if (blkg->pd[pol->plid])
1384 continue;
1385
1386
1387 if (blkg == pinned_blkg) {
1388 pd = pd_prealloc;
1389 pd_prealloc = NULL;
1390 } else {
1391 pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q,
1392 blkg->blkcg);
1393 }
1394
1395 if (!pd) {
1396
1397
1398
1399
1400 if (pinned_blkg)
1401 blkg_put(pinned_blkg);
1402 blkg_get(blkg);
1403 pinned_blkg = blkg;
1404
1405 spin_unlock_irq(&q->queue_lock);
1406
1407 if (pd_prealloc)
1408 pol->pd_free_fn(pd_prealloc);
1409 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q,
1410 blkg->blkcg);
1411 if (pd_prealloc)
1412 goto retry;
1413 else
1414 goto enomem;
1415 }
1416
1417 blkg->pd[pol->plid] = pd;
1418 pd->blkg = blkg;
1419 pd->plid = pol->plid;
1420 }
1421
1422
1423 if (pol->pd_init_fn)
1424 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
1425 pol->pd_init_fn(blkg->pd[pol->plid]);
1426
1427 __set_bit(pol->plid, q->blkcg_pols);
1428 ret = 0;
1429
1430 spin_unlock_irq(&q->queue_lock);
1431 out:
1432 if (queue_is_mq(q))
1433 blk_mq_unfreeze_queue(q);
1434 if (pinned_blkg)
1435 blkg_put(pinned_blkg);
1436 if (pd_prealloc)
1437 pol->pd_free_fn(pd_prealloc);
1438 return ret;
1439
1440 enomem:
1441
1442 spin_lock_irq(&q->queue_lock);
1443 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1444 if (blkg->pd[pol->plid]) {
1445 pol->pd_free_fn(blkg->pd[pol->plid]);
1446 blkg->pd[pol->plid] = NULL;
1447 }
1448 }
1449 spin_unlock_irq(&q->queue_lock);
1450 ret = -ENOMEM;
1451 goto out;
1452 }
1453 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463 void blkcg_deactivate_policy(struct request_queue *q,
1464 const struct blkcg_policy *pol)
1465 {
1466 struct blkcg_gq *blkg;
1467
1468 if (!blkcg_policy_enabled(q, pol))
1469 return;
1470
1471 if (queue_is_mq(q))
1472 blk_mq_freeze_queue(q);
1473
1474 spin_lock_irq(&q->queue_lock);
1475
1476 __clear_bit(pol->plid, q->blkcg_pols);
1477
1478 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1479 if (blkg->pd[pol->plid]) {
1480 if (pol->pd_offline_fn)
1481 pol->pd_offline_fn(blkg->pd[pol->plid]);
1482 pol->pd_free_fn(blkg->pd[pol->plid]);
1483 blkg->pd[pol->plid] = NULL;
1484 }
1485 }
1486
1487 spin_unlock_irq(&q->queue_lock);
1488
1489 if (queue_is_mq(q))
1490 blk_mq_unfreeze_queue(q);
1491 }
1492 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1493
1494
1495
1496
1497
1498
1499
1500
1501 int blkcg_policy_register(struct blkcg_policy *pol)
1502 {
1503 struct blkcg *blkcg;
1504 int i, ret;
1505
1506 mutex_lock(&blkcg_pol_register_mutex);
1507 mutex_lock(&blkcg_pol_mutex);
1508
1509
1510 ret = -ENOSPC;
1511 for (i = 0; i < BLKCG_MAX_POLS; i++)
1512 if (!blkcg_policy[i])
1513 break;
1514 if (i >= BLKCG_MAX_POLS) {
1515 pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
1516 goto err_unlock;
1517 }
1518
1519
1520 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1521 (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1522 goto err_unlock;
1523
1524
1525 pol->plid = i;
1526 blkcg_policy[pol->plid] = pol;
1527
1528
1529 if (pol->cpd_alloc_fn) {
1530 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1531 struct blkcg_policy_data *cpd;
1532
1533 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1534 if (!cpd)
1535 goto err_free_cpds;
1536
1537 blkcg->cpd[pol->plid] = cpd;
1538 cpd->blkcg = blkcg;
1539 cpd->plid = pol->plid;
1540 if (pol->cpd_init_fn)
1541 pol->cpd_init_fn(cpd);
1542 }
1543 }
1544
1545 mutex_unlock(&blkcg_pol_mutex);
1546
1547
1548 if (pol->dfl_cftypes)
1549 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1550 pol->dfl_cftypes));
1551 if (pol->legacy_cftypes)
1552 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1553 pol->legacy_cftypes));
1554 mutex_unlock(&blkcg_pol_register_mutex);
1555 return 0;
1556
1557 err_free_cpds:
1558 if (pol->cpd_free_fn) {
1559 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1560 if (blkcg->cpd[pol->plid]) {
1561 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1562 blkcg->cpd[pol->plid] = NULL;
1563 }
1564 }
1565 }
1566 blkcg_policy[pol->plid] = NULL;
1567 err_unlock:
1568 mutex_unlock(&blkcg_pol_mutex);
1569 mutex_unlock(&blkcg_pol_register_mutex);
1570 return ret;
1571 }
1572 EXPORT_SYMBOL_GPL(blkcg_policy_register);
1573
1574
1575
1576
1577
1578
1579
1580 void blkcg_policy_unregister(struct blkcg_policy *pol)
1581 {
1582 struct blkcg *blkcg;
1583
1584 mutex_lock(&blkcg_pol_register_mutex);
1585
1586 if (WARN_ON(blkcg_policy[pol->plid] != pol))
1587 goto out_unlock;
1588
1589
1590 if (pol->dfl_cftypes)
1591 cgroup_rm_cftypes(pol->dfl_cftypes);
1592 if (pol->legacy_cftypes)
1593 cgroup_rm_cftypes(pol->legacy_cftypes);
1594
1595
1596 mutex_lock(&blkcg_pol_mutex);
1597
1598 if (pol->cpd_free_fn) {
1599 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1600 if (blkcg->cpd[pol->plid]) {
1601 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1602 blkcg->cpd[pol->plid] = NULL;
1603 }
1604 }
1605 }
1606 blkcg_policy[pol->plid] = NULL;
1607
1608 mutex_unlock(&blkcg_pol_mutex);
1609 out_unlock:
1610 mutex_unlock(&blkcg_pol_register_mutex);
1611 }
1612 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1613
1614 bool __blkcg_punt_bio_submit(struct bio *bio)
1615 {
1616 struct blkcg_gq *blkg = bio->bi_blkg;
1617
1618
1619 bio->bi_opf &= ~REQ_CGROUP_PUNT;
1620
1621
1622 if (!blkg->parent)
1623 return false;
1624
1625 spin_lock_bh(&blkg->async_bio_lock);
1626 bio_list_add(&blkg->async_bios, bio);
1627 spin_unlock_bh(&blkg->async_bio_lock);
1628
1629 queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
1630 return true;
1631 }
1632
1633
1634
1635
1636
1637
1638
1639 static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1640 {
1641 u64 old = atomic64_read(&blkg->delay_start);
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656 if (time_before64(old + NSEC_PER_SEC, now) &&
1657 atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
1658 u64 cur = atomic64_read(&blkg->delay_nsec);
1659 u64 sub = min_t(u64, blkg->last_delay, now - old);
1660 int cur_use = atomic_read(&blkg->use_delay);
1661
1662
1663
1664
1665
1666 if (cur_use < blkg->last_use)
1667 sub = max_t(u64, sub, blkg->last_delay >> 1);
1668
1669
1670
1671
1672
1673
1674
1675 if (unlikely(cur < sub)) {
1676 atomic64_set(&blkg->delay_nsec, 0);
1677 blkg->last_delay = 0;
1678 } else {
1679 atomic64_sub(sub, &blkg->delay_nsec);
1680 blkg->last_delay = cur - sub;
1681 }
1682 blkg->last_use = cur_use;
1683 }
1684 }
1685
1686
1687
1688
1689
1690
1691
1692 static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1693 {
1694 unsigned long pflags;
1695 u64 now = ktime_to_ns(ktime_get());
1696 u64 exp;
1697 u64 delay_nsec = 0;
1698 int tok;
1699
1700 while (blkg->parent) {
1701 if (atomic_read(&blkg->use_delay)) {
1702 blkcg_scale_delay(blkg, now);
1703 delay_nsec = max_t(u64, delay_nsec,
1704 atomic64_read(&blkg->delay_nsec));
1705 }
1706 blkg = blkg->parent;
1707 }
1708
1709 if (!delay_nsec)
1710 return;
1711
1712
1713
1714
1715
1716
1717
1718
1719 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1720
1721 if (use_memdelay)
1722 psi_memstall_enter(&pflags);
1723
1724 exp = ktime_add_ns(now, delay_nsec);
1725 tok = io_schedule_prepare();
1726 do {
1727 __set_current_state(TASK_KILLABLE);
1728 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1729 break;
1730 } while (!fatal_signal_pending(current));
1731 io_schedule_finish(tok);
1732
1733 if (use_memdelay)
1734 psi_memstall_leave(&pflags);
1735 }
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747 void blkcg_maybe_throttle_current(void)
1748 {
1749 struct request_queue *q = current->throttle_queue;
1750 struct cgroup_subsys_state *css;
1751 struct blkcg *blkcg;
1752 struct blkcg_gq *blkg;
1753 bool use_memdelay = current->use_memdelay;
1754
1755 if (!q)
1756 return;
1757
1758 current->throttle_queue = NULL;
1759 current->use_memdelay = false;
1760
1761 rcu_read_lock();
1762 css = kthread_blkcg();
1763 if (css)
1764 blkcg = css_to_blkcg(css);
1765 else
1766 blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
1767
1768 if (!blkcg)
1769 goto out;
1770 blkg = blkg_lookup(blkcg, q);
1771 if (!blkg)
1772 goto out;
1773 if (!blkg_tryget(blkg))
1774 goto out;
1775 rcu_read_unlock();
1776
1777 blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1778 blkg_put(blkg);
1779 blk_put_queue(q);
1780 return;
1781 out:
1782 rcu_read_unlock();
1783 blk_put_queue(q);
1784 }
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803 void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1804 {
1805 if (unlikely(current->flags & PF_KTHREAD))
1806 return;
1807
1808 if (!blk_get_queue(q))
1809 return;
1810
1811 if (current->throttle_queue)
1812 blk_put_queue(current->throttle_queue);
1813 current->throttle_queue = q;
1814 if (use_memdelay)
1815 current->use_memdelay = use_memdelay;
1816 set_notify_resume(current);
1817 }
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1829 {
1830 blkcg_scale_delay(blkg, now);
1831 atomic64_add(delta, &blkg->delay_nsec);
1832 }
1833
1834 static int __init blkcg_init(void)
1835 {
1836 blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
1837 WQ_MEM_RECLAIM | WQ_FREEZABLE |
1838 WQ_UNBOUND | WQ_SYSFS, 0);
1839 if (!blkcg_punt_bio_wq)
1840 return -ENOMEM;
1841 return 0;
1842 }
1843 subsys_initcall(blkcg_init);
1844
1845 module_param(blkcg_debug_stats, bool, 0644);
1846 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");