1/*
2 * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
3 * over multiple cachelines to avoid ping-pong between multiple submitters
4 * or submitter and completer. Uses rolling wakeups to avoid falling of
5 * the scaling cliff when we run out of tags and have to start putting
6 * submitters to sleep.
7 *
8 * Uses active queue tracking to support fairer distribution of tags
9 * between multiple submitters when a shared tag map is used.
10 *
11 * Copyright (C) 2013-2014 Jens Axboe
12 */
13#include <linux/kernel.h>
14#include <linux/module.h>
15#include <linux/random.h>
16
17#include <linux/blk-mq.h>
18#include "blk.h"
19#include "blk-mq.h"
20#include "blk-mq-tag.h"
21
22static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
23{
24	int i;
25
26	for (i = 0; i < bt->map_nr; i++) {
27		struct blk_align_bitmap *bm = &bt->map[i];
28		int ret;
29
30		ret = find_first_zero_bit(&bm->word, bm->depth);
31		if (ret < bm->depth)
32			return true;
33	}
34
35	return false;
36}
37
38bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
39{
40	if (!tags)
41		return true;
42
43	return bt_has_free_tags(&tags->bitmap_tags);
44}
45
46static inline int bt_index_inc(int index)
47{
48	return (index + 1) & (BT_WAIT_QUEUES - 1);
49}
50
51static inline void bt_index_atomic_inc(atomic_t *index)
52{
53	int old = atomic_read(index);
54	int new = bt_index_inc(old);
55	atomic_cmpxchg(index, old, new);
56}
57
58/*
59 * If a previously inactive queue goes active, bump the active user count.
60 */
61bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
62{
63	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
64	    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
65		atomic_inc(&hctx->tags->active_queues);
66
67	return true;
68}
69
70/*
71 * Wakeup all potentially sleeping on tags
72 */
73void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
74{
75	struct blk_mq_bitmap_tags *bt;
76	int i, wake_index;
77
78	/*
79	 * Make sure all changes prior to this are visible from other CPUs.
80	 */
81	smp_mb();
82	bt = &tags->bitmap_tags;
83	wake_index = atomic_read(&bt->wake_index);
84	for (i = 0; i < BT_WAIT_QUEUES; i++) {
85		struct bt_wait_state *bs = &bt->bs[wake_index];
86
87		if (waitqueue_active(&bs->wait))
88			wake_up(&bs->wait);
89
90		wake_index = bt_index_inc(wake_index);
91	}
92
93	if (include_reserve) {
94		bt = &tags->breserved_tags;
95		if (waitqueue_active(&bt->bs[0].wait))
96			wake_up(&bt->bs[0].wait);
97	}
98}
99
100/*
101 * If a previously busy queue goes inactive, potential waiters could now
102 * be allowed to queue. Wake them up and check.
103 */
104void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
105{
106	struct blk_mq_tags *tags = hctx->tags;
107
108	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
109		return;
110
111	atomic_dec(&tags->active_queues);
112
113	blk_mq_tag_wakeup_all(tags, false);
114}
115
116/*
117 * For shared tag users, we track the number of currently active users
118 * and attempt to provide a fair share of the tag depth for each of them.
119 */
120static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
121				  struct blk_mq_bitmap_tags *bt)
122{
123	unsigned int depth, users;
124
125	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
126		return true;
127	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
128		return true;
129
130	/*
131	 * Don't try dividing an ant
132	 */
133	if (bt->depth == 1)
134		return true;
135
136	users = atomic_read(&hctx->tags->active_queues);
137	if (!users)
138		return true;
139
140	/*
141	 * Allow at least some tags
142	 */
143	depth = max((bt->depth + users - 1) / users, 4U);
144	return atomic_read(&hctx->nr_active) < depth;
145}
146
147static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag,
148			 bool nowrap)
149{
150	int tag, org_last_tag = last_tag;
151
152	while (1) {
153		tag = find_next_zero_bit(&bm->word, bm->depth, last_tag);
154		if (unlikely(tag >= bm->depth)) {
155			/*
156			 * We started with an offset, and we didn't reset the
157			 * offset to 0 in a failure case, so start from 0 to
158			 * exhaust the map.
159			 */
160			if (org_last_tag && last_tag && !nowrap) {
161				last_tag = org_last_tag = 0;
162				continue;
163			}
164			return -1;
165		}
166
167		if (!test_and_set_bit(tag, &bm->word))
168			break;
169
170		last_tag = tag + 1;
171		if (last_tag >= bm->depth - 1)
172			last_tag = 0;
173	}
174
175	return tag;
176}
177
178#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR)
179
180/*
181 * Straight forward bitmap tag implementation, where each bit is a tag
182 * (cleared == free, and set == busy). The small twist is using per-cpu
183 * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
184 * contexts. This enables us to drastically limit the space searched,
185 * without dirtying an extra shared cacheline like we would if we stored
186 * the cache value inside the shared blk_mq_bitmap_tags structure. On top
187 * of that, each word of tags is in a separate cacheline. This means that
188 * multiple users will tend to stick to different cachelines, at least
189 * until the map is exhausted.
190 */
191static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
192		    unsigned int *tag_cache, struct blk_mq_tags *tags)
193{
194	unsigned int last_tag, org_last_tag;
195	int index, i, tag;
196
197	if (!hctx_may_queue(hctx, bt))
198		return -1;
199
200	last_tag = org_last_tag = *tag_cache;
201	index = TAG_TO_INDEX(bt, last_tag);
202
203	for (i = 0; i < bt->map_nr; i++) {
204		tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag),
205				    BT_ALLOC_RR(tags));
206		if (tag != -1) {
207			tag += (index << bt->bits_per_word);
208			goto done;
209		}
210
211		/*
212		 * Jump to next index, and reset the last tag to be the
213		 * first tag of that index
214		 */
215		index++;
216		last_tag = (index << bt->bits_per_word);
217
218		if (index >= bt->map_nr) {
219			index = 0;
220			last_tag = 0;
221		}
222	}
223
224	*tag_cache = 0;
225	return -1;
226
227	/*
228	 * Only update the cache from the allocation path, if we ended
229	 * up using the specific cached tag.
230	 */
231done:
232	if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) {
233		last_tag = tag + 1;
234		if (last_tag >= bt->depth - 1)
235			last_tag = 0;
236
237		*tag_cache = last_tag;
238	}
239
240	return tag;
241}
242
243static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
244					 struct blk_mq_hw_ctx *hctx)
245{
246	struct bt_wait_state *bs;
247	int wait_index;
248
249	if (!hctx)
250		return &bt->bs[0];
251
252	wait_index = atomic_read(&hctx->wait_index);
253	bs = &bt->bs[wait_index];
254	bt_index_atomic_inc(&hctx->wait_index);
255	return bs;
256}
257
258static int bt_get(struct blk_mq_alloc_data *data,
259		struct blk_mq_bitmap_tags *bt,
260		struct blk_mq_hw_ctx *hctx,
261		unsigned int *last_tag, struct blk_mq_tags *tags)
262{
263	struct bt_wait_state *bs;
264	DEFINE_WAIT(wait);
265	int tag;
266
267	tag = __bt_get(hctx, bt, last_tag, tags);
268	if (tag != -1)
269		return tag;
270
271	if (!gfpflags_allow_blocking(data->gfp))
272		return -1;
273
274	bs = bt_wait_ptr(bt, hctx);
275	do {
276		prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
277
278		tag = __bt_get(hctx, bt, last_tag, tags);
279		if (tag != -1)
280			break;
281
282		/*
283		 * We're out of tags on this hardware queue, kick any
284		 * pending IO submits before going to sleep waiting for
285		 * some to complete. Note that hctx can be NULL here for
286		 * reserved tag allocation.
287		 */
288		if (hctx)
289			blk_mq_run_hw_queue(hctx, false);
290
291		/*
292		 * Retry tag allocation after running the hardware queue,
293		 * as running the queue may also have found completions.
294		 */
295		tag = __bt_get(hctx, bt, last_tag, tags);
296		if (tag != -1)
297			break;
298
299		blk_mq_put_ctx(data->ctx);
300
301		io_schedule();
302
303		data->ctx = blk_mq_get_ctx(data->q);
304		data->hctx = data->q->mq_ops->map_queue(data->q,
305				data->ctx->cpu);
306		if (data->reserved) {
307			bt = &data->hctx->tags->breserved_tags;
308		} else {
309			last_tag = &data->ctx->last_tag;
310			hctx = data->hctx;
311			bt = &hctx->tags->bitmap_tags;
312		}
313		finish_wait(&bs->wait, &wait);
314		bs = bt_wait_ptr(bt, hctx);
315	} while (1);
316
317	finish_wait(&bs->wait, &wait);
318	return tag;
319}
320
321static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
322{
323	int tag;
324
325	tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
326			&data->ctx->last_tag, data->hctx->tags);
327	if (tag >= 0)
328		return tag + data->hctx->tags->nr_reserved_tags;
329
330	return BLK_MQ_TAG_FAIL;
331}
332
333static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
334{
335	int tag, zero = 0;
336
337	if (unlikely(!data->hctx->tags->nr_reserved_tags)) {
338		WARN_ON_ONCE(1);
339		return BLK_MQ_TAG_FAIL;
340	}
341
342	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero,
343		data->hctx->tags);
344	if (tag < 0)
345		return BLK_MQ_TAG_FAIL;
346
347	return tag;
348}
349
350unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
351{
352	if (!data->reserved)
353		return __blk_mq_get_tag(data);
354
355	return __blk_mq_get_reserved_tag(data);
356}
357
358static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
359{
360	int i, wake_index;
361
362	wake_index = atomic_read(&bt->wake_index);
363	for (i = 0; i < BT_WAIT_QUEUES; i++) {
364		struct bt_wait_state *bs = &bt->bs[wake_index];
365
366		if (waitqueue_active(&bs->wait)) {
367			int o = atomic_read(&bt->wake_index);
368			if (wake_index != o)
369				atomic_cmpxchg(&bt->wake_index, o, wake_index);
370
371			return bs;
372		}
373
374		wake_index = bt_index_inc(wake_index);
375	}
376
377	return NULL;
378}
379
380static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
381{
382	const int index = TAG_TO_INDEX(bt, tag);
383	struct bt_wait_state *bs;
384	int wait_cnt;
385
386	clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
387
388	/* Ensure that the wait list checks occur after clear_bit(). */
389	smp_mb();
390
391	bs = bt_wake_ptr(bt);
392	if (!bs)
393		return;
394
395	wait_cnt = atomic_dec_return(&bs->wait_cnt);
396	if (unlikely(wait_cnt < 0))
397		wait_cnt = atomic_inc_return(&bs->wait_cnt);
398	if (wait_cnt == 0) {
399		atomic_add(bt->wake_cnt, &bs->wait_cnt);
400		bt_index_atomic_inc(&bt->wake_index);
401		wake_up(&bs->wait);
402	}
403}
404
405void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
406		    unsigned int *last_tag)
407{
408	struct blk_mq_tags *tags = hctx->tags;
409
410	if (tag >= tags->nr_reserved_tags) {
411		const int real_tag = tag - tags->nr_reserved_tags;
412
413		BUG_ON(real_tag >= tags->nr_tags);
414		bt_clear_tag(&tags->bitmap_tags, real_tag);
415		if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO))
416			*last_tag = real_tag;
417	} else {
418		BUG_ON(tag >= tags->nr_reserved_tags);
419		bt_clear_tag(&tags->breserved_tags, tag);
420	}
421}
422
423static void bt_for_each(struct blk_mq_hw_ctx *hctx,
424		struct blk_mq_bitmap_tags *bt, unsigned int off,
425		busy_iter_fn *fn, void *data, bool reserved)
426{
427	struct request *rq;
428	int bit, i;
429
430	for (i = 0; i < bt->map_nr; i++) {
431		struct blk_align_bitmap *bm = &bt->map[i];
432
433		for (bit = find_first_bit(&bm->word, bm->depth);
434		     bit < bm->depth;
435		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
436			rq = hctx->tags->rqs[off + bit];
437			if (rq->q == hctx->queue)
438				fn(hctx, rq, data, reserved);
439		}
440
441		off += (1 << bt->bits_per_word);
442	}
443}
444
445static void bt_tags_for_each(struct blk_mq_tags *tags,
446		struct blk_mq_bitmap_tags *bt, unsigned int off,
447		busy_tag_iter_fn *fn, void *data, bool reserved)
448{
449	struct request *rq;
450	int bit, i;
451
452	if (!tags->rqs)
453		return;
454	for (i = 0; i < bt->map_nr; i++) {
455		struct blk_align_bitmap *bm = &bt->map[i];
456
457		for (bit = find_first_bit(&bm->word, bm->depth);
458		     bit < bm->depth;
459		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
460			rq = tags->rqs[off + bit];
461			fn(rq, data, reserved);
462		}
463
464		off += (1 << bt->bits_per_word);
465	}
466}
467
468void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
469		void *priv)
470{
471	if (tags->nr_reserved_tags)
472		bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
473	bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
474			false);
475}
476EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
477
478void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
479		void *priv)
480{
481	struct blk_mq_hw_ctx *hctx;
482	int i;
483
484
485	queue_for_each_hw_ctx(q, hctx, i) {
486		struct blk_mq_tags *tags = hctx->tags;
487
488		/*
489		 * If not software queues are currently mapped to this
490		 * hardware queue, there's nothing to check
491		 */
492		if (!blk_mq_hw_queue_mapped(hctx))
493			continue;
494
495		if (tags->nr_reserved_tags)
496			bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
497		bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
498		      false);
499	}
500
501}
502
503static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
504{
505	unsigned int i, used;
506
507	for (i = 0, used = 0; i < bt->map_nr; i++) {
508		struct blk_align_bitmap *bm = &bt->map[i];
509
510		used += bitmap_weight(&bm->word, bm->depth);
511	}
512
513	return bt->depth - used;
514}
515
516static void bt_update_count(struct blk_mq_bitmap_tags *bt,
517			    unsigned int depth)
518{
519	unsigned int tags_per_word = 1U << bt->bits_per_word;
520	unsigned int map_depth = depth;
521
522	if (depth) {
523		int i;
524
525		for (i = 0; i < bt->map_nr; i++) {
526			bt->map[i].depth = min(map_depth, tags_per_word);
527			map_depth -= bt->map[i].depth;
528		}
529	}
530
531	bt->wake_cnt = BT_WAIT_BATCH;
532	if (bt->wake_cnt > depth / BT_WAIT_QUEUES)
533		bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES);
534
535	bt->depth = depth;
536}
537
538static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
539			int node, bool reserved)
540{
541	int i;
542
543	bt->bits_per_word = ilog2(BITS_PER_LONG);
544
545	/*
546	 * Depth can be zero for reserved tags, that's not a failure
547	 * condition.
548	 */
549	if (depth) {
550		unsigned int nr, tags_per_word;
551
552		tags_per_word = (1 << bt->bits_per_word);
553
554		/*
555		 * If the tag space is small, shrink the number of tags
556		 * per word so we spread over a few cachelines, at least.
557		 * If less than 4 tags, just forget about it, it's not
558		 * going to work optimally anyway.
559		 */
560		if (depth >= 4) {
561			while (tags_per_word * 4 > depth) {
562				bt->bits_per_word--;
563				tags_per_word = (1 << bt->bits_per_word);
564			}
565		}
566
567		nr = ALIGN(depth, tags_per_word) / tags_per_word;
568		bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
569						GFP_KERNEL, node);
570		if (!bt->map)
571			return -ENOMEM;
572
573		bt->map_nr = nr;
574	}
575
576	bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
577	if (!bt->bs) {
578		kfree(bt->map);
579		bt->map = NULL;
580		return -ENOMEM;
581	}
582
583	bt_update_count(bt, depth);
584
585	for (i = 0; i < BT_WAIT_QUEUES; i++) {
586		init_waitqueue_head(&bt->bs[i].wait);
587		atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt);
588	}
589
590	return 0;
591}
592
593static void bt_free(struct blk_mq_bitmap_tags *bt)
594{
595	kfree(bt->map);
596	kfree(bt->bs);
597}
598
599static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
600						   int node, int alloc_policy)
601{
602	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
603
604	tags->alloc_policy = alloc_policy;
605
606	if (bt_alloc(&tags->bitmap_tags, depth, node, false))
607		goto enomem;
608	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
609		goto enomem;
610
611	return tags;
612enomem:
613	bt_free(&tags->bitmap_tags);
614	kfree(tags);
615	return NULL;
616}
617
618struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
619				     unsigned int reserved_tags,
620				     int node, int alloc_policy)
621{
622	struct blk_mq_tags *tags;
623
624	if (total_tags > BLK_MQ_TAG_MAX) {
625		pr_err("blk-mq: tag depth too large\n");
626		return NULL;
627	}
628
629	tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
630	if (!tags)
631		return NULL;
632
633	if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
634		kfree(tags);
635		return NULL;
636	}
637
638	tags->nr_tags = total_tags;
639	tags->nr_reserved_tags = reserved_tags;
640
641	return blk_mq_init_bitmap_tags(tags, node, alloc_policy);
642}
643
644void blk_mq_free_tags(struct blk_mq_tags *tags)
645{
646	bt_free(&tags->bitmap_tags);
647	bt_free(&tags->breserved_tags);
648	free_cpumask_var(tags->cpumask);
649	kfree(tags);
650}
651
652void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
653{
654	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
655
656	*tag = prandom_u32() % depth;
657}
658
659int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
660{
661	tdepth -= tags->nr_reserved_tags;
662	if (tdepth > tags->nr_tags)
663		return -EINVAL;
664
665	/*
666	 * Don't need (or can't) update reserved tags here, they remain
667	 * static and should never need resizing.
668	 */
669	bt_update_count(&tags->bitmap_tags, tdepth);
670	blk_mq_tag_wakeup_all(tags, false);
671	return 0;
672}
673
674/**
675 * blk_mq_unique_tag() - return a tag that is unique queue-wide
676 * @rq: request for which to compute a unique tag
677 *
678 * The tag field in struct request is unique per hardware queue but not over
679 * all hardware queues. Hence this function that returns a tag with the
680 * hardware context index in the upper bits and the per hardware queue tag in
681 * the lower bits.
682 *
683 * Note: When called for a request that is queued on a non-multiqueue request
684 * queue, the hardware context index is set to zero.
685 */
686u32 blk_mq_unique_tag(struct request *rq)
687{
688	struct request_queue *q = rq->q;
689	struct blk_mq_hw_ctx *hctx;
690	int hwq = 0;
691
692	if (q->mq_ops) {
693		hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
694		hwq = hctx->queue_num;
695	}
696
697	return (hwq << BLK_MQ_UNIQUE_TAG_BITS) |
698		(rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
699}
700EXPORT_SYMBOL(blk_mq_unique_tag);
701
702ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
703{
704	char *orig_page = page;
705	unsigned int free, res;
706
707	if (!tags)
708		return 0;
709
710	page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
711			"bits_per_word=%u\n",
712			tags->nr_tags, tags->nr_reserved_tags,
713			tags->bitmap_tags.bits_per_word);
714
715	free = bt_unused_tags(&tags->bitmap_tags);
716	res = bt_unused_tags(&tags->breserved_tags);
717
718	page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
719	page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
720
721	return page - orig_page;
722}
723