1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
8#include "dm-bio-prison.h"
9#include "dm-bio-record.h"
10#include "dm-cache-metadata.h"
11
12#include <linux/dm-io.h>
13#include <linux/dm-kcopyd.h>
14#include <linux/jiffies.h>
15#include <linux/init.h>
16#include <linux/mempool.h>
17#include <linux/module.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h>
20
21#define DM_MSG_PREFIX "cache"
22
23DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
24	"A percentage of time allocated for copying to and/or from cache");
25
26/*----------------------------------------------------------------*/
27
28/*
29 * Glossary:
30 *
31 * oblock: index of an origin block
32 * cblock: index of a cache block
33 * promotion: movement of a block from origin to cache
34 * demotion: movement of a block from cache to origin
35 * migration: movement of a block between the origin and cache device,
36 *	      either direction
37 */
38
39/*----------------------------------------------------------------*/
40
41static size_t bitset_size_in_bytes(unsigned nr_entries)
42{
43	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
44}
45
46static unsigned long *alloc_bitset(unsigned nr_entries)
47{
48	size_t s = bitset_size_in_bytes(nr_entries);
49	return vzalloc(s);
50}
51
52static void clear_bitset(void *bitset, unsigned nr_entries)
53{
54	size_t s = bitset_size_in_bytes(nr_entries);
55	memset(bitset, 0, s);
56}
57
58static void free_bitset(unsigned long *bits)
59{
60	vfree(bits);
61}
62
63/*----------------------------------------------------------------*/
64
65/*
66 * There are a couple of places where we let a bio run, but want to do some
67 * work before calling its endio function.  We do this by temporarily
68 * changing the endio fn.
69 */
70struct dm_hook_info {
71	bio_end_io_t *bi_end_io;
72	void *bi_private;
73};
74
75static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
76			bio_end_io_t *bi_end_io, void *bi_private)
77{
78	h->bi_end_io = bio->bi_end_io;
79	h->bi_private = bio->bi_private;
80
81	bio->bi_end_io = bi_end_io;
82	bio->bi_private = bi_private;
83}
84
85static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
86{
87	bio->bi_end_io = h->bi_end_io;
88	bio->bi_private = h->bi_private;
89
90	/*
91	 * Must bump bi_remaining to allow bio to complete with
92	 * restored bi_end_io.
93	 */
94	atomic_inc(&bio->bi_remaining);
95}
96
97/*----------------------------------------------------------------*/
98
99#define MIGRATION_POOL_SIZE 128
100#define COMMIT_PERIOD HZ
101#define MIGRATION_COUNT_WINDOW 10
102
103/*
104 * The block size of the device holding cache data must be
105 * between 32KB and 1GB.
106 */
107#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
108#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
109
110/*
111 * FIXME: the cache is read/write for the time being.
112 */
113enum cache_metadata_mode {
114	CM_WRITE,		/* metadata may be changed */
115	CM_READ_ONLY,		/* metadata may not be changed */
116};
117
118enum cache_io_mode {
119	/*
120	 * Data is written to cached blocks only.  These blocks are marked
121	 * dirty.  If you lose the cache device you will lose data.
122	 * Potential performance increase for both reads and writes.
123	 */
124	CM_IO_WRITEBACK,
125
126	/*
127	 * Data is written to both cache and origin.  Blocks are never
128	 * dirty.  Potential performance benfit for reads only.
129	 */
130	CM_IO_WRITETHROUGH,
131
132	/*
133	 * A degraded mode useful for various cache coherency situations
134	 * (eg, rolling back snapshots).  Reads and writes always go to the
135	 * origin.  If a write goes to a cached oblock, then the cache
136	 * block is invalidated.
137	 */
138	CM_IO_PASSTHROUGH
139};
140
141struct cache_features {
142	enum cache_metadata_mode mode;
143	enum cache_io_mode io_mode;
144};
145
146struct cache_stats {
147	atomic_t read_hit;
148	atomic_t read_miss;
149	atomic_t write_hit;
150	atomic_t write_miss;
151	atomic_t demotion;
152	atomic_t promotion;
153	atomic_t copies_avoided;
154	atomic_t cache_cell_clash;
155	atomic_t commit_count;
156	atomic_t discard_count;
157};
158
159/*
160 * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
161 * the one-past-the-end value.
162 */
163struct cblock_range {
164	dm_cblock_t begin;
165	dm_cblock_t end;
166};
167
168struct invalidation_request {
169	struct list_head list;
170	struct cblock_range *cblocks;
171
172	atomic_t complete;
173	int err;
174
175	wait_queue_head_t result_wait;
176};
177
178struct cache {
179	struct dm_target *ti;
180	struct dm_target_callbacks callbacks;
181
182	struct dm_cache_metadata *cmd;
183
184	/*
185	 * Metadata is written to this device.
186	 */
187	struct dm_dev *metadata_dev;
188
189	/*
190	 * The slower of the two data devices.  Typically a spindle.
191	 */
192	struct dm_dev *origin_dev;
193
194	/*
195	 * The faster of the two data devices.  Typically an SSD.
196	 */
197	struct dm_dev *cache_dev;
198
199	/*
200	 * Size of the origin device in _complete_ blocks and native sectors.
201	 */
202	dm_oblock_t origin_blocks;
203	sector_t origin_sectors;
204
205	/*
206	 * Size of the cache device in blocks.
207	 */
208	dm_cblock_t cache_size;
209
210	/*
211	 * Fields for converting from sectors to blocks.
212	 */
213	uint32_t sectors_per_block;
214	int sectors_per_block_shift;
215
216	spinlock_t lock;
217	struct bio_list deferred_bios;
218	struct bio_list deferred_flush_bios;
219	struct bio_list deferred_writethrough_bios;
220	struct list_head quiesced_migrations;
221	struct list_head completed_migrations;
222	struct list_head need_commit_migrations;
223	sector_t migration_threshold;
224	wait_queue_head_t migration_wait;
225	atomic_t nr_allocated_migrations;
226
227	/*
228	 * The number of in flight migrations that are performing
229	 * background io. eg, promotion, writeback.
230	 */
231	atomic_t nr_io_migrations;
232
233	wait_queue_head_t quiescing_wait;
234	atomic_t quiescing;
235	atomic_t quiescing_ack;
236
237	/*
238	 * cache_size entries, dirty if set
239	 */
240	atomic_t nr_dirty;
241	unsigned long *dirty_bitset;
242
243	/*
244	 * origin_blocks entries, discarded if set.
245	 */
246	dm_dblock_t discard_nr_blocks;
247	unsigned long *discard_bitset;
248	uint32_t discard_block_size; /* a power of 2 times sectors per block */
249
250	/*
251	 * Rather than reconstructing the table line for the status we just
252	 * save it and regurgitate.
253	 */
254	unsigned nr_ctr_args;
255	const char **ctr_args;
256
257	struct dm_kcopyd_client *copier;
258	struct workqueue_struct *wq;
259	struct work_struct worker;
260
261	struct delayed_work waker;
262	unsigned long last_commit_jiffies;
263
264	struct dm_bio_prison *prison;
265	struct dm_deferred_set *all_io_ds;
266
267	mempool_t *migration_pool;
268
269	struct dm_cache_policy *policy;
270	unsigned policy_nr_args;
271
272	bool need_tick_bio:1;
273	bool sized:1;
274	bool invalidate:1;
275	bool commit_requested:1;
276	bool loaded_mappings:1;
277	bool loaded_discards:1;
278
279	/*
280	 * Cache features such as write-through.
281	 */
282	struct cache_features features;
283
284	struct cache_stats stats;
285
286	/*
287	 * Invalidation fields.
288	 */
289	spinlock_t invalidation_lock;
290	struct list_head invalidation_requests;
291};
292
293struct per_bio_data {
294	bool tick:1;
295	unsigned req_nr:2;
296	struct dm_deferred_entry *all_io_entry;
297	struct dm_hook_info hook_info;
298
299	/*
300	 * writethrough fields.  These MUST remain at the end of this
301	 * structure and the 'cache' member must be the first as it
302	 * is used to determine the offset of the writethrough fields.
303	 */
304	struct cache *cache;
305	dm_cblock_t cblock;
306	struct dm_bio_details bio_details;
307};
308
309struct dm_cache_migration {
310	struct list_head list;
311	struct cache *cache;
312
313	unsigned long start_jiffies;
314	dm_oblock_t old_oblock;
315	dm_oblock_t new_oblock;
316	dm_cblock_t cblock;
317
318	bool err:1;
319	bool discard:1;
320	bool writeback:1;
321	bool demote:1;
322	bool promote:1;
323	bool requeue_holder:1;
324	bool invalidate:1;
325
326	struct dm_bio_prison_cell *old_ocell;
327	struct dm_bio_prison_cell *new_ocell;
328};
329
330/*
331 * Processing a bio in the worker thread may require these memory
332 * allocations.  We prealloc to avoid deadlocks (the same worker thread
333 * frees them back to the mempool).
334 */
335struct prealloc {
336	struct dm_cache_migration *mg;
337	struct dm_bio_prison_cell *cell1;
338	struct dm_bio_prison_cell *cell2;
339};
340
341static void wake_worker(struct cache *cache)
342{
343	queue_work(cache->wq, &cache->worker);
344}
345
346/*----------------------------------------------------------------*/
347
348static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
349{
350	/* FIXME: change to use a local slab. */
351	return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
352}
353
354static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
355{
356	dm_bio_prison_free_cell(cache->prison, cell);
357}
358
359static struct dm_cache_migration *alloc_migration(struct cache *cache)
360{
361	struct dm_cache_migration *mg;
362
363	mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
364	if (mg) {
365		mg->cache = cache;
366		atomic_inc(&mg->cache->nr_allocated_migrations);
367	}
368
369	return mg;
370}
371
372static void free_migration(struct dm_cache_migration *mg)
373{
374	if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations))
375		wake_up(&mg->cache->migration_wait);
376
377	mempool_free(mg, mg->cache->migration_pool);
378}
379
380static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
381{
382	if (!p->mg) {
383		p->mg = alloc_migration(cache);
384		if (!p->mg)
385			return -ENOMEM;
386	}
387
388	if (!p->cell1) {
389		p->cell1 = alloc_prison_cell(cache);
390		if (!p->cell1)
391			return -ENOMEM;
392	}
393
394	if (!p->cell2) {
395		p->cell2 = alloc_prison_cell(cache);
396		if (!p->cell2)
397			return -ENOMEM;
398	}
399
400	return 0;
401}
402
403static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
404{
405	if (p->cell2)
406		free_prison_cell(cache, p->cell2);
407
408	if (p->cell1)
409		free_prison_cell(cache, p->cell1);
410
411	if (p->mg)
412		free_migration(p->mg);
413}
414
415static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
416{
417	struct dm_cache_migration *mg = p->mg;
418
419	BUG_ON(!mg);
420	p->mg = NULL;
421
422	return mg;
423}
424
425/*
426 * You must have a cell within the prealloc struct to return.  If not this
427 * function will BUG() rather than returning NULL.
428 */
429static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
430{
431	struct dm_bio_prison_cell *r = NULL;
432
433	if (p->cell1) {
434		r = p->cell1;
435		p->cell1 = NULL;
436
437	} else if (p->cell2) {
438		r = p->cell2;
439		p->cell2 = NULL;
440	} else
441		BUG();
442
443	return r;
444}
445
446/*
447 * You can't have more than two cells in a prealloc struct.  BUG() will be
448 * called if you try and overfill.
449 */
450static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
451{
452	if (!p->cell2)
453		p->cell2 = cell;
454
455	else if (!p->cell1)
456		p->cell1 = cell;
457
458	else
459		BUG();
460}
461
462/*----------------------------------------------------------------*/
463
464static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
465{
466	key->virtual = 0;
467	key->dev = 0;
468	key->block_begin = from_oblock(begin);
469	key->block_end = from_oblock(end);
470}
471
472/*
473 * The caller hands in a preallocated cell, and a free function for it.
474 * The cell will be freed if there's an error, or if it wasn't used because
475 * a cell with that key already exists.
476 */
477typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
478
479static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
480			    struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
481			    cell_free_fn free_fn, void *free_context,
482			    struct dm_bio_prison_cell **cell_result)
483{
484	int r;
485	struct dm_cell_key key;
486
487	build_key(oblock_begin, oblock_end, &key);
488	r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
489	if (r)
490		free_fn(free_context, cell_prealloc);
491
492	return r;
493}
494
495static int bio_detain(struct cache *cache, dm_oblock_t oblock,
496		      struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
497		      cell_free_fn free_fn, void *free_context,
498		      struct dm_bio_prison_cell **cell_result)
499{
500	dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
501	return bio_detain_range(cache, oblock, end, bio,
502				cell_prealloc, free_fn, free_context, cell_result);
503}
504
505static int get_cell(struct cache *cache,
506		    dm_oblock_t oblock,
507		    struct prealloc *structs,
508		    struct dm_bio_prison_cell **cell_result)
509{
510	int r;
511	struct dm_cell_key key;
512	struct dm_bio_prison_cell *cell_prealloc;
513
514	cell_prealloc = prealloc_get_cell(structs);
515
516	build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
517	r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
518	if (r)
519		prealloc_put_cell(structs, cell_prealloc);
520
521	return r;
522}
523
524/*----------------------------------------------------------------*/
525
526static bool is_dirty(struct cache *cache, dm_cblock_t b)
527{
528	return test_bit(from_cblock(b), cache->dirty_bitset);
529}
530
531static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
532{
533	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
534		atomic_inc(&cache->nr_dirty);
535		policy_set_dirty(cache->policy, oblock);
536	}
537}
538
539static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
540{
541	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
542		policy_clear_dirty(cache->policy, oblock);
543		if (atomic_dec_return(&cache->nr_dirty) == 0)
544			dm_table_event(cache->ti->table);
545	}
546}
547
548/*----------------------------------------------------------------*/
549
550static bool block_size_is_power_of_two(struct cache *cache)
551{
552	return cache->sectors_per_block_shift >= 0;
553}
554
555/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
556#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
557__always_inline
558#endif
559static dm_block_t block_div(dm_block_t b, uint32_t n)
560{
561	do_div(b, n);
562
563	return b;
564}
565
566static dm_block_t oblocks_per_dblock(struct cache *cache)
567{
568	dm_block_t oblocks = cache->discard_block_size;
569
570	if (block_size_is_power_of_two(cache))
571		oblocks >>= cache->sectors_per_block_shift;
572	else
573		oblocks = block_div(oblocks, cache->sectors_per_block);
574
575	return oblocks;
576}
577
578static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
579{
580	return to_dblock(block_div(from_oblock(oblock),
581				   oblocks_per_dblock(cache)));
582}
583
584static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
585{
586	return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
587}
588
589static void set_discard(struct cache *cache, dm_dblock_t b)
590{
591	unsigned long flags;
592
593	BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
594	atomic_inc(&cache->stats.discard_count);
595
596	spin_lock_irqsave(&cache->lock, flags);
597	set_bit(from_dblock(b), cache->discard_bitset);
598	spin_unlock_irqrestore(&cache->lock, flags);
599}
600
601static void clear_discard(struct cache *cache, dm_dblock_t b)
602{
603	unsigned long flags;
604
605	spin_lock_irqsave(&cache->lock, flags);
606	clear_bit(from_dblock(b), cache->discard_bitset);
607	spin_unlock_irqrestore(&cache->lock, flags);
608}
609
610static bool is_discarded(struct cache *cache, dm_dblock_t b)
611{
612	int r;
613	unsigned long flags;
614
615	spin_lock_irqsave(&cache->lock, flags);
616	r = test_bit(from_dblock(b), cache->discard_bitset);
617	spin_unlock_irqrestore(&cache->lock, flags);
618
619	return r;
620}
621
622static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
623{
624	int r;
625	unsigned long flags;
626
627	spin_lock_irqsave(&cache->lock, flags);
628	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
629		     cache->discard_bitset);
630	spin_unlock_irqrestore(&cache->lock, flags);
631
632	return r;
633}
634
635/*----------------------------------------------------------------*/
636
637static void load_stats(struct cache *cache)
638{
639	struct dm_cache_statistics stats;
640
641	dm_cache_metadata_get_stats(cache->cmd, &stats);
642	atomic_set(&cache->stats.read_hit, stats.read_hits);
643	atomic_set(&cache->stats.read_miss, stats.read_misses);
644	atomic_set(&cache->stats.write_hit, stats.write_hits);
645	atomic_set(&cache->stats.write_miss, stats.write_misses);
646}
647
648static void save_stats(struct cache *cache)
649{
650	struct dm_cache_statistics stats;
651
652	stats.read_hits = atomic_read(&cache->stats.read_hit);
653	stats.read_misses = atomic_read(&cache->stats.read_miss);
654	stats.write_hits = atomic_read(&cache->stats.write_hit);
655	stats.write_misses = atomic_read(&cache->stats.write_miss);
656
657	dm_cache_metadata_set_stats(cache->cmd, &stats);
658}
659
660/*----------------------------------------------------------------
661 * Per bio data
662 *--------------------------------------------------------------*/
663
664/*
665 * If using writeback, leave out struct per_bio_data's writethrough fields.
666 */
667#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
668#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
669
670static bool writethrough_mode(struct cache_features *f)
671{
672	return f->io_mode == CM_IO_WRITETHROUGH;
673}
674
675static bool writeback_mode(struct cache_features *f)
676{
677	return f->io_mode == CM_IO_WRITEBACK;
678}
679
680static bool passthrough_mode(struct cache_features *f)
681{
682	return f->io_mode == CM_IO_PASSTHROUGH;
683}
684
685static size_t get_per_bio_data_size(struct cache *cache)
686{
687	return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
688}
689
690static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
691{
692	struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
693	BUG_ON(!pb);
694	return pb;
695}
696
697static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
698{
699	struct per_bio_data *pb = get_per_bio_data(bio, data_size);
700
701	pb->tick = false;
702	pb->req_nr = dm_bio_get_target_bio_nr(bio);
703	pb->all_io_entry = NULL;
704
705	return pb;
706}
707
708/*----------------------------------------------------------------
709 * Remapping
710 *--------------------------------------------------------------*/
711static void remap_to_origin(struct cache *cache, struct bio *bio)
712{
713	bio->bi_bdev = cache->origin_dev->bdev;
714}
715
716static void remap_to_cache(struct cache *cache, struct bio *bio,
717			   dm_cblock_t cblock)
718{
719	sector_t bi_sector = bio->bi_iter.bi_sector;
720	sector_t block = from_cblock(cblock);
721
722	bio->bi_bdev = cache->cache_dev->bdev;
723	if (!block_size_is_power_of_two(cache))
724		bio->bi_iter.bi_sector =
725			(block * cache->sectors_per_block) +
726			sector_div(bi_sector, cache->sectors_per_block);
727	else
728		bio->bi_iter.bi_sector =
729			(block << cache->sectors_per_block_shift) |
730			(bi_sector & (cache->sectors_per_block - 1));
731}
732
733static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
734{
735	unsigned long flags;
736	size_t pb_data_size = get_per_bio_data_size(cache);
737	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
738
739	spin_lock_irqsave(&cache->lock, flags);
740	if (cache->need_tick_bio &&
741	    !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
742		pb->tick = true;
743		cache->need_tick_bio = false;
744	}
745	spin_unlock_irqrestore(&cache->lock, flags);
746}
747
748static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
749				  dm_oblock_t oblock)
750{
751	check_if_tick_bio_needed(cache, bio);
752	remap_to_origin(cache, bio);
753	if (bio_data_dir(bio) == WRITE)
754		clear_discard(cache, oblock_to_dblock(cache, oblock));
755}
756
757static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
758				 dm_oblock_t oblock, dm_cblock_t cblock)
759{
760	check_if_tick_bio_needed(cache, bio);
761	remap_to_cache(cache, bio, cblock);
762	if (bio_data_dir(bio) == WRITE) {
763		set_dirty(cache, oblock, cblock);
764		clear_discard(cache, oblock_to_dblock(cache, oblock));
765	}
766}
767
768static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
769{
770	sector_t block_nr = bio->bi_iter.bi_sector;
771
772	if (!block_size_is_power_of_two(cache))
773		(void) sector_div(block_nr, cache->sectors_per_block);
774	else
775		block_nr >>= cache->sectors_per_block_shift;
776
777	return to_oblock(block_nr);
778}
779
780static int bio_triggers_commit(struct cache *cache, struct bio *bio)
781{
782	return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
783}
784
785/*
786 * You must increment the deferred set whilst the prison cell is held.  To
787 * encourage this, we ask for 'cell' to be passed in.
788 */
789static void inc_ds(struct cache *cache, struct bio *bio,
790		   struct dm_bio_prison_cell *cell)
791{
792	size_t pb_data_size = get_per_bio_data_size(cache);
793	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
794
795	BUG_ON(!cell);
796	BUG_ON(pb->all_io_entry);
797
798	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
799}
800
801static void issue(struct cache *cache, struct bio *bio)
802{
803	unsigned long flags;
804
805	if (!bio_triggers_commit(cache, bio)) {
806		generic_make_request(bio);
807		return;
808	}
809
810	/*
811	 * Batch together any bios that trigger commits and then issue a
812	 * single commit for them in do_worker().
813	 */
814	spin_lock_irqsave(&cache->lock, flags);
815	cache->commit_requested = true;
816	bio_list_add(&cache->deferred_flush_bios, bio);
817	spin_unlock_irqrestore(&cache->lock, flags);
818}
819
820static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
821{
822	inc_ds(cache, bio, cell);
823	issue(cache, bio);
824}
825
826static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
827{
828	unsigned long flags;
829
830	spin_lock_irqsave(&cache->lock, flags);
831	bio_list_add(&cache->deferred_writethrough_bios, bio);
832	spin_unlock_irqrestore(&cache->lock, flags);
833
834	wake_worker(cache);
835}
836
837static void writethrough_endio(struct bio *bio, int err)
838{
839	struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
840
841	dm_unhook_bio(&pb->hook_info, bio);
842
843	if (err) {
844		bio_endio(bio, err);
845		return;
846	}
847
848	dm_bio_restore(&pb->bio_details, bio);
849	remap_to_cache(pb->cache, bio, pb->cblock);
850
851	/*
852	 * We can't issue this bio directly, since we're in interrupt
853	 * context.  So it gets put on a bio list for processing by the
854	 * worker thread.
855	 */
856	defer_writethrough_bio(pb->cache, bio);
857}
858
859/*
860 * When running in writethrough mode we need to send writes to clean blocks
861 * to both the cache and origin devices.  In future we'd like to clone the
862 * bio and send them in parallel, but for now we're doing them in
863 * series as this is easier.
864 */
865static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
866				       dm_oblock_t oblock, dm_cblock_t cblock)
867{
868	struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
869
870	pb->cache = cache;
871	pb->cblock = cblock;
872	dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
873	dm_bio_record(&pb->bio_details, bio);
874
875	remap_to_origin_clear_discard(pb->cache, bio, oblock);
876}
877
878/*----------------------------------------------------------------
879 * Migration processing
880 *
881 * Migration covers moving data from the origin device to the cache, or
882 * vice versa.
883 *--------------------------------------------------------------*/
884static void inc_io_migrations(struct cache *cache)
885{
886	atomic_inc(&cache->nr_io_migrations);
887}
888
889static void dec_io_migrations(struct cache *cache)
890{
891	atomic_dec(&cache->nr_io_migrations);
892}
893
894static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
895			 bool holder)
896{
897	(holder ? dm_cell_release : dm_cell_release_no_holder)
898		(cache->prison, cell, &cache->deferred_bios);
899	free_prison_cell(cache, cell);
900}
901
902static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
903		       bool holder)
904{
905	unsigned long flags;
906
907	spin_lock_irqsave(&cache->lock, flags);
908	__cell_defer(cache, cell, holder);
909	spin_unlock_irqrestore(&cache->lock, flags);
910
911	wake_worker(cache);
912}
913
914static void free_io_migration(struct dm_cache_migration *mg)
915{
916	dec_io_migrations(mg->cache);
917	free_migration(mg);
918}
919
920static void migration_failure(struct dm_cache_migration *mg)
921{
922	struct cache *cache = mg->cache;
923
924	if (mg->writeback) {
925		DMWARN_LIMIT("writeback failed; couldn't copy block");
926		set_dirty(cache, mg->old_oblock, mg->cblock);
927		cell_defer(cache, mg->old_ocell, false);
928
929	} else if (mg->demote) {
930		DMWARN_LIMIT("demotion failed; couldn't copy block");
931		policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
932
933		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
934		if (mg->promote)
935			cell_defer(cache, mg->new_ocell, true);
936	} else {
937		DMWARN_LIMIT("promotion failed; couldn't copy block");
938		policy_remove_mapping(cache->policy, mg->new_oblock);
939		cell_defer(cache, mg->new_ocell, true);
940	}
941
942	free_io_migration(mg);
943}
944
945static void migration_success_pre_commit(struct dm_cache_migration *mg)
946{
947	unsigned long flags;
948	struct cache *cache = mg->cache;
949
950	if (mg->writeback) {
951		clear_dirty(cache, mg->old_oblock, mg->cblock);
952		cell_defer(cache, mg->old_ocell, false);
953		free_io_migration(mg);
954		return;
955
956	} else if (mg->demote) {
957		if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
958			DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
959			policy_force_mapping(cache->policy, mg->new_oblock,
960					     mg->old_oblock);
961			if (mg->promote)
962				cell_defer(cache, mg->new_ocell, true);
963			free_io_migration(mg);
964			return;
965		}
966	} else {
967		if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
968			DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
969			policy_remove_mapping(cache->policy, mg->new_oblock);
970			free_io_migration(mg);
971			return;
972		}
973	}
974
975	spin_lock_irqsave(&cache->lock, flags);
976	list_add_tail(&mg->list, &cache->need_commit_migrations);
977	cache->commit_requested = true;
978	spin_unlock_irqrestore(&cache->lock, flags);
979}
980
981static void migration_success_post_commit(struct dm_cache_migration *mg)
982{
983	unsigned long flags;
984	struct cache *cache = mg->cache;
985
986	if (mg->writeback) {
987		DMWARN("writeback unexpectedly triggered commit");
988		return;
989
990	} else if (mg->demote) {
991		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
992
993		if (mg->promote) {
994			mg->demote = false;
995
996			spin_lock_irqsave(&cache->lock, flags);
997			list_add_tail(&mg->list, &cache->quiesced_migrations);
998			spin_unlock_irqrestore(&cache->lock, flags);
999
1000		} else {
1001			if (mg->invalidate)
1002				policy_remove_mapping(cache->policy, mg->old_oblock);
1003			free_io_migration(mg);
1004		}
1005
1006	} else {
1007		if (mg->requeue_holder) {
1008			clear_dirty(cache, mg->new_oblock, mg->cblock);
1009			cell_defer(cache, mg->new_ocell, true);
1010		} else {
1011			/*
1012			 * The block was promoted via an overwrite, so it's dirty.
1013			 */
1014			set_dirty(cache, mg->new_oblock, mg->cblock);
1015			bio_endio(mg->new_ocell->holder, 0);
1016			cell_defer(cache, mg->new_ocell, false);
1017		}
1018		free_io_migration(mg);
1019	}
1020}
1021
1022static void copy_complete(int read_err, unsigned long write_err, void *context)
1023{
1024	unsigned long flags;
1025	struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
1026	struct cache *cache = mg->cache;
1027
1028	if (read_err || write_err)
1029		mg->err = true;
1030
1031	spin_lock_irqsave(&cache->lock, flags);
1032	list_add_tail(&mg->list, &cache->completed_migrations);
1033	spin_unlock_irqrestore(&cache->lock, flags);
1034
1035	wake_worker(cache);
1036}
1037
1038static void issue_copy(struct dm_cache_migration *mg)
1039{
1040	int r;
1041	struct dm_io_region o_region, c_region;
1042	struct cache *cache = mg->cache;
1043	sector_t cblock = from_cblock(mg->cblock);
1044
1045	o_region.bdev = cache->origin_dev->bdev;
1046	o_region.count = cache->sectors_per_block;
1047
1048	c_region.bdev = cache->cache_dev->bdev;
1049	c_region.sector = cblock * cache->sectors_per_block;
1050	c_region.count = cache->sectors_per_block;
1051
1052	if (mg->writeback || mg->demote) {
1053		/* demote */
1054		o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
1055		r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
1056	} else {
1057		/* promote */
1058		o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
1059		r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
1060	}
1061
1062	if (r < 0) {
1063		DMERR_LIMIT("issuing migration failed");
1064		migration_failure(mg);
1065	}
1066}
1067
1068static void overwrite_endio(struct bio *bio, int err)
1069{
1070	struct dm_cache_migration *mg = bio->bi_private;
1071	struct cache *cache = mg->cache;
1072	size_t pb_data_size = get_per_bio_data_size(cache);
1073	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1074	unsigned long flags;
1075
1076	dm_unhook_bio(&pb->hook_info, bio);
1077
1078	if (err)
1079		mg->err = true;
1080
1081	mg->requeue_holder = false;
1082
1083	spin_lock_irqsave(&cache->lock, flags);
1084	list_add_tail(&mg->list, &cache->completed_migrations);
1085	spin_unlock_irqrestore(&cache->lock, flags);
1086
1087	wake_worker(cache);
1088}
1089
1090static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
1091{
1092	size_t pb_data_size = get_per_bio_data_size(mg->cache);
1093	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1094
1095	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1096	remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
1097
1098	/*
1099	 * No need to inc_ds() here, since the cell will be held for the
1100	 * duration of the io.
1101	 */
1102	generic_make_request(bio);
1103}
1104
1105static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1106{
1107	return (bio_data_dir(bio) == WRITE) &&
1108		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1109}
1110
1111static void avoid_copy(struct dm_cache_migration *mg)
1112{
1113	atomic_inc(&mg->cache->stats.copies_avoided);
1114	migration_success_pre_commit(mg);
1115}
1116
1117static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1118				     dm_dblock_t *b, dm_dblock_t *e)
1119{
1120	sector_t sb = bio->bi_iter.bi_sector;
1121	sector_t se = bio_end_sector(bio);
1122
1123	*b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1124
1125	if (se - sb < cache->discard_block_size)
1126		*e = *b;
1127	else
1128		*e = to_dblock(block_div(se, cache->discard_block_size));
1129}
1130
1131static void issue_discard(struct dm_cache_migration *mg)
1132{
1133	dm_dblock_t b, e;
1134	struct bio *bio = mg->new_ocell->holder;
1135
1136	calc_discard_block_range(mg->cache, bio, &b, &e);
1137	while (b != e) {
1138		set_discard(mg->cache, b);
1139		b = to_dblock(from_dblock(b) + 1);
1140	}
1141
1142	bio_endio(bio, 0);
1143	cell_defer(mg->cache, mg->new_ocell, false);
1144	free_migration(mg);
1145}
1146
1147static void issue_copy_or_discard(struct dm_cache_migration *mg)
1148{
1149	bool avoid;
1150	struct cache *cache = mg->cache;
1151
1152	if (mg->discard) {
1153		issue_discard(mg);
1154		return;
1155	}
1156
1157	if (mg->writeback || mg->demote)
1158		avoid = !is_dirty(cache, mg->cblock) ||
1159			is_discarded_oblock(cache, mg->old_oblock);
1160	else {
1161		struct bio *bio = mg->new_ocell->holder;
1162
1163		avoid = is_discarded_oblock(cache, mg->new_oblock);
1164
1165		if (writeback_mode(&cache->features) &&
1166		    !avoid && bio_writes_complete_block(cache, bio)) {
1167			issue_overwrite(mg, bio);
1168			return;
1169		}
1170	}
1171
1172	avoid ? avoid_copy(mg) : issue_copy(mg);
1173}
1174
1175static void complete_migration(struct dm_cache_migration *mg)
1176{
1177	if (mg->err)
1178		migration_failure(mg);
1179	else
1180		migration_success_pre_commit(mg);
1181}
1182
1183static void process_migrations(struct cache *cache, struct list_head *head,
1184			       void (*fn)(struct dm_cache_migration *))
1185{
1186	unsigned long flags;
1187	struct list_head list;
1188	struct dm_cache_migration *mg, *tmp;
1189
1190	INIT_LIST_HEAD(&list);
1191	spin_lock_irqsave(&cache->lock, flags);
1192	list_splice_init(head, &list);
1193	spin_unlock_irqrestore(&cache->lock, flags);
1194
1195	list_for_each_entry_safe(mg, tmp, &list, list)
1196		fn(mg);
1197}
1198
1199static void __queue_quiesced_migration(struct dm_cache_migration *mg)
1200{
1201	list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
1202}
1203
1204static void queue_quiesced_migration(struct dm_cache_migration *mg)
1205{
1206	unsigned long flags;
1207	struct cache *cache = mg->cache;
1208
1209	spin_lock_irqsave(&cache->lock, flags);
1210	__queue_quiesced_migration(mg);
1211	spin_unlock_irqrestore(&cache->lock, flags);
1212
1213	wake_worker(cache);
1214}
1215
1216static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
1217{
1218	unsigned long flags;
1219	struct dm_cache_migration *mg, *tmp;
1220
1221	spin_lock_irqsave(&cache->lock, flags);
1222	list_for_each_entry_safe(mg, tmp, work, list)
1223		__queue_quiesced_migration(mg);
1224	spin_unlock_irqrestore(&cache->lock, flags);
1225
1226	wake_worker(cache);
1227}
1228
1229static void check_for_quiesced_migrations(struct cache *cache,
1230					  struct per_bio_data *pb)
1231{
1232	struct list_head work;
1233
1234	if (!pb->all_io_entry)
1235		return;
1236
1237	INIT_LIST_HEAD(&work);
1238	dm_deferred_entry_dec(pb->all_io_entry, &work);
1239
1240	if (!list_empty(&work))
1241		queue_quiesced_migrations(cache, &work);
1242}
1243
1244static void quiesce_migration(struct dm_cache_migration *mg)
1245{
1246	if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
1247		queue_quiesced_migration(mg);
1248}
1249
1250static void promote(struct cache *cache, struct prealloc *structs,
1251		    dm_oblock_t oblock, dm_cblock_t cblock,
1252		    struct dm_bio_prison_cell *cell)
1253{
1254	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1255
1256	mg->err = false;
1257	mg->discard = false;
1258	mg->writeback = false;
1259	mg->demote = false;
1260	mg->promote = true;
1261	mg->requeue_holder = true;
1262	mg->invalidate = false;
1263	mg->cache = cache;
1264	mg->new_oblock = oblock;
1265	mg->cblock = cblock;
1266	mg->old_ocell = NULL;
1267	mg->new_ocell = cell;
1268	mg->start_jiffies = jiffies;
1269
1270	inc_io_migrations(cache);
1271	quiesce_migration(mg);
1272}
1273
1274static void writeback(struct cache *cache, struct prealloc *structs,
1275		      dm_oblock_t oblock, dm_cblock_t cblock,
1276		      struct dm_bio_prison_cell *cell)
1277{
1278	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1279
1280	mg->err = false;
1281	mg->discard = false;
1282	mg->writeback = true;
1283	mg->demote = false;
1284	mg->promote = false;
1285	mg->requeue_holder = true;
1286	mg->invalidate = false;
1287	mg->cache = cache;
1288	mg->old_oblock = oblock;
1289	mg->cblock = cblock;
1290	mg->old_ocell = cell;
1291	mg->new_ocell = NULL;
1292	mg->start_jiffies = jiffies;
1293
1294	inc_io_migrations(cache);
1295	quiesce_migration(mg);
1296}
1297
1298static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1299				dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1300				dm_cblock_t cblock,
1301				struct dm_bio_prison_cell *old_ocell,
1302				struct dm_bio_prison_cell *new_ocell)
1303{
1304	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1305
1306	mg->err = false;
1307	mg->discard = false;
1308	mg->writeback = false;
1309	mg->demote = true;
1310	mg->promote = true;
1311	mg->requeue_holder = true;
1312	mg->invalidate = false;
1313	mg->cache = cache;
1314	mg->old_oblock = old_oblock;
1315	mg->new_oblock = new_oblock;
1316	mg->cblock = cblock;
1317	mg->old_ocell = old_ocell;
1318	mg->new_ocell = new_ocell;
1319	mg->start_jiffies = jiffies;
1320
1321	inc_io_migrations(cache);
1322	quiesce_migration(mg);
1323}
1324
1325/*
1326 * Invalidate a cache entry.  No writeback occurs; any changes in the cache
1327 * block are thrown away.
1328 */
1329static void invalidate(struct cache *cache, struct prealloc *structs,
1330		       dm_oblock_t oblock, dm_cblock_t cblock,
1331		       struct dm_bio_prison_cell *cell)
1332{
1333	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1334
1335	mg->err = false;
1336	mg->discard = false;
1337	mg->writeback = false;
1338	mg->demote = true;
1339	mg->promote = false;
1340	mg->requeue_holder = true;
1341	mg->invalidate = true;
1342	mg->cache = cache;
1343	mg->old_oblock = oblock;
1344	mg->cblock = cblock;
1345	mg->old_ocell = cell;
1346	mg->new_ocell = NULL;
1347	mg->start_jiffies = jiffies;
1348
1349	inc_io_migrations(cache);
1350	quiesce_migration(mg);
1351}
1352
1353static void discard(struct cache *cache, struct prealloc *structs,
1354		    struct dm_bio_prison_cell *cell)
1355{
1356	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1357
1358	mg->err = false;
1359	mg->discard = true;
1360	mg->writeback = false;
1361	mg->demote = false;
1362	mg->promote = false;
1363	mg->requeue_holder = false;
1364	mg->invalidate = false;
1365	mg->cache = cache;
1366	mg->old_ocell = NULL;
1367	mg->new_ocell = cell;
1368	mg->start_jiffies = jiffies;
1369
1370	quiesce_migration(mg);
1371}
1372
1373/*----------------------------------------------------------------
1374 * bio processing
1375 *--------------------------------------------------------------*/
1376static void defer_bio(struct cache *cache, struct bio *bio)
1377{
1378	unsigned long flags;
1379
1380	spin_lock_irqsave(&cache->lock, flags);
1381	bio_list_add(&cache->deferred_bios, bio);
1382	spin_unlock_irqrestore(&cache->lock, flags);
1383
1384	wake_worker(cache);
1385}
1386
1387static void process_flush_bio(struct cache *cache, struct bio *bio)
1388{
1389	size_t pb_data_size = get_per_bio_data_size(cache);
1390	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1391
1392	BUG_ON(bio->bi_iter.bi_size);
1393	if (!pb->req_nr)
1394		remap_to_origin(cache, bio);
1395	else
1396		remap_to_cache(cache, bio, 0);
1397
1398	/*
1399	 * REQ_FLUSH is not directed at any particular block so we don't
1400	 * need to inc_ds().  REQ_FUA's are split into a write + REQ_FLUSH
1401	 * by dm-core.
1402	 */
1403	issue(cache, bio);
1404}
1405
1406static void process_discard_bio(struct cache *cache, struct prealloc *structs,
1407				struct bio *bio)
1408{
1409	int r;
1410	dm_dblock_t b, e;
1411	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1412
1413	calc_discard_block_range(cache, bio, &b, &e);
1414	if (b == e) {
1415		bio_endio(bio, 0);
1416		return;
1417	}
1418
1419	cell_prealloc = prealloc_get_cell(structs);
1420	r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
1421			     (cell_free_fn) prealloc_put_cell,
1422			     structs, &new_ocell);
1423	if (r > 0)
1424		return;
1425
1426	discard(cache, structs, new_ocell);
1427}
1428
1429static bool spare_migration_bandwidth(struct cache *cache)
1430{
1431	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1432		cache->sectors_per_block;
1433	return current_volume < cache->migration_threshold;
1434}
1435
1436static void inc_hit_counter(struct cache *cache, struct bio *bio)
1437{
1438	atomic_inc(bio_data_dir(bio) == READ ?
1439		   &cache->stats.read_hit : &cache->stats.write_hit);
1440}
1441
1442static void inc_miss_counter(struct cache *cache, struct bio *bio)
1443{
1444	atomic_inc(bio_data_dir(bio) == READ ?
1445		   &cache->stats.read_miss : &cache->stats.write_miss);
1446}
1447
1448/*----------------------------------------------------------------*/
1449
1450struct old_oblock_lock {
1451	struct policy_locker locker;
1452	struct cache *cache;
1453	struct prealloc *structs;
1454	struct dm_bio_prison_cell *cell;
1455};
1456
1457static int null_locker(struct policy_locker *locker, dm_oblock_t b)
1458{
1459	/* This should never be called */
1460	BUG();
1461	return 0;
1462}
1463
1464static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
1465{
1466	struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
1467	struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
1468
1469	return bio_detain(l->cache, b, NULL, cell_prealloc,
1470			  (cell_free_fn) prealloc_put_cell,
1471			  l->structs, &l->cell);
1472}
1473
1474static void process_bio(struct cache *cache, struct prealloc *structs,
1475			struct bio *bio)
1476{
1477	int r;
1478	bool release_cell = true;
1479	dm_oblock_t block = get_bio_block(cache, bio);
1480	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1481	struct policy_result lookup_result;
1482	bool passthrough = passthrough_mode(&cache->features);
1483	bool discarded_block, can_migrate;
1484	struct old_oblock_lock ool;
1485
1486	/*
1487	 * Check to see if that block is currently migrating.
1488	 */
1489	cell_prealloc = prealloc_get_cell(structs);
1490	r = bio_detain(cache, block, bio, cell_prealloc,
1491		       (cell_free_fn) prealloc_put_cell,
1492		       structs, &new_ocell);
1493	if (r > 0)
1494		return;
1495
1496	discarded_block = is_discarded_oblock(cache, block);
1497	can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
1498
1499	ool.locker.fn = cell_locker;
1500	ool.cache = cache;
1501	ool.structs = structs;
1502	ool.cell = NULL;
1503	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1504		       bio, &ool.locker, &lookup_result);
1505
1506	if (r == -EWOULDBLOCK)
1507		/* migration has been denied */
1508		lookup_result.op = POLICY_MISS;
1509
1510	switch (lookup_result.op) {
1511	case POLICY_HIT:
1512		if (passthrough) {
1513			inc_miss_counter(cache, bio);
1514
1515			/*
1516			 * Passthrough always maps to the origin,
1517			 * invalidating any cache blocks that are written
1518			 * to.
1519			 */
1520
1521			if (bio_data_dir(bio) == WRITE) {
1522				atomic_inc(&cache->stats.demotion);
1523				invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
1524				release_cell = false;
1525
1526			} else {
1527				/* FIXME: factor out issue_origin() */
1528				remap_to_origin_clear_discard(cache, bio, block);
1529				inc_and_issue(cache, bio, new_ocell);
1530			}
1531		} else {
1532			inc_hit_counter(cache, bio);
1533
1534			if (bio_data_dir(bio) == WRITE &&
1535			    writethrough_mode(&cache->features) &&
1536			    !is_dirty(cache, lookup_result.cblock)) {
1537				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1538				inc_and_issue(cache, bio, new_ocell);
1539
1540			} else  {
1541				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1542				inc_and_issue(cache, bio, new_ocell);
1543			}
1544		}
1545
1546		break;
1547
1548	case POLICY_MISS:
1549		inc_miss_counter(cache, bio);
1550		remap_to_origin_clear_discard(cache, bio, block);
1551		inc_and_issue(cache, bio, new_ocell);
1552		break;
1553
1554	case POLICY_NEW:
1555		atomic_inc(&cache->stats.promotion);
1556		promote(cache, structs, block, lookup_result.cblock, new_ocell);
1557		release_cell = false;
1558		break;
1559
1560	case POLICY_REPLACE:
1561		atomic_inc(&cache->stats.demotion);
1562		atomic_inc(&cache->stats.promotion);
1563		demote_then_promote(cache, structs, lookup_result.old_oblock,
1564				    block, lookup_result.cblock,
1565				    ool.cell, new_ocell);
1566		release_cell = false;
1567		break;
1568
1569	default:
1570		DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1571			    (unsigned) lookup_result.op);
1572		bio_io_error(bio);
1573	}
1574
1575	if (release_cell)
1576		cell_defer(cache, new_ocell, false);
1577}
1578
1579static int need_commit_due_to_time(struct cache *cache)
1580{
1581	return !time_in_range(jiffies, cache->last_commit_jiffies,
1582			      cache->last_commit_jiffies + COMMIT_PERIOD);
1583}
1584
1585static int commit_if_needed(struct cache *cache)
1586{
1587	int r = 0;
1588
1589	if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
1590	    dm_cache_changed_this_transaction(cache->cmd)) {
1591		atomic_inc(&cache->stats.commit_count);
1592		cache->commit_requested = false;
1593		r = dm_cache_commit(cache->cmd, false);
1594		cache->last_commit_jiffies = jiffies;
1595	}
1596
1597	return r;
1598}
1599
1600static void process_deferred_bios(struct cache *cache)
1601{
1602	unsigned long flags;
1603	struct bio_list bios;
1604	struct bio *bio;
1605	struct prealloc structs;
1606
1607	memset(&structs, 0, sizeof(structs));
1608	bio_list_init(&bios);
1609
1610	spin_lock_irqsave(&cache->lock, flags);
1611	bio_list_merge(&bios, &cache->deferred_bios);
1612	bio_list_init(&cache->deferred_bios);
1613	spin_unlock_irqrestore(&cache->lock, flags);
1614
1615	while (!bio_list_empty(&bios)) {
1616		/*
1617		 * If we've got no free migration structs, and processing
1618		 * this bio might require one, we pause until there are some
1619		 * prepared mappings to process.
1620		 */
1621		if (prealloc_data_structs(cache, &structs)) {
1622			spin_lock_irqsave(&cache->lock, flags);
1623			bio_list_merge(&cache->deferred_bios, &bios);
1624			spin_unlock_irqrestore(&cache->lock, flags);
1625			break;
1626		}
1627
1628		bio = bio_list_pop(&bios);
1629
1630		if (bio->bi_rw & REQ_FLUSH)
1631			process_flush_bio(cache, bio);
1632		else if (bio->bi_rw & REQ_DISCARD)
1633			process_discard_bio(cache, &structs, bio);
1634		else
1635			process_bio(cache, &structs, bio);
1636	}
1637
1638	prealloc_free_structs(cache, &structs);
1639}
1640
1641static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1642{
1643	unsigned long flags;
1644	struct bio_list bios;
1645	struct bio *bio;
1646
1647	bio_list_init(&bios);
1648
1649	spin_lock_irqsave(&cache->lock, flags);
1650	bio_list_merge(&bios, &cache->deferred_flush_bios);
1651	bio_list_init(&cache->deferred_flush_bios);
1652	spin_unlock_irqrestore(&cache->lock, flags);
1653
1654	/*
1655	 * These bios have already been through inc_ds()
1656	 */
1657	while ((bio = bio_list_pop(&bios)))
1658		submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1659}
1660
1661static void process_deferred_writethrough_bios(struct cache *cache)
1662{
1663	unsigned long flags;
1664	struct bio_list bios;
1665	struct bio *bio;
1666
1667	bio_list_init(&bios);
1668
1669	spin_lock_irqsave(&cache->lock, flags);
1670	bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1671	bio_list_init(&cache->deferred_writethrough_bios);
1672	spin_unlock_irqrestore(&cache->lock, flags);
1673
1674	/*
1675	 * These bios have already been through inc_ds()
1676	 */
1677	while ((bio = bio_list_pop(&bios)))
1678		generic_make_request(bio);
1679}
1680
1681static void writeback_some_dirty_blocks(struct cache *cache)
1682{
1683	int r = 0;
1684	dm_oblock_t oblock;
1685	dm_cblock_t cblock;
1686	struct prealloc structs;
1687	struct dm_bio_prison_cell *old_ocell;
1688
1689	memset(&structs, 0, sizeof(structs));
1690
1691	while (spare_migration_bandwidth(cache)) {
1692		if (prealloc_data_structs(cache, &structs))
1693			break;
1694
1695		r = policy_writeback_work(cache->policy, &oblock, &cblock);
1696		if (r)
1697			break;
1698
1699		r = get_cell(cache, oblock, &structs, &old_ocell);
1700		if (r) {
1701			policy_set_dirty(cache->policy, oblock);
1702			break;
1703		}
1704
1705		writeback(cache, &structs, oblock, cblock, old_ocell);
1706	}
1707
1708	prealloc_free_structs(cache, &structs);
1709}
1710
1711/*----------------------------------------------------------------
1712 * Invalidations.
1713 * Dropping something from the cache *without* writing back.
1714 *--------------------------------------------------------------*/
1715
1716static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
1717{
1718	int r = 0;
1719	uint64_t begin = from_cblock(req->cblocks->begin);
1720	uint64_t end = from_cblock(req->cblocks->end);
1721
1722	while (begin != end) {
1723		r = policy_remove_cblock(cache->policy, to_cblock(begin));
1724		if (!r) {
1725			r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
1726			if (r)
1727				break;
1728
1729		} else if (r == -ENODATA) {
1730			/* harmless, already unmapped */
1731			r = 0;
1732
1733		} else {
1734			DMERR("policy_remove_cblock failed");
1735			break;
1736		}
1737
1738		begin++;
1739        }
1740
1741	cache->commit_requested = true;
1742
1743	req->err = r;
1744	atomic_set(&req->complete, 1);
1745
1746	wake_up(&req->result_wait);
1747}
1748
1749static void process_invalidation_requests(struct cache *cache)
1750{
1751	struct list_head list;
1752	struct invalidation_request *req, *tmp;
1753
1754	INIT_LIST_HEAD(&list);
1755	spin_lock(&cache->invalidation_lock);
1756	list_splice_init(&cache->invalidation_requests, &list);
1757	spin_unlock(&cache->invalidation_lock);
1758
1759	list_for_each_entry_safe (req, tmp, &list, list)
1760		process_invalidation_request(cache, req);
1761}
1762
1763/*----------------------------------------------------------------
1764 * Main worker loop
1765 *--------------------------------------------------------------*/
1766static bool is_quiescing(struct cache *cache)
1767{
1768	return atomic_read(&cache->quiescing);
1769}
1770
1771static void ack_quiescing(struct cache *cache)
1772{
1773	if (is_quiescing(cache)) {
1774		atomic_inc(&cache->quiescing_ack);
1775		wake_up(&cache->quiescing_wait);
1776	}
1777}
1778
1779static void wait_for_quiescing_ack(struct cache *cache)
1780{
1781	wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
1782}
1783
1784static void start_quiescing(struct cache *cache)
1785{
1786	atomic_inc(&cache->quiescing);
1787	wait_for_quiescing_ack(cache);
1788}
1789
1790static void stop_quiescing(struct cache *cache)
1791{
1792	atomic_set(&cache->quiescing, 0);
1793	atomic_set(&cache->quiescing_ack, 0);
1794}
1795
1796static void wait_for_migrations(struct cache *cache)
1797{
1798	wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
1799}
1800
1801static void stop_worker(struct cache *cache)
1802{
1803	cancel_delayed_work(&cache->waker);
1804	flush_workqueue(cache->wq);
1805}
1806
1807static void requeue_deferred_io(struct cache *cache)
1808{
1809	struct bio *bio;
1810	struct bio_list bios;
1811
1812	bio_list_init(&bios);
1813	bio_list_merge(&bios, &cache->deferred_bios);
1814	bio_list_init(&cache->deferred_bios);
1815
1816	while ((bio = bio_list_pop(&bios)))
1817		bio_endio(bio, DM_ENDIO_REQUEUE);
1818}
1819
1820static int more_work(struct cache *cache)
1821{
1822	if (is_quiescing(cache))
1823		return !list_empty(&cache->quiesced_migrations) ||
1824			!list_empty(&cache->completed_migrations) ||
1825			!list_empty(&cache->need_commit_migrations);
1826	else
1827		return !bio_list_empty(&cache->deferred_bios) ||
1828			!bio_list_empty(&cache->deferred_flush_bios) ||
1829			!bio_list_empty(&cache->deferred_writethrough_bios) ||
1830			!list_empty(&cache->quiesced_migrations) ||
1831			!list_empty(&cache->completed_migrations) ||
1832			!list_empty(&cache->need_commit_migrations) ||
1833			cache->invalidate;
1834}
1835
1836static void do_worker(struct work_struct *ws)
1837{
1838	struct cache *cache = container_of(ws, struct cache, worker);
1839
1840	do {
1841		if (!is_quiescing(cache)) {
1842			writeback_some_dirty_blocks(cache);
1843			process_deferred_writethrough_bios(cache);
1844			process_deferred_bios(cache);
1845			process_invalidation_requests(cache);
1846		}
1847
1848		process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
1849		process_migrations(cache, &cache->completed_migrations, complete_migration);
1850
1851		if (commit_if_needed(cache)) {
1852			process_deferred_flush_bios(cache, false);
1853			process_migrations(cache, &cache->need_commit_migrations, migration_failure);
1854
1855			/*
1856			 * FIXME: rollback metadata or just go into a
1857			 * failure mode and error everything
1858			 */
1859		} else {
1860			process_deferred_flush_bios(cache, true);
1861			process_migrations(cache, &cache->need_commit_migrations,
1862					   migration_success_post_commit);
1863		}
1864
1865		ack_quiescing(cache);
1866
1867	} while (more_work(cache));
1868}
1869
1870/*
1871 * We want to commit periodically so that not too much
1872 * unwritten metadata builds up.
1873 */
1874static void do_waker(struct work_struct *ws)
1875{
1876	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1877	policy_tick(cache->policy);
1878	wake_worker(cache);
1879	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1880}
1881
1882/*----------------------------------------------------------------*/
1883
1884static int is_congested(struct dm_dev *dev, int bdi_bits)
1885{
1886	struct request_queue *q = bdev_get_queue(dev->bdev);
1887	return bdi_congested(&q->backing_dev_info, bdi_bits);
1888}
1889
1890static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1891{
1892	struct cache *cache = container_of(cb, struct cache, callbacks);
1893
1894	return is_congested(cache->origin_dev, bdi_bits) ||
1895		is_congested(cache->cache_dev, bdi_bits);
1896}
1897
1898/*----------------------------------------------------------------
1899 * Target methods
1900 *--------------------------------------------------------------*/
1901
1902/*
1903 * This function gets called on the error paths of the constructor, so we
1904 * have to cope with a partially initialised struct.
1905 */
1906static void destroy(struct cache *cache)
1907{
1908	unsigned i;
1909
1910	if (cache->migration_pool)
1911		mempool_destroy(cache->migration_pool);
1912
1913	if (cache->all_io_ds)
1914		dm_deferred_set_destroy(cache->all_io_ds);
1915
1916	if (cache->prison)
1917		dm_bio_prison_destroy(cache->prison);
1918
1919	if (cache->wq)
1920		destroy_workqueue(cache->wq);
1921
1922	if (cache->dirty_bitset)
1923		free_bitset(cache->dirty_bitset);
1924
1925	if (cache->discard_bitset)
1926		free_bitset(cache->discard_bitset);
1927
1928	if (cache->copier)
1929		dm_kcopyd_client_destroy(cache->copier);
1930
1931	if (cache->cmd)
1932		dm_cache_metadata_close(cache->cmd);
1933
1934	if (cache->metadata_dev)
1935		dm_put_device(cache->ti, cache->metadata_dev);
1936
1937	if (cache->origin_dev)
1938		dm_put_device(cache->ti, cache->origin_dev);
1939
1940	if (cache->cache_dev)
1941		dm_put_device(cache->ti, cache->cache_dev);
1942
1943	if (cache->policy)
1944		dm_cache_policy_destroy(cache->policy);
1945
1946	for (i = 0; i < cache->nr_ctr_args ; i++)
1947		kfree(cache->ctr_args[i]);
1948	kfree(cache->ctr_args);
1949
1950	kfree(cache);
1951}
1952
1953static void cache_dtr(struct dm_target *ti)
1954{
1955	struct cache *cache = ti->private;
1956
1957	destroy(cache);
1958}
1959
1960static sector_t get_dev_size(struct dm_dev *dev)
1961{
1962	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1963}
1964
1965/*----------------------------------------------------------------*/
1966
1967/*
1968 * Construct a cache device mapping.
1969 *
1970 * cache <metadata dev> <cache dev> <origin dev> <block size>
1971 *       <#feature args> [<feature arg>]*
1972 *       <policy> <#policy args> [<policy arg>]*
1973 *
1974 * metadata dev    : fast device holding the persistent metadata
1975 * cache dev	   : fast device holding cached data blocks
1976 * origin dev	   : slow device holding original data blocks
1977 * block size	   : cache unit size in sectors
1978 *
1979 * #feature args   : number of feature arguments passed
1980 * feature args    : writethrough.  (The default is writeback.)
1981 *
1982 * policy	   : the replacement policy to use
1983 * #policy args    : an even number of policy arguments corresponding
1984 *		     to key/value pairs passed to the policy
1985 * policy args	   : key/value pairs passed to the policy
1986 *		     E.g. 'sequential_threshold 1024'
1987 *		     See cache-policies.txt for details.
1988 *
1989 * Optional feature arguments are:
1990 *   writethrough  : write through caching that prohibits cache block
1991 *		     content from being different from origin block content.
1992 *		     Without this argument, the default behaviour is to write
1993 *		     back cache block contents later for performance reasons,
1994 *		     so they may differ from the corresponding origin blocks.
1995 */
1996struct cache_args {
1997	struct dm_target *ti;
1998
1999	struct dm_dev *metadata_dev;
2000
2001	struct dm_dev *cache_dev;
2002	sector_t cache_sectors;
2003
2004	struct dm_dev *origin_dev;
2005	sector_t origin_sectors;
2006
2007	uint32_t block_size;
2008
2009	const char *policy_name;
2010	int policy_argc;
2011	const char **policy_argv;
2012
2013	struct cache_features features;
2014};
2015
2016static void destroy_cache_args(struct cache_args *ca)
2017{
2018	if (ca->metadata_dev)
2019		dm_put_device(ca->ti, ca->metadata_dev);
2020
2021	if (ca->cache_dev)
2022		dm_put_device(ca->ti, ca->cache_dev);
2023
2024	if (ca->origin_dev)
2025		dm_put_device(ca->ti, ca->origin_dev);
2026
2027	kfree(ca);
2028}
2029
2030static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2031{
2032	if (!as->argc) {
2033		*error = "Insufficient args";
2034		return false;
2035	}
2036
2037	return true;
2038}
2039
2040static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2041			      char **error)
2042{
2043	int r;
2044	sector_t metadata_dev_size;
2045	char b[BDEVNAME_SIZE];
2046
2047	if (!at_least_one_arg(as, error))
2048		return -EINVAL;
2049
2050	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2051			  &ca->metadata_dev);
2052	if (r) {
2053		*error = "Error opening metadata device";
2054		return r;
2055	}
2056
2057	metadata_dev_size = get_dev_size(ca->metadata_dev);
2058	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2059		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2060		       bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
2061
2062	return 0;
2063}
2064
2065static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2066			   char **error)
2067{
2068	int r;
2069
2070	if (!at_least_one_arg(as, error))
2071		return -EINVAL;
2072
2073	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2074			  &ca->cache_dev);
2075	if (r) {
2076		*error = "Error opening cache device";
2077		return r;
2078	}
2079	ca->cache_sectors = get_dev_size(ca->cache_dev);
2080
2081	return 0;
2082}
2083
2084static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2085			    char **error)
2086{
2087	int r;
2088
2089	if (!at_least_one_arg(as, error))
2090		return -EINVAL;
2091
2092	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2093			  &ca->origin_dev);
2094	if (r) {
2095		*error = "Error opening origin device";
2096		return r;
2097	}
2098
2099	ca->origin_sectors = get_dev_size(ca->origin_dev);
2100	if (ca->ti->len > ca->origin_sectors) {
2101		*error = "Device size larger than cached device";
2102		return -EINVAL;
2103	}
2104
2105	return 0;
2106}
2107
2108static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2109			    char **error)
2110{
2111	unsigned long block_size;
2112
2113	if (!at_least_one_arg(as, error))
2114		return -EINVAL;
2115
2116	if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2117	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2118	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2119	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2120		*error = "Invalid data block size";
2121		return -EINVAL;
2122	}
2123
2124	if (block_size > ca->cache_sectors) {
2125		*error = "Data block size is larger than the cache device";
2126		return -EINVAL;
2127	}
2128
2129	ca->block_size = block_size;
2130
2131	return 0;
2132}
2133
2134static void init_features(struct cache_features *cf)
2135{
2136	cf->mode = CM_WRITE;
2137	cf->io_mode = CM_IO_WRITEBACK;
2138}
2139
2140static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2141			  char **error)
2142{
2143	static struct dm_arg _args[] = {
2144		{0, 1, "Invalid number of cache feature arguments"},
2145	};
2146
2147	int r;
2148	unsigned argc;
2149	const char *arg;
2150	struct cache_features *cf = &ca->features;
2151
2152	init_features(cf);
2153
2154	r = dm_read_arg_group(_args, as, &argc, error);
2155	if (r)
2156		return -EINVAL;
2157
2158	while (argc--) {
2159		arg = dm_shift_arg(as);
2160
2161		if (!strcasecmp(arg, "writeback"))
2162			cf->io_mode = CM_IO_WRITEBACK;
2163
2164		else if (!strcasecmp(arg, "writethrough"))
2165			cf->io_mode = CM_IO_WRITETHROUGH;
2166
2167		else if (!strcasecmp(arg, "passthrough"))
2168			cf->io_mode = CM_IO_PASSTHROUGH;
2169
2170		else {
2171			*error = "Unrecognised cache feature requested";
2172			return -EINVAL;
2173		}
2174	}
2175
2176	return 0;
2177}
2178
2179static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2180			char **error)
2181{
2182	static struct dm_arg _args[] = {
2183		{0, 1024, "Invalid number of policy arguments"},
2184	};
2185
2186	int r;
2187
2188	if (!at_least_one_arg(as, error))
2189		return -EINVAL;
2190
2191	ca->policy_name = dm_shift_arg(as);
2192
2193	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2194	if (r)
2195		return -EINVAL;
2196
2197	ca->policy_argv = (const char **)as->argv;
2198	dm_consume_args(as, ca->policy_argc);
2199
2200	return 0;
2201}
2202
2203static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2204			    char **error)
2205{
2206	int r;
2207	struct dm_arg_set as;
2208
2209	as.argc = argc;
2210	as.argv = argv;
2211
2212	r = parse_metadata_dev(ca, &as, error);
2213	if (r)
2214		return r;
2215
2216	r = parse_cache_dev(ca, &as, error);
2217	if (r)
2218		return r;
2219
2220	r = parse_origin_dev(ca, &as, error);
2221	if (r)
2222		return r;
2223
2224	r = parse_block_size(ca, &as, error);
2225	if (r)
2226		return r;
2227
2228	r = parse_features(ca, &as, error);
2229	if (r)
2230		return r;
2231
2232	r = parse_policy(ca, &as, error);
2233	if (r)
2234		return r;
2235
2236	return 0;
2237}
2238
2239/*----------------------------------------------------------------*/
2240
2241static struct kmem_cache *migration_cache;
2242
2243#define NOT_CORE_OPTION 1
2244
2245static int process_config_option(struct cache *cache, const char *key, const char *value)
2246{
2247	unsigned long tmp;
2248
2249	if (!strcasecmp(key, "migration_threshold")) {
2250		if (kstrtoul(value, 10, &tmp))
2251			return -EINVAL;
2252
2253		cache->migration_threshold = tmp;
2254		return 0;
2255	}
2256
2257	return NOT_CORE_OPTION;
2258}
2259
2260static int set_config_value(struct cache *cache, const char *key, const char *value)
2261{
2262	int r = process_config_option(cache, key, value);
2263
2264	if (r == NOT_CORE_OPTION)
2265		r = policy_set_config_value(cache->policy, key, value);
2266
2267	if (r)
2268		DMWARN("bad config value for %s: %s", key, value);
2269
2270	return r;
2271}
2272
2273static int set_config_values(struct cache *cache, int argc, const char **argv)
2274{
2275	int r = 0;
2276
2277	if (argc & 1) {
2278		DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2279		return -EINVAL;
2280	}
2281
2282	while (argc) {
2283		r = set_config_value(cache, argv[0], argv[1]);
2284		if (r)
2285			break;
2286
2287		argc -= 2;
2288		argv += 2;
2289	}
2290
2291	return r;
2292}
2293
2294static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2295			       char **error)
2296{
2297	struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2298							   cache->cache_size,
2299							   cache->origin_sectors,
2300							   cache->sectors_per_block);
2301	if (IS_ERR(p)) {
2302		*error = "Error creating cache's policy";
2303		return PTR_ERR(p);
2304	}
2305	cache->policy = p;
2306
2307	return 0;
2308}
2309
2310/*
2311 * We want the discard block size to be at least the size of the cache
2312 * block size and have no more than 2^14 discard blocks across the origin.
2313 */
2314#define MAX_DISCARD_BLOCKS (1 << 14)
2315
2316static bool too_many_discard_blocks(sector_t discard_block_size,
2317				    sector_t origin_size)
2318{
2319	(void) sector_div(origin_size, discard_block_size);
2320
2321	return origin_size > MAX_DISCARD_BLOCKS;
2322}
2323
2324static sector_t calculate_discard_block_size(sector_t cache_block_size,
2325					     sector_t origin_size)
2326{
2327	sector_t discard_block_size = cache_block_size;
2328
2329	if (origin_size)
2330		while (too_many_discard_blocks(discard_block_size, origin_size))
2331			discard_block_size *= 2;
2332
2333	return discard_block_size;
2334}
2335
2336static void set_cache_size(struct cache *cache, dm_cblock_t size)
2337{
2338	dm_block_t nr_blocks = from_cblock(size);
2339
2340	if (nr_blocks > (1 << 20) && cache->cache_size != size)
2341		DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2342			     "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2343			     "Please consider increasing the cache block size to reduce the overall cache block count.",
2344			     (unsigned long long) nr_blocks);
2345
2346	cache->cache_size = size;
2347}
2348
2349#define DEFAULT_MIGRATION_THRESHOLD 2048
2350
2351static int cache_create(struct cache_args *ca, struct cache **result)
2352{
2353	int r = 0;
2354	char **error = &ca->ti->error;
2355	struct cache *cache;
2356	struct dm_target *ti = ca->ti;
2357	dm_block_t origin_blocks;
2358	struct dm_cache_metadata *cmd;
2359	bool may_format = ca->features.mode == CM_WRITE;
2360
2361	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2362	if (!cache)
2363		return -ENOMEM;
2364
2365	cache->ti = ca->ti;
2366	ti->private = cache;
2367	ti->num_flush_bios = 2;
2368	ti->flush_supported = true;
2369
2370	ti->num_discard_bios = 1;
2371	ti->discards_supported = true;
2372	ti->discard_zeroes_data_unsupported = true;
2373	ti->split_discard_bios = false;
2374
2375	cache->features = ca->features;
2376	ti->per_bio_data_size = get_per_bio_data_size(cache);
2377
2378	cache->callbacks.congested_fn = cache_is_congested;
2379	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
2380
2381	cache->metadata_dev = ca->metadata_dev;
2382	cache->origin_dev = ca->origin_dev;
2383	cache->cache_dev = ca->cache_dev;
2384
2385	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2386
2387	/* FIXME: factor out this whole section */
2388	origin_blocks = cache->origin_sectors = ca->origin_sectors;
2389	origin_blocks = block_div(origin_blocks, ca->block_size);
2390	cache->origin_blocks = to_oblock(origin_blocks);
2391
2392	cache->sectors_per_block = ca->block_size;
2393	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2394		r = -EINVAL;
2395		goto bad;
2396	}
2397
2398	if (ca->block_size & (ca->block_size - 1)) {
2399		dm_block_t cache_size = ca->cache_sectors;
2400
2401		cache->sectors_per_block_shift = -1;
2402		cache_size = block_div(cache_size, ca->block_size);
2403		set_cache_size(cache, to_cblock(cache_size));
2404	} else {
2405		cache->sectors_per_block_shift = __ffs(ca->block_size);
2406		set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2407	}
2408
2409	r = create_cache_policy(cache, ca, error);
2410	if (r)
2411		goto bad;
2412
2413	cache->policy_nr_args = ca->policy_argc;
2414	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2415
2416	r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2417	if (r) {
2418		*error = "Error setting cache policy's config values";
2419		goto bad;
2420	}
2421
2422	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2423				     ca->block_size, may_format,
2424				     dm_cache_policy_get_hint_size(cache->policy));
2425	if (IS_ERR(cmd)) {
2426		*error = "Error creating metadata object";
2427		r = PTR_ERR(cmd);
2428		goto bad;
2429	}
2430	cache->cmd = cmd;
2431
2432	if (passthrough_mode(&cache->features)) {
2433		bool all_clean;
2434
2435		r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2436		if (r) {
2437			*error = "dm_cache_metadata_all_clean() failed";
2438			goto bad;
2439		}
2440
2441		if (!all_clean) {
2442			*error = "Cannot enter passthrough mode unless all blocks are clean";
2443			r = -EINVAL;
2444			goto bad;
2445		}
2446	}
2447
2448	spin_lock_init(&cache->lock);
2449	bio_list_init(&cache->deferred_bios);
2450	bio_list_init(&cache->deferred_flush_bios);
2451	bio_list_init(&cache->deferred_writethrough_bios);
2452	INIT_LIST_HEAD(&cache->quiesced_migrations);
2453	INIT_LIST_HEAD(&cache->completed_migrations);
2454	INIT_LIST_HEAD(&cache->need_commit_migrations);
2455	atomic_set(&cache->nr_allocated_migrations, 0);
2456	atomic_set(&cache->nr_io_migrations, 0);
2457	init_waitqueue_head(&cache->migration_wait);
2458
2459	init_waitqueue_head(&cache->quiescing_wait);
2460	atomic_set(&cache->quiescing, 0);
2461	atomic_set(&cache->quiescing_ack, 0);
2462
2463	r = -ENOMEM;
2464	atomic_set(&cache->nr_dirty, 0);
2465	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2466	if (!cache->dirty_bitset) {
2467		*error = "could not allocate dirty bitset";
2468		goto bad;
2469	}
2470	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2471
2472	cache->discard_block_size =
2473		calculate_discard_block_size(cache->sectors_per_block,
2474					     cache->origin_sectors);
2475	cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2476							      cache->discard_block_size));
2477	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2478	if (!cache->discard_bitset) {
2479		*error = "could not allocate discard bitset";
2480		goto bad;
2481	}
2482	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2483
2484	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2485	if (IS_ERR(cache->copier)) {
2486		*error = "could not create kcopyd client";
2487		r = PTR_ERR(cache->copier);
2488		goto bad;
2489	}
2490
2491	cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2492	if (!cache->wq) {
2493		*error = "could not create workqueue for metadata object";
2494		goto bad;
2495	}
2496	INIT_WORK(&cache->worker, do_worker);
2497	INIT_DELAYED_WORK(&cache->waker, do_waker);
2498	cache->last_commit_jiffies = jiffies;
2499
2500	cache->prison = dm_bio_prison_create();
2501	if (!cache->prison) {
2502		*error = "could not create bio prison";
2503		goto bad;
2504	}
2505
2506	cache->all_io_ds = dm_deferred_set_create();
2507	if (!cache->all_io_ds) {
2508		*error = "could not create all_io deferred set";
2509		goto bad;
2510	}
2511
2512	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2513							 migration_cache);
2514	if (!cache->migration_pool) {
2515		*error = "Error creating cache's migration mempool";
2516		goto bad;
2517	}
2518
2519	cache->need_tick_bio = true;
2520	cache->sized = false;
2521	cache->invalidate = false;
2522	cache->commit_requested = false;
2523	cache->loaded_mappings = false;
2524	cache->loaded_discards = false;
2525
2526	load_stats(cache);
2527
2528	atomic_set(&cache->stats.demotion, 0);
2529	atomic_set(&cache->stats.promotion, 0);
2530	atomic_set(&cache->stats.copies_avoided, 0);
2531	atomic_set(&cache->stats.cache_cell_clash, 0);
2532	atomic_set(&cache->stats.commit_count, 0);
2533	atomic_set(&cache->stats.discard_count, 0);
2534
2535	spin_lock_init(&cache->invalidation_lock);
2536	INIT_LIST_HEAD(&cache->invalidation_requests);
2537
2538	*result = cache;
2539	return 0;
2540
2541bad:
2542	destroy(cache);
2543	return r;
2544}
2545
2546static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2547{
2548	unsigned i;
2549	const char **copy;
2550
2551	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2552	if (!copy)
2553		return -ENOMEM;
2554	for (i = 0; i < argc; i++) {
2555		copy[i] = kstrdup(argv[i], GFP_KERNEL);
2556		if (!copy[i]) {
2557			while (i--)
2558				kfree(copy[i]);
2559			kfree(copy);
2560			return -ENOMEM;
2561		}
2562	}
2563
2564	cache->nr_ctr_args = argc;
2565	cache->ctr_args = copy;
2566
2567	return 0;
2568}
2569
2570static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2571{
2572	int r = -EINVAL;
2573	struct cache_args *ca;
2574	struct cache *cache = NULL;
2575
2576	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2577	if (!ca) {
2578		ti->error = "Error allocating memory for cache";
2579		return -ENOMEM;
2580	}
2581	ca->ti = ti;
2582
2583	r = parse_cache_args(ca, argc, argv, &ti->error);
2584	if (r)
2585		goto out;
2586
2587	r = cache_create(ca, &cache);
2588	if (r)
2589		goto out;
2590
2591	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2592	if (r) {
2593		destroy(cache);
2594		goto out;
2595	}
2596
2597	ti->private = cache;
2598
2599out:
2600	destroy_cache_args(ca);
2601	return r;
2602}
2603
2604static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
2605{
2606	int r;
2607	dm_oblock_t block = get_bio_block(cache, bio);
2608	size_t pb_data_size = get_per_bio_data_size(cache);
2609	bool can_migrate = false;
2610	bool discarded_block;
2611	struct policy_result lookup_result;
2612	struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
2613	struct old_oblock_lock ool;
2614
2615	ool.locker.fn = null_locker;
2616
2617	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2618		/*
2619		 * This can only occur if the io goes to a partial block at
2620		 * the end of the origin device.  We don't cache these.
2621		 * Just remap to the origin and carry on.
2622		 */
2623		remap_to_origin(cache, bio);
2624		return DM_MAPIO_REMAPPED;
2625	}
2626
2627	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2628		defer_bio(cache, bio);
2629		return DM_MAPIO_SUBMITTED;
2630	}
2631
2632	/*
2633	 * Check to see if that block is currently migrating.
2634	 */
2635	*cell = alloc_prison_cell(cache);
2636	if (!*cell) {
2637		defer_bio(cache, bio);
2638		return DM_MAPIO_SUBMITTED;
2639	}
2640
2641	r = bio_detain(cache, block, bio, *cell,
2642		       (cell_free_fn) free_prison_cell,
2643		       cache, cell);
2644	if (r) {
2645		if (r < 0)
2646			defer_bio(cache, bio);
2647
2648		return DM_MAPIO_SUBMITTED;
2649	}
2650
2651	discarded_block = is_discarded_oblock(cache, block);
2652
2653	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2654		       bio, &ool.locker, &lookup_result);
2655	if (r == -EWOULDBLOCK) {
2656		cell_defer(cache, *cell, true);
2657		return DM_MAPIO_SUBMITTED;
2658
2659	} else if (r) {
2660		DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2661		cell_defer(cache, *cell, false);
2662		bio_io_error(bio);
2663		return DM_MAPIO_SUBMITTED;
2664	}
2665
2666	r = DM_MAPIO_REMAPPED;
2667	switch (lookup_result.op) {
2668	case POLICY_HIT:
2669		if (passthrough_mode(&cache->features)) {
2670			if (bio_data_dir(bio) == WRITE) {
2671				/*
2672				 * We need to invalidate this block, so
2673				 * defer for the worker thread.
2674				 */
2675				cell_defer(cache, *cell, true);
2676				r = DM_MAPIO_SUBMITTED;
2677
2678			} else {
2679				inc_miss_counter(cache, bio);
2680				remap_to_origin_clear_discard(cache, bio, block);
2681			}
2682
2683		} else {
2684			inc_hit_counter(cache, bio);
2685			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
2686			    !is_dirty(cache, lookup_result.cblock))
2687				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2688			else
2689				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2690		}
2691		break;
2692
2693	case POLICY_MISS:
2694		inc_miss_counter(cache, bio);
2695		if (pb->req_nr != 0) {
2696			/*
2697			 * This is a duplicate writethrough io that is no
2698			 * longer needed because the block has been demoted.
2699			 */
2700			bio_endio(bio, 0);
2701			cell_defer(cache, *cell, false);
2702			r = DM_MAPIO_SUBMITTED;
2703
2704		} else
2705			remap_to_origin_clear_discard(cache, bio, block);
2706
2707		break;
2708
2709	default:
2710		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2711			    (unsigned) lookup_result.op);
2712		cell_defer(cache, *cell, false);
2713		bio_io_error(bio);
2714		r = DM_MAPIO_SUBMITTED;
2715	}
2716
2717	return r;
2718}
2719
2720static int cache_map(struct dm_target *ti, struct bio *bio)
2721{
2722	int r;
2723	struct dm_bio_prison_cell *cell = NULL;
2724	struct cache *cache = ti->private;
2725
2726	r = __cache_map(cache, bio, &cell);
2727	if (r == DM_MAPIO_REMAPPED && cell) {
2728		inc_ds(cache, bio, cell);
2729		cell_defer(cache, cell, false);
2730	}
2731
2732	return r;
2733}
2734
2735static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2736{
2737	struct cache *cache = ti->private;
2738	unsigned long flags;
2739	size_t pb_data_size = get_per_bio_data_size(cache);
2740	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
2741
2742	if (pb->tick) {
2743		policy_tick(cache->policy);
2744
2745		spin_lock_irqsave(&cache->lock, flags);
2746		cache->need_tick_bio = true;
2747		spin_unlock_irqrestore(&cache->lock, flags);
2748	}
2749
2750	check_for_quiesced_migrations(cache, pb);
2751
2752	return 0;
2753}
2754
2755static int write_dirty_bitset(struct cache *cache)
2756{
2757	unsigned i, r;
2758
2759	for (i = 0; i < from_cblock(cache->cache_size); i++) {
2760		r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2761				       is_dirty(cache, to_cblock(i)));
2762		if (r)
2763			return r;
2764	}
2765
2766	return 0;
2767}
2768
2769static int write_discard_bitset(struct cache *cache)
2770{
2771	unsigned i, r;
2772
2773	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2774					   cache->discard_nr_blocks);
2775	if (r) {
2776		DMERR("could not resize on-disk discard bitset");
2777		return r;
2778	}
2779
2780	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2781		r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2782					 is_discarded(cache, to_dblock(i)));
2783		if (r)
2784			return r;
2785	}
2786
2787	return 0;
2788}
2789
2790/*
2791 * returns true on success
2792 */
2793static bool sync_metadata(struct cache *cache)
2794{
2795	int r1, r2, r3, r4;
2796
2797	r1 = write_dirty_bitset(cache);
2798	if (r1)
2799		DMERR("could not write dirty bitset");
2800
2801	r2 = write_discard_bitset(cache);
2802	if (r2)
2803		DMERR("could not write discard bitset");
2804
2805	save_stats(cache);
2806
2807	r3 = dm_cache_write_hints(cache->cmd, cache->policy);
2808	if (r3)
2809		DMERR("could not write hints");
2810
2811	/*
2812	 * If writing the above metadata failed, we still commit, but don't
2813	 * set the clean shutdown flag.  This will effectively force every
2814	 * dirty bit to be set on reload.
2815	 */
2816	r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2817	if (r4)
2818		DMERR("could not write cache metadata.  Data loss may occur.");
2819
2820	return !r1 && !r2 && !r3 && !r4;
2821}
2822
2823static void cache_postsuspend(struct dm_target *ti)
2824{
2825	struct cache *cache = ti->private;
2826
2827	start_quiescing(cache);
2828	wait_for_migrations(cache);
2829	stop_worker(cache);
2830	requeue_deferred_io(cache);
2831	stop_quiescing(cache);
2832
2833	(void) sync_metadata(cache);
2834}
2835
2836static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2837			bool dirty, uint32_t hint, bool hint_valid)
2838{
2839	int r;
2840	struct cache *cache = context;
2841
2842	r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2843	if (r)
2844		return r;
2845
2846	if (dirty)
2847		set_dirty(cache, oblock, cblock);
2848	else
2849		clear_dirty(cache, oblock, cblock);
2850
2851	return 0;
2852}
2853
2854/*
2855 * The discard block size in the on disk metadata is not
2856 * neccessarily the same as we're currently using.  So we have to
2857 * be careful to only set the discarded attribute if we know it
2858 * covers a complete block of the new size.
2859 */
2860struct discard_load_info {
2861	struct cache *cache;
2862
2863	/*
2864	 * These blocks are sized using the on disk dblock size, rather
2865	 * than the current one.
2866	 */
2867	dm_block_t block_size;
2868	dm_block_t discard_begin, discard_end;
2869};
2870
2871static void discard_load_info_init(struct cache *cache,
2872				   struct discard_load_info *li)
2873{
2874	li->cache = cache;
2875	li->discard_begin = li->discard_end = 0;
2876}
2877
2878static void set_discard_range(struct discard_load_info *li)
2879{
2880	sector_t b, e;
2881
2882	if (li->discard_begin == li->discard_end)
2883		return;
2884
2885	/*
2886	 * Convert to sectors.
2887	 */
2888	b = li->discard_begin * li->block_size;
2889	e = li->discard_end * li->block_size;
2890
2891	/*
2892	 * Then convert back to the current dblock size.
2893	 */
2894	b = dm_sector_div_up(b, li->cache->discard_block_size);
2895	sector_div(e, li->cache->discard_block_size);
2896
2897	/*
2898	 * The origin may have shrunk, so we need to check we're still in
2899	 * bounds.
2900	 */
2901	if (e > from_dblock(li->cache->discard_nr_blocks))
2902		e = from_dblock(li->cache->discard_nr_blocks);
2903
2904	for (; b < e; b++)
2905		set_discard(li->cache, to_dblock(b));
2906}
2907
2908static int load_discard(void *context, sector_t discard_block_size,
2909			dm_dblock_t dblock, bool discard)
2910{
2911	struct discard_load_info *li = context;
2912
2913	li->block_size = discard_block_size;
2914
2915	if (discard) {
2916		if (from_dblock(dblock) == li->discard_end)
2917			/*
2918			 * We're already in a discard range, just extend it.
2919			 */
2920			li->discard_end = li->discard_end + 1ULL;
2921
2922		else {
2923			/*
2924			 * Emit the old range and start a new one.
2925			 */
2926			set_discard_range(li);
2927			li->discard_begin = from_dblock(dblock);
2928			li->discard_end = li->discard_begin + 1ULL;
2929		}
2930	} else {
2931		set_discard_range(li);
2932		li->discard_begin = li->discard_end = 0;
2933	}
2934
2935	return 0;
2936}
2937
2938static dm_cblock_t get_cache_dev_size(struct cache *cache)
2939{
2940	sector_t size = get_dev_size(cache->cache_dev);
2941	(void) sector_div(size, cache->sectors_per_block);
2942	return to_cblock(size);
2943}
2944
2945static bool can_resize(struct cache *cache, dm_cblock_t new_size)
2946{
2947	if (from_cblock(new_size) > from_cblock(cache->cache_size))
2948		return true;
2949
2950	/*
2951	 * We can't drop a dirty block when shrinking the cache.
2952	 */
2953	while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
2954		new_size = to_cblock(from_cblock(new_size) + 1);
2955		if (is_dirty(cache, new_size)) {
2956			DMERR("unable to shrink cache; cache block %llu is dirty",
2957			      (unsigned long long) from_cblock(new_size));
2958			return false;
2959		}
2960	}
2961
2962	return true;
2963}
2964
2965static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
2966{
2967	int r;
2968
2969	r = dm_cache_resize(cache->cmd, new_size);
2970	if (r) {
2971		DMERR("could not resize cache metadata");
2972		return r;
2973	}
2974
2975	set_cache_size(cache, new_size);
2976
2977	return 0;
2978}
2979
2980static int cache_preresume(struct dm_target *ti)
2981{
2982	int r = 0;
2983	struct cache *cache = ti->private;
2984	dm_cblock_t csize = get_cache_dev_size(cache);
2985
2986	/*
2987	 * Check to see if the cache has resized.
2988	 */
2989	if (!cache->sized) {
2990		r = resize_cache_dev(cache, csize);
2991		if (r)
2992			return r;
2993
2994		cache->sized = true;
2995
2996	} else if (csize != cache->cache_size) {
2997		if (!can_resize(cache, csize))
2998			return -EINVAL;
2999
3000		r = resize_cache_dev(cache, csize);
3001		if (r)
3002			return r;
3003	}
3004
3005	if (!cache->loaded_mappings) {
3006		r = dm_cache_load_mappings(cache->cmd, cache->policy,
3007					   load_mapping, cache);
3008		if (r) {
3009			DMERR("could not load cache mappings");
3010			return r;
3011		}
3012
3013		cache->loaded_mappings = true;
3014	}
3015
3016	if (!cache->loaded_discards) {
3017		struct discard_load_info li;
3018
3019		/*
3020		 * The discard bitset could have been resized, or the
3021		 * discard block size changed.  To be safe we start by
3022		 * setting every dblock to not discarded.
3023		 */
3024		clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
3025
3026		discard_load_info_init(cache, &li);
3027		r = dm_cache_load_discards(cache->cmd, load_discard, &li);
3028		if (r) {
3029			DMERR("could not load origin discards");
3030			return r;
3031		}
3032		set_discard_range(&li);
3033
3034		cache->loaded_discards = true;
3035	}
3036
3037	return r;
3038}
3039
3040static void cache_resume(struct dm_target *ti)
3041{
3042	struct cache *cache = ti->private;
3043
3044	cache->need_tick_bio = true;
3045	do_waker(&cache->waker.work);
3046}
3047
3048/*
3049 * Status format:
3050 *
3051 * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3052 * <cache block size> <#used cache blocks>/<#total cache blocks>
3053 * <#read hits> <#read misses> <#write hits> <#write misses>
3054 * <#demotions> <#promotions> <#dirty>
3055 * <#features> <features>*
3056 * <#core args> <core args>
3057 * <policy name> <#policy args> <policy args>*
3058 */
3059static void cache_status(struct dm_target *ti, status_type_t type,
3060			 unsigned status_flags, char *result, unsigned maxlen)
3061{
3062	int r = 0;
3063	unsigned i;
3064	ssize_t sz = 0;
3065	dm_block_t nr_free_blocks_metadata = 0;
3066	dm_block_t nr_blocks_metadata = 0;
3067	char buf[BDEVNAME_SIZE];
3068	struct cache *cache = ti->private;
3069	dm_cblock_t residency;
3070
3071	switch (type) {
3072	case STATUSTYPE_INFO:
3073		/* Commit to ensure statistics aren't out-of-date */
3074		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
3075			r = dm_cache_commit(cache->cmd, false);
3076			if (r)
3077				DMERR("could not commit metadata for accurate status");
3078		}
3079
3080		r = dm_cache_get_free_metadata_block_count(cache->cmd,
3081							   &nr_free_blocks_metadata);
3082		if (r) {
3083			DMERR("could not get metadata free block count");
3084			goto err;
3085		}
3086
3087		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3088		if (r) {
3089			DMERR("could not get metadata device size");
3090			goto err;
3091		}
3092
3093		residency = policy_residency(cache->policy);
3094
3095		DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
3096		       (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
3097		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3098		       (unsigned long long)nr_blocks_metadata,
3099		       cache->sectors_per_block,
3100		       (unsigned long long) from_cblock(residency),
3101		       (unsigned long long) from_cblock(cache->cache_size),
3102		       (unsigned) atomic_read(&cache->stats.read_hit),
3103		       (unsigned) atomic_read(&cache->stats.read_miss),
3104		       (unsigned) atomic_read(&cache->stats.write_hit),
3105		       (unsigned) atomic_read(&cache->stats.write_miss),
3106		       (unsigned) atomic_read(&cache->stats.demotion),
3107		       (unsigned) atomic_read(&cache->stats.promotion),
3108		       (unsigned long) atomic_read(&cache->nr_dirty));
3109
3110		if (writethrough_mode(&cache->features))
3111			DMEMIT("1 writethrough ");
3112
3113		else if (passthrough_mode(&cache->features))
3114			DMEMIT("1 passthrough ");
3115
3116		else if (writeback_mode(&cache->features))
3117			DMEMIT("1 writeback ");
3118
3119		else {
3120			DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
3121			goto err;
3122		}
3123
3124		DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3125
3126		DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3127		if (sz < maxlen) {
3128			r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
3129			if (r)
3130				DMERR("policy_emit_config_values returned %d", r);
3131		}
3132
3133		break;
3134
3135	case STATUSTYPE_TABLE:
3136		format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3137		DMEMIT("%s ", buf);
3138		format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3139		DMEMIT("%s ", buf);
3140		format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3141		DMEMIT("%s", buf);
3142
3143		for (i = 0; i < cache->nr_ctr_args - 1; i++)
3144			DMEMIT(" %s", cache->ctr_args[i]);
3145		if (cache->nr_ctr_args)
3146			DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3147	}
3148
3149	return;
3150
3151err:
3152	DMEMIT("Error");
3153}
3154
3155/*
3156 * A cache block range can take two forms:
3157 *
3158 * i) A single cblock, eg. '3456'
3159 * ii) A begin and end cblock with dots between, eg. 123-234
3160 */
3161static int parse_cblock_range(struct cache *cache, const char *str,
3162			      struct cblock_range *result)
3163{
3164	char dummy;
3165	uint64_t b, e;
3166	int r;
3167
3168	/*
3169	 * Try and parse form (ii) first.
3170	 */
3171	r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3172	if (r < 0)
3173		return r;
3174
3175	if (r == 2) {
3176		result->begin = to_cblock(b);
3177		result->end = to_cblock(e);
3178		return 0;
3179	}
3180
3181	/*
3182	 * That didn't work, try form (i).
3183	 */
3184	r = sscanf(str, "%llu%c", &b, &dummy);
3185	if (r < 0)
3186		return r;
3187
3188	if (r == 1) {
3189		result->begin = to_cblock(b);
3190		result->end = to_cblock(from_cblock(result->begin) + 1u);
3191		return 0;
3192	}
3193
3194	DMERR("invalid cblock range '%s'", str);
3195	return -EINVAL;
3196}
3197
3198static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3199{
3200	uint64_t b = from_cblock(range->begin);
3201	uint64_t e = from_cblock(range->end);
3202	uint64_t n = from_cblock(cache->cache_size);
3203
3204	if (b >= n) {
3205		DMERR("begin cblock out of range: %llu >= %llu", b, n);
3206		return -EINVAL;
3207	}
3208
3209	if (e > n) {
3210		DMERR("end cblock out of range: %llu > %llu", e, n);
3211		return -EINVAL;
3212	}
3213
3214	if (b >= e) {
3215		DMERR("invalid cblock range: %llu >= %llu", b, e);
3216		return -EINVAL;
3217	}
3218
3219	return 0;
3220}
3221
3222static int request_invalidation(struct cache *cache, struct cblock_range *range)
3223{
3224	struct invalidation_request req;
3225
3226	INIT_LIST_HEAD(&req.list);
3227	req.cblocks = range;
3228	atomic_set(&req.complete, 0);
3229	req.err = 0;
3230	init_waitqueue_head(&req.result_wait);
3231
3232	spin_lock(&cache->invalidation_lock);
3233	list_add(&req.list, &cache->invalidation_requests);
3234	spin_unlock(&cache->invalidation_lock);
3235	wake_worker(cache);
3236
3237	wait_event(req.result_wait, atomic_read(&req.complete));
3238	return req.err;
3239}
3240
3241static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
3242					      const char **cblock_ranges)
3243{
3244	int r = 0;
3245	unsigned i;
3246	struct cblock_range range;
3247
3248	if (!passthrough_mode(&cache->features)) {
3249		DMERR("cache has to be in passthrough mode for invalidation");
3250		return -EPERM;
3251	}
3252
3253	for (i = 0; i < count; i++) {
3254		r = parse_cblock_range(cache, cblock_ranges[i], &range);
3255		if (r)
3256			break;
3257
3258		r = validate_cblock_range(cache, &range);
3259		if (r)
3260			break;
3261
3262		/*
3263		 * Pass begin and end origin blocks to the worker and wake it.
3264		 */
3265		r = request_invalidation(cache, &range);
3266		if (r)
3267			break;
3268	}
3269
3270	return r;
3271}
3272
3273/*
3274 * Supports
3275 *	"<key> <value>"
3276 * and
3277 *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3278 *
3279 * The key migration_threshold is supported by the cache target core.
3280 */
3281static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
3282{
3283	struct cache *cache = ti->private;
3284
3285	if (!argc)
3286		return -EINVAL;
3287
3288	if (!strcasecmp(argv[0], "invalidate_cblocks"))
3289		return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3290
3291	if (argc != 2)
3292		return -EINVAL;
3293
3294	return set_config_value(cache, argv[0], argv[1]);
3295}
3296
3297static int cache_iterate_devices(struct dm_target *ti,
3298				 iterate_devices_callout_fn fn, void *data)
3299{
3300	int r = 0;
3301	struct cache *cache = ti->private;
3302
3303	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3304	if (!r)
3305		r = fn(ti, cache->origin_dev, 0, ti->len, data);
3306
3307	return r;
3308}
3309
3310/*
3311 * We assume I/O is going to the origin (which is the volume
3312 * more likely to have restrictions e.g. by being striped).
3313 * (Looking up the exact location of the data would be expensive
3314 * and could always be out of date by the time the bio is submitted.)
3315 */
3316static int cache_bvec_merge(struct dm_target *ti,
3317			    struct bvec_merge_data *bvm,
3318			    struct bio_vec *biovec, int max_size)
3319{
3320	struct cache *cache = ti->private;
3321	struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
3322
3323	if (!q->merge_bvec_fn)
3324		return max_size;
3325
3326	bvm->bi_bdev = cache->origin_dev->bdev;
3327	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3328}
3329
3330static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3331{
3332	/*
3333	 * FIXME: these limits may be incompatible with the cache device
3334	 */
3335	limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3336					    cache->origin_sectors);
3337	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3338}
3339
3340static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3341{
3342	struct cache *cache = ti->private;
3343	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3344
3345	/*
3346	 * If the system-determined stacked limits are compatible with the
3347	 * cache's blocksize (io_opt is a factor) do not override them.
3348	 */
3349	if (io_opt_sectors < cache->sectors_per_block ||
3350	    do_div(io_opt_sectors, cache->sectors_per_block)) {
3351		blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3352		blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3353	}
3354	set_discard_limits(cache, limits);
3355}
3356
3357/*----------------------------------------------------------------*/
3358
3359static struct target_type cache_target = {
3360	.name = "cache",
3361	.version = {1, 6, 0},
3362	.module = THIS_MODULE,
3363	.ctr = cache_ctr,
3364	.dtr = cache_dtr,
3365	.map = cache_map,
3366	.end_io = cache_end_io,
3367	.postsuspend = cache_postsuspend,
3368	.preresume = cache_preresume,
3369	.resume = cache_resume,
3370	.status = cache_status,
3371	.message = cache_message,
3372	.iterate_devices = cache_iterate_devices,
3373	.merge = cache_bvec_merge,
3374	.io_hints = cache_io_hints,
3375};
3376
3377static int __init dm_cache_init(void)
3378{
3379	int r;
3380
3381	r = dm_register_target(&cache_target);
3382	if (r) {
3383		DMERR("cache target registration failed: %d", r);
3384		return r;
3385	}
3386
3387	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3388	if (!migration_cache) {
3389		dm_unregister_target(&cache_target);
3390		return -ENOMEM;
3391	}
3392
3393	return 0;
3394}
3395
3396static void __exit dm_cache_exit(void)
3397{
3398	dm_unregister_target(&cache_target);
3399	kmem_cache_destroy(migration_cache);
3400}
3401
3402module_init(dm_cache_init);
3403module_exit(dm_cache_exit);
3404
3405MODULE_DESCRIPTION(DM_NAME " cache target");
3406MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3407MODULE_LICENSE("GPL");
3408