1/*
2 * NVM Express device driver
3 * Copyright (c) 2011-2014, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/bitops.h>
16#include <linux/blkdev.h>
17#include <linux/blk-mq.h>
18#include <linux/cpu.h>
19#include <linux/delay.h>
20#include <linux/errno.h>
21#include <linux/fs.h>
22#include <linux/genhd.h>
23#include <linux/hdreg.h>
24#include <linux/idr.h>
25#include <linux/init.h>
26#include <linux/interrupt.h>
27#include <linux/io.h>
28#include <linux/kdev_t.h>
29#include <linux/kthread.h>
30#include <linux/kernel.h>
31#include <linux/list_sort.h>
32#include <linux/mm.h>
33#include <linux/module.h>
34#include <linux/moduleparam.h>
35#include <linux/pci.h>
36#include <linux/poison.h>
37#include <linux/ptrace.h>
38#include <linux/sched.h>
39#include <linux/slab.h>
40#include <linux/t10-pi.h>
41#include <linux/types.h>
42#include <linux/pr.h>
43#include <scsi/sg.h>
44#include <linux/io-64-nonatomic-lo-hi.h>
45#include <asm/unaligned.h>
46
47#include <uapi/linux/nvme_ioctl.h>
48#include "nvme.h"
49
50#define NVME_MINORS		(1U << MINORBITS)
51#define NVME_Q_DEPTH		1024
52#define NVME_AQ_DEPTH		256
53#define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
54#define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
55#define ADMIN_TIMEOUT		(admin_timeout * HZ)
56#define SHUTDOWN_TIMEOUT	(shutdown_timeout * HZ)
57
58static unsigned char admin_timeout = 60;
59module_param(admin_timeout, byte, 0644);
60MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
61
62unsigned char nvme_io_timeout = 30;
63module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
64MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
65
66static unsigned char shutdown_timeout = 5;
67module_param(shutdown_timeout, byte, 0644);
68MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
69
70static int nvme_major;
71module_param(nvme_major, int, 0);
72
73static int nvme_char_major;
74module_param(nvme_char_major, int, 0);
75
76static int use_threaded_interrupts;
77module_param(use_threaded_interrupts, int, 0);
78
79static bool use_cmb_sqes = true;
80module_param(use_cmb_sqes, bool, 0644);
81MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
82
83static DEFINE_SPINLOCK(dev_list_lock);
84static LIST_HEAD(dev_list);
85static struct task_struct *nvme_thread;
86static struct workqueue_struct *nvme_workq;
87static wait_queue_head_t nvme_kthread_wait;
88
89static struct class *nvme_class;
90
91static int __nvme_reset(struct nvme_dev *dev);
92static int nvme_reset(struct nvme_dev *dev);
93static void nvme_process_cq(struct nvme_queue *nvmeq);
94static void nvme_dead_ctrl(struct nvme_dev *dev);
95
96struct async_cmd_info {
97	struct kthread_work work;
98	struct kthread_worker *worker;
99	struct request *req;
100	u32 result;
101	int status;
102	void *ctx;
103};
104
105/*
106 * An NVM Express queue.  Each device has at least two (one for admin
107 * commands and one for I/O commands).
108 */
109struct nvme_queue {
110	struct device *q_dmadev;
111	struct nvme_dev *dev;
112	char irqname[24];	/* nvme4294967295-65535\0 */
113	spinlock_t q_lock;
114	struct nvme_command *sq_cmds;
115	struct nvme_command __iomem *sq_cmds_io;
116	volatile struct nvme_completion *cqes;
117	struct blk_mq_tags **tags;
118	dma_addr_t sq_dma_addr;
119	dma_addr_t cq_dma_addr;
120	u32 __iomem *q_db;
121	u16 q_depth;
122	s16 cq_vector;
123	u16 sq_head;
124	u16 sq_tail;
125	u16 cq_head;
126	u16 qid;
127	u8 cq_phase;
128	u8 cqe_seen;
129	struct async_cmd_info cmdinfo;
130};
131
132/*
133 * Check we didin't inadvertently grow the command struct
134 */
135static inline void _nvme_check_size(void)
136{
137	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
138	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
139	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
140	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
141	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
142	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
143	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
144	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
145	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
146	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
147	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
148	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
149}
150
151typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
152						struct nvme_completion *);
153
154struct nvme_cmd_info {
155	nvme_completion_fn fn;
156	void *ctx;
157	int aborted;
158	struct nvme_queue *nvmeq;
159	struct nvme_iod iod[0];
160};
161
162/*
163 * Max size of iod being embedded in the request payload
164 */
165#define NVME_INT_PAGES		2
166#define NVME_INT_BYTES(dev)	(NVME_INT_PAGES * (dev)->page_size)
167#define NVME_INT_MASK		0x01
168
169/*
170 * Will slightly overestimate the number of pages needed.  This is OK
171 * as it only leads to a small amount of wasted memory for the lifetime of
172 * the I/O.
173 */
174static int nvme_npages(unsigned size, struct nvme_dev *dev)
175{
176	unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
177	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
178}
179
180static unsigned int nvme_cmd_size(struct nvme_dev *dev)
181{
182	unsigned int ret = sizeof(struct nvme_cmd_info);
183
184	ret += sizeof(struct nvme_iod);
185	ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
186	ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
187
188	return ret;
189}
190
191static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
192				unsigned int hctx_idx)
193{
194	struct nvme_dev *dev = data;
195	struct nvme_queue *nvmeq = dev->queues[0];
196
197	WARN_ON(hctx_idx != 0);
198	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
199	WARN_ON(nvmeq->tags);
200
201	hctx->driver_data = nvmeq;
202	nvmeq->tags = &dev->admin_tagset.tags[0];
203	return 0;
204}
205
206static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
207{
208	struct nvme_queue *nvmeq = hctx->driver_data;
209
210	nvmeq->tags = NULL;
211}
212
213static int nvme_admin_init_request(void *data, struct request *req,
214				unsigned int hctx_idx, unsigned int rq_idx,
215				unsigned int numa_node)
216{
217	struct nvme_dev *dev = data;
218	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
219	struct nvme_queue *nvmeq = dev->queues[0];
220
221	BUG_ON(!nvmeq);
222	cmd->nvmeq = nvmeq;
223	return 0;
224}
225
226static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
227			  unsigned int hctx_idx)
228{
229	struct nvme_dev *dev = data;
230	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
231
232	if (!nvmeq->tags)
233		nvmeq->tags = &dev->tagset.tags[hctx_idx];
234
235	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
236	hctx->driver_data = nvmeq;
237	return 0;
238}
239
240static int nvme_init_request(void *data, struct request *req,
241				unsigned int hctx_idx, unsigned int rq_idx,
242				unsigned int numa_node)
243{
244	struct nvme_dev *dev = data;
245	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
246	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
247
248	BUG_ON(!nvmeq);
249	cmd->nvmeq = nvmeq;
250	return 0;
251}
252
253static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
254				nvme_completion_fn handler)
255{
256	cmd->fn = handler;
257	cmd->ctx = ctx;
258	cmd->aborted = 0;
259	blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
260}
261
262static void *iod_get_private(struct nvme_iod *iod)
263{
264	return (void *) (iod->private & ~0x1UL);
265}
266
267/*
268 * If bit 0 is set, the iod is embedded in the request payload.
269 */
270static bool iod_should_kfree(struct nvme_iod *iod)
271{
272	return (iod->private & NVME_INT_MASK) == 0;
273}
274
275/* Special values must be less than 0x1000 */
276#define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
277#define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
278#define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
279#define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
280
281static void special_completion(struct nvme_queue *nvmeq, void *ctx,
282						struct nvme_completion *cqe)
283{
284	if (ctx == CMD_CTX_CANCELLED)
285		return;
286	if (ctx == CMD_CTX_COMPLETED) {
287		dev_warn(nvmeq->q_dmadev,
288				"completed id %d twice on queue %d\n",
289				cqe->command_id, le16_to_cpup(&cqe->sq_id));
290		return;
291	}
292	if (ctx == CMD_CTX_INVALID) {
293		dev_warn(nvmeq->q_dmadev,
294				"invalid id %d completed on queue %d\n",
295				cqe->command_id, le16_to_cpup(&cqe->sq_id));
296		return;
297	}
298	dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
299}
300
301static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
302{
303	void *ctx;
304
305	if (fn)
306		*fn = cmd->fn;
307	ctx = cmd->ctx;
308	cmd->fn = special_completion;
309	cmd->ctx = CMD_CTX_CANCELLED;
310	return ctx;
311}
312
313static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
314						struct nvme_completion *cqe)
315{
316	u32 result = le32_to_cpup(&cqe->result);
317	u16 status = le16_to_cpup(&cqe->status) >> 1;
318
319	if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
320		++nvmeq->dev->event_limit;
321	if (status != NVME_SC_SUCCESS)
322		return;
323
324	switch (result & 0xff07) {
325	case NVME_AER_NOTICE_NS_CHANGED:
326		dev_info(nvmeq->q_dmadev, "rescanning\n");
327		schedule_work(&nvmeq->dev->scan_work);
328	default:
329		dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result);
330	}
331}
332
333static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
334						struct nvme_completion *cqe)
335{
336	struct request *req = ctx;
337
338	u16 status = le16_to_cpup(&cqe->status) >> 1;
339	u32 result = le32_to_cpup(&cqe->result);
340
341	blk_mq_free_request(req);
342
343	dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
344	++nvmeq->dev->abort_limit;
345}
346
347static void async_completion(struct nvme_queue *nvmeq, void *ctx,
348						struct nvme_completion *cqe)
349{
350	struct async_cmd_info *cmdinfo = ctx;
351	cmdinfo->result = le32_to_cpup(&cqe->result);
352	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
353	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
354	blk_mq_free_request(cmdinfo->req);
355}
356
357static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
358				  unsigned int tag)
359{
360	struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag);
361
362	return blk_mq_rq_to_pdu(req);
363}
364
365/*
366 * Called with local interrupts disabled and the q_lock held.  May not sleep.
367 */
368static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
369						nvme_completion_fn *fn)
370{
371	struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
372	void *ctx;
373	if (tag >= nvmeq->q_depth) {
374		*fn = special_completion;
375		return CMD_CTX_INVALID;
376	}
377	if (fn)
378		*fn = cmd->fn;
379	ctx = cmd->ctx;
380	cmd->fn = special_completion;
381	cmd->ctx = CMD_CTX_COMPLETED;
382	return ctx;
383}
384
385/**
386 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
387 * @nvmeq: The queue to use
388 * @cmd: The command to send
389 *
390 * Safe to use from interrupt context
391 */
392static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
393						struct nvme_command *cmd)
394{
395	u16 tail = nvmeq->sq_tail;
396
397	if (nvmeq->sq_cmds_io)
398		memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
399	else
400		memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
401
402	if (++tail == nvmeq->q_depth)
403		tail = 0;
404	writel(tail, nvmeq->q_db);
405	nvmeq->sq_tail = tail;
406}
407
408static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
409{
410	unsigned long flags;
411	spin_lock_irqsave(&nvmeq->q_lock, flags);
412	__nvme_submit_cmd(nvmeq, cmd);
413	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
414}
415
416static __le64 **iod_list(struct nvme_iod *iod)
417{
418	return ((void *)iod) + iod->offset;
419}
420
421static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
422			    unsigned nseg, unsigned long private)
423{
424	iod->private = private;
425	iod->offset = offsetof(struct nvme_iod, sg[nseg]);
426	iod->npages = -1;
427	iod->length = nbytes;
428	iod->nents = 0;
429}
430
431static struct nvme_iod *
432__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
433		 unsigned long priv, gfp_t gfp)
434{
435	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
436				sizeof(__le64 *) * nvme_npages(bytes, dev) +
437				sizeof(struct scatterlist) * nseg, gfp);
438
439	if (iod)
440		iod_init(iod, bytes, nseg, priv);
441
442	return iod;
443}
444
445static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
446			               gfp_t gfp)
447{
448	unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
449                                                sizeof(struct nvme_dsm_range);
450	struct nvme_iod *iod;
451
452	if (rq->nr_phys_segments <= NVME_INT_PAGES &&
453	    size <= NVME_INT_BYTES(dev)) {
454		struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
455
456		iod = cmd->iod;
457		iod_init(iod, size, rq->nr_phys_segments,
458				(unsigned long) rq | NVME_INT_MASK);
459		return iod;
460	}
461
462	return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
463				(unsigned long) rq, gfp);
464}
465
466static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
467{
468	const int last_prp = dev->page_size / 8 - 1;
469	int i;
470	__le64 **list = iod_list(iod);
471	dma_addr_t prp_dma = iod->first_dma;
472
473	if (iod->npages == 0)
474		dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
475	for (i = 0; i < iod->npages; i++) {
476		__le64 *prp_list = list[i];
477		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
478		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
479		prp_dma = next_prp_dma;
480	}
481
482	if (iod_should_kfree(iod))
483		kfree(iod);
484}
485
486static int nvme_error_status(u16 status)
487{
488	switch (status & 0x7ff) {
489	case NVME_SC_SUCCESS:
490		return 0;
491	case NVME_SC_CAP_EXCEEDED:
492		return -ENOSPC;
493	default:
494		return -EIO;
495	}
496}
497
498#ifdef CONFIG_BLK_DEV_INTEGRITY
499static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
500{
501	if (be32_to_cpu(pi->ref_tag) == v)
502		pi->ref_tag = cpu_to_be32(p);
503}
504
505static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
506{
507	if (be32_to_cpu(pi->ref_tag) == p)
508		pi->ref_tag = cpu_to_be32(v);
509}
510
511/**
512 * nvme_dif_remap - remaps ref tags to bip seed and physical lba
513 *
514 * The virtual start sector is the one that was originally submitted by the
515 * block layer.	Due to partitioning, MD/DM cloning, etc. the actual physical
516 * start sector may be different. Remap protection information to match the
517 * physical LBA on writes, and back to the original seed on reads.
518 *
519 * Type 0 and 3 do not have a ref tag, so no remapping required.
520 */
521static void nvme_dif_remap(struct request *req,
522			void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
523{
524	struct nvme_ns *ns = req->rq_disk->private_data;
525	struct bio_integrity_payload *bip;
526	struct t10_pi_tuple *pi;
527	void *p, *pmap;
528	u32 i, nlb, ts, phys, virt;
529
530	if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
531		return;
532
533	bip = bio_integrity(req->bio);
534	if (!bip)
535		return;
536
537	pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;
538
539	p = pmap;
540	virt = bip_get_seed(bip);
541	phys = nvme_block_nr(ns, blk_rq_pos(req));
542	nlb = (blk_rq_bytes(req) >> ns->lba_shift);
543	ts = ns->disk->queue->integrity.tuple_size;
544
545	for (i = 0; i < nlb; i++, virt++, phys++) {
546		pi = (struct t10_pi_tuple *)p;
547		dif_swap(phys, virt, pi);
548		p += ts;
549	}
550	kunmap_atomic(pmap);
551}
552
553static void nvme_init_integrity(struct nvme_ns *ns)
554{
555	struct blk_integrity integrity;
556
557	switch (ns->pi_type) {
558	case NVME_NS_DPS_PI_TYPE3:
559		integrity.profile = &t10_pi_type3_crc;
560		break;
561	case NVME_NS_DPS_PI_TYPE1:
562	case NVME_NS_DPS_PI_TYPE2:
563		integrity.profile = &t10_pi_type1_crc;
564		break;
565	default:
566		integrity.profile = NULL;
567		break;
568	}
569	integrity.tuple_size = ns->ms;
570	blk_integrity_register(ns->disk, &integrity);
571	blk_queue_max_integrity_segments(ns->queue, 1);
572}
573#else /* CONFIG_BLK_DEV_INTEGRITY */
574static void nvme_dif_remap(struct request *req,
575			void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
576{
577}
578static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
579{
580}
581static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
582{
583}
584static void nvme_init_integrity(struct nvme_ns *ns)
585{
586}
587#endif
588
589static void req_completion(struct nvme_queue *nvmeq, void *ctx,
590						struct nvme_completion *cqe)
591{
592	struct nvme_iod *iod = ctx;
593	struct request *req = iod_get_private(iod);
594	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
595	u16 status = le16_to_cpup(&cqe->status) >> 1;
596	bool requeue = false;
597	int error = 0;
598
599	if (unlikely(status)) {
600		if (!(status & NVME_SC_DNR || blk_noretry_request(req))
601		    && (jiffies - req->start_time) < req->timeout) {
602			unsigned long flags;
603
604			requeue = true;
605			blk_mq_requeue_request(req);
606			spin_lock_irqsave(req->q->queue_lock, flags);
607			if (!blk_queue_stopped(req->q))
608				blk_mq_kick_requeue_list(req->q);
609			spin_unlock_irqrestore(req->q->queue_lock, flags);
610			goto release_iod;
611		}
612
613		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
614			if (cmd_rq->ctx == CMD_CTX_CANCELLED)
615				error = -EINTR;
616			else
617				error = status;
618		} else {
619			error = nvme_error_status(status);
620		}
621	}
622
623	if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
624		u32 result = le32_to_cpup(&cqe->result);
625		req->special = (void *)(uintptr_t)result;
626	}
627
628	if (cmd_rq->aborted)
629		dev_warn(nvmeq->dev->dev,
630			"completing aborted command with status:%04x\n",
631			error);
632
633release_iod:
634	if (iod->nents) {
635		dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
636			rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
637		if (blk_integrity_rq(req)) {
638			if (!rq_data_dir(req))
639				nvme_dif_remap(req, nvme_dif_complete);
640			dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1,
641				rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
642		}
643	}
644	nvme_free_iod(nvmeq->dev, iod);
645
646	if (likely(!requeue))
647		blk_mq_complete_request(req, error);
648}
649
650/* length is in bytes.  gfp flags indicates whether we may sleep. */
651static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
652		int total_len, gfp_t gfp)
653{
654	struct dma_pool *pool;
655	int length = total_len;
656	struct scatterlist *sg = iod->sg;
657	int dma_len = sg_dma_len(sg);
658	u64 dma_addr = sg_dma_address(sg);
659	u32 page_size = dev->page_size;
660	int offset = dma_addr & (page_size - 1);
661	__le64 *prp_list;
662	__le64 **list = iod_list(iod);
663	dma_addr_t prp_dma;
664	int nprps, i;
665
666	length -= (page_size - offset);
667	if (length <= 0)
668		return total_len;
669
670	dma_len -= (page_size - offset);
671	if (dma_len) {
672		dma_addr += (page_size - offset);
673	} else {
674		sg = sg_next(sg);
675		dma_addr = sg_dma_address(sg);
676		dma_len = sg_dma_len(sg);
677	}
678
679	if (length <= page_size) {
680		iod->first_dma = dma_addr;
681		return total_len;
682	}
683
684	nprps = DIV_ROUND_UP(length, page_size);
685	if (nprps <= (256 / 8)) {
686		pool = dev->prp_small_pool;
687		iod->npages = 0;
688	} else {
689		pool = dev->prp_page_pool;
690		iod->npages = 1;
691	}
692
693	prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
694	if (!prp_list) {
695		iod->first_dma = dma_addr;
696		iod->npages = -1;
697		return (total_len - length) + page_size;
698	}
699	list[0] = prp_list;
700	iod->first_dma = prp_dma;
701	i = 0;
702	for (;;) {
703		if (i == page_size >> 3) {
704			__le64 *old_prp_list = prp_list;
705			prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
706			if (!prp_list)
707				return total_len - length;
708			list[iod->npages++] = prp_list;
709			prp_list[0] = old_prp_list[i - 1];
710			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
711			i = 1;
712		}
713		prp_list[i++] = cpu_to_le64(dma_addr);
714		dma_len -= page_size;
715		dma_addr += page_size;
716		length -= page_size;
717		if (length <= 0)
718			break;
719		if (dma_len > 0)
720			continue;
721		BUG_ON(dma_len < 0);
722		sg = sg_next(sg);
723		dma_addr = sg_dma_address(sg);
724		dma_len = sg_dma_len(sg);
725	}
726
727	return total_len;
728}
729
730static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req,
731		struct nvme_iod *iod)
732{
733	struct nvme_command cmnd;
734
735	memcpy(&cmnd, req->cmd, sizeof(cmnd));
736	cmnd.rw.command_id = req->tag;
737	if (req->nr_phys_segments) {
738		cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
739		cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
740	}
741
742	__nvme_submit_cmd(nvmeq, &cmnd);
743}
744
745/*
746 * We reuse the small pool to allocate the 16-byte range here as it is not
747 * worth having a special pool for these or additional cases to handle freeing
748 * the iod.
749 */
750static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
751		struct request *req, struct nvme_iod *iod)
752{
753	struct nvme_dsm_range *range =
754				(struct nvme_dsm_range *)iod_list(iod)[0];
755	struct nvme_command cmnd;
756
757	range->cattr = cpu_to_le32(0);
758	range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
759	range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
760
761	memset(&cmnd, 0, sizeof(cmnd));
762	cmnd.dsm.opcode = nvme_cmd_dsm;
763	cmnd.dsm.command_id = req->tag;
764	cmnd.dsm.nsid = cpu_to_le32(ns->ns_id);
765	cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma);
766	cmnd.dsm.nr = 0;
767	cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
768
769	__nvme_submit_cmd(nvmeq, &cmnd);
770}
771
772static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
773								int cmdid)
774{
775	struct nvme_command cmnd;
776
777	memset(&cmnd, 0, sizeof(cmnd));
778	cmnd.common.opcode = nvme_cmd_flush;
779	cmnd.common.command_id = cmdid;
780	cmnd.common.nsid = cpu_to_le32(ns->ns_id);
781
782	__nvme_submit_cmd(nvmeq, &cmnd);
783}
784
785static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
786							struct nvme_ns *ns)
787{
788	struct request *req = iod_get_private(iod);
789	struct nvme_command cmnd;
790	u16 control = 0;
791	u32 dsmgmt = 0;
792
793	if (req->cmd_flags & REQ_FUA)
794		control |= NVME_RW_FUA;
795	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
796		control |= NVME_RW_LR;
797
798	if (req->cmd_flags & REQ_RAHEAD)
799		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
800
801	memset(&cmnd, 0, sizeof(cmnd));
802	cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
803	cmnd.rw.command_id = req->tag;
804	cmnd.rw.nsid = cpu_to_le32(ns->ns_id);
805	cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
806	cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
807	cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
808	cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
809
810	if (ns->ms) {
811		switch (ns->pi_type) {
812		case NVME_NS_DPS_PI_TYPE3:
813			control |= NVME_RW_PRINFO_PRCHK_GUARD;
814			break;
815		case NVME_NS_DPS_PI_TYPE1:
816		case NVME_NS_DPS_PI_TYPE2:
817			control |= NVME_RW_PRINFO_PRCHK_GUARD |
818					NVME_RW_PRINFO_PRCHK_REF;
819			cmnd.rw.reftag = cpu_to_le32(
820					nvme_block_nr(ns, blk_rq_pos(req)));
821			break;
822		}
823		if (blk_integrity_rq(req))
824			cmnd.rw.metadata =
825				cpu_to_le64(sg_dma_address(iod->meta_sg));
826		else
827			control |= NVME_RW_PRINFO_PRACT;
828	}
829
830	cmnd.rw.control = cpu_to_le16(control);
831	cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt);
832
833	__nvme_submit_cmd(nvmeq, &cmnd);
834
835	return 0;
836}
837
838/*
839 * NOTE: ns is NULL when called on the admin queue.
840 */
841static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
842			 const struct blk_mq_queue_data *bd)
843{
844	struct nvme_ns *ns = hctx->queue->queuedata;
845	struct nvme_queue *nvmeq = hctx->driver_data;
846	struct nvme_dev *dev = nvmeq->dev;
847	struct request *req = bd->rq;
848	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
849	struct nvme_iod *iod;
850	enum dma_data_direction dma_dir;
851
852	/*
853	 * If formated with metadata, require the block layer provide a buffer
854	 * unless this namespace is formated such that the metadata can be
855	 * stripped/generated by the controller with PRACT=1.
856	 */
857	if (ns && ns->ms && !blk_integrity_rq(req)) {
858		if (!(ns->pi_type && ns->ms == 8) &&
859					req->cmd_type != REQ_TYPE_DRV_PRIV) {
860			blk_mq_complete_request(req, -EFAULT);
861			return BLK_MQ_RQ_QUEUE_OK;
862		}
863	}
864
865	iod = nvme_alloc_iod(req, dev, GFP_ATOMIC);
866	if (!iod)
867		return BLK_MQ_RQ_QUEUE_BUSY;
868
869	if (req->cmd_flags & REQ_DISCARD) {
870		void *range;
871		/*
872		 * We reuse the small pool to allocate the 16-byte range here
873		 * as it is not worth having a special pool for these or
874		 * additional cases to handle freeing the iod.
875		 */
876		range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC,
877						&iod->first_dma);
878		if (!range)
879			goto retry_cmd;
880		iod_list(iod)[0] = (__le64 *)range;
881		iod->npages = 0;
882	} else if (req->nr_phys_segments) {
883		dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
884
885		sg_init_table(iod->sg, req->nr_phys_segments);
886		iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
887		if (!iod->nents)
888			goto error_cmd;
889
890		if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
891			goto retry_cmd;
892
893		if (blk_rq_bytes(req) !=
894                    nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
895			dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
896			goto retry_cmd;
897		}
898		if (blk_integrity_rq(req)) {
899			if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) {
900				dma_unmap_sg(dev->dev, iod->sg, iod->nents,
901						dma_dir);
902				goto error_cmd;
903			}
904
905			sg_init_table(iod->meta_sg, 1);
906			if (blk_rq_map_integrity_sg(
907					req->q, req->bio, iod->meta_sg) != 1) {
908				dma_unmap_sg(dev->dev, iod->sg, iod->nents,
909						dma_dir);
910				goto error_cmd;
911			}
912
913			if (rq_data_dir(req))
914				nvme_dif_remap(req, nvme_dif_prep);
915
916			if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) {
917				dma_unmap_sg(dev->dev, iod->sg, iod->nents,
918						dma_dir);
919				goto error_cmd;
920			}
921		}
922	}
923
924	nvme_set_info(cmd, iod, req_completion);
925	spin_lock_irq(&nvmeq->q_lock);
926	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
927		nvme_submit_priv(nvmeq, req, iod);
928	else if (req->cmd_flags & REQ_DISCARD)
929		nvme_submit_discard(nvmeq, ns, req, iod);
930	else if (req->cmd_flags & REQ_FLUSH)
931		nvme_submit_flush(nvmeq, ns, req->tag);
932	else
933		nvme_submit_iod(nvmeq, iod, ns);
934
935	nvme_process_cq(nvmeq);
936	spin_unlock_irq(&nvmeq->q_lock);
937	return BLK_MQ_RQ_QUEUE_OK;
938
939 error_cmd:
940	nvme_free_iod(dev, iod);
941	return BLK_MQ_RQ_QUEUE_ERROR;
942 retry_cmd:
943	nvme_free_iod(dev, iod);
944	return BLK_MQ_RQ_QUEUE_BUSY;
945}
946
947static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
948{
949	u16 head, phase;
950
951	head = nvmeq->cq_head;
952	phase = nvmeq->cq_phase;
953
954	for (;;) {
955		void *ctx;
956		nvme_completion_fn fn;
957		struct nvme_completion cqe = nvmeq->cqes[head];
958		if ((le16_to_cpu(cqe.status) & 1) != phase)
959			break;
960		nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
961		if (++head == nvmeq->q_depth) {
962			head = 0;
963			phase = !phase;
964		}
965		if (tag && *tag == cqe.command_id)
966			*tag = -1;
967		ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
968		fn(nvmeq, ctx, &cqe);
969	}
970
971	/* If the controller ignores the cq head doorbell and continuously
972	 * writes to the queue, it is theoretically possible to wrap around
973	 * the queue twice and mistakenly return IRQ_NONE.  Linux only
974	 * requires that 0.1% of your interrupts are handled, so this isn't
975	 * a big problem.
976	 */
977	if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
978		return;
979
980	if (likely(nvmeq->cq_vector >= 0))
981		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
982	nvmeq->cq_head = head;
983	nvmeq->cq_phase = phase;
984
985	nvmeq->cqe_seen = 1;
986}
987
988static void nvme_process_cq(struct nvme_queue *nvmeq)
989{
990	__nvme_process_cq(nvmeq, NULL);
991}
992
993static irqreturn_t nvme_irq(int irq, void *data)
994{
995	irqreturn_t result;
996	struct nvme_queue *nvmeq = data;
997	spin_lock(&nvmeq->q_lock);
998	nvme_process_cq(nvmeq);
999	result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
1000	nvmeq->cqe_seen = 0;
1001	spin_unlock(&nvmeq->q_lock);
1002	return result;
1003}
1004
1005static irqreturn_t nvme_irq_check(int irq, void *data)
1006{
1007	struct nvme_queue *nvmeq = data;
1008	struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
1009	if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
1010		return IRQ_NONE;
1011	return IRQ_WAKE_THREAD;
1012}
1013
1014static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
1015{
1016	struct nvme_queue *nvmeq = hctx->driver_data;
1017
1018	if ((le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
1019	    nvmeq->cq_phase) {
1020		spin_lock_irq(&nvmeq->q_lock);
1021		__nvme_process_cq(nvmeq, &tag);
1022		spin_unlock_irq(&nvmeq->q_lock);
1023
1024		if (tag == -1)
1025			return 1;
1026	}
1027
1028	return 0;
1029}
1030
1031/*
1032 * Returns 0 on success.  If the result is negative, it's a Linux error code;
1033 * if the result is positive, it's an NVM Express status code
1034 */
1035int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1036		void *buffer, void __user *ubuffer, unsigned bufflen,
1037		u32 *result, unsigned timeout)
1038{
1039	bool write = cmd->common.opcode & 1;
1040	struct bio *bio = NULL;
1041	struct request *req;
1042	int ret;
1043
1044	req = blk_mq_alloc_request(q, write, GFP_KERNEL, false);
1045	if (IS_ERR(req))
1046		return PTR_ERR(req);
1047
1048	req->cmd_type = REQ_TYPE_DRV_PRIV;
1049	req->cmd_flags |= REQ_FAILFAST_DRIVER;
1050	req->__data_len = 0;
1051	req->__sector = (sector_t) -1;
1052	req->bio = req->biotail = NULL;
1053
1054	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
1055
1056	req->cmd = (unsigned char *)cmd;
1057	req->cmd_len = sizeof(struct nvme_command);
1058	req->special = (void *)0;
1059
1060	if (buffer && bufflen) {
1061		ret = blk_rq_map_kern(q, req, buffer, bufflen,
1062				      __GFP_DIRECT_RECLAIM);
1063		if (ret)
1064			goto out;
1065	} else if (ubuffer && bufflen) {
1066		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
1067				      __GFP_DIRECT_RECLAIM);
1068		if (ret)
1069			goto out;
1070		bio = req->bio;
1071	}
1072
1073	blk_execute_rq(req->q, NULL, req, 0);
1074	if (bio)
1075		blk_rq_unmap_user(bio);
1076	if (result)
1077		*result = (u32)(uintptr_t)req->special;
1078	ret = req->errors;
1079 out:
1080	blk_mq_free_request(req);
1081	return ret;
1082}
1083
1084int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1085		void *buffer, unsigned bufflen)
1086{
1087	return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0);
1088}
1089
1090static int nvme_submit_async_admin_req(struct nvme_dev *dev)
1091{
1092	struct nvme_queue *nvmeq = dev->queues[0];
1093	struct nvme_command c;
1094	struct nvme_cmd_info *cmd_info;
1095	struct request *req;
1096
1097	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true);
1098	if (IS_ERR(req))
1099		return PTR_ERR(req);
1100
1101	req->cmd_flags |= REQ_NO_TIMEOUT;
1102	cmd_info = blk_mq_rq_to_pdu(req);
1103	nvme_set_info(cmd_info, NULL, async_req_completion);
1104
1105	memset(&c, 0, sizeof(c));
1106	c.common.opcode = nvme_admin_async_event;
1107	c.common.command_id = req->tag;
1108
1109	blk_mq_free_request(req);
1110	__nvme_submit_cmd(nvmeq, &c);
1111	return 0;
1112}
1113
1114static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
1115			struct nvme_command *cmd,
1116			struct async_cmd_info *cmdinfo, unsigned timeout)
1117{
1118	struct nvme_queue *nvmeq = dev->queues[0];
1119	struct request *req;
1120	struct nvme_cmd_info *cmd_rq;
1121
1122	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
1123	if (IS_ERR(req))
1124		return PTR_ERR(req);
1125
1126	req->timeout = timeout;
1127	cmd_rq = blk_mq_rq_to_pdu(req);
1128	cmdinfo->req = req;
1129	nvme_set_info(cmd_rq, cmdinfo, async_completion);
1130	cmdinfo->status = -EINTR;
1131
1132	cmd->common.command_id = req->tag;
1133
1134	nvme_submit_cmd(nvmeq, cmd);
1135	return 0;
1136}
1137
1138static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
1139{
1140	struct nvme_command c;
1141
1142	memset(&c, 0, sizeof(c));
1143	c.delete_queue.opcode = opcode;
1144	c.delete_queue.qid = cpu_to_le16(id);
1145
1146	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
1147}
1148
1149static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1150						struct nvme_queue *nvmeq)
1151{
1152	struct nvme_command c;
1153	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
1154
1155	/*
1156	 * Note: we (ab)use the fact the the prp fields survive if no data
1157	 * is attached to the request.
1158	 */
1159	memset(&c, 0, sizeof(c));
1160	c.create_cq.opcode = nvme_admin_create_cq;
1161	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
1162	c.create_cq.cqid = cpu_to_le16(qid);
1163	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
1164	c.create_cq.cq_flags = cpu_to_le16(flags);
1165	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
1166
1167	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
1168}
1169
1170static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
1171						struct nvme_queue *nvmeq)
1172{
1173	struct nvme_command c;
1174	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
1175
1176	/*
1177	 * Note: we (ab)use the fact the the prp fields survive if no data
1178	 * is attached to the request.
1179	 */
1180	memset(&c, 0, sizeof(c));
1181	c.create_sq.opcode = nvme_admin_create_sq;
1182	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
1183	c.create_sq.sqid = cpu_to_le16(qid);
1184	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
1185	c.create_sq.sq_flags = cpu_to_le16(flags);
1186	c.create_sq.cqid = cpu_to_le16(qid);
1187
1188	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
1189}
1190
1191static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
1192{
1193	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
1194}
1195
1196static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
1197{
1198	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
1199}
1200
1201int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id)
1202{
1203	struct nvme_command c = { };
1204	int error;
1205
1206	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1207	c.identify.opcode = nvme_admin_identify;
1208	c.identify.cns = cpu_to_le32(1);
1209
1210	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
1211	if (!*id)
1212		return -ENOMEM;
1213
1214	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1215			sizeof(struct nvme_id_ctrl));
1216	if (error)
1217		kfree(*id);
1218	return error;
1219}
1220
1221int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
1222		struct nvme_id_ns **id)
1223{
1224	struct nvme_command c = { };
1225	int error;
1226
1227	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1228	c.identify.opcode = nvme_admin_identify,
1229	c.identify.nsid = cpu_to_le32(nsid),
1230
1231	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
1232	if (!*id)
1233		return -ENOMEM;
1234
1235	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1236			sizeof(struct nvme_id_ns));
1237	if (error)
1238		kfree(*id);
1239	return error;
1240}
1241
1242int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
1243					dma_addr_t dma_addr, u32 *result)
1244{
1245	struct nvme_command c;
1246
1247	memset(&c, 0, sizeof(c));
1248	c.features.opcode = nvme_admin_get_features;
1249	c.features.nsid = cpu_to_le32(nsid);
1250	c.features.prp1 = cpu_to_le64(dma_addr);
1251	c.features.fid = cpu_to_le32(fid);
1252
1253	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
1254			result, 0);
1255}
1256
1257int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
1258					dma_addr_t dma_addr, u32 *result)
1259{
1260	struct nvme_command c;
1261
1262	memset(&c, 0, sizeof(c));
1263	c.features.opcode = nvme_admin_set_features;
1264	c.features.prp1 = cpu_to_le64(dma_addr);
1265	c.features.fid = cpu_to_le32(fid);
1266	c.features.dword11 = cpu_to_le32(dword11);
1267
1268	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
1269			result, 0);
1270}
1271
1272int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log)
1273{
1274	struct nvme_command c = { };
1275	int error;
1276
1277	c.common.opcode = nvme_admin_get_log_page,
1278	c.common.nsid = cpu_to_le32(0xFFFFFFFF),
1279	c.common.cdw10[0] = cpu_to_le32(
1280			(((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
1281			 NVME_LOG_SMART),
1282
1283	*log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
1284	if (!*log)
1285		return -ENOMEM;
1286
1287	error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
1288			sizeof(struct nvme_smart_log));
1289	if (error)
1290		kfree(*log);
1291	return error;
1292}
1293
1294/**
1295 * nvme_abort_req - Attempt aborting a request
1296 *
1297 * Schedule controller reset if the command was already aborted once before and
1298 * still hasn't been returned to the driver, or if this is the admin queue.
1299 */
1300static void nvme_abort_req(struct request *req)
1301{
1302	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
1303	struct nvme_queue *nvmeq = cmd_rq->nvmeq;
1304	struct nvme_dev *dev = nvmeq->dev;
1305	struct request *abort_req;
1306	struct nvme_cmd_info *abort_cmd;
1307	struct nvme_command cmd;
1308
1309	if (!nvmeq->qid || cmd_rq->aborted) {
1310		spin_lock(&dev_list_lock);
1311		if (!__nvme_reset(dev)) {
1312			dev_warn(dev->dev,
1313				 "I/O %d QID %d timeout, reset controller\n",
1314				 req->tag, nvmeq->qid);
1315		}
1316		spin_unlock(&dev_list_lock);
1317		return;
1318	}
1319
1320	if (!dev->abort_limit)
1321		return;
1322
1323	abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC,
1324									false);
1325	if (IS_ERR(abort_req))
1326		return;
1327
1328	abort_cmd = blk_mq_rq_to_pdu(abort_req);
1329	nvme_set_info(abort_cmd, abort_req, abort_completion);
1330
1331	memset(&cmd, 0, sizeof(cmd));
1332	cmd.abort.opcode = nvme_admin_abort_cmd;
1333	cmd.abort.cid = req->tag;
1334	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
1335	cmd.abort.command_id = abort_req->tag;
1336
1337	--dev->abort_limit;
1338	cmd_rq->aborted = 1;
1339
1340	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
1341							nvmeq->qid);
1342	nvme_submit_cmd(dev->queues[0], &cmd);
1343}
1344
1345static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
1346{
1347	struct nvme_queue *nvmeq = data;
1348	void *ctx;
1349	nvme_completion_fn fn;
1350	struct nvme_cmd_info *cmd;
1351	struct nvme_completion cqe;
1352
1353	if (!blk_mq_request_started(req))
1354		return;
1355
1356	cmd = blk_mq_rq_to_pdu(req);
1357
1358	if (cmd->ctx == CMD_CTX_CANCELLED)
1359		return;
1360
1361	if (blk_queue_dying(req->q))
1362		cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
1363	else
1364		cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
1365
1366
1367	dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
1368						req->tag, nvmeq->qid);
1369	ctx = cancel_cmd_info(cmd, &fn);
1370	fn(nvmeq, ctx, &cqe);
1371}
1372
1373static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
1374{
1375	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
1376	struct nvme_queue *nvmeq = cmd->nvmeq;
1377
1378	dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
1379							nvmeq->qid);
1380	spin_lock_irq(&nvmeq->q_lock);
1381	nvme_abort_req(req);
1382	spin_unlock_irq(&nvmeq->q_lock);
1383
1384	/*
1385	 * The aborted req will be completed on receiving the abort req.
1386	 * We enable the timer again. If hit twice, it'll cause a device reset,
1387	 * as the device then is in a faulty state.
1388	 */
1389	return BLK_EH_RESET_TIMER;
1390}
1391
1392static void nvme_free_queue(struct nvme_queue *nvmeq)
1393{
1394	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
1395				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1396	if (nvmeq->sq_cmds)
1397		dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
1398					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1399	kfree(nvmeq);
1400}
1401
1402static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1403{
1404	int i;
1405
1406	for (i = dev->queue_count - 1; i >= lowest; i--) {
1407		struct nvme_queue *nvmeq = dev->queues[i];
1408		dev->queue_count--;
1409		dev->queues[i] = NULL;
1410		nvme_free_queue(nvmeq);
1411	}
1412}
1413
1414/**
1415 * nvme_suspend_queue - put queue into suspended state
1416 * @nvmeq - queue to suspend
1417 */
1418static int nvme_suspend_queue(struct nvme_queue *nvmeq)
1419{
1420	int vector;
1421
1422	spin_lock_irq(&nvmeq->q_lock);
1423	if (nvmeq->cq_vector == -1) {
1424		spin_unlock_irq(&nvmeq->q_lock);
1425		return 1;
1426	}
1427	vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
1428	nvmeq->dev->online_queues--;
1429	nvmeq->cq_vector = -1;
1430	spin_unlock_irq(&nvmeq->q_lock);
1431
1432	if (!nvmeq->qid && nvmeq->dev->admin_q)
1433		blk_mq_freeze_queue_start(nvmeq->dev->admin_q);
1434
1435	irq_set_affinity_hint(vector, NULL);
1436	free_irq(vector, nvmeq);
1437
1438	return 0;
1439}
1440
1441static void nvme_clear_queue(struct nvme_queue *nvmeq)
1442{
1443	spin_lock_irq(&nvmeq->q_lock);
1444	if (nvmeq->tags && *nvmeq->tags)
1445		blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq);
1446	spin_unlock_irq(&nvmeq->q_lock);
1447}
1448
1449static void nvme_disable_queue(struct nvme_dev *dev, int qid)
1450{
1451	struct nvme_queue *nvmeq = dev->queues[qid];
1452
1453	if (!nvmeq)
1454		return;
1455	if (nvme_suspend_queue(nvmeq))
1456		return;
1457
1458	/* Don't tell the adapter to delete the admin queue.
1459	 * Don't tell a removed adapter to delete IO queues. */
1460	if (qid && readl(&dev->bar->csts) != -1) {
1461		adapter_delete_sq(dev, qid);
1462		adapter_delete_cq(dev, qid);
1463	}
1464
1465	spin_lock_irq(&nvmeq->q_lock);
1466	nvme_process_cq(nvmeq);
1467	spin_unlock_irq(&nvmeq->q_lock);
1468}
1469
1470static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
1471				int entry_size)
1472{
1473	int q_depth = dev->q_depth;
1474	unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size);
1475
1476	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
1477		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
1478		mem_per_q = round_down(mem_per_q, dev->page_size);
1479		q_depth = div_u64(mem_per_q, entry_size);
1480
1481		/*
1482		 * Ensure the reduced q_depth is above some threshold where it
1483		 * would be better to map queues in system memory with the
1484		 * original depth
1485		 */
1486		if (q_depth < 64)
1487			return -ENOMEM;
1488	}
1489
1490	return q_depth;
1491}
1492
1493static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1494				int qid, int depth)
1495{
1496	if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
1497		unsigned offset = (qid - 1) *
1498					roundup(SQ_SIZE(depth), dev->page_size);
1499		nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
1500		nvmeq->sq_cmds_io = dev->cmb + offset;
1501	} else {
1502		nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
1503					&nvmeq->sq_dma_addr, GFP_KERNEL);
1504		if (!nvmeq->sq_cmds)
1505			return -ENOMEM;
1506	}
1507
1508	return 0;
1509}
1510
1511static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
1512							int depth)
1513{
1514	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
1515	if (!nvmeq)
1516		return NULL;
1517
1518	nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
1519					  &nvmeq->cq_dma_addr, GFP_KERNEL);
1520	if (!nvmeq->cqes)
1521		goto free_nvmeq;
1522
1523	if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
1524		goto free_cqdma;
1525
1526	nvmeq->q_dmadev = dev->dev;
1527	nvmeq->dev = dev;
1528	snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
1529			dev->instance, qid);
1530	spin_lock_init(&nvmeq->q_lock);
1531	nvmeq->cq_head = 0;
1532	nvmeq->cq_phase = 1;
1533	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1534	nvmeq->q_depth = depth;
1535	nvmeq->qid = qid;
1536	nvmeq->cq_vector = -1;
1537	dev->queues[qid] = nvmeq;
1538
1539	/* make sure queue descriptor is set before queue count, for kthread */
1540	mb();
1541	dev->queue_count++;
1542
1543	return nvmeq;
1544
1545 free_cqdma:
1546	dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
1547							nvmeq->cq_dma_addr);
1548 free_nvmeq:
1549	kfree(nvmeq);
1550	return NULL;
1551}
1552
1553static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1554							const char *name)
1555{
1556	if (use_threaded_interrupts)
1557		return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
1558					nvme_irq_check, nvme_irq, IRQF_SHARED,
1559					name, nvmeq);
1560	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
1561				IRQF_SHARED, name, nvmeq);
1562}
1563
1564static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
1565{
1566	struct nvme_dev *dev = nvmeq->dev;
1567
1568	spin_lock_irq(&nvmeq->q_lock);
1569	nvmeq->sq_tail = 0;
1570	nvmeq->cq_head = 0;
1571	nvmeq->cq_phase = 1;
1572	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1573	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
1574	dev->online_queues++;
1575	spin_unlock_irq(&nvmeq->q_lock);
1576}
1577
1578static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
1579{
1580	struct nvme_dev *dev = nvmeq->dev;
1581	int result;
1582
1583	nvmeq->cq_vector = qid - 1;
1584	result = adapter_alloc_cq(dev, qid, nvmeq);
1585	if (result < 0)
1586		return result;
1587
1588	result = adapter_alloc_sq(dev, qid, nvmeq);
1589	if (result < 0)
1590		goto release_cq;
1591
1592	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
1593	if (result < 0)
1594		goto release_sq;
1595
1596	nvme_init_queue(nvmeq, qid);
1597	return result;
1598
1599 release_sq:
1600	adapter_delete_sq(dev, qid);
1601 release_cq:
1602	adapter_delete_cq(dev, qid);
1603	return result;
1604}
1605
1606static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
1607{
1608	unsigned long timeout;
1609	u32 bit = enabled ? NVME_CSTS_RDY : 0;
1610
1611	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1612
1613	while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) {
1614		msleep(100);
1615		if (fatal_signal_pending(current))
1616			return -EINTR;
1617		if (time_after(jiffies, timeout)) {
1618			dev_err(dev->dev,
1619				"Device not ready; aborting %s\n", enabled ?
1620						"initialisation" : "reset");
1621			return -ENODEV;
1622		}
1623	}
1624
1625	return 0;
1626}
1627
1628/*
1629 * If the device has been passed off to us in an enabled state, just clear
1630 * the enabled bit.  The spec says we should set the 'shutdown notification
1631 * bits', but doing so may cause the device to complete commands to the
1632 * admin queue ... and we don't know what memory that might be pointing at!
1633 */
1634static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
1635{
1636	dev->ctrl_config &= ~NVME_CC_SHN_MASK;
1637	dev->ctrl_config &= ~NVME_CC_ENABLE;
1638	writel(dev->ctrl_config, &dev->bar->cc);
1639
1640	return nvme_wait_ready(dev, cap, false);
1641}
1642
1643static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
1644{
1645	dev->ctrl_config &= ~NVME_CC_SHN_MASK;
1646	dev->ctrl_config |= NVME_CC_ENABLE;
1647	writel(dev->ctrl_config, &dev->bar->cc);
1648
1649	return nvme_wait_ready(dev, cap, true);
1650}
1651
1652static int nvme_shutdown_ctrl(struct nvme_dev *dev)
1653{
1654	unsigned long timeout;
1655
1656	dev->ctrl_config &= ~NVME_CC_SHN_MASK;
1657	dev->ctrl_config |= NVME_CC_SHN_NORMAL;
1658
1659	writel(dev->ctrl_config, &dev->bar->cc);
1660
1661	timeout = SHUTDOWN_TIMEOUT + jiffies;
1662	while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) !=
1663							NVME_CSTS_SHST_CMPLT) {
1664		msleep(100);
1665		if (fatal_signal_pending(current))
1666			return -EINTR;
1667		if (time_after(jiffies, timeout)) {
1668			dev_err(dev->dev,
1669				"Device shutdown incomplete; abort shutdown\n");
1670			return -ENODEV;
1671		}
1672	}
1673
1674	return 0;
1675}
1676
1677static struct blk_mq_ops nvme_mq_admin_ops = {
1678	.queue_rq	= nvme_queue_rq,
1679	.map_queue	= blk_mq_map_queue,
1680	.init_hctx	= nvme_admin_init_hctx,
1681	.exit_hctx      = nvme_admin_exit_hctx,
1682	.init_request	= nvme_admin_init_request,
1683	.timeout	= nvme_timeout,
1684};
1685
1686static struct blk_mq_ops nvme_mq_ops = {
1687	.queue_rq	= nvme_queue_rq,
1688	.map_queue	= blk_mq_map_queue,
1689	.init_hctx	= nvme_init_hctx,
1690	.init_request	= nvme_init_request,
1691	.timeout	= nvme_timeout,
1692	.poll		= nvme_poll,
1693};
1694
1695static void nvme_dev_remove_admin(struct nvme_dev *dev)
1696{
1697	if (dev->admin_q && !blk_queue_dying(dev->admin_q)) {
1698		blk_cleanup_queue(dev->admin_q);
1699		blk_mq_free_tag_set(&dev->admin_tagset);
1700	}
1701}
1702
1703static int nvme_alloc_admin_tags(struct nvme_dev *dev)
1704{
1705	if (!dev->admin_q) {
1706		dev->admin_tagset.ops = &nvme_mq_admin_ops;
1707		dev->admin_tagset.nr_hw_queues = 1;
1708		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
1709		dev->admin_tagset.reserved_tags = 1;
1710		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1711		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
1712		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
1713		dev->admin_tagset.driver_data = dev;
1714
1715		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
1716			return -ENOMEM;
1717
1718		dev->admin_q = blk_mq_init_queue(&dev->admin_tagset);
1719		if (IS_ERR(dev->admin_q)) {
1720			blk_mq_free_tag_set(&dev->admin_tagset);
1721			return -ENOMEM;
1722		}
1723		if (!blk_get_queue(dev->admin_q)) {
1724			nvme_dev_remove_admin(dev);
1725			dev->admin_q = NULL;
1726			return -ENODEV;
1727		}
1728	} else
1729		blk_mq_unfreeze_queue(dev->admin_q);
1730
1731	return 0;
1732}
1733
1734static int nvme_configure_admin_queue(struct nvme_dev *dev)
1735{
1736	int result;
1737	u32 aqa;
1738	u64 cap = lo_hi_readq(&dev->bar->cap);
1739	struct nvme_queue *nvmeq;
1740	/*
1741	 * default to a 4K page size, with the intention to update this
1742	 * path in the future to accomodate architectures with differing
1743	 * kernel and IO page sizes.
1744	 */
1745	unsigned page_shift = 12;
1746	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;
1747
1748	if (page_shift < dev_page_min) {
1749		dev_err(dev->dev,
1750				"Minimum device page size (%u) too large for "
1751				"host (%u)\n", 1 << dev_page_min,
1752				1 << page_shift);
1753		return -ENODEV;
1754	}
1755
1756	dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ?
1757						NVME_CAP_NSSRC(cap) : 0;
1758
1759	if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO))
1760		writel(NVME_CSTS_NSSRO, &dev->bar->csts);
1761
1762	result = nvme_disable_ctrl(dev, cap);
1763	if (result < 0)
1764		return result;
1765
1766	nvmeq = dev->queues[0];
1767	if (!nvmeq) {
1768		nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
1769		if (!nvmeq)
1770			return -ENOMEM;
1771	}
1772
1773	aqa = nvmeq->q_depth - 1;
1774	aqa |= aqa << 16;
1775
1776	dev->page_size = 1 << page_shift;
1777
1778	dev->ctrl_config = NVME_CC_CSS_NVM;
1779	dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
1780	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
1781	dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
1782
1783	writel(aqa, &dev->bar->aqa);
1784	lo_hi_writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
1785	lo_hi_writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
1786
1787	result = nvme_enable_ctrl(dev, cap);
1788	if (result)
1789		goto free_nvmeq;
1790
1791	nvmeq->cq_vector = 0;
1792	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
1793	if (result) {
1794		nvmeq->cq_vector = -1;
1795		goto free_nvmeq;
1796	}
1797
1798	return result;
1799
1800 free_nvmeq:
1801	nvme_free_queues(dev, 0);
1802	return result;
1803}
1804
1805static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1806{
1807	struct nvme_dev *dev = ns->dev;
1808	struct nvme_user_io io;
1809	struct nvme_command c;
1810	unsigned length, meta_len;
1811	int status, write;
1812	dma_addr_t meta_dma = 0;
1813	void *meta = NULL;
1814	void __user *metadata;
1815
1816	if (copy_from_user(&io, uio, sizeof(io)))
1817		return -EFAULT;
1818
1819	switch (io.opcode) {
1820	case nvme_cmd_write:
1821	case nvme_cmd_read:
1822	case nvme_cmd_compare:
1823		break;
1824	default:
1825		return -EINVAL;
1826	}
1827
1828	length = (io.nblocks + 1) << ns->lba_shift;
1829	meta_len = (io.nblocks + 1) * ns->ms;
1830	metadata = (void __user *)(uintptr_t)io.metadata;
1831	write = io.opcode & 1;
1832
1833	if (ns->ext) {
1834		length += meta_len;
1835		meta_len = 0;
1836	}
1837	if (meta_len) {
1838		if (((io.metadata & 3) || !io.metadata) && !ns->ext)
1839			return -EINVAL;
1840
1841		meta = dma_alloc_coherent(dev->dev, meta_len,
1842						&meta_dma, GFP_KERNEL);
1843
1844		if (!meta) {
1845			status = -ENOMEM;
1846			goto unmap;
1847		}
1848		if (write) {
1849			if (copy_from_user(meta, metadata, meta_len)) {
1850				status = -EFAULT;
1851				goto unmap;
1852			}
1853		}
1854	}
1855
1856	memset(&c, 0, sizeof(c));
1857	c.rw.opcode = io.opcode;
1858	c.rw.flags = io.flags;
1859	c.rw.nsid = cpu_to_le32(ns->ns_id);
1860	c.rw.slba = cpu_to_le64(io.slba);
1861	c.rw.length = cpu_to_le16(io.nblocks);
1862	c.rw.control = cpu_to_le16(io.control);
1863	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
1864	c.rw.reftag = cpu_to_le32(io.reftag);
1865	c.rw.apptag = cpu_to_le16(io.apptag);
1866	c.rw.appmask = cpu_to_le16(io.appmask);
1867	c.rw.metadata = cpu_to_le64(meta_dma);
1868
1869	status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
1870			(void __user *)(uintptr_t)io.addr, length, NULL, 0);
1871 unmap:
1872	if (meta) {
1873		if (status == NVME_SC_SUCCESS && !write) {
1874			if (copy_to_user(metadata, meta, meta_len))
1875				status = -EFAULT;
1876		}
1877		dma_free_coherent(dev->dev, meta_len, meta, meta_dma);
1878	}
1879	return status;
1880}
1881
1882static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
1883			struct nvme_passthru_cmd __user *ucmd)
1884{
1885	struct nvme_passthru_cmd cmd;
1886	struct nvme_command c;
1887	unsigned timeout = 0;
1888	int status;
1889
1890	if (!capable(CAP_SYS_ADMIN))
1891		return -EACCES;
1892	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1893		return -EFAULT;
1894
1895	memset(&c, 0, sizeof(c));
1896	c.common.opcode = cmd.opcode;
1897	c.common.flags = cmd.flags;
1898	c.common.nsid = cpu_to_le32(cmd.nsid);
1899	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1900	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1901	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
1902	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
1903	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
1904	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
1905	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
1906	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
1907
1908	if (cmd.timeout_ms)
1909		timeout = msecs_to_jiffies(cmd.timeout_ms);
1910
1911	status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
1912			NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
1913			&cmd.result, timeout);
1914	if (status >= 0) {
1915		if (put_user(cmd.result, &ucmd->result))
1916			return -EFAULT;
1917	}
1918
1919	return status;
1920}
1921
1922static int nvme_subsys_reset(struct nvme_dev *dev)
1923{
1924	if (!dev->subsystem)
1925		return -ENOTTY;
1926
1927	writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */
1928	return 0;
1929}
1930
1931static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
1932							unsigned long arg)
1933{
1934	struct nvme_ns *ns = bdev->bd_disk->private_data;
1935
1936	switch (cmd) {
1937	case NVME_IOCTL_ID:
1938		force_successful_syscall_return();
1939		return ns->ns_id;
1940	case NVME_IOCTL_ADMIN_CMD:
1941		return nvme_user_cmd(ns->dev, NULL, (void __user *)arg);
1942	case NVME_IOCTL_IO_CMD:
1943		return nvme_user_cmd(ns->dev, ns, (void __user *)arg);
1944	case NVME_IOCTL_SUBMIT_IO:
1945		return nvme_submit_io(ns, (void __user *)arg);
1946	case SG_GET_VERSION_NUM:
1947		return nvme_sg_get_version_num((void __user *)arg);
1948	case SG_IO:
1949		return nvme_sg_io(ns, (void __user *)arg);
1950	default:
1951		return -ENOTTY;
1952	}
1953}
1954
1955#ifdef CONFIG_COMPAT
1956static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
1957					unsigned int cmd, unsigned long arg)
1958{
1959	switch (cmd) {
1960	case SG_IO:
1961		return -ENOIOCTLCMD;
1962	}
1963	return nvme_ioctl(bdev, mode, cmd, arg);
1964}
1965#else
1966#define nvme_compat_ioctl	NULL
1967#endif
1968
1969static void nvme_free_dev(struct kref *kref);
1970static void nvme_free_ns(struct kref *kref)
1971{
1972	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
1973
1974	if (ns->type == NVME_NS_LIGHTNVM)
1975		nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
1976
1977	spin_lock(&dev_list_lock);
1978	ns->disk->private_data = NULL;
1979	spin_unlock(&dev_list_lock);
1980
1981	kref_put(&ns->dev->kref, nvme_free_dev);
1982	put_disk(ns->disk);
1983	kfree(ns);
1984}
1985
1986static int nvme_open(struct block_device *bdev, fmode_t mode)
1987{
1988	int ret = 0;
1989	struct nvme_ns *ns;
1990
1991	spin_lock(&dev_list_lock);
1992	ns = bdev->bd_disk->private_data;
1993	if (!ns)
1994		ret = -ENXIO;
1995	else if (!kref_get_unless_zero(&ns->kref))
1996		ret = -ENXIO;
1997	spin_unlock(&dev_list_lock);
1998
1999	return ret;
2000}
2001
2002static void nvme_release(struct gendisk *disk, fmode_t mode)
2003{
2004	struct nvme_ns *ns = disk->private_data;
2005	kref_put(&ns->kref, nvme_free_ns);
2006}
2007
2008static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
2009{
2010	/* some standard values */
2011	geo->heads = 1 << 6;
2012	geo->sectors = 1 << 5;
2013	geo->cylinders = get_capacity(bd->bd_disk) >> 11;
2014	return 0;
2015}
2016
2017static void nvme_config_discard(struct nvme_ns *ns)
2018{
2019	u32 logical_block_size = queue_logical_block_size(ns->queue);
2020	ns->queue->limits.discard_zeroes_data = 0;
2021	ns->queue->limits.discard_alignment = logical_block_size;
2022	ns->queue->limits.discard_granularity = logical_block_size;
2023	blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
2024	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
2025}
2026
2027static int nvme_revalidate_disk(struct gendisk *disk)
2028{
2029	struct nvme_ns *ns = disk->private_data;
2030	struct nvme_dev *dev = ns->dev;
2031	struct nvme_id_ns *id;
2032	u8 lbaf, pi_type;
2033	u16 old_ms;
2034	unsigned short bs;
2035
2036	if (nvme_identify_ns(dev, ns->ns_id, &id)) {
2037		dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__,
2038						dev->instance, ns->ns_id);
2039		return -ENODEV;
2040	}
2041	if (id->ncap == 0) {
2042		kfree(id);
2043		return -ENODEV;
2044	}
2045
2046	if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
2047		if (nvme_nvm_register(ns->queue, disk->disk_name)) {
2048			dev_warn(dev->dev,
2049				"%s: LightNVM init failure\n", __func__);
2050			kfree(id);
2051			return -ENODEV;
2052		}
2053		ns->type = NVME_NS_LIGHTNVM;
2054	}
2055
2056	old_ms = ns->ms;
2057	lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
2058	ns->lba_shift = id->lbaf[lbaf].ds;
2059	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
2060	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
2061
2062	/*
2063	 * If identify namespace failed, use default 512 byte block size so
2064	 * block layer can use before failing read/write for 0 capacity.
2065	 */
2066	if (ns->lba_shift == 0)
2067		ns->lba_shift = 9;
2068	bs = 1 << ns->lba_shift;
2069
2070	/* XXX: PI implementation requires metadata equal t10 pi tuple size */
2071	pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
2072					id->dps & NVME_NS_DPS_PI_MASK : 0;
2073
2074	blk_mq_freeze_queue(disk->queue);
2075	if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
2076				ns->ms != old_ms ||
2077				bs != queue_logical_block_size(disk->queue) ||
2078				(ns->ms && ns->ext)))
2079		blk_integrity_unregister(disk);
2080
2081	ns->pi_type = pi_type;
2082	blk_queue_logical_block_size(ns->queue, bs);
2083
2084	if (ns->ms && !ns->ext)
2085		nvme_init_integrity(ns);
2086
2087	if ((ns->ms && !(ns->ms == 8 && ns->pi_type) &&
2088						!blk_get_integrity(disk)) ||
2089						ns->type == NVME_NS_LIGHTNVM)
2090		set_capacity(disk, 0);
2091	else
2092		set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
2093
2094	if (dev->oncs & NVME_CTRL_ONCS_DSM)
2095		nvme_config_discard(ns);
2096	blk_mq_unfreeze_queue(disk->queue);
2097
2098	kfree(id);
2099	return 0;
2100}
2101
2102static char nvme_pr_type(enum pr_type type)
2103{
2104	switch (type) {
2105	case PR_WRITE_EXCLUSIVE:
2106		return 1;
2107	case PR_EXCLUSIVE_ACCESS:
2108		return 2;
2109	case PR_WRITE_EXCLUSIVE_REG_ONLY:
2110		return 3;
2111	case PR_EXCLUSIVE_ACCESS_REG_ONLY:
2112		return 4;
2113	case PR_WRITE_EXCLUSIVE_ALL_REGS:
2114		return 5;
2115	case PR_EXCLUSIVE_ACCESS_ALL_REGS:
2116		return 6;
2117	default:
2118		return 0;
2119	}
2120};
2121
2122static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
2123				u64 key, u64 sa_key, u8 op)
2124{
2125	struct nvme_ns *ns = bdev->bd_disk->private_data;
2126	struct nvme_command c;
2127	u8 data[16] = { 0, };
2128
2129	put_unaligned_le64(key, &data[0]);
2130	put_unaligned_le64(sa_key, &data[8]);
2131
2132	memset(&c, 0, sizeof(c));
2133	c.common.opcode = op;
2134	c.common.nsid = cpu_to_le32(ns->ns_id);
2135	c.common.cdw10[0] = cpu_to_le32(cdw10);
2136
2137	return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
2138}
2139
2140static int nvme_pr_register(struct block_device *bdev, u64 old,
2141		u64 new, unsigned flags)
2142{
2143	u32 cdw10;
2144
2145	if (flags & ~PR_FL_IGNORE_KEY)
2146		return -EOPNOTSUPP;
2147
2148	cdw10 = old ? 2 : 0;
2149	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
2150	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
2151	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
2152}
2153
2154static int nvme_pr_reserve(struct block_device *bdev, u64 key,
2155		enum pr_type type, unsigned flags)
2156{
2157	u32 cdw10;
2158
2159	if (flags & ~PR_FL_IGNORE_KEY)
2160		return -EOPNOTSUPP;
2161
2162	cdw10 = nvme_pr_type(type) << 8;
2163	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
2164	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
2165}
2166
2167static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
2168		enum pr_type type, bool abort)
2169{
2170	u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
2171	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
2172}
2173
2174static int nvme_pr_clear(struct block_device *bdev, u64 key)
2175{
2176	u32 cdw10 = 1 | (key ? 1 << 3 : 0);
2177	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
2178}
2179
2180static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2181{
2182	u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
2183	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
2184}
2185
2186static const struct pr_ops nvme_pr_ops = {
2187	.pr_register	= nvme_pr_register,
2188	.pr_reserve	= nvme_pr_reserve,
2189	.pr_release	= nvme_pr_release,
2190	.pr_preempt	= nvme_pr_preempt,
2191	.pr_clear	= nvme_pr_clear,
2192};
2193
2194static const struct block_device_operations nvme_fops = {
2195	.owner		= THIS_MODULE,
2196	.ioctl		= nvme_ioctl,
2197	.compat_ioctl	= nvme_compat_ioctl,
2198	.open		= nvme_open,
2199	.release	= nvme_release,
2200	.getgeo		= nvme_getgeo,
2201	.revalidate_disk= nvme_revalidate_disk,
2202	.pr_ops		= &nvme_pr_ops,
2203};
2204
2205static int nvme_kthread(void *data)
2206{
2207	struct nvme_dev *dev, *next;
2208
2209	while (!kthread_should_stop()) {
2210		set_current_state(TASK_INTERRUPTIBLE);
2211		spin_lock(&dev_list_lock);
2212		list_for_each_entry_safe(dev, next, &dev_list, node) {
2213			int i;
2214			u32 csts = readl(&dev->bar->csts);
2215
2216			if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
2217							csts & NVME_CSTS_CFS) {
2218				if (!__nvme_reset(dev)) {
2219					dev_warn(dev->dev,
2220						"Failed status: %x, reset controller\n",
2221						readl(&dev->bar->csts));
2222				}
2223				continue;
2224			}
2225			for (i = 0; i < dev->queue_count; i++) {
2226				struct nvme_queue *nvmeq = dev->queues[i];
2227				if (!nvmeq)
2228					continue;
2229				spin_lock_irq(&nvmeq->q_lock);
2230				nvme_process_cq(nvmeq);
2231
2232				while ((i == 0) && (dev->event_limit > 0)) {
2233					if (nvme_submit_async_admin_req(dev))
2234						break;
2235					dev->event_limit--;
2236				}
2237				spin_unlock_irq(&nvmeq->q_lock);
2238			}
2239		}
2240		spin_unlock(&dev_list_lock);
2241		schedule_timeout(round_jiffies_relative(HZ));
2242	}
2243	return 0;
2244}
2245
2246static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
2247{
2248	struct nvme_ns *ns;
2249	struct gendisk *disk;
2250	int node = dev_to_node(dev->dev);
2251
2252	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
2253	if (!ns)
2254		return;
2255
2256	ns->queue = blk_mq_init_queue(&dev->tagset);
2257	if (IS_ERR(ns->queue))
2258		goto out_free_ns;
2259	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
2260	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
2261	ns->dev = dev;
2262	ns->queue->queuedata = ns;
2263
2264	disk = alloc_disk_node(0, node);
2265	if (!disk)
2266		goto out_free_queue;
2267
2268	kref_init(&ns->kref);
2269	ns->ns_id = nsid;
2270	ns->disk = disk;
2271	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
2272	list_add_tail(&ns->list, &dev->namespaces);
2273
2274	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
2275	if (dev->max_hw_sectors) {
2276		blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
2277		blk_queue_max_segments(ns->queue,
2278			(dev->max_hw_sectors / (dev->page_size >> 9)) + 1);
2279	}
2280	if (dev->stripe_size)
2281		blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
2282	if (dev->vwc & NVME_CTRL_VWC_PRESENT)
2283		blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
2284	blk_queue_virt_boundary(ns->queue, dev->page_size - 1);
2285
2286	disk->major = nvme_major;
2287	disk->first_minor = 0;
2288	disk->fops = &nvme_fops;
2289	disk->private_data = ns;
2290	disk->queue = ns->queue;
2291	disk->driverfs_dev = dev->device;
2292	disk->flags = GENHD_FL_EXT_DEVT;
2293	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
2294
2295	/*
2296	 * Initialize capacity to 0 until we establish the namespace format and
2297	 * setup integrity extentions if necessary. The revalidate_disk after
2298	 * add_disk allows the driver to register with integrity if the format
2299	 * requires it.
2300	 */
2301	set_capacity(disk, 0);
2302	if (nvme_revalidate_disk(ns->disk))
2303		goto out_free_disk;
2304
2305	kref_get(&dev->kref);
2306	if (ns->type != NVME_NS_LIGHTNVM) {
2307		add_disk(ns->disk);
2308		if (ns->ms) {
2309			struct block_device *bd = bdget_disk(ns->disk, 0);
2310			if (!bd)
2311				return;
2312			if (blkdev_get(bd, FMODE_READ, NULL)) {
2313				bdput(bd);
2314				return;
2315			}
2316			blkdev_reread_part(bd);
2317			blkdev_put(bd, FMODE_READ);
2318		}
2319	}
2320	return;
2321 out_free_disk:
2322	kfree(disk);
2323	list_del(&ns->list);
2324 out_free_queue:
2325	blk_cleanup_queue(ns->queue);
2326 out_free_ns:
2327	kfree(ns);
2328}
2329
2330/*
2331 * Create I/O queues.  Failing to create an I/O queue is not an issue,
2332 * we can continue with less than the desired amount of queues, and
2333 * even a controller without I/O queues an still be used to issue
2334 * admin commands.  This might be useful to upgrade a buggy firmware
2335 * for example.
2336 */
2337static void nvme_create_io_queues(struct nvme_dev *dev)
2338{
2339	unsigned i;
2340
2341	for (i = dev->queue_count; i <= dev->max_qid; i++)
2342		if (!nvme_alloc_queue(dev, i, dev->q_depth))
2343			break;
2344
2345	for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
2346		if (nvme_create_queue(dev->queues[i], i)) {
2347			nvme_free_queues(dev, i);
2348			break;
2349		}
2350}
2351
2352static int set_queue_count(struct nvme_dev *dev, int count)
2353{
2354	int status;
2355	u32 result;
2356	u32 q_count = (count - 1) | ((count - 1) << 16);
2357
2358	status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
2359								&result);
2360	if (status < 0)
2361		return status;
2362	if (status > 0) {
2363		dev_err(dev->dev, "Could not set queue count (%d)\n", status);
2364		return 0;
2365	}
2366	return min(result & 0xffff, result >> 16) + 1;
2367}
2368
2369static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
2370{
2371	u64 szu, size, offset;
2372	u32 cmbloc;
2373	resource_size_t bar_size;
2374	struct pci_dev *pdev = to_pci_dev(dev->dev);
2375	void __iomem *cmb;
2376	dma_addr_t dma_addr;
2377
2378	if (!use_cmb_sqes)
2379		return NULL;
2380
2381	dev->cmbsz = readl(&dev->bar->cmbsz);
2382	if (!(NVME_CMB_SZ(dev->cmbsz)))
2383		return NULL;
2384
2385	cmbloc = readl(&dev->bar->cmbloc);
2386
2387	szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
2388	size = szu * NVME_CMB_SZ(dev->cmbsz);
2389	offset = szu * NVME_CMB_OFST(cmbloc);
2390	bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc));
2391
2392	if (offset > bar_size)
2393		return NULL;
2394
2395	/*
2396	 * Controllers may support a CMB size larger than their BAR,
2397	 * for example, due to being behind a bridge. Reduce the CMB to
2398	 * the reported size of the BAR
2399	 */
2400	if (size > bar_size - offset)
2401		size = bar_size - offset;
2402
2403	dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset;
2404	cmb = ioremap_wc(dma_addr, size);
2405	if (!cmb)
2406		return NULL;
2407
2408	dev->cmb_dma_addr = dma_addr;
2409	dev->cmb_size = size;
2410	return cmb;
2411}
2412
2413static inline void nvme_release_cmb(struct nvme_dev *dev)
2414{
2415	if (dev->cmb) {
2416		iounmap(dev->cmb);
2417		dev->cmb = NULL;
2418	}
2419}
2420
2421static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
2422{
2423	return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
2424}
2425
2426static int nvme_setup_io_queues(struct nvme_dev *dev)
2427{
2428	struct nvme_queue *adminq = dev->queues[0];
2429	struct pci_dev *pdev = to_pci_dev(dev->dev);
2430	int result, i, vecs, nr_io_queues, size;
2431
2432	nr_io_queues = num_possible_cpus();
2433	result = set_queue_count(dev, nr_io_queues);
2434	if (result <= 0)
2435		return result;
2436	if (result < nr_io_queues)
2437		nr_io_queues = result;
2438
2439	if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
2440		result = nvme_cmb_qdepth(dev, nr_io_queues,
2441				sizeof(struct nvme_command));
2442		if (result > 0)
2443			dev->q_depth = result;
2444		else
2445			nvme_release_cmb(dev);
2446	}
2447
2448	size = db_bar_size(dev, nr_io_queues);
2449	if (size > 8192) {
2450		iounmap(dev->bar);
2451		do {
2452			dev->bar = ioremap(pci_resource_start(pdev, 0), size);
2453			if (dev->bar)
2454				break;
2455			if (!--nr_io_queues)
2456				return -ENOMEM;
2457			size = db_bar_size(dev, nr_io_queues);
2458		} while (1);
2459		dev->dbs = ((void __iomem *)dev->bar) + 4096;
2460		adminq->q_db = dev->dbs;
2461	}
2462
2463	/* Deregister the admin queue's interrupt */
2464	free_irq(dev->entry[0].vector, adminq);
2465
2466	/*
2467	 * If we enable msix early due to not intx, disable it again before
2468	 * setting up the full range we need.
2469	 */
2470	if (!pdev->irq)
2471		pci_disable_msix(pdev);
2472
2473	for (i = 0; i < nr_io_queues; i++)
2474		dev->entry[i].entry = i;
2475	vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
2476	if (vecs < 0) {
2477		vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));
2478		if (vecs < 0) {
2479			vecs = 1;
2480		} else {
2481			for (i = 0; i < vecs; i++)
2482				dev->entry[i].vector = i + pdev->irq;
2483		}
2484	}
2485
2486	/*
2487	 * Should investigate if there's a performance win from allocating
2488	 * more queues than interrupt vectors; it might allow the submission
2489	 * path to scale better, even if the receive path is limited by the
2490	 * number of interrupts.
2491	 */
2492	nr_io_queues = vecs;
2493	dev->max_qid = nr_io_queues;
2494
2495	result = queue_request_irq(dev, adminq, adminq->irqname);
2496	if (result) {
2497		adminq->cq_vector = -1;
2498		goto free_queues;
2499	}
2500
2501	/* Free previously allocated queues that are no longer usable */
2502	nvme_free_queues(dev, nr_io_queues + 1);
2503	nvme_create_io_queues(dev);
2504
2505	return 0;
2506
2507 free_queues:
2508	nvme_free_queues(dev, 1);
2509	return result;
2510}
2511
2512static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
2513{
2514	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
2515	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
2516
2517	return nsa->ns_id - nsb->ns_id;
2518}
2519
2520static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid)
2521{
2522	struct nvme_ns *ns;
2523
2524	list_for_each_entry(ns, &dev->namespaces, list) {
2525		if (ns->ns_id == nsid)
2526			return ns;
2527		if (ns->ns_id > nsid)
2528			break;
2529	}
2530	return NULL;
2531}
2532
2533static inline bool nvme_io_incapable(struct nvme_dev *dev)
2534{
2535	return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS ||
2536							dev->online_queues < 2);
2537}
2538
2539static void nvme_ns_remove(struct nvme_ns *ns)
2540{
2541	bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue);
2542
2543	if (kill) {
2544		blk_set_queue_dying(ns->queue);
2545
2546		/*
2547		 * The controller was shutdown first if we got here through
2548		 * device removal. The shutdown may requeue outstanding
2549		 * requests. These need to be aborted immediately so
2550		 * del_gendisk doesn't block indefinitely for their completion.
2551		 */
2552		blk_mq_abort_requeue_list(ns->queue);
2553	}
2554	if (ns->disk->flags & GENHD_FL_UP)
2555		del_gendisk(ns->disk);
2556	if (kill || !blk_queue_dying(ns->queue)) {
2557		blk_mq_abort_requeue_list(ns->queue);
2558		blk_cleanup_queue(ns->queue);
2559	}
2560	list_del_init(&ns->list);
2561	kref_put(&ns->kref, nvme_free_ns);
2562}
2563
2564static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn)
2565{
2566	struct nvme_ns *ns, *next;
2567	unsigned i;
2568
2569	for (i = 1; i <= nn; i++) {
2570		ns = nvme_find_ns(dev, i);
2571		if (ns) {
2572			if (revalidate_disk(ns->disk))
2573				nvme_ns_remove(ns);
2574		} else
2575			nvme_alloc_ns(dev, i);
2576	}
2577	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
2578		if (ns->ns_id > nn)
2579			nvme_ns_remove(ns);
2580	}
2581	list_sort(NULL, &dev->namespaces, ns_cmp);
2582}
2583
2584static void nvme_set_irq_hints(struct nvme_dev *dev)
2585{
2586	struct nvme_queue *nvmeq;
2587	int i;
2588
2589	for (i = 0; i < dev->online_queues; i++) {
2590		nvmeq = dev->queues[i];
2591
2592		if (!nvmeq->tags || !(*nvmeq->tags))
2593			continue;
2594
2595		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
2596					blk_mq_tags_cpumask(*nvmeq->tags));
2597	}
2598}
2599
2600static void nvme_dev_scan(struct work_struct *work)
2601{
2602	struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);
2603	struct nvme_id_ctrl *ctrl;
2604
2605	if (!dev->tagset.tags)
2606		return;
2607	if (nvme_identify_ctrl(dev, &ctrl))
2608		return;
2609	nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn));
2610	kfree(ctrl);
2611	nvme_set_irq_hints(dev);
2612}
2613
2614/*
2615 * Return: error value if an error occurred setting up the queues or calling
2616 * Identify Device.  0 if these succeeded, even if adding some of the
2617 * namespaces failed.  At the moment, these failures are silent.  TBD which
2618 * failures should be reported.
2619 */
2620static int nvme_dev_add(struct nvme_dev *dev)
2621{
2622	struct pci_dev *pdev = to_pci_dev(dev->dev);
2623	int res;
2624	struct nvme_id_ctrl *ctrl;
2625	int shift = NVME_CAP_MPSMIN(lo_hi_readq(&dev->bar->cap)) + 12;
2626
2627	res = nvme_identify_ctrl(dev, &ctrl);
2628	if (res) {
2629		dev_err(dev->dev, "Identify Controller failed (%d)\n", res);
2630		return -EIO;
2631	}
2632
2633	dev->oncs = le16_to_cpup(&ctrl->oncs);
2634	dev->abort_limit = ctrl->acl + 1;
2635	dev->vwc = ctrl->vwc;
2636	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
2637	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
2638	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
2639	if (ctrl->mdts)
2640		dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
2641	else
2642		dev->max_hw_sectors = UINT_MAX;
2643	if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
2644			(pdev->device == 0x0953) && ctrl->vs[3]) {
2645		unsigned int max_hw_sectors;
2646
2647		dev->stripe_size = 1 << (ctrl->vs[3] + shift);
2648		max_hw_sectors = dev->stripe_size >> (shift - 9);
2649		if (dev->max_hw_sectors) {
2650			dev->max_hw_sectors = min(max_hw_sectors,
2651							dev->max_hw_sectors);
2652		} else
2653			dev->max_hw_sectors = max_hw_sectors;
2654	}
2655	kfree(ctrl);
2656
2657	if (!dev->tagset.tags) {
2658		dev->tagset.ops = &nvme_mq_ops;
2659		dev->tagset.nr_hw_queues = dev->online_queues - 1;
2660		dev->tagset.timeout = NVME_IO_TIMEOUT;
2661		dev->tagset.numa_node = dev_to_node(dev->dev);
2662		dev->tagset.queue_depth =
2663				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
2664		dev->tagset.cmd_size = nvme_cmd_size(dev);
2665		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
2666		dev->tagset.driver_data = dev;
2667
2668		if (blk_mq_alloc_tag_set(&dev->tagset))
2669			return 0;
2670	}
2671	schedule_work(&dev->scan_work);
2672	return 0;
2673}
2674
2675static int nvme_dev_map(struct nvme_dev *dev)
2676{
2677	u64 cap;
2678	int bars, result = -ENOMEM;
2679	struct pci_dev *pdev = to_pci_dev(dev->dev);
2680
2681	if (pci_enable_device_mem(pdev))
2682		return result;
2683
2684	dev->entry[0].vector = pdev->irq;
2685	pci_set_master(pdev);
2686	bars = pci_select_bars(pdev, IORESOURCE_MEM);
2687	if (!bars)
2688		goto disable_pci;
2689
2690	if (pci_request_selected_regions(pdev, bars, "nvme"))
2691		goto disable_pci;
2692
2693	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
2694	    dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
2695		goto disable;
2696
2697	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
2698	if (!dev->bar)
2699		goto disable;
2700
2701	if (readl(&dev->bar->csts) == -1) {
2702		result = -ENODEV;
2703		goto unmap;
2704	}
2705
2706	/*
2707	 * Some devices don't advertse INTx interrupts, pre-enable a single
2708	 * MSIX vec for setup. We'll adjust this later.
2709	 */
2710	if (!pdev->irq) {
2711		result = pci_enable_msix(pdev, dev->entry, 1);
2712		if (result < 0)
2713			goto unmap;
2714	}
2715
2716	cap = lo_hi_readq(&dev->bar->cap);
2717	dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
2718	dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
2719	dev->dbs = ((void __iomem *)dev->bar) + 4096;
2720
2721	/*
2722	 * Temporary fix for the Apple controller found in the MacBook8,1 and
2723	 * some MacBook7,1 to avoid controller resets and data loss.
2724	 */
2725	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
2726		dev->q_depth = 2;
2727		dev_warn(dev->dev, "detected Apple NVMe controller, set "
2728			"queue depth=%u to work around controller resets\n",
2729			dev->q_depth);
2730	}
2731
2732	if (readl(&dev->bar->vs) >= NVME_VS(1, 2))
2733		dev->cmb = nvme_map_cmb(dev);
2734
2735	return 0;
2736
2737 unmap:
2738	iounmap(dev->bar);
2739	dev->bar = NULL;
2740 disable:
2741	pci_release_regions(pdev);
2742 disable_pci:
2743	pci_disable_device(pdev);
2744	return result;
2745}
2746
2747static void nvme_dev_unmap(struct nvme_dev *dev)
2748{
2749	struct pci_dev *pdev = to_pci_dev(dev->dev);
2750
2751	if (pdev->msi_enabled)
2752		pci_disable_msi(pdev);
2753	else if (pdev->msix_enabled)
2754		pci_disable_msix(pdev);
2755
2756	if (dev->bar) {
2757		iounmap(dev->bar);
2758		dev->bar = NULL;
2759		pci_release_regions(pdev);
2760	}
2761
2762	if (pci_is_enabled(pdev))
2763		pci_disable_device(pdev);
2764}
2765
2766struct nvme_delq_ctx {
2767	struct task_struct *waiter;
2768	struct kthread_worker *worker;
2769	atomic_t refcount;
2770};
2771
2772static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
2773{
2774	dq->waiter = current;
2775	mb();
2776
2777	for (;;) {
2778		set_current_state(TASK_KILLABLE);
2779		if (!atomic_read(&dq->refcount))
2780			break;
2781		if (!schedule_timeout(ADMIN_TIMEOUT) ||
2782					fatal_signal_pending(current)) {
2783			/*
2784			 * Disable the controller first since we can't trust it
2785			 * at this point, but leave the admin queue enabled
2786			 * until all queue deletion requests are flushed.
2787			 * FIXME: This may take a while if there are more h/w
2788			 * queues than admin tags.
2789			 */
2790			set_current_state(TASK_RUNNING);
2791			nvme_disable_ctrl(dev, lo_hi_readq(&dev->bar->cap));
2792			nvme_clear_queue(dev->queues[0]);
2793			flush_kthread_worker(dq->worker);
2794			nvme_disable_queue(dev, 0);
2795			return;
2796		}
2797	}
2798	set_current_state(TASK_RUNNING);
2799}
2800
2801static void nvme_put_dq(struct nvme_delq_ctx *dq)
2802{
2803	atomic_dec(&dq->refcount);
2804	if (dq->waiter)
2805		wake_up_process(dq->waiter);
2806}
2807
2808static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
2809{
2810	atomic_inc(&dq->refcount);
2811	return dq;
2812}
2813
2814static void nvme_del_queue_end(struct nvme_queue *nvmeq)
2815{
2816	struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
2817	nvme_put_dq(dq);
2818
2819	spin_lock_irq(&nvmeq->q_lock);
2820	nvme_process_cq(nvmeq);
2821	spin_unlock_irq(&nvmeq->q_lock);
2822}
2823
2824static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
2825						kthread_work_func_t fn)
2826{
2827	struct nvme_command c;
2828
2829	memset(&c, 0, sizeof(c));
2830	c.delete_queue.opcode = opcode;
2831	c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2832
2833	init_kthread_work(&nvmeq->cmdinfo.work, fn);
2834	return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo,
2835								ADMIN_TIMEOUT);
2836}
2837
2838static void nvme_del_cq_work_handler(struct kthread_work *work)
2839{
2840	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
2841							cmdinfo.work);
2842	nvme_del_queue_end(nvmeq);
2843}
2844
2845static int nvme_delete_cq(struct nvme_queue *nvmeq)
2846{
2847	return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq,
2848						nvme_del_cq_work_handler);
2849}
2850
2851static void nvme_del_sq_work_handler(struct kthread_work *work)
2852{
2853	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
2854							cmdinfo.work);
2855	int status = nvmeq->cmdinfo.status;
2856
2857	if (!status)
2858		status = nvme_delete_cq(nvmeq);
2859	if (status)
2860		nvme_del_queue_end(nvmeq);
2861}
2862
2863static int nvme_delete_sq(struct nvme_queue *nvmeq)
2864{
2865	return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq,
2866						nvme_del_sq_work_handler);
2867}
2868
2869static void nvme_del_queue_start(struct kthread_work *work)
2870{
2871	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
2872							cmdinfo.work);
2873	if (nvme_delete_sq(nvmeq))
2874		nvme_del_queue_end(nvmeq);
2875}
2876
2877static void nvme_disable_io_queues(struct nvme_dev *dev)
2878{
2879	int i;
2880	DEFINE_KTHREAD_WORKER_ONSTACK(worker);
2881	struct nvme_delq_ctx dq;
2882	struct task_struct *kworker_task = kthread_run(kthread_worker_fn,
2883					&worker, "nvme%d", dev->instance);
2884
2885	if (IS_ERR(kworker_task)) {
2886		dev_err(dev->dev,
2887			"Failed to create queue del task\n");
2888		for (i = dev->queue_count - 1; i > 0; i--)
2889			nvme_disable_queue(dev, i);
2890		return;
2891	}
2892
2893	dq.waiter = NULL;
2894	atomic_set(&dq.refcount, 0);
2895	dq.worker = &worker;
2896	for (i = dev->queue_count - 1; i > 0; i--) {
2897		struct nvme_queue *nvmeq = dev->queues[i];
2898
2899		if (nvme_suspend_queue(nvmeq))
2900			continue;
2901		nvmeq->cmdinfo.ctx = nvme_get_dq(&dq);
2902		nvmeq->cmdinfo.worker = dq.worker;
2903		init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start);
2904		queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work);
2905	}
2906	nvme_wait_dq(&dq, dev);
2907	kthread_stop(kworker_task);
2908}
2909
2910/*
2911* Remove the node from the device list and check
2912* for whether or not we need to stop the nvme_thread.
2913*/
2914static void nvme_dev_list_remove(struct nvme_dev *dev)
2915{
2916	struct task_struct *tmp = NULL;
2917
2918	spin_lock(&dev_list_lock);
2919	list_del_init(&dev->node);
2920	if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) {
2921		tmp = nvme_thread;
2922		nvme_thread = NULL;
2923	}
2924	spin_unlock(&dev_list_lock);
2925
2926	if (tmp)
2927		kthread_stop(tmp);
2928}
2929
2930static void nvme_freeze_queues(struct nvme_dev *dev)
2931{
2932	struct nvme_ns *ns;
2933
2934	list_for_each_entry(ns, &dev->namespaces, list) {
2935		blk_mq_freeze_queue_start(ns->queue);
2936
2937		spin_lock_irq(ns->queue->queue_lock);
2938		queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
2939		spin_unlock_irq(ns->queue->queue_lock);
2940
2941		blk_mq_cancel_requeue_work(ns->queue);
2942		blk_mq_stop_hw_queues(ns->queue);
2943	}
2944}
2945
2946static void nvme_unfreeze_queues(struct nvme_dev *dev)
2947{
2948	struct nvme_ns *ns;
2949
2950	list_for_each_entry(ns, &dev->namespaces, list) {
2951		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
2952		blk_mq_unfreeze_queue(ns->queue);
2953		blk_mq_start_stopped_hw_queues(ns->queue, true);
2954		blk_mq_kick_requeue_list(ns->queue);
2955	}
2956}
2957
2958static void nvme_dev_shutdown(struct nvme_dev *dev)
2959{
2960	int i;
2961	u32 csts = -1;
2962
2963	nvme_dev_list_remove(dev);
2964
2965	if (dev->bar) {
2966		nvme_freeze_queues(dev);
2967		csts = readl(&dev->bar->csts);
2968	}
2969	if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
2970		for (i = dev->queue_count - 1; i >= 0; i--) {
2971			struct nvme_queue *nvmeq = dev->queues[i];
2972			nvme_suspend_queue(nvmeq);
2973		}
2974	} else {
2975		nvme_disable_io_queues(dev);
2976		nvme_shutdown_ctrl(dev);
2977		nvme_disable_queue(dev, 0);
2978	}
2979	nvme_dev_unmap(dev);
2980
2981	for (i = dev->queue_count - 1; i >= 0; i--)
2982		nvme_clear_queue(dev->queues[i]);
2983}
2984
2985static void nvme_dev_remove(struct nvme_dev *dev)
2986{
2987	struct nvme_ns *ns, *next;
2988
2989	if (nvme_io_incapable(dev)) {
2990		/*
2991		 * If the device is not capable of IO (surprise hot-removal,
2992		 * for example), we need to quiesce prior to deleting the
2993		 * namespaces. This will end outstanding requests and prevent
2994		 * attempts to sync dirty data.
2995		 */
2996		nvme_dev_shutdown(dev);
2997	}
2998	list_for_each_entry_safe(ns, next, &dev->namespaces, list)
2999		nvme_ns_remove(ns);
3000}
3001
3002static int nvme_setup_prp_pools(struct nvme_dev *dev)
3003{
3004	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
3005						PAGE_SIZE, PAGE_SIZE, 0);
3006	if (!dev->prp_page_pool)
3007		return -ENOMEM;
3008
3009	/* Optimisation for I/Os between 4k and 128k */
3010	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
3011						256, 256, 0);
3012	if (!dev->prp_small_pool) {
3013		dma_pool_destroy(dev->prp_page_pool);
3014		return -ENOMEM;
3015	}
3016	return 0;
3017}
3018
3019static void nvme_release_prp_pools(struct nvme_dev *dev)
3020{
3021	dma_pool_destroy(dev->prp_page_pool);
3022	dma_pool_destroy(dev->prp_small_pool);
3023}
3024
3025static DEFINE_IDA(nvme_instance_ida);
3026
3027static int nvme_set_instance(struct nvme_dev *dev)
3028{
3029	int instance, error;
3030
3031	do {
3032		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
3033			return -ENODEV;
3034
3035		spin_lock(&dev_list_lock);
3036		error = ida_get_new(&nvme_instance_ida, &instance);
3037		spin_unlock(&dev_list_lock);
3038	} while (error == -EAGAIN);
3039
3040	if (error)
3041		return -ENODEV;
3042
3043	dev->instance = instance;
3044	return 0;
3045}
3046
3047static void nvme_release_instance(struct nvme_dev *dev)
3048{
3049	spin_lock(&dev_list_lock);
3050	ida_remove(&nvme_instance_ida, dev->instance);
3051	spin_unlock(&dev_list_lock);
3052}
3053
3054static void nvme_free_dev(struct kref *kref)
3055{
3056	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
3057
3058	put_device(dev->dev);
3059	put_device(dev->device);
3060	nvme_release_instance(dev);
3061	if (dev->tagset.tags)
3062		blk_mq_free_tag_set(&dev->tagset);
3063	if (dev->admin_q)
3064		blk_put_queue(dev->admin_q);
3065	kfree(dev->queues);
3066	kfree(dev->entry);
3067	kfree(dev);
3068}
3069
3070static int nvme_dev_open(struct inode *inode, struct file *f)
3071{
3072	struct nvme_dev *dev;
3073	int instance = iminor(inode);
3074	int ret = -ENODEV;
3075
3076	spin_lock(&dev_list_lock);
3077	list_for_each_entry(dev, &dev_list, node) {
3078		if (dev->instance == instance) {
3079			if (!dev->admin_q) {
3080				ret = -EWOULDBLOCK;
3081				break;
3082			}
3083			if (!kref_get_unless_zero(&dev->kref))
3084				break;
3085			f->private_data = dev;
3086			ret = 0;
3087			break;
3088		}
3089	}
3090	spin_unlock(&dev_list_lock);
3091
3092	return ret;
3093}
3094
3095static int nvme_dev_release(struct inode *inode, struct file *f)
3096{
3097	struct nvme_dev *dev = f->private_data;
3098	kref_put(&dev->kref, nvme_free_dev);
3099	return 0;
3100}
3101
3102static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
3103{
3104	struct nvme_dev *dev = f->private_data;
3105	struct nvme_ns *ns;
3106
3107	switch (cmd) {
3108	case NVME_IOCTL_ADMIN_CMD:
3109		return nvme_user_cmd(dev, NULL, (void __user *)arg);
3110	case NVME_IOCTL_IO_CMD:
3111		if (list_empty(&dev->namespaces))
3112			return -ENOTTY;
3113		ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
3114		return nvme_user_cmd(dev, ns, (void __user *)arg);
3115	case NVME_IOCTL_RESET:
3116		dev_warn(dev->dev, "resetting controller\n");
3117		return nvme_reset(dev);
3118	case NVME_IOCTL_SUBSYS_RESET:
3119		return nvme_subsys_reset(dev);
3120	default:
3121		return -ENOTTY;
3122	}
3123}
3124
3125static const struct file_operations nvme_dev_fops = {
3126	.owner		= THIS_MODULE,
3127	.open		= nvme_dev_open,
3128	.release	= nvme_dev_release,
3129	.unlocked_ioctl	= nvme_dev_ioctl,
3130	.compat_ioctl	= nvme_dev_ioctl,
3131};
3132
3133static void nvme_probe_work(struct work_struct *work)
3134{
3135	struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
3136	bool start_thread = false;
3137	int result;
3138
3139	result = nvme_dev_map(dev);
3140	if (result)
3141		goto out;
3142
3143	result = nvme_configure_admin_queue(dev);
3144	if (result)
3145		goto unmap;
3146
3147	spin_lock(&dev_list_lock);
3148	if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
3149		start_thread = true;
3150		nvme_thread = NULL;
3151	}
3152	list_add(&dev->node, &dev_list);
3153	spin_unlock(&dev_list_lock);
3154
3155	if (start_thread) {
3156		nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
3157		wake_up_all(&nvme_kthread_wait);
3158	} else
3159		wait_event_killable(nvme_kthread_wait, nvme_thread);
3160
3161	if (IS_ERR_OR_NULL(nvme_thread)) {
3162		result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
3163		goto disable;
3164	}
3165
3166	nvme_init_queue(dev->queues[0], 0);
3167	result = nvme_alloc_admin_tags(dev);
3168	if (result)
3169		goto disable;
3170
3171	result = nvme_setup_io_queues(dev);
3172	if (result)
3173		goto free_tags;
3174
3175	dev->event_limit = 1;
3176
3177	/*
3178	 * Keep the controller around but remove all namespaces if we don't have
3179	 * any working I/O queue.
3180	 */
3181	if (dev->online_queues < 2) {
3182		dev_warn(dev->dev, "IO queues not created\n");
3183		nvme_dev_remove(dev);
3184	} else {
3185		nvme_unfreeze_queues(dev);
3186		nvme_dev_add(dev);
3187	}
3188
3189	return;
3190
3191 free_tags:
3192	nvme_dev_remove_admin(dev);
3193	blk_put_queue(dev->admin_q);
3194	dev->admin_q = NULL;
3195	dev->queues[0]->tags = NULL;
3196 disable:
3197	nvme_disable_queue(dev, 0);
3198	nvme_dev_list_remove(dev);
3199 unmap:
3200	nvme_dev_unmap(dev);
3201 out:
3202	if (!work_busy(&dev->reset_work))
3203		nvme_dead_ctrl(dev);
3204}
3205
3206static int nvme_remove_dead_ctrl(void *arg)
3207{
3208	struct nvme_dev *dev = (struct nvme_dev *)arg;
3209	struct pci_dev *pdev = to_pci_dev(dev->dev);
3210
3211	if (pci_get_drvdata(pdev))
3212		pci_stop_and_remove_bus_device_locked(pdev);
3213	kref_put(&dev->kref, nvme_free_dev);
3214	return 0;
3215}
3216
3217static void nvme_dead_ctrl(struct nvme_dev *dev)
3218{
3219	dev_warn(dev->dev, "Device failed to resume\n");
3220	kref_get(&dev->kref);
3221	if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
3222						dev->instance))) {
3223		dev_err(dev->dev,
3224			"Failed to start controller remove task\n");
3225		kref_put(&dev->kref, nvme_free_dev);
3226	}
3227}
3228
3229static void nvme_reset_work(struct work_struct *ws)
3230{
3231	struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
3232	bool in_probe = work_busy(&dev->probe_work);
3233
3234	nvme_dev_shutdown(dev);
3235
3236	/* Synchronize with device probe so that work will see failure status
3237	 * and exit gracefully without trying to schedule another reset */
3238	flush_work(&dev->probe_work);
3239
3240	/* Fail this device if reset occured during probe to avoid
3241	 * infinite initialization loops. */
3242	if (in_probe) {
3243		nvme_dead_ctrl(dev);
3244		return;
3245	}
3246	/* Schedule device resume asynchronously so the reset work is available
3247	 * to cleanup errors that may occur during reinitialization */
3248	schedule_work(&dev->probe_work);
3249}
3250
3251static int __nvme_reset(struct nvme_dev *dev)
3252{
3253	if (work_pending(&dev->reset_work))
3254		return -EBUSY;
3255	list_del_init(&dev->node);
3256	queue_work(nvme_workq, &dev->reset_work);
3257	return 0;
3258}
3259
3260static int nvme_reset(struct nvme_dev *dev)
3261{
3262	int ret;
3263
3264	if (!dev->admin_q || blk_queue_dying(dev->admin_q))
3265		return -ENODEV;
3266
3267	spin_lock(&dev_list_lock);
3268	ret = __nvme_reset(dev);
3269	spin_unlock(&dev_list_lock);
3270
3271	if (!ret) {
3272		flush_work(&dev->reset_work);
3273		flush_work(&dev->probe_work);
3274		return 0;
3275	}
3276
3277	return ret;
3278}
3279
3280static ssize_t nvme_sysfs_reset(struct device *dev,
3281				struct device_attribute *attr, const char *buf,
3282				size_t count)
3283{
3284	struct nvme_dev *ndev = dev_get_drvdata(dev);
3285	int ret;
3286
3287	ret = nvme_reset(ndev);
3288	if (ret < 0)
3289		return ret;
3290
3291	return count;
3292}
3293static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
3294
3295static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
3296{
3297	int node, result = -ENOMEM;
3298	struct nvme_dev *dev;
3299
3300	node = dev_to_node(&pdev->dev);
3301	if (node == NUMA_NO_NODE)
3302		set_dev_node(&pdev->dev, 0);
3303
3304	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
3305	if (!dev)
3306		return -ENOMEM;
3307	dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
3308							GFP_KERNEL, node);
3309	if (!dev->entry)
3310		goto free;
3311	dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
3312							GFP_KERNEL, node);
3313	if (!dev->queues)
3314		goto free;
3315
3316	INIT_LIST_HEAD(&dev->namespaces);
3317	INIT_WORK(&dev->reset_work, nvme_reset_work);
3318	dev->dev = get_device(&pdev->dev);
3319	pci_set_drvdata(pdev, dev);
3320	result = nvme_set_instance(dev);
3321	if (result)
3322		goto put_pci;
3323
3324	result = nvme_setup_prp_pools(dev);
3325	if (result)
3326		goto release;
3327
3328	kref_init(&dev->kref);
3329	dev->device = device_create(nvme_class, &pdev->dev,
3330				MKDEV(nvme_char_major, dev->instance),
3331				dev, "nvme%d", dev->instance);
3332	if (IS_ERR(dev->device)) {
3333		result = PTR_ERR(dev->device);
3334		goto release_pools;
3335	}
3336	get_device(dev->device);
3337	dev_set_drvdata(dev->device, dev);
3338
3339	result = device_create_file(dev->device, &dev_attr_reset_controller);
3340	if (result)
3341		goto put_dev;
3342
3343	INIT_LIST_HEAD(&dev->node);
3344	INIT_WORK(&dev->scan_work, nvme_dev_scan);
3345	INIT_WORK(&dev->probe_work, nvme_probe_work);
3346	schedule_work(&dev->probe_work);
3347	return 0;
3348
3349 put_dev:
3350	device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
3351	put_device(dev->device);
3352 release_pools:
3353	nvme_release_prp_pools(dev);
3354 release:
3355	nvme_release_instance(dev);
3356 put_pci:
3357	put_device(dev->dev);
3358 free:
3359	kfree(dev->queues);
3360	kfree(dev->entry);
3361	kfree(dev);
3362	return result;
3363}
3364
3365static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
3366{
3367	struct nvme_dev *dev = pci_get_drvdata(pdev);
3368
3369	if (prepare)
3370		nvme_dev_shutdown(dev);
3371	else
3372		schedule_work(&dev->probe_work);
3373}
3374
3375static void nvme_shutdown(struct pci_dev *pdev)
3376{
3377	struct nvme_dev *dev = pci_get_drvdata(pdev);
3378	nvme_dev_shutdown(dev);
3379}
3380
3381static void nvme_remove(struct pci_dev *pdev)
3382{
3383	struct nvme_dev *dev = pci_get_drvdata(pdev);
3384
3385	spin_lock(&dev_list_lock);
3386	list_del_init(&dev->node);
3387	spin_unlock(&dev_list_lock);
3388
3389	pci_set_drvdata(pdev, NULL);
3390	flush_work(&dev->probe_work);
3391	flush_work(&dev->reset_work);
3392	flush_work(&dev->scan_work);
3393	device_remove_file(dev->device, &dev_attr_reset_controller);
3394	nvme_dev_remove(dev);
3395	nvme_dev_shutdown(dev);
3396	nvme_dev_remove_admin(dev);
3397	device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
3398	nvme_free_queues(dev, 0);
3399	nvme_release_cmb(dev);
3400	nvme_release_prp_pools(dev);
3401	kref_put(&dev->kref, nvme_free_dev);
3402}
3403
3404/* These functions are yet to be implemented */
3405#define nvme_error_detected NULL
3406#define nvme_dump_registers NULL
3407#define nvme_link_reset NULL
3408#define nvme_slot_reset NULL
3409#define nvme_error_resume NULL
3410
3411#ifdef CONFIG_PM_SLEEP
3412static int nvme_suspend(struct device *dev)
3413{
3414	struct pci_dev *pdev = to_pci_dev(dev);
3415	struct nvme_dev *ndev = pci_get_drvdata(pdev);
3416
3417	nvme_dev_shutdown(ndev);
3418	return 0;
3419}
3420
3421static int nvme_resume(struct device *dev)
3422{
3423	struct pci_dev *pdev = to_pci_dev(dev);
3424	struct nvme_dev *ndev = pci_get_drvdata(pdev);
3425
3426	schedule_work(&ndev->probe_work);
3427	return 0;
3428}
3429#endif
3430
3431static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
3432
3433static const struct pci_error_handlers nvme_err_handler = {
3434	.error_detected	= nvme_error_detected,
3435	.mmio_enabled	= nvme_dump_registers,
3436	.link_reset	= nvme_link_reset,
3437	.slot_reset	= nvme_slot_reset,
3438	.resume		= nvme_error_resume,
3439	.reset_notify	= nvme_reset_notify,
3440};
3441
3442/* Move to pci_ids.h later */
3443#define PCI_CLASS_STORAGE_EXPRESS	0x010802
3444
3445static const struct pci_device_id nvme_id_table[] = {
3446	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
3447	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
3448	{ 0, }
3449};
3450MODULE_DEVICE_TABLE(pci, nvme_id_table);
3451
3452static struct pci_driver nvme_driver = {
3453	.name		= "nvme",
3454	.id_table	= nvme_id_table,
3455	.probe		= nvme_probe,
3456	.remove		= nvme_remove,
3457	.shutdown	= nvme_shutdown,
3458	.driver		= {
3459		.pm	= &nvme_dev_pm_ops,
3460	},
3461	.err_handler	= &nvme_err_handler,
3462};
3463
3464static int __init nvme_init(void)
3465{
3466	int result;
3467
3468	init_waitqueue_head(&nvme_kthread_wait);
3469
3470	nvme_workq = create_singlethread_workqueue("nvme");
3471	if (!nvme_workq)
3472		return -ENOMEM;
3473
3474	result = register_blkdev(nvme_major, "nvme");
3475	if (result < 0)
3476		goto kill_workq;
3477	else if (result > 0)
3478		nvme_major = result;
3479
3480	result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
3481							&nvme_dev_fops);
3482	if (result < 0)
3483		goto unregister_blkdev;
3484	else if (result > 0)
3485		nvme_char_major = result;
3486
3487	nvme_class = class_create(THIS_MODULE, "nvme");
3488	if (IS_ERR(nvme_class)) {
3489		result = PTR_ERR(nvme_class);
3490		goto unregister_chrdev;
3491	}
3492
3493	result = pci_register_driver(&nvme_driver);
3494	if (result)
3495		goto destroy_class;
3496	return 0;
3497
3498 destroy_class:
3499	class_destroy(nvme_class);
3500 unregister_chrdev:
3501	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
3502 unregister_blkdev:
3503	unregister_blkdev(nvme_major, "nvme");
3504 kill_workq:
3505	destroy_workqueue(nvme_workq);
3506	return result;
3507}
3508
3509static void __exit nvme_exit(void)
3510{
3511	pci_unregister_driver(&nvme_driver);
3512	unregister_blkdev(nvme_major, "nvme");
3513	destroy_workqueue(nvme_workq);
3514	class_destroy(nvme_class);
3515	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
3516	BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
3517	_nvme_check_size();
3518}
3519
3520MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
3521MODULE_LICENSE("GPL");
3522MODULE_VERSION("1.0");
3523module_init(nvme_init);
3524module_exit(nvme_exit);
3525