root/drivers/nvme/host/tcp.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. to_tcp_ctrl
  2. nvme_tcp_queue_id
  3. nvme_tcp_tagset
  4. nvme_tcp_hdgst_len
  5. nvme_tcp_ddgst_len
  6. nvme_tcp_inline_data_size
  7. nvme_tcp_async_req
  8. nvme_tcp_has_inline_data
  9. nvme_tcp_req_cur_page
  10. nvme_tcp_req_cur_offset
  11. nvme_tcp_req_cur_length
  12. nvme_tcp_req_offset
  13. nvme_tcp_pdu_data_left
  14. nvme_tcp_pdu_last_send
  15. nvme_tcp_init_iter
  16. nvme_tcp_advance_req
  17. nvme_tcp_queue_request
  18. nvme_tcp_fetch_request
  19. nvme_tcp_ddgst_final
  20. nvme_tcp_ddgst_update
  21. nvme_tcp_hdgst
  22. nvme_tcp_verify_hdgst
  23. nvme_tcp_check_ddgst
  24. nvme_tcp_exit_request
  25. nvme_tcp_init_request
  26. nvme_tcp_init_hctx
  27. nvme_tcp_init_admin_hctx
  28. nvme_tcp_recv_state
  29. nvme_tcp_init_recv_ctx
  30. nvme_tcp_error_recovery
  31. nvme_tcp_process_nvme_cqe
  32. nvme_tcp_handle_c2h_data
  33. nvme_tcp_handle_comp
  34. nvme_tcp_setup_h2c_data_pdu
  35. nvme_tcp_handle_r2t
  36. nvme_tcp_recv_pdu
  37. nvme_tcp_end_request
  38. nvme_tcp_recv_data
  39. nvme_tcp_recv_ddgst
  40. nvme_tcp_recv_skb
  41. nvme_tcp_data_ready
  42. nvme_tcp_write_space
  43. nvme_tcp_state_change
  44. nvme_tcp_done_send_req
  45. nvme_tcp_fail_request
  46. nvme_tcp_try_send_data
  47. nvme_tcp_try_send_cmd_pdu
  48. nvme_tcp_try_send_data_pdu
  49. nvme_tcp_try_send_ddgst
  50. nvme_tcp_try_send
  51. nvme_tcp_try_recv
  52. nvme_tcp_io_work
  53. nvme_tcp_free_crypto
  54. nvme_tcp_alloc_crypto
  55. nvme_tcp_free_async_req
  56. nvme_tcp_alloc_async_req
  57. nvme_tcp_free_queue
  58. nvme_tcp_init_connection
  59. nvme_tcp_alloc_queue
  60. nvme_tcp_restore_sock_calls
  61. __nvme_tcp_stop_queue
  62. nvme_tcp_stop_queue
  63. nvme_tcp_start_queue
  64. nvme_tcp_alloc_tagset
  65. nvme_tcp_free_admin_queue
  66. nvme_tcp_free_io_queues
  67. nvme_tcp_stop_io_queues
  68. nvme_tcp_start_io_queues
  69. nvme_tcp_alloc_admin_queue
  70. __nvme_tcp_alloc_io_queues
  71. nvme_tcp_nr_io_queues
  72. nvme_tcp_set_io_queues
  73. nvme_tcp_alloc_io_queues
  74. nvme_tcp_destroy_io_queues
  75. nvme_tcp_configure_io_queues
  76. nvme_tcp_destroy_admin_queue
  77. nvme_tcp_configure_admin_queue
  78. nvme_tcp_teardown_admin_queue
  79. nvme_tcp_teardown_io_queues
  80. nvme_tcp_reconnect_or_remove
  81. nvme_tcp_setup_ctrl
  82. nvme_tcp_reconnect_ctrl_work
  83. nvme_tcp_error_recovery_work
  84. nvme_tcp_teardown_ctrl
  85. nvme_tcp_delete_ctrl
  86. nvme_reset_ctrl_work
  87. nvme_tcp_free_ctrl
  88. nvme_tcp_set_sg_null
  89. nvme_tcp_set_sg_inline
  90. nvme_tcp_set_sg_host_data
  91. nvme_tcp_submit_async_event
  92. nvme_tcp_timeout
  93. nvme_tcp_map_data
  94. nvme_tcp_setup_cmd_pdu
  95. nvme_tcp_queue_rq
  96. nvme_tcp_map_queues
  97. nvme_tcp_poll
  98. nvme_tcp_existing_controller
  99. nvme_tcp_create_ctrl
  100. nvme_tcp_init_module
  101. nvme_tcp_cleanup_module

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * NVMe over Fabrics TCP host.
   4  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
   5  */
   6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7 #include <linux/module.h>
   8 #include <linux/init.h>
   9 #include <linux/slab.h>
  10 #include <linux/err.h>
  11 #include <linux/nvme-tcp.h>
  12 #include <net/sock.h>
  13 #include <net/tcp.h>
  14 #include <linux/blk-mq.h>
  15 #include <crypto/hash.h>
  16 #include <net/busy_poll.h>
  17 
  18 #include "nvme.h"
  19 #include "fabrics.h"
  20 
  21 struct nvme_tcp_queue;
  22 
  23 enum nvme_tcp_send_state {
  24         NVME_TCP_SEND_CMD_PDU = 0,
  25         NVME_TCP_SEND_H2C_PDU,
  26         NVME_TCP_SEND_DATA,
  27         NVME_TCP_SEND_DDGST,
  28 };
  29 
  30 struct nvme_tcp_request {
  31         struct nvme_request     req;
  32         void                    *pdu;
  33         struct nvme_tcp_queue   *queue;
  34         u32                     data_len;
  35         u32                     pdu_len;
  36         u32                     pdu_sent;
  37         u16                     ttag;
  38         struct list_head        entry;
  39         __le32                  ddgst;
  40 
  41         struct bio              *curr_bio;
  42         struct iov_iter         iter;
  43 
  44         /* send state */
  45         size_t                  offset;
  46         size_t                  data_sent;
  47         enum nvme_tcp_send_state state;
  48 };
  49 
  50 enum nvme_tcp_queue_flags {
  51         NVME_TCP_Q_ALLOCATED    = 0,
  52         NVME_TCP_Q_LIVE         = 1,
  53 };
  54 
  55 enum nvme_tcp_recv_state {
  56         NVME_TCP_RECV_PDU = 0,
  57         NVME_TCP_RECV_DATA,
  58         NVME_TCP_RECV_DDGST,
  59 };
  60 
  61 struct nvme_tcp_ctrl;
  62 struct nvme_tcp_queue {
  63         struct socket           *sock;
  64         struct work_struct      io_work;
  65         int                     io_cpu;
  66 
  67         spinlock_t              lock;
  68         struct list_head        send_list;
  69 
  70         /* recv state */
  71         void                    *pdu;
  72         int                     pdu_remaining;
  73         int                     pdu_offset;
  74         size_t                  data_remaining;
  75         size_t                  ddgst_remaining;
  76         unsigned int            nr_cqe;
  77 
  78         /* send state */
  79         struct nvme_tcp_request *request;
  80 
  81         int                     queue_size;
  82         size_t                  cmnd_capsule_len;
  83         struct nvme_tcp_ctrl    *ctrl;
  84         unsigned long           flags;
  85         bool                    rd_enabled;
  86 
  87         bool                    hdr_digest;
  88         bool                    data_digest;
  89         struct ahash_request    *rcv_hash;
  90         struct ahash_request    *snd_hash;
  91         __le32                  exp_ddgst;
  92         __le32                  recv_ddgst;
  93 
  94         struct page_frag_cache  pf_cache;
  95 
  96         void (*state_change)(struct sock *);
  97         void (*data_ready)(struct sock *);
  98         void (*write_space)(struct sock *);
  99 };
 100 
 101 struct nvme_tcp_ctrl {
 102         /* read only in the hot path */
 103         struct nvme_tcp_queue   *queues;
 104         struct blk_mq_tag_set   tag_set;
 105 
 106         /* other member variables */
 107         struct list_head        list;
 108         struct blk_mq_tag_set   admin_tag_set;
 109         struct sockaddr_storage addr;
 110         struct sockaddr_storage src_addr;
 111         struct nvme_ctrl        ctrl;
 112 
 113         struct work_struct      err_work;
 114         struct delayed_work     connect_work;
 115         struct nvme_tcp_request async_req;
 116         u32                     io_queues[HCTX_MAX_TYPES];
 117 };
 118 
 119 static LIST_HEAD(nvme_tcp_ctrl_list);
 120 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
 121 static struct workqueue_struct *nvme_tcp_wq;
 122 static struct blk_mq_ops nvme_tcp_mq_ops;
 123 static struct blk_mq_ops nvme_tcp_admin_mq_ops;
 124 
 125 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
 126 {
 127         return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
 128 }
 129 
 130 static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
 131 {
 132         return queue - queue->ctrl->queues;
 133 }
 134 
 135 static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
 136 {
 137         u32 queue_idx = nvme_tcp_queue_id(queue);
 138 
 139         if (queue_idx == 0)
 140                 return queue->ctrl->admin_tag_set.tags[queue_idx];
 141         return queue->ctrl->tag_set.tags[queue_idx - 1];
 142 }
 143 
 144 static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
 145 {
 146         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 147 }
 148 
 149 static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
 150 {
 151         return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 152 }
 153 
 154 static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
 155 {
 156         return queue->cmnd_capsule_len - sizeof(struct nvme_command);
 157 }
 158 
 159 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
 160 {
 161         return req == &req->queue->ctrl->async_req;
 162 }
 163 
 164 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
 165 {
 166         struct request *rq;
 167 
 168         if (unlikely(nvme_tcp_async_req(req)))
 169                 return false; /* async events don't have a request */
 170 
 171         rq = blk_mq_rq_from_pdu(req);
 172 
 173         return rq_data_dir(rq) == WRITE && req->data_len &&
 174                 req->data_len <= nvme_tcp_inline_data_size(req->queue);
 175 }
 176 
 177 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
 178 {
 179         return req->iter.bvec->bv_page;
 180 }
 181 
 182 static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
 183 {
 184         return req->iter.bvec->bv_offset + req->iter.iov_offset;
 185 }
 186 
 187 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
 188 {
 189         return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
 190                         req->pdu_len - req->pdu_sent);
 191 }
 192 
 193 static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
 194 {
 195         return req->iter.iov_offset;
 196 }
 197 
 198 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
 199 {
 200         return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
 201                         req->pdu_len - req->pdu_sent : 0;
 202 }
 203 
 204 static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
 205                 int len)
 206 {
 207         return nvme_tcp_pdu_data_left(req) <= len;
 208 }
 209 
 210 static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
 211                 unsigned int dir)
 212 {
 213         struct request *rq = blk_mq_rq_from_pdu(req);
 214         struct bio_vec *vec;
 215         unsigned int size;
 216         int nsegs;
 217         size_t offset;
 218 
 219         if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
 220                 vec = &rq->special_vec;
 221                 nsegs = 1;
 222                 size = blk_rq_payload_bytes(rq);
 223                 offset = 0;
 224         } else {
 225                 struct bio *bio = req->curr_bio;
 226 
 227                 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 228                 nsegs = bio_segments(bio);
 229                 size = bio->bi_iter.bi_size;
 230                 offset = bio->bi_iter.bi_bvec_done;
 231         }
 232 
 233         iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
 234         req->iter.iov_offset = offset;
 235 }
 236 
 237 static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
 238                 int len)
 239 {
 240         req->data_sent += len;
 241         req->pdu_sent += len;
 242         iov_iter_advance(&req->iter, len);
 243         if (!iov_iter_count(&req->iter) &&
 244             req->data_sent < req->data_len) {
 245                 req->curr_bio = req->curr_bio->bi_next;
 246                 nvme_tcp_init_iter(req, WRITE);
 247         }
 248 }
 249 
 250 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
 251 {
 252         struct nvme_tcp_queue *queue = req->queue;
 253 
 254         spin_lock(&queue->lock);
 255         list_add_tail(&req->entry, &queue->send_list);
 256         spin_unlock(&queue->lock);
 257 
 258         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 259 }
 260 
 261 static inline struct nvme_tcp_request *
 262 nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
 263 {
 264         struct nvme_tcp_request *req;
 265 
 266         spin_lock(&queue->lock);
 267         req = list_first_entry_or_null(&queue->send_list,
 268                         struct nvme_tcp_request, entry);
 269         if (req)
 270                 list_del(&req->entry);
 271         spin_unlock(&queue->lock);
 272 
 273         return req;
 274 }
 275 
 276 static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
 277                 __le32 *dgst)
 278 {
 279         ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
 280         crypto_ahash_final(hash);
 281 }
 282 
 283 static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
 284                 struct page *page, off_t off, size_t len)
 285 {
 286         struct scatterlist sg;
 287 
 288         sg_init_marker(&sg, 1);
 289         sg_set_page(&sg, page, len, off);
 290         ahash_request_set_crypt(hash, &sg, NULL, len);
 291         crypto_ahash_update(hash);
 292 }
 293 
 294 static inline void nvme_tcp_hdgst(struct ahash_request *hash,
 295                 void *pdu, size_t len)
 296 {
 297         struct scatterlist sg;
 298 
 299         sg_init_one(&sg, pdu, len);
 300         ahash_request_set_crypt(hash, &sg, pdu + len, len);
 301         crypto_ahash_digest(hash);
 302 }
 303 
 304 static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
 305                 void *pdu, size_t pdu_len)
 306 {
 307         struct nvme_tcp_hdr *hdr = pdu;
 308         __le32 recv_digest;
 309         __le32 exp_digest;
 310 
 311         if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
 312                 dev_err(queue->ctrl->ctrl.device,
 313                         "queue %d: header digest flag is cleared\n",
 314                         nvme_tcp_queue_id(queue));
 315                 return -EPROTO;
 316         }
 317 
 318         recv_digest = *(__le32 *)(pdu + hdr->hlen);
 319         nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
 320         exp_digest = *(__le32 *)(pdu + hdr->hlen);
 321         if (recv_digest != exp_digest) {
 322                 dev_err(queue->ctrl->ctrl.device,
 323                         "header digest error: recv %#x expected %#x\n",
 324                         le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
 325                 return -EIO;
 326         }
 327 
 328         return 0;
 329 }
 330 
 331 static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
 332 {
 333         struct nvme_tcp_hdr *hdr = pdu;
 334         u8 digest_len = nvme_tcp_hdgst_len(queue);
 335         u32 len;
 336 
 337         len = le32_to_cpu(hdr->plen) - hdr->hlen -
 338                 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
 339 
 340         if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
 341                 dev_err(queue->ctrl->ctrl.device,
 342                         "queue %d: data digest flag is cleared\n",
 343                 nvme_tcp_queue_id(queue));
 344                 return -EPROTO;
 345         }
 346         crypto_ahash_init(queue->rcv_hash);
 347 
 348         return 0;
 349 }
 350 
 351 static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
 352                 struct request *rq, unsigned int hctx_idx)
 353 {
 354         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 355 
 356         page_frag_free(req->pdu);
 357 }
 358 
 359 static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
 360                 struct request *rq, unsigned int hctx_idx,
 361                 unsigned int numa_node)
 362 {
 363         struct nvme_tcp_ctrl *ctrl = set->driver_data;
 364         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 365         int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
 366         struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
 367         u8 hdgst = nvme_tcp_hdgst_len(queue);
 368 
 369         req->pdu = page_frag_alloc(&queue->pf_cache,
 370                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 371                 GFP_KERNEL | __GFP_ZERO);
 372         if (!req->pdu)
 373                 return -ENOMEM;
 374 
 375         req->queue = queue;
 376         nvme_req(rq)->ctrl = &ctrl->ctrl;
 377 
 378         return 0;
 379 }
 380 
 381 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 382                 unsigned int hctx_idx)
 383 {
 384         struct nvme_tcp_ctrl *ctrl = data;
 385         struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
 386 
 387         hctx->driver_data = queue;
 388         return 0;
 389 }
 390 
 391 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 392                 unsigned int hctx_idx)
 393 {
 394         struct nvme_tcp_ctrl *ctrl = data;
 395         struct nvme_tcp_queue *queue = &ctrl->queues[0];
 396 
 397         hctx->driver_data = queue;
 398         return 0;
 399 }
 400 
 401 static enum nvme_tcp_recv_state
 402 nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
 403 {
 404         return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
 405                 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
 406                 NVME_TCP_RECV_DATA;
 407 }
 408 
 409 static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
 410 {
 411         queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
 412                                 nvme_tcp_hdgst_len(queue);
 413         queue->pdu_offset = 0;
 414         queue->data_remaining = -1;
 415         queue->ddgst_remaining = 0;
 416 }
 417 
 418 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
 419 {
 420         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 421                 return;
 422 
 423         queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
 424 }
 425 
 426 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
 427                 struct nvme_completion *cqe)
 428 {
 429         struct request *rq;
 430 
 431         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
 432         if (!rq) {
 433                 dev_err(queue->ctrl->ctrl.device,
 434                         "queue %d tag 0x%x not found\n",
 435                         nvme_tcp_queue_id(queue), cqe->command_id);
 436                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 437                 return -EINVAL;
 438         }
 439 
 440         nvme_end_request(rq, cqe->status, cqe->result);
 441         queue->nr_cqe++;
 442 
 443         return 0;
 444 }
 445 
 446 static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
 447                 struct nvme_tcp_data_pdu *pdu)
 448 {
 449         struct request *rq;
 450 
 451         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 452         if (!rq) {
 453                 dev_err(queue->ctrl->ctrl.device,
 454                         "queue %d tag %#x not found\n",
 455                         nvme_tcp_queue_id(queue), pdu->command_id);
 456                 return -ENOENT;
 457         }
 458 
 459         if (!blk_rq_payload_bytes(rq)) {
 460                 dev_err(queue->ctrl->ctrl.device,
 461                         "queue %d tag %#x unexpected data\n",
 462                         nvme_tcp_queue_id(queue), rq->tag);
 463                 return -EIO;
 464         }
 465 
 466         queue->data_remaining = le32_to_cpu(pdu->data_length);
 467 
 468         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
 469             unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
 470                 dev_err(queue->ctrl->ctrl.device,
 471                         "queue %d tag %#x SUCCESS set but not last PDU\n",
 472                         nvme_tcp_queue_id(queue), rq->tag);
 473                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 474                 return -EPROTO;
 475         }
 476 
 477         return 0;
 478 }
 479 
 480 static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
 481                 struct nvme_tcp_rsp_pdu *pdu)
 482 {
 483         struct nvme_completion *cqe = &pdu->cqe;
 484         int ret = 0;
 485 
 486         /*
 487          * AEN requests are special as they don't time out and can
 488          * survive any kind of queue freeze and often don't respond to
 489          * aborts.  We don't even bother to allocate a struct request
 490          * for them but rather special case them here.
 491          */
 492         if (unlikely(nvme_tcp_queue_id(queue) == 0 &&
 493             cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
 494                 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
 495                                 &cqe->result);
 496         else
 497                 ret = nvme_tcp_process_nvme_cqe(queue, cqe);
 498 
 499         return ret;
 500 }
 501 
 502 static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
 503                 struct nvme_tcp_r2t_pdu *pdu)
 504 {
 505         struct nvme_tcp_data_pdu *data = req->pdu;
 506         struct nvme_tcp_queue *queue = req->queue;
 507         struct request *rq = blk_mq_rq_from_pdu(req);
 508         u8 hdgst = nvme_tcp_hdgst_len(queue);
 509         u8 ddgst = nvme_tcp_ddgst_len(queue);
 510 
 511         req->pdu_len = le32_to_cpu(pdu->r2t_length);
 512         req->pdu_sent = 0;
 513 
 514         if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
 515                 dev_err(queue->ctrl->ctrl.device,
 516                         "req %d r2t len %u exceeded data len %u (%zu sent)\n",
 517                         rq->tag, req->pdu_len, req->data_len,
 518                         req->data_sent);
 519                 return -EPROTO;
 520         }
 521 
 522         if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
 523                 dev_err(queue->ctrl->ctrl.device,
 524                         "req %d unexpected r2t offset %u (expected %zu)\n",
 525                         rq->tag, le32_to_cpu(pdu->r2t_offset),
 526                         req->data_sent);
 527                 return -EPROTO;
 528         }
 529 
 530         memset(data, 0, sizeof(*data));
 531         data->hdr.type = nvme_tcp_h2c_data;
 532         data->hdr.flags = NVME_TCP_F_DATA_LAST;
 533         if (queue->hdr_digest)
 534                 data->hdr.flags |= NVME_TCP_F_HDGST;
 535         if (queue->data_digest)
 536                 data->hdr.flags |= NVME_TCP_F_DDGST;
 537         data->hdr.hlen = sizeof(*data);
 538         data->hdr.pdo = data->hdr.hlen + hdgst;
 539         data->hdr.plen =
 540                 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
 541         data->ttag = pdu->ttag;
 542         data->command_id = rq->tag;
 543         data->data_offset = cpu_to_le32(req->data_sent);
 544         data->data_length = cpu_to_le32(req->pdu_len);
 545         return 0;
 546 }
 547 
 548 static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
 549                 struct nvme_tcp_r2t_pdu *pdu)
 550 {
 551         struct nvme_tcp_request *req;
 552         struct request *rq;
 553         int ret;
 554 
 555         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 556         if (!rq) {
 557                 dev_err(queue->ctrl->ctrl.device,
 558                         "queue %d tag %#x not found\n",
 559                         nvme_tcp_queue_id(queue), pdu->command_id);
 560                 return -ENOENT;
 561         }
 562         req = blk_mq_rq_to_pdu(rq);
 563 
 564         ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
 565         if (unlikely(ret))
 566                 return ret;
 567 
 568         req->state = NVME_TCP_SEND_H2C_PDU;
 569         req->offset = 0;
 570 
 571         nvme_tcp_queue_request(req);
 572 
 573         return 0;
 574 }
 575 
 576 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 577                 unsigned int *offset, size_t *len)
 578 {
 579         struct nvme_tcp_hdr *hdr;
 580         char *pdu = queue->pdu;
 581         size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
 582         int ret;
 583 
 584         ret = skb_copy_bits(skb, *offset,
 585                 &pdu[queue->pdu_offset], rcv_len);
 586         if (unlikely(ret))
 587                 return ret;
 588 
 589         queue->pdu_remaining -= rcv_len;
 590         queue->pdu_offset += rcv_len;
 591         *offset += rcv_len;
 592         *len -= rcv_len;
 593         if (queue->pdu_remaining)
 594                 return 0;
 595 
 596         hdr = queue->pdu;
 597         if (queue->hdr_digest) {
 598                 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
 599                 if (unlikely(ret))
 600                         return ret;
 601         }
 602 
 603 
 604         if (queue->data_digest) {
 605                 ret = nvme_tcp_check_ddgst(queue, queue->pdu);
 606                 if (unlikely(ret))
 607                         return ret;
 608         }
 609 
 610         switch (hdr->type) {
 611         case nvme_tcp_c2h_data:
 612                 return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
 613         case nvme_tcp_rsp:
 614                 nvme_tcp_init_recv_ctx(queue);
 615                 return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
 616         case nvme_tcp_r2t:
 617                 nvme_tcp_init_recv_ctx(queue);
 618                 return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
 619         default:
 620                 dev_err(queue->ctrl->ctrl.device,
 621                         "unsupported pdu type (%d)\n", hdr->type);
 622                 return -EINVAL;
 623         }
 624 }
 625 
 626 static inline void nvme_tcp_end_request(struct request *rq, u16 status)
 627 {
 628         union nvme_result res = {};
 629 
 630         nvme_end_request(rq, cpu_to_le16(status << 1), res);
 631 }
 632 
 633 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 634                               unsigned int *offset, size_t *len)
 635 {
 636         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 637         struct nvme_tcp_request *req;
 638         struct request *rq;
 639 
 640         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 641         if (!rq) {
 642                 dev_err(queue->ctrl->ctrl.device,
 643                         "queue %d tag %#x not found\n",
 644                         nvme_tcp_queue_id(queue), pdu->command_id);
 645                 return -ENOENT;
 646         }
 647         req = blk_mq_rq_to_pdu(rq);
 648 
 649         while (true) {
 650                 int recv_len, ret;
 651 
 652                 recv_len = min_t(size_t, *len, queue->data_remaining);
 653                 if (!recv_len)
 654                         break;
 655 
 656                 if (!iov_iter_count(&req->iter)) {
 657                         req->curr_bio = req->curr_bio->bi_next;
 658 
 659                         /*
 660                          * If we don`t have any bios it means that controller
 661                          * sent more data than we requested, hence error
 662                          */
 663                         if (!req->curr_bio) {
 664                                 dev_err(queue->ctrl->ctrl.device,
 665                                         "queue %d no space in request %#x",
 666                                         nvme_tcp_queue_id(queue), rq->tag);
 667                                 nvme_tcp_init_recv_ctx(queue);
 668                                 return -EIO;
 669                         }
 670                         nvme_tcp_init_iter(req, READ);
 671                 }
 672 
 673                 /* we can read only from what is left in this bio */
 674                 recv_len = min_t(size_t, recv_len,
 675                                 iov_iter_count(&req->iter));
 676 
 677                 if (queue->data_digest)
 678                         ret = skb_copy_and_hash_datagram_iter(skb, *offset,
 679                                 &req->iter, recv_len, queue->rcv_hash);
 680                 else
 681                         ret = skb_copy_datagram_iter(skb, *offset,
 682                                         &req->iter, recv_len);
 683                 if (ret) {
 684                         dev_err(queue->ctrl->ctrl.device,
 685                                 "queue %d failed to copy request %#x data",
 686                                 nvme_tcp_queue_id(queue), rq->tag);
 687                         return ret;
 688                 }
 689 
 690                 *len -= recv_len;
 691                 *offset += recv_len;
 692                 queue->data_remaining -= recv_len;
 693         }
 694 
 695         if (!queue->data_remaining) {
 696                 if (queue->data_digest) {
 697                         nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
 698                         queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
 699                 } else {
 700                         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 701                                 nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
 702                                 queue->nr_cqe++;
 703                         }
 704                         nvme_tcp_init_recv_ctx(queue);
 705                 }
 706         }
 707 
 708         return 0;
 709 }
 710 
 711 static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
 712                 struct sk_buff *skb, unsigned int *offset, size_t *len)
 713 {
 714         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 715         char *ddgst = (char *)&queue->recv_ddgst;
 716         size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
 717         off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
 718         int ret;
 719 
 720         ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
 721         if (unlikely(ret))
 722                 return ret;
 723 
 724         queue->ddgst_remaining -= recv_len;
 725         *offset += recv_len;
 726         *len -= recv_len;
 727         if (queue->ddgst_remaining)
 728                 return 0;
 729 
 730         if (queue->recv_ddgst != queue->exp_ddgst) {
 731                 dev_err(queue->ctrl->ctrl.device,
 732                         "data digest error: recv %#x expected %#x\n",
 733                         le32_to_cpu(queue->recv_ddgst),
 734                         le32_to_cpu(queue->exp_ddgst));
 735                 return -EIO;
 736         }
 737 
 738         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 739                 struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
 740                                                 pdu->command_id);
 741 
 742                 nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
 743                 queue->nr_cqe++;
 744         }
 745 
 746         nvme_tcp_init_recv_ctx(queue);
 747         return 0;
 748 }
 749 
 750 static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
 751                              unsigned int offset, size_t len)
 752 {
 753         struct nvme_tcp_queue *queue = desc->arg.data;
 754         size_t consumed = len;
 755         int result;
 756 
 757         while (len) {
 758                 switch (nvme_tcp_recv_state(queue)) {
 759                 case NVME_TCP_RECV_PDU:
 760                         result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
 761                         break;
 762                 case NVME_TCP_RECV_DATA:
 763                         result = nvme_tcp_recv_data(queue, skb, &offset, &len);
 764                         break;
 765                 case NVME_TCP_RECV_DDGST:
 766                         result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
 767                         break;
 768                 default:
 769                         result = -EFAULT;
 770                 }
 771                 if (result) {
 772                         dev_err(queue->ctrl->ctrl.device,
 773                                 "receive failed:  %d\n", result);
 774                         queue->rd_enabled = false;
 775                         nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 776                         return result;
 777                 }
 778         }
 779 
 780         return consumed;
 781 }
 782 
 783 static void nvme_tcp_data_ready(struct sock *sk)
 784 {
 785         struct nvme_tcp_queue *queue;
 786 
 787         read_lock(&sk->sk_callback_lock);
 788         queue = sk->sk_user_data;
 789         if (likely(queue && queue->rd_enabled))
 790                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 791         read_unlock(&sk->sk_callback_lock);
 792 }
 793 
 794 static void nvme_tcp_write_space(struct sock *sk)
 795 {
 796         struct nvme_tcp_queue *queue;
 797 
 798         read_lock_bh(&sk->sk_callback_lock);
 799         queue = sk->sk_user_data;
 800         if (likely(queue && sk_stream_is_writeable(sk))) {
 801                 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 802                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 803         }
 804         read_unlock_bh(&sk->sk_callback_lock);
 805 }
 806 
 807 static void nvme_tcp_state_change(struct sock *sk)
 808 {
 809         struct nvme_tcp_queue *queue;
 810 
 811         read_lock(&sk->sk_callback_lock);
 812         queue = sk->sk_user_data;
 813         if (!queue)
 814                 goto done;
 815 
 816         switch (sk->sk_state) {
 817         case TCP_CLOSE:
 818         case TCP_CLOSE_WAIT:
 819         case TCP_LAST_ACK:
 820         case TCP_FIN_WAIT1:
 821         case TCP_FIN_WAIT2:
 822                 /* fallthrough */
 823                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 824                 break;
 825         default:
 826                 dev_info(queue->ctrl->ctrl.device,
 827                         "queue %d socket state %d\n",
 828                         nvme_tcp_queue_id(queue), sk->sk_state);
 829         }
 830 
 831         queue->state_change(sk);
 832 done:
 833         read_unlock(&sk->sk_callback_lock);
 834 }
 835 
 836 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
 837 {
 838         queue->request = NULL;
 839 }
 840 
 841 static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
 842 {
 843         nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
 844 }
 845 
 846 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 847 {
 848         struct nvme_tcp_queue *queue = req->queue;
 849 
 850         while (true) {
 851                 struct page *page = nvme_tcp_req_cur_page(req);
 852                 size_t offset = nvme_tcp_req_cur_offset(req);
 853                 size_t len = nvme_tcp_req_cur_length(req);
 854                 bool last = nvme_tcp_pdu_last_send(req, len);
 855                 int ret, flags = MSG_DONTWAIT;
 856 
 857                 if (last && !queue->data_digest)
 858                         flags |= MSG_EOR;
 859                 else
 860                         flags |= MSG_MORE;
 861 
 862                 /* can't zcopy slab pages */
 863                 if (unlikely(PageSlab(page))) {
 864                         ret = sock_no_sendpage(queue->sock, page, offset, len,
 865                                         flags);
 866                 } else {
 867                         ret = kernel_sendpage(queue->sock, page, offset, len,
 868                                         flags);
 869                 }
 870                 if (ret <= 0)
 871                         return ret;
 872 
 873                 nvme_tcp_advance_req(req, ret);
 874                 if (queue->data_digest)
 875                         nvme_tcp_ddgst_update(queue->snd_hash, page,
 876                                         offset, ret);
 877 
 878                 /* fully successful last write*/
 879                 if (last && ret == len) {
 880                         if (queue->data_digest) {
 881                                 nvme_tcp_ddgst_final(queue->snd_hash,
 882                                         &req->ddgst);
 883                                 req->state = NVME_TCP_SEND_DDGST;
 884                                 req->offset = 0;
 885                         } else {
 886                                 nvme_tcp_done_send_req(queue);
 887                         }
 888                         return 1;
 889                 }
 890         }
 891         return -EAGAIN;
 892 }
 893 
 894 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 895 {
 896         struct nvme_tcp_queue *queue = req->queue;
 897         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
 898         bool inline_data = nvme_tcp_has_inline_data(req);
 899         int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
 900         u8 hdgst = nvme_tcp_hdgst_len(queue);
 901         int len = sizeof(*pdu) + hdgst - req->offset;
 902         int ret;
 903 
 904         if (queue->hdr_digest && !req->offset)
 905                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 906 
 907         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
 908                         offset_in_page(pdu) + req->offset, len,  flags);
 909         if (unlikely(ret <= 0))
 910                 return ret;
 911 
 912         len -= ret;
 913         if (!len) {
 914                 if (inline_data) {
 915                         req->state = NVME_TCP_SEND_DATA;
 916                         if (queue->data_digest)
 917                                 crypto_ahash_init(queue->snd_hash);
 918                         nvme_tcp_init_iter(req, WRITE);
 919                 } else {
 920                         nvme_tcp_done_send_req(queue);
 921                 }
 922                 return 1;
 923         }
 924         req->offset += ret;
 925 
 926         return -EAGAIN;
 927 }
 928 
 929 static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 930 {
 931         struct nvme_tcp_queue *queue = req->queue;
 932         struct nvme_tcp_data_pdu *pdu = req->pdu;
 933         u8 hdgst = nvme_tcp_hdgst_len(queue);
 934         int len = sizeof(*pdu) - req->offset + hdgst;
 935         int ret;
 936 
 937         if (queue->hdr_digest && !req->offset)
 938                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 939 
 940         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
 941                         offset_in_page(pdu) + req->offset, len,
 942                         MSG_DONTWAIT | MSG_MORE);
 943         if (unlikely(ret <= 0))
 944                 return ret;
 945 
 946         len -= ret;
 947         if (!len) {
 948                 req->state = NVME_TCP_SEND_DATA;
 949                 if (queue->data_digest)
 950                         crypto_ahash_init(queue->snd_hash);
 951                 if (!req->data_sent)
 952                         nvme_tcp_init_iter(req, WRITE);
 953                 return 1;
 954         }
 955         req->offset += ret;
 956 
 957         return -EAGAIN;
 958 }
 959 
 960 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
 961 {
 962         struct nvme_tcp_queue *queue = req->queue;
 963         int ret;
 964         struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
 965         struct kvec iov = {
 966                 .iov_base = &req->ddgst + req->offset,
 967                 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
 968         };
 969 
 970         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
 971         if (unlikely(ret <= 0))
 972                 return ret;
 973 
 974         if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
 975                 nvme_tcp_done_send_req(queue);
 976                 return 1;
 977         }
 978 
 979         req->offset += ret;
 980         return -EAGAIN;
 981 }
 982 
 983 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
 984 {
 985         struct nvme_tcp_request *req;
 986         int ret = 1;
 987 
 988         if (!queue->request) {
 989                 queue->request = nvme_tcp_fetch_request(queue);
 990                 if (!queue->request)
 991                         return 0;
 992         }
 993         req = queue->request;
 994 
 995         if (req->state == NVME_TCP_SEND_CMD_PDU) {
 996                 ret = nvme_tcp_try_send_cmd_pdu(req);
 997                 if (ret <= 0)
 998                         goto done;
 999                 if (!nvme_tcp_has_inline_data(req))
1000                         return ret;
1001         }
1002 
1003         if (req->state == NVME_TCP_SEND_H2C_PDU) {
1004                 ret = nvme_tcp_try_send_data_pdu(req);
1005                 if (ret <= 0)
1006                         goto done;
1007         }
1008 
1009         if (req->state == NVME_TCP_SEND_DATA) {
1010                 ret = nvme_tcp_try_send_data(req);
1011                 if (ret <= 0)
1012                         goto done;
1013         }
1014 
1015         if (req->state == NVME_TCP_SEND_DDGST)
1016                 ret = nvme_tcp_try_send_ddgst(req);
1017 done:
1018         if (ret == -EAGAIN)
1019                 ret = 0;
1020         return ret;
1021 }
1022 
1023 static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1024 {
1025         struct socket *sock = queue->sock;
1026         struct sock *sk = sock->sk;
1027         read_descriptor_t rd_desc;
1028         int consumed;
1029 
1030         rd_desc.arg.data = queue;
1031         rd_desc.count = 1;
1032         lock_sock(sk);
1033         queue->nr_cqe = 0;
1034         consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1035         release_sock(sk);
1036         return consumed;
1037 }
1038 
1039 static void nvme_tcp_io_work(struct work_struct *w)
1040 {
1041         struct nvme_tcp_queue *queue =
1042                 container_of(w, struct nvme_tcp_queue, io_work);
1043         unsigned long deadline = jiffies + msecs_to_jiffies(1);
1044 
1045         do {
1046                 bool pending = false;
1047                 int result;
1048 
1049                 result = nvme_tcp_try_send(queue);
1050                 if (result > 0) {
1051                         pending = true;
1052                 } else if (unlikely(result < 0)) {
1053                         dev_err(queue->ctrl->ctrl.device,
1054                                 "failed to send request %d\n", result);
1055 
1056                         /*
1057                          * Fail the request unless peer closed the connection,
1058                          * in which case error recovery flow will complete all.
1059                          */
1060                         if ((result != -EPIPE) && (result != -ECONNRESET))
1061                                 nvme_tcp_fail_request(queue->request);
1062                         nvme_tcp_done_send_req(queue);
1063                         return;
1064                 }
1065 
1066                 result = nvme_tcp_try_recv(queue);
1067                 if (result > 0)
1068                         pending = true;
1069 
1070                 if (!pending)
1071                         return;
1072 
1073         } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1074 
1075         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1076 }
1077 
1078 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1079 {
1080         struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1081 
1082         ahash_request_free(queue->rcv_hash);
1083         ahash_request_free(queue->snd_hash);
1084         crypto_free_ahash(tfm);
1085 }
1086 
1087 static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1088 {
1089         struct crypto_ahash *tfm;
1090 
1091         tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1092         if (IS_ERR(tfm))
1093                 return PTR_ERR(tfm);
1094 
1095         queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1096         if (!queue->snd_hash)
1097                 goto free_tfm;
1098         ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1099 
1100         queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1101         if (!queue->rcv_hash)
1102                 goto free_snd_hash;
1103         ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1104 
1105         return 0;
1106 free_snd_hash:
1107         ahash_request_free(queue->snd_hash);
1108 free_tfm:
1109         crypto_free_ahash(tfm);
1110         return -ENOMEM;
1111 }
1112 
1113 static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1114 {
1115         struct nvme_tcp_request *async = &ctrl->async_req;
1116 
1117         page_frag_free(async->pdu);
1118 }
1119 
1120 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1121 {
1122         struct nvme_tcp_queue *queue = &ctrl->queues[0];
1123         struct nvme_tcp_request *async = &ctrl->async_req;
1124         u8 hdgst = nvme_tcp_hdgst_len(queue);
1125 
1126         async->pdu = page_frag_alloc(&queue->pf_cache,
1127                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1128                 GFP_KERNEL | __GFP_ZERO);
1129         if (!async->pdu)
1130                 return -ENOMEM;
1131 
1132         async->queue = &ctrl->queues[0];
1133         return 0;
1134 }
1135 
1136 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1137 {
1138         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1139         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1140 
1141         if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1142                 return;
1143 
1144         if (queue->hdr_digest || queue->data_digest)
1145                 nvme_tcp_free_crypto(queue);
1146 
1147         sock_release(queue->sock);
1148         kfree(queue->pdu);
1149 }
1150 
1151 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1152 {
1153         struct nvme_tcp_icreq_pdu *icreq;
1154         struct nvme_tcp_icresp_pdu *icresp;
1155         struct msghdr msg = {};
1156         struct kvec iov;
1157         bool ctrl_hdgst, ctrl_ddgst;
1158         int ret;
1159 
1160         icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1161         if (!icreq)
1162                 return -ENOMEM;
1163 
1164         icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1165         if (!icresp) {
1166                 ret = -ENOMEM;
1167                 goto free_icreq;
1168         }
1169 
1170         icreq->hdr.type = nvme_tcp_icreq;
1171         icreq->hdr.hlen = sizeof(*icreq);
1172         icreq->hdr.pdo = 0;
1173         icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1174         icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1175         icreq->maxr2t = 0; /* single inflight r2t supported */
1176         icreq->hpda = 0; /* no alignment constraint */
1177         if (queue->hdr_digest)
1178                 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1179         if (queue->data_digest)
1180                 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1181 
1182         iov.iov_base = icreq;
1183         iov.iov_len = sizeof(*icreq);
1184         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1185         if (ret < 0)
1186                 goto free_icresp;
1187 
1188         memset(&msg, 0, sizeof(msg));
1189         iov.iov_base = icresp;
1190         iov.iov_len = sizeof(*icresp);
1191         ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1192                         iov.iov_len, msg.msg_flags);
1193         if (ret < 0)
1194                 goto free_icresp;
1195 
1196         ret = -EINVAL;
1197         if (icresp->hdr.type != nvme_tcp_icresp) {
1198                 pr_err("queue %d: bad type returned %d\n",
1199                         nvme_tcp_queue_id(queue), icresp->hdr.type);
1200                 goto free_icresp;
1201         }
1202 
1203         if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1204                 pr_err("queue %d: bad pdu length returned %d\n",
1205                         nvme_tcp_queue_id(queue), icresp->hdr.plen);
1206                 goto free_icresp;
1207         }
1208 
1209         if (icresp->pfv != NVME_TCP_PFV_1_0) {
1210                 pr_err("queue %d: bad pfv returned %d\n",
1211                         nvme_tcp_queue_id(queue), icresp->pfv);
1212                 goto free_icresp;
1213         }
1214 
1215         ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1216         if ((queue->data_digest && !ctrl_ddgst) ||
1217             (!queue->data_digest && ctrl_ddgst)) {
1218                 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1219                         nvme_tcp_queue_id(queue),
1220                         queue->data_digest ? "enabled" : "disabled",
1221                         ctrl_ddgst ? "enabled" : "disabled");
1222                 goto free_icresp;
1223         }
1224 
1225         ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1226         if ((queue->hdr_digest && !ctrl_hdgst) ||
1227             (!queue->hdr_digest && ctrl_hdgst)) {
1228                 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1229                         nvme_tcp_queue_id(queue),
1230                         queue->hdr_digest ? "enabled" : "disabled",
1231                         ctrl_hdgst ? "enabled" : "disabled");
1232                 goto free_icresp;
1233         }
1234 
1235         if (icresp->cpda != 0) {
1236                 pr_err("queue %d: unsupported cpda returned %d\n",
1237                         nvme_tcp_queue_id(queue), icresp->cpda);
1238                 goto free_icresp;
1239         }
1240 
1241         ret = 0;
1242 free_icresp:
1243         kfree(icresp);
1244 free_icreq:
1245         kfree(icreq);
1246         return ret;
1247 }
1248 
1249 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1250                 int qid, size_t queue_size)
1251 {
1252         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1253         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1254         struct linger sol = { .l_onoff = 1, .l_linger = 0 };
1255         int ret, opt, rcv_pdu_size, n;
1256 
1257         queue->ctrl = ctrl;
1258         INIT_LIST_HEAD(&queue->send_list);
1259         spin_lock_init(&queue->lock);
1260         INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1261         queue->queue_size = queue_size;
1262 
1263         if (qid > 0)
1264                 queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1265         else
1266                 queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1267                                                 NVME_TCP_ADMIN_CCSZ;
1268 
1269         ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1270                         IPPROTO_TCP, &queue->sock);
1271         if (ret) {
1272                 dev_err(nctrl->device,
1273                         "failed to create socket: %d\n", ret);
1274                 return ret;
1275         }
1276 
1277         /* Single syn retry */
1278         opt = 1;
1279         ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
1280                         (char *)&opt, sizeof(opt));
1281         if (ret) {
1282                 dev_err(nctrl->device,
1283                         "failed to set TCP_SYNCNT sock opt %d\n", ret);
1284                 goto err_sock;
1285         }
1286 
1287         /* Set TCP no delay */
1288         opt = 1;
1289         ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
1290                         TCP_NODELAY, (char *)&opt, sizeof(opt));
1291         if (ret) {
1292                 dev_err(nctrl->device,
1293                         "failed to set TCP_NODELAY sock opt %d\n", ret);
1294                 goto err_sock;
1295         }
1296 
1297         /*
1298          * Cleanup whatever is sitting in the TCP transmit queue on socket
1299          * close. This is done to prevent stale data from being sent should
1300          * the network connection be restored before TCP times out.
1301          */
1302         ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
1303                         (char *)&sol, sizeof(sol));
1304         if (ret) {
1305                 dev_err(nctrl->device,
1306                         "failed to set SO_LINGER sock opt %d\n", ret);
1307                 goto err_sock;
1308         }
1309 
1310         /* Set socket type of service */
1311         if (nctrl->opts->tos >= 0) {
1312                 opt = nctrl->opts->tos;
1313                 ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS,
1314                                 (char *)&opt, sizeof(opt));
1315                 if (ret) {
1316                         dev_err(nctrl->device,
1317                                 "failed to set IP_TOS sock opt %d\n", ret);
1318                         goto err_sock;
1319                 }
1320         }
1321 
1322         queue->sock->sk->sk_allocation = GFP_ATOMIC;
1323         if (!qid)
1324                 n = 0;
1325         else
1326                 n = (qid - 1) % num_online_cpus();
1327         queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1328         queue->request = NULL;
1329         queue->data_remaining = 0;
1330         queue->ddgst_remaining = 0;
1331         queue->pdu_remaining = 0;
1332         queue->pdu_offset = 0;
1333         sk_set_memalloc(queue->sock->sk);
1334 
1335         if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1336                 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1337                         sizeof(ctrl->src_addr));
1338                 if (ret) {
1339                         dev_err(nctrl->device,
1340                                 "failed to bind queue %d socket %d\n",
1341                                 qid, ret);
1342                         goto err_sock;
1343                 }
1344         }
1345 
1346         queue->hdr_digest = nctrl->opts->hdr_digest;
1347         queue->data_digest = nctrl->opts->data_digest;
1348         if (queue->hdr_digest || queue->data_digest) {
1349                 ret = nvme_tcp_alloc_crypto(queue);
1350                 if (ret) {
1351                         dev_err(nctrl->device,
1352                                 "failed to allocate queue %d crypto\n", qid);
1353                         goto err_sock;
1354                 }
1355         }
1356 
1357         rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1358                         nvme_tcp_hdgst_len(queue);
1359         queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1360         if (!queue->pdu) {
1361                 ret = -ENOMEM;
1362                 goto err_crypto;
1363         }
1364 
1365         dev_dbg(nctrl->device, "connecting queue %d\n",
1366                         nvme_tcp_queue_id(queue));
1367 
1368         ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1369                 sizeof(ctrl->addr), 0);
1370         if (ret) {
1371                 dev_err(nctrl->device,
1372                         "failed to connect socket: %d\n", ret);
1373                 goto err_rcv_pdu;
1374         }
1375 
1376         ret = nvme_tcp_init_connection(queue);
1377         if (ret)
1378                 goto err_init_connect;
1379 
1380         queue->rd_enabled = true;
1381         set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1382         nvme_tcp_init_recv_ctx(queue);
1383 
1384         write_lock_bh(&queue->sock->sk->sk_callback_lock);
1385         queue->sock->sk->sk_user_data = queue;
1386         queue->state_change = queue->sock->sk->sk_state_change;
1387         queue->data_ready = queue->sock->sk->sk_data_ready;
1388         queue->write_space = queue->sock->sk->sk_write_space;
1389         queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1390         queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1391         queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1392 #ifdef CONFIG_NET_RX_BUSY_POLL
1393         queue->sock->sk->sk_ll_usec = 1;
1394 #endif
1395         write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1396 
1397         return 0;
1398 
1399 err_init_connect:
1400         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1401 err_rcv_pdu:
1402         kfree(queue->pdu);
1403 err_crypto:
1404         if (queue->hdr_digest || queue->data_digest)
1405                 nvme_tcp_free_crypto(queue);
1406 err_sock:
1407         sock_release(queue->sock);
1408         queue->sock = NULL;
1409         return ret;
1410 }
1411 
1412 static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1413 {
1414         struct socket *sock = queue->sock;
1415 
1416         write_lock_bh(&sock->sk->sk_callback_lock);
1417         sock->sk->sk_user_data  = NULL;
1418         sock->sk->sk_data_ready = queue->data_ready;
1419         sock->sk->sk_state_change = queue->state_change;
1420         sock->sk->sk_write_space  = queue->write_space;
1421         write_unlock_bh(&sock->sk->sk_callback_lock);
1422 }
1423 
1424 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1425 {
1426         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1427         nvme_tcp_restore_sock_calls(queue);
1428         cancel_work_sync(&queue->io_work);
1429 }
1430 
1431 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1432 {
1433         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1434         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1435 
1436         if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1437                 return;
1438 
1439         __nvme_tcp_stop_queue(queue);
1440 }
1441 
1442 static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1443 {
1444         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1445         int ret;
1446 
1447         if (idx)
1448                 ret = nvmf_connect_io_queue(nctrl, idx, false);
1449         else
1450                 ret = nvmf_connect_admin_queue(nctrl);
1451 
1452         if (!ret) {
1453                 set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1454         } else {
1455                 if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
1456                         __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1457                 dev_err(nctrl->device,
1458                         "failed to connect queue: %d ret=%d\n", idx, ret);
1459         }
1460         return ret;
1461 }
1462 
1463 static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1464                 bool admin)
1465 {
1466         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1467         struct blk_mq_tag_set *set;
1468         int ret;
1469 
1470         if (admin) {
1471                 set = &ctrl->admin_tag_set;
1472                 memset(set, 0, sizeof(*set));
1473                 set->ops = &nvme_tcp_admin_mq_ops;
1474                 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1475                 set->reserved_tags = 2; /* connect + keep-alive */
1476                 set->numa_node = NUMA_NO_NODE;
1477                 set->cmd_size = sizeof(struct nvme_tcp_request);
1478                 set->driver_data = ctrl;
1479                 set->nr_hw_queues = 1;
1480                 set->timeout = ADMIN_TIMEOUT;
1481         } else {
1482                 set = &ctrl->tag_set;
1483                 memset(set, 0, sizeof(*set));
1484                 set->ops = &nvme_tcp_mq_ops;
1485                 set->queue_depth = nctrl->sqsize + 1;
1486                 set->reserved_tags = 1; /* fabric connect */
1487                 set->numa_node = NUMA_NO_NODE;
1488                 set->flags = BLK_MQ_F_SHOULD_MERGE;
1489                 set->cmd_size = sizeof(struct nvme_tcp_request);
1490                 set->driver_data = ctrl;
1491                 set->nr_hw_queues = nctrl->queue_count - 1;
1492                 set->timeout = NVME_IO_TIMEOUT;
1493                 set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
1494         }
1495 
1496         ret = blk_mq_alloc_tag_set(set);
1497         if (ret)
1498                 return ERR_PTR(ret);
1499 
1500         return set;
1501 }
1502 
1503 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1504 {
1505         if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1506                 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1507                 to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1508         }
1509 
1510         nvme_tcp_free_queue(ctrl, 0);
1511 }
1512 
1513 static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1514 {
1515         int i;
1516 
1517         for (i = 1; i < ctrl->queue_count; i++)
1518                 nvme_tcp_free_queue(ctrl, i);
1519 }
1520 
1521 static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1522 {
1523         int i;
1524 
1525         for (i = 1; i < ctrl->queue_count; i++)
1526                 nvme_tcp_stop_queue(ctrl, i);
1527 }
1528 
1529 static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1530 {
1531         int i, ret = 0;
1532 
1533         for (i = 1; i < ctrl->queue_count; i++) {
1534                 ret = nvme_tcp_start_queue(ctrl, i);
1535                 if (ret)
1536                         goto out_stop_queues;
1537         }
1538 
1539         return 0;
1540 
1541 out_stop_queues:
1542         for (i--; i >= 1; i--)
1543                 nvme_tcp_stop_queue(ctrl, i);
1544         return ret;
1545 }
1546 
1547 static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1548 {
1549         int ret;
1550 
1551         ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1552         if (ret)
1553                 return ret;
1554 
1555         ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1556         if (ret)
1557                 goto out_free_queue;
1558 
1559         return 0;
1560 
1561 out_free_queue:
1562         nvme_tcp_free_queue(ctrl, 0);
1563         return ret;
1564 }
1565 
1566 static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1567 {
1568         int i, ret;
1569 
1570         for (i = 1; i < ctrl->queue_count; i++) {
1571                 ret = nvme_tcp_alloc_queue(ctrl, i,
1572                                 ctrl->sqsize + 1);
1573                 if (ret)
1574                         goto out_free_queues;
1575         }
1576 
1577         return 0;
1578 
1579 out_free_queues:
1580         for (i--; i >= 1; i--)
1581                 nvme_tcp_free_queue(ctrl, i);
1582 
1583         return ret;
1584 }
1585 
1586 static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1587 {
1588         unsigned int nr_io_queues;
1589 
1590         nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1591         nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1592         nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
1593 
1594         return nr_io_queues;
1595 }
1596 
1597 static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
1598                 unsigned int nr_io_queues)
1599 {
1600         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1601         struct nvmf_ctrl_options *opts = nctrl->opts;
1602 
1603         if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
1604                 /*
1605                  * separate read/write queues
1606                  * hand out dedicated default queues only after we have
1607                  * sufficient read queues.
1608                  */
1609                 ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
1610                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
1611                 ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1612                         min(opts->nr_write_queues, nr_io_queues);
1613                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1614         } else {
1615                 /*
1616                  * shared read/write queues
1617                  * either no write queues were requested, or we don't have
1618                  * sufficient queue count to have dedicated default queues.
1619                  */
1620                 ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1621                         min(opts->nr_io_queues, nr_io_queues);
1622                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1623         }
1624 
1625         if (opts->nr_poll_queues && nr_io_queues) {
1626                 /* map dedicated poll queues only if we have queues left */
1627                 ctrl->io_queues[HCTX_TYPE_POLL] =
1628                         min(opts->nr_poll_queues, nr_io_queues);
1629         }
1630 }
1631 
1632 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1633 {
1634         unsigned int nr_io_queues;
1635         int ret;
1636 
1637         nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1638         ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1639         if (ret)
1640                 return ret;
1641 
1642         ctrl->queue_count = nr_io_queues + 1;
1643         if (ctrl->queue_count < 2)
1644                 return 0;
1645 
1646         dev_info(ctrl->device,
1647                 "creating %d I/O queues.\n", nr_io_queues);
1648 
1649         nvme_tcp_set_io_queues(ctrl, nr_io_queues);
1650 
1651         return __nvme_tcp_alloc_io_queues(ctrl);
1652 }
1653 
1654 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1655 {
1656         nvme_tcp_stop_io_queues(ctrl);
1657         if (remove) {
1658                 blk_cleanup_queue(ctrl->connect_q);
1659                 blk_mq_free_tag_set(ctrl->tagset);
1660         }
1661         nvme_tcp_free_io_queues(ctrl);
1662 }
1663 
1664 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1665 {
1666         int ret;
1667 
1668         ret = nvme_tcp_alloc_io_queues(ctrl);
1669         if (ret)
1670                 return ret;
1671 
1672         if (new) {
1673                 ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1674                 if (IS_ERR(ctrl->tagset)) {
1675                         ret = PTR_ERR(ctrl->tagset);
1676                         goto out_free_io_queues;
1677                 }
1678 
1679                 ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1680                 if (IS_ERR(ctrl->connect_q)) {
1681                         ret = PTR_ERR(ctrl->connect_q);
1682                         goto out_free_tag_set;
1683                 }
1684         } else {
1685                 blk_mq_update_nr_hw_queues(ctrl->tagset,
1686                         ctrl->queue_count - 1);
1687         }
1688 
1689         ret = nvme_tcp_start_io_queues(ctrl);
1690         if (ret)
1691                 goto out_cleanup_connect_q;
1692 
1693         return 0;
1694 
1695 out_cleanup_connect_q:
1696         if (new)
1697                 blk_cleanup_queue(ctrl->connect_q);
1698 out_free_tag_set:
1699         if (new)
1700                 blk_mq_free_tag_set(ctrl->tagset);
1701 out_free_io_queues:
1702         nvme_tcp_free_io_queues(ctrl);
1703         return ret;
1704 }
1705 
1706 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1707 {
1708         nvme_tcp_stop_queue(ctrl, 0);
1709         if (remove) {
1710                 blk_cleanup_queue(ctrl->admin_q);
1711                 blk_cleanup_queue(ctrl->fabrics_q);
1712                 blk_mq_free_tag_set(ctrl->admin_tagset);
1713         }
1714         nvme_tcp_free_admin_queue(ctrl);
1715 }
1716 
1717 static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1718 {
1719         int error;
1720 
1721         error = nvme_tcp_alloc_admin_queue(ctrl);
1722         if (error)
1723                 return error;
1724 
1725         if (new) {
1726                 ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1727                 if (IS_ERR(ctrl->admin_tagset)) {
1728                         error = PTR_ERR(ctrl->admin_tagset);
1729                         goto out_free_queue;
1730                 }
1731 
1732                 ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
1733                 if (IS_ERR(ctrl->fabrics_q)) {
1734                         error = PTR_ERR(ctrl->fabrics_q);
1735                         goto out_free_tagset;
1736                 }
1737 
1738                 ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1739                 if (IS_ERR(ctrl->admin_q)) {
1740                         error = PTR_ERR(ctrl->admin_q);
1741                         goto out_cleanup_fabrics_q;
1742                 }
1743         }
1744 
1745         error = nvme_tcp_start_queue(ctrl, 0);
1746         if (error)
1747                 goto out_cleanup_queue;
1748 
1749         error = nvme_enable_ctrl(ctrl);
1750         if (error)
1751                 goto out_stop_queue;
1752 
1753         blk_mq_unquiesce_queue(ctrl->admin_q);
1754 
1755         error = nvme_init_identify(ctrl);
1756         if (error)
1757                 goto out_stop_queue;
1758 
1759         return 0;
1760 
1761 out_stop_queue:
1762         nvme_tcp_stop_queue(ctrl, 0);
1763 out_cleanup_queue:
1764         if (new)
1765                 blk_cleanup_queue(ctrl->admin_q);
1766 out_cleanup_fabrics_q:
1767         if (new)
1768                 blk_cleanup_queue(ctrl->fabrics_q);
1769 out_free_tagset:
1770         if (new)
1771                 blk_mq_free_tag_set(ctrl->admin_tagset);
1772 out_free_queue:
1773         nvme_tcp_free_admin_queue(ctrl);
1774         return error;
1775 }
1776 
1777 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1778                 bool remove)
1779 {
1780         blk_mq_quiesce_queue(ctrl->admin_q);
1781         nvme_tcp_stop_queue(ctrl, 0);
1782         if (ctrl->admin_tagset) {
1783                 blk_mq_tagset_busy_iter(ctrl->admin_tagset,
1784                         nvme_cancel_request, ctrl);
1785                 blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
1786         }
1787         if (remove)
1788                 blk_mq_unquiesce_queue(ctrl->admin_q);
1789         nvme_tcp_destroy_admin_queue(ctrl, remove);
1790 }
1791 
1792 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1793                 bool remove)
1794 {
1795         if (ctrl->queue_count <= 1)
1796                 return;
1797         nvme_stop_queues(ctrl);
1798         nvme_tcp_stop_io_queues(ctrl);
1799         if (ctrl->tagset) {
1800                 blk_mq_tagset_busy_iter(ctrl->tagset,
1801                         nvme_cancel_request, ctrl);
1802                 blk_mq_tagset_wait_completed_request(ctrl->tagset);
1803         }
1804         if (remove)
1805                 nvme_start_queues(ctrl);
1806         nvme_tcp_destroy_io_queues(ctrl, remove);
1807 }
1808 
1809 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1810 {
1811         /* If we are resetting/deleting then do nothing */
1812         if (ctrl->state != NVME_CTRL_CONNECTING) {
1813                 WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
1814                         ctrl->state == NVME_CTRL_LIVE);
1815                 return;
1816         }
1817 
1818         if (nvmf_should_reconnect(ctrl)) {
1819                 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
1820                         ctrl->opts->reconnect_delay);
1821                 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
1822                                 ctrl->opts->reconnect_delay * HZ);
1823         } else {
1824                 dev_info(ctrl->device, "Removing controller...\n");
1825                 nvme_delete_ctrl(ctrl);
1826         }
1827 }
1828 
1829 static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
1830 {
1831         struct nvmf_ctrl_options *opts = ctrl->opts;
1832         int ret;
1833 
1834         ret = nvme_tcp_configure_admin_queue(ctrl, new);
1835         if (ret)
1836                 return ret;
1837 
1838         if (ctrl->icdoff) {
1839                 dev_err(ctrl->device, "icdoff is not supported!\n");
1840                 goto destroy_admin;
1841         }
1842 
1843         if (opts->queue_size > ctrl->sqsize + 1)
1844                 dev_warn(ctrl->device,
1845                         "queue_size %zu > ctrl sqsize %u, clamping down\n",
1846                         opts->queue_size, ctrl->sqsize + 1);
1847 
1848         if (ctrl->sqsize + 1 > ctrl->maxcmd) {
1849                 dev_warn(ctrl->device,
1850                         "sqsize %u > ctrl maxcmd %u, clamping down\n",
1851                         ctrl->sqsize + 1, ctrl->maxcmd);
1852                 ctrl->sqsize = ctrl->maxcmd - 1;
1853         }
1854 
1855         if (ctrl->queue_count > 1) {
1856                 ret = nvme_tcp_configure_io_queues(ctrl, new);
1857                 if (ret)
1858                         goto destroy_admin;
1859         }
1860 
1861         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
1862                 /* state change failure is ok if we're in DELETING state */
1863                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1864                 ret = -EINVAL;
1865                 goto destroy_io;
1866         }
1867 
1868         nvme_start_ctrl(ctrl);
1869         return 0;
1870 
1871 destroy_io:
1872         if (ctrl->queue_count > 1)
1873                 nvme_tcp_destroy_io_queues(ctrl, new);
1874 destroy_admin:
1875         nvme_tcp_stop_queue(ctrl, 0);
1876         nvme_tcp_destroy_admin_queue(ctrl, new);
1877         return ret;
1878 }
1879 
1880 static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
1881 {
1882         struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
1883                         struct nvme_tcp_ctrl, connect_work);
1884         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1885 
1886         ++ctrl->nr_reconnects;
1887 
1888         if (nvme_tcp_setup_ctrl(ctrl, false))
1889                 goto requeue;
1890 
1891         dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
1892                         ctrl->nr_reconnects);
1893 
1894         ctrl->nr_reconnects = 0;
1895 
1896         return;
1897 
1898 requeue:
1899         dev_info(ctrl->device, "Failed reconnect attempt %d\n",
1900                         ctrl->nr_reconnects);
1901         nvme_tcp_reconnect_or_remove(ctrl);
1902 }
1903 
1904 static void nvme_tcp_error_recovery_work(struct work_struct *work)
1905 {
1906         struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
1907                                 struct nvme_tcp_ctrl, err_work);
1908         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1909 
1910         nvme_stop_keep_alive(ctrl);
1911         nvme_tcp_teardown_io_queues(ctrl, false);
1912         /* unquiesce to fail fast pending requests */
1913         nvme_start_queues(ctrl);
1914         nvme_tcp_teardown_admin_queue(ctrl, false);
1915         blk_mq_unquiesce_queue(ctrl->admin_q);
1916 
1917         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
1918                 /* state change failure is ok if we're in DELETING state */
1919                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1920                 return;
1921         }
1922 
1923         nvme_tcp_reconnect_or_remove(ctrl);
1924 }
1925 
1926 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
1927 {
1928         cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
1929         cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
1930 
1931         nvme_tcp_teardown_io_queues(ctrl, shutdown);
1932         blk_mq_quiesce_queue(ctrl->admin_q);
1933         if (shutdown)
1934                 nvme_shutdown_ctrl(ctrl);
1935         else
1936                 nvme_disable_ctrl(ctrl);
1937         nvme_tcp_teardown_admin_queue(ctrl, shutdown);
1938 }
1939 
1940 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
1941 {
1942         nvme_tcp_teardown_ctrl(ctrl, true);
1943 }
1944 
1945 static void nvme_reset_ctrl_work(struct work_struct *work)
1946 {
1947         struct nvme_ctrl *ctrl =
1948                 container_of(work, struct nvme_ctrl, reset_work);
1949 
1950         nvme_stop_ctrl(ctrl);
1951         nvme_tcp_teardown_ctrl(ctrl, false);
1952 
1953         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
1954                 /* state change failure is ok if we're in DELETING state */
1955                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1956                 return;
1957         }
1958 
1959         if (nvme_tcp_setup_ctrl(ctrl, false))
1960                 goto out_fail;
1961 
1962         return;
1963 
1964 out_fail:
1965         ++ctrl->nr_reconnects;
1966         nvme_tcp_reconnect_or_remove(ctrl);
1967 }
1968 
1969 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
1970 {
1971         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1972 
1973         if (list_empty(&ctrl->list))
1974                 goto free_ctrl;
1975 
1976         mutex_lock(&nvme_tcp_ctrl_mutex);
1977         list_del(&ctrl->list);
1978         mutex_unlock(&nvme_tcp_ctrl_mutex);
1979 
1980         nvmf_free_options(nctrl->opts);
1981 free_ctrl:
1982         kfree(ctrl->queues);
1983         kfree(ctrl);
1984 }
1985 
1986 static void nvme_tcp_set_sg_null(struct nvme_command *c)
1987 {
1988         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1989 
1990         sg->addr = 0;
1991         sg->length = 0;
1992         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
1993                         NVME_SGL_FMT_TRANSPORT_A;
1994 }
1995 
1996 static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
1997                 struct nvme_command *c, u32 data_len)
1998 {
1999         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2000 
2001         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2002         sg->length = cpu_to_le32(data_len);
2003         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2004 }
2005 
2006 static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2007                 u32 data_len)
2008 {
2009         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2010 
2011         sg->addr = 0;
2012         sg->length = cpu_to_le32(data_len);
2013         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2014                         NVME_SGL_FMT_TRANSPORT_A;
2015 }
2016 
2017 static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2018 {
2019         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2020         struct nvme_tcp_queue *queue = &ctrl->queues[0];
2021         struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2022         struct nvme_command *cmd = &pdu->cmd;
2023         u8 hdgst = nvme_tcp_hdgst_len(queue);
2024 
2025         memset(pdu, 0, sizeof(*pdu));
2026         pdu->hdr.type = nvme_tcp_cmd;
2027         if (queue->hdr_digest)
2028                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2029         pdu->hdr.hlen = sizeof(*pdu);
2030         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2031 
2032         cmd->common.opcode = nvme_admin_async_event;
2033         cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2034         cmd->common.flags |= NVME_CMD_SGL_METABUF;
2035         nvme_tcp_set_sg_null(cmd);
2036 
2037         ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2038         ctrl->async_req.offset = 0;
2039         ctrl->async_req.curr_bio = NULL;
2040         ctrl->async_req.data_len = 0;
2041 
2042         nvme_tcp_queue_request(&ctrl->async_req);
2043 }
2044 
2045 static enum blk_eh_timer_return
2046 nvme_tcp_timeout(struct request *rq, bool reserved)
2047 {
2048         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2049         struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
2050         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2051 
2052         /*
2053          * Restart the timer if a controller reset is already scheduled. Any
2054          * timed out commands would be handled before entering the connecting
2055          * state.
2056          */
2057         if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
2058                 return BLK_EH_RESET_TIMER;
2059 
2060         dev_warn(ctrl->ctrl.device,
2061                 "queue %d: timeout request %#x type %d\n",
2062                 nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
2063 
2064         if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
2065                 /*
2066                  * Teardown immediately if controller times out while starting
2067                  * or we are already started error recovery. all outstanding
2068                  * requests are completed on shutdown, so we return BLK_EH_DONE.
2069                  */
2070                 flush_work(&ctrl->err_work);
2071                 nvme_tcp_teardown_io_queues(&ctrl->ctrl, false);
2072                 nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false);
2073                 return BLK_EH_DONE;
2074         }
2075 
2076         dev_warn(ctrl->ctrl.device, "starting error recovery\n");
2077         nvme_tcp_error_recovery(&ctrl->ctrl);
2078 
2079         return BLK_EH_RESET_TIMER;
2080 }
2081 
2082 static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2083                         struct request *rq)
2084 {
2085         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2086         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2087         struct nvme_command *c = &pdu->cmd;
2088 
2089         c->common.flags |= NVME_CMD_SGL_METABUF;
2090 
2091         if (!blk_rq_nr_phys_segments(rq))
2092                 nvme_tcp_set_sg_null(c);
2093         else if (rq_data_dir(rq) == WRITE &&
2094             req->data_len <= nvme_tcp_inline_data_size(queue))
2095                 nvme_tcp_set_sg_inline(queue, c, req->data_len);
2096         else
2097                 nvme_tcp_set_sg_host_data(c, req->data_len);
2098 
2099         return 0;
2100 }
2101 
2102 static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2103                 struct request *rq)
2104 {
2105         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2106         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2107         struct nvme_tcp_queue *queue = req->queue;
2108         u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2109         blk_status_t ret;
2110 
2111         ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
2112         if (ret)
2113                 return ret;
2114 
2115         req->state = NVME_TCP_SEND_CMD_PDU;
2116         req->offset = 0;
2117         req->data_sent = 0;
2118         req->pdu_len = 0;
2119         req->pdu_sent = 0;
2120         req->data_len = blk_rq_nr_phys_segments(rq) ?
2121                                 blk_rq_payload_bytes(rq) : 0;
2122         req->curr_bio = rq->bio;
2123 
2124         if (rq_data_dir(rq) == WRITE &&
2125             req->data_len <= nvme_tcp_inline_data_size(queue))
2126                 req->pdu_len = req->data_len;
2127         else if (req->curr_bio)
2128                 nvme_tcp_init_iter(req, READ);
2129 
2130         pdu->hdr.type = nvme_tcp_cmd;
2131         pdu->hdr.flags = 0;
2132         if (queue->hdr_digest)
2133                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2134         if (queue->data_digest && req->pdu_len) {
2135                 pdu->hdr.flags |= NVME_TCP_F_DDGST;
2136                 ddgst = nvme_tcp_ddgst_len(queue);
2137         }
2138         pdu->hdr.hlen = sizeof(*pdu);
2139         pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2140         pdu->hdr.plen =
2141                 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2142 
2143         ret = nvme_tcp_map_data(queue, rq);
2144         if (unlikely(ret)) {
2145                 nvme_cleanup_cmd(rq);
2146                 dev_err(queue->ctrl->ctrl.device,
2147                         "Failed to map data (%d)\n", ret);
2148                 return ret;
2149         }
2150 
2151         return 0;
2152 }
2153 
2154 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2155                 const struct blk_mq_queue_data *bd)
2156 {
2157         struct nvme_ns *ns = hctx->queue->queuedata;
2158         struct nvme_tcp_queue *queue = hctx->driver_data;
2159         struct request *rq = bd->rq;
2160         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2161         bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2162         blk_status_t ret;
2163 
2164         if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2165                 return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
2166 
2167         ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2168         if (unlikely(ret))
2169                 return ret;
2170 
2171         blk_mq_start_request(rq);
2172 
2173         nvme_tcp_queue_request(req);
2174 
2175         return BLK_STS_OK;
2176 }
2177 
2178 static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2179 {
2180         struct nvme_tcp_ctrl *ctrl = set->driver_data;
2181         struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2182 
2183         if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2184                 /* separate read/write queues */
2185                 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2186                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2187                 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2188                 set->map[HCTX_TYPE_READ].nr_queues =
2189                         ctrl->io_queues[HCTX_TYPE_READ];
2190                 set->map[HCTX_TYPE_READ].queue_offset =
2191                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2192         } else {
2193                 /* shared read/write queues */
2194                 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2195                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2196                 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2197                 set->map[HCTX_TYPE_READ].nr_queues =
2198                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2199                 set->map[HCTX_TYPE_READ].queue_offset = 0;
2200         }
2201         blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2202         blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2203 
2204         if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2205                 /* map dedicated poll queues only if we have queues left */
2206                 set->map[HCTX_TYPE_POLL].nr_queues =
2207                                 ctrl->io_queues[HCTX_TYPE_POLL];
2208                 set->map[HCTX_TYPE_POLL].queue_offset =
2209                         ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2210                         ctrl->io_queues[HCTX_TYPE_READ];
2211                 blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2212         }
2213 
2214         dev_info(ctrl->ctrl.device,
2215                 "mapped %d/%d/%d default/read/poll queues.\n",
2216                 ctrl->io_queues[HCTX_TYPE_DEFAULT],
2217                 ctrl->io_queues[HCTX_TYPE_READ],
2218                 ctrl->io_queues[HCTX_TYPE_POLL]);
2219 
2220         return 0;
2221 }
2222 
2223 static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
2224 {
2225         struct nvme_tcp_queue *queue = hctx->driver_data;
2226         struct sock *sk = queue->sock->sk;
2227 
2228         if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2229                 sk_busy_loop(sk, true);
2230         nvme_tcp_try_recv(queue);
2231         return queue->nr_cqe;
2232 }
2233 
2234 static struct blk_mq_ops nvme_tcp_mq_ops = {
2235         .queue_rq       = nvme_tcp_queue_rq,
2236         .complete       = nvme_complete_rq,
2237         .init_request   = nvme_tcp_init_request,
2238         .exit_request   = nvme_tcp_exit_request,
2239         .init_hctx      = nvme_tcp_init_hctx,
2240         .timeout        = nvme_tcp_timeout,
2241         .map_queues     = nvme_tcp_map_queues,
2242         .poll           = nvme_tcp_poll,
2243 };
2244 
2245 static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2246         .queue_rq       = nvme_tcp_queue_rq,
2247         .complete       = nvme_complete_rq,
2248         .init_request   = nvme_tcp_init_request,
2249         .exit_request   = nvme_tcp_exit_request,
2250         .init_hctx      = nvme_tcp_init_admin_hctx,
2251         .timeout        = nvme_tcp_timeout,
2252 };
2253 
2254 static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2255         .name                   = "tcp",
2256         .module                 = THIS_MODULE,
2257         .flags                  = NVME_F_FABRICS,
2258         .reg_read32             = nvmf_reg_read32,
2259         .reg_read64             = nvmf_reg_read64,
2260         .reg_write32            = nvmf_reg_write32,
2261         .free_ctrl              = nvme_tcp_free_ctrl,
2262         .submit_async_event     = nvme_tcp_submit_async_event,
2263         .delete_ctrl            = nvme_tcp_delete_ctrl,
2264         .get_address            = nvmf_get_address,
2265 };
2266 
2267 static bool
2268 nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2269 {
2270         struct nvme_tcp_ctrl *ctrl;
2271         bool found = false;
2272 
2273         mutex_lock(&nvme_tcp_ctrl_mutex);
2274         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2275                 found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2276                 if (found)
2277                         break;
2278         }
2279         mutex_unlock(&nvme_tcp_ctrl_mutex);
2280 
2281         return found;
2282 }
2283 
2284 static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2285                 struct nvmf_ctrl_options *opts)
2286 {
2287         struct nvme_tcp_ctrl *ctrl;
2288         int ret;
2289 
2290         ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2291         if (!ctrl)
2292                 return ERR_PTR(-ENOMEM);
2293 
2294         INIT_LIST_HEAD(&ctrl->list);
2295         ctrl->ctrl.opts = opts;
2296         ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2297                                 opts->nr_poll_queues + 1;
2298         ctrl->ctrl.sqsize = opts->queue_size - 1;
2299         ctrl->ctrl.kato = opts->kato;
2300 
2301         INIT_DELAYED_WORK(&ctrl->connect_work,
2302                         nvme_tcp_reconnect_ctrl_work);
2303         INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2304         INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2305 
2306         if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2307                 opts->trsvcid =
2308                         kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2309                 if (!opts->trsvcid) {
2310                         ret = -ENOMEM;
2311                         goto out_free_ctrl;
2312                 }
2313                 opts->mask |= NVMF_OPT_TRSVCID;
2314         }
2315 
2316         ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2317                         opts->traddr, opts->trsvcid, &ctrl->addr);
2318         if (ret) {
2319                 pr_err("malformed address passed: %s:%s\n",
2320                         opts->traddr, opts->trsvcid);
2321                 goto out_free_ctrl;
2322         }
2323 
2324         if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2325                 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2326                         opts->host_traddr, NULL, &ctrl->src_addr);
2327                 if (ret) {
2328                         pr_err("malformed src address passed: %s\n",
2329                                opts->host_traddr);
2330                         goto out_free_ctrl;
2331                 }
2332         }
2333 
2334         if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2335                 ret = -EALREADY;
2336                 goto out_free_ctrl;
2337         }
2338 
2339         ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2340                                 GFP_KERNEL);
2341         if (!ctrl->queues) {
2342                 ret = -ENOMEM;
2343                 goto out_free_ctrl;
2344         }
2345 
2346         ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2347         if (ret)
2348                 goto out_kfree_queues;
2349 
2350         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2351                 WARN_ON_ONCE(1);
2352                 ret = -EINTR;
2353                 goto out_uninit_ctrl;
2354         }
2355 
2356         ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2357         if (ret)
2358                 goto out_uninit_ctrl;
2359 
2360         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2361                 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2362 
2363         nvme_get_ctrl(&ctrl->ctrl);
2364 
2365         mutex_lock(&nvme_tcp_ctrl_mutex);
2366         list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2367         mutex_unlock(&nvme_tcp_ctrl_mutex);
2368 
2369         return &ctrl->ctrl;
2370 
2371 out_uninit_ctrl:
2372         nvme_uninit_ctrl(&ctrl->ctrl);
2373         nvme_put_ctrl(&ctrl->ctrl);
2374         if (ret > 0)
2375                 ret = -EIO;
2376         return ERR_PTR(ret);
2377 out_kfree_queues:
2378         kfree(ctrl->queues);
2379 out_free_ctrl:
2380         kfree(ctrl);
2381         return ERR_PTR(ret);
2382 }
2383 
2384 static struct nvmf_transport_ops nvme_tcp_transport = {
2385         .name           = "tcp",
2386         .module         = THIS_MODULE,
2387         .required_opts  = NVMF_OPT_TRADDR,
2388         .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2389                           NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2390                           NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2391                           NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2392                           NVMF_OPT_TOS,
2393         .create_ctrl    = nvme_tcp_create_ctrl,
2394 };
2395 
2396 static int __init nvme_tcp_init_module(void)
2397 {
2398         nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2399                         WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2400         if (!nvme_tcp_wq)
2401                 return -ENOMEM;
2402 
2403         nvmf_register_transport(&nvme_tcp_transport);
2404         return 0;
2405 }
2406 
2407 static void __exit nvme_tcp_cleanup_module(void)
2408 {
2409         struct nvme_tcp_ctrl *ctrl;
2410 
2411         nvmf_unregister_transport(&nvme_tcp_transport);
2412 
2413         mutex_lock(&nvme_tcp_ctrl_mutex);
2414         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2415                 nvme_delete_ctrl(&ctrl->ctrl);
2416         mutex_unlock(&nvme_tcp_ctrl_mutex);
2417         flush_workqueue(nvme_delete_wq);
2418 
2419         destroy_workqueue(nvme_tcp_wq);
2420 }
2421 
2422 module_init(nvme_tcp_init_module);
2423 module_exit(nvme_tcp_cleanup_module);
2424 
2425 MODULE_LICENSE("GPL v2");

/* [<][>][^][v][top][bottom][index][help] */