root/drivers/nvme/host/multipath.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. nvme_mpath_unfreeze
  2. nvme_mpath_wait_freeze
  3. nvme_mpath_start_freeze
  4. nvme_set_disk_name
  5. nvme_failover_req
  6. nvme_kick_requeue_lists
  7. nvme_mpath_clear_current_path
  8. nvme_mpath_clear_ctrl_paths
  9. nvme_path_is_disabled
  10. __nvme_find_path
  11. nvme_next_ns
  12. nvme_round_robin_path
  13. nvme_path_is_optimized
  14. nvme_find_path
  15. nvme_available_path
  16. nvme_ns_head_make_request
  17. nvme_requeue_work
  18. nvme_mpath_alloc_disk
  19. nvme_mpath_set_live
  20. nvme_parse_ana_log
  21. nvme_state_is_live
  22. nvme_update_ns_ana_state
  23. nvme_update_ana_state
  24. nvme_read_ana_log
  25. nvme_ana_work
  26. nvme_anatt_timeout
  27. nvme_mpath_stop
  28. nvme_subsys_iopolicy_show
  29. nvme_subsys_iopolicy_store
  30. ana_grpid_show
  31. ana_state_show
  32. nvme_set_ns_ana_state
  33. nvme_mpath_add_disk
  34. nvme_mpath_remove_disk
  35. nvme_mpath_init
  36. nvme_mpath_uninit

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2017-2018 Christoph Hellwig.
   4  */
   5 
   6 #include <linux/moduleparam.h>
   7 #include <trace/events/block.h>
   8 #include "nvme.h"
   9 
  10 static bool multipath = true;
  11 module_param(multipath, bool, 0444);
  12 MODULE_PARM_DESC(multipath,
  13         "turn on native support for multiple controllers per subsystem");
  14 
  15 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
  16 {
  17         struct nvme_ns_head *h;
  18 
  19         lockdep_assert_held(&subsys->lock);
  20         list_for_each_entry(h, &subsys->nsheads, entry)
  21                 if (h->disk)
  22                         blk_mq_unfreeze_queue(h->disk->queue);
  23 }
  24 
  25 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
  26 {
  27         struct nvme_ns_head *h;
  28 
  29         lockdep_assert_held(&subsys->lock);
  30         list_for_each_entry(h, &subsys->nsheads, entry)
  31                 if (h->disk)
  32                         blk_mq_freeze_queue_wait(h->disk->queue);
  33 }
  34 
  35 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
  36 {
  37         struct nvme_ns_head *h;
  38 
  39         lockdep_assert_held(&subsys->lock);
  40         list_for_each_entry(h, &subsys->nsheads, entry)
  41                 if (h->disk)
  42                         blk_freeze_queue_start(h->disk->queue);
  43 }
  44 
  45 /*
  46  * If multipathing is enabled we need to always use the subsystem instance
  47  * number for numbering our devices to avoid conflicts between subsystems that
  48  * have multiple controllers and thus use the multipath-aware subsystem node
  49  * and those that have a single controller and use the controller node
  50  * directly.
  51  */
  52 void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
  53                         struct nvme_ctrl *ctrl, int *flags)
  54 {
  55         if (!multipath) {
  56                 sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
  57         } else if (ns->head->disk) {
  58                 sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
  59                                 ctrl->instance, ns->head->instance);
  60                 *flags = GENHD_FL_HIDDEN;
  61         } else {
  62                 sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
  63                                 ns->head->instance);
  64         }
  65 }
  66 
  67 void nvme_failover_req(struct request *req)
  68 {
  69         struct nvme_ns *ns = req->q->queuedata;
  70         u16 status = nvme_req(req)->status;
  71         unsigned long flags;
  72 
  73         spin_lock_irqsave(&ns->head->requeue_lock, flags);
  74         blk_steal_bios(&ns->head->requeue_list, req);
  75         spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
  76         blk_mq_end_request(req, 0);
  77 
  78         switch (status & 0x7ff) {
  79         case NVME_SC_ANA_TRANSITION:
  80         case NVME_SC_ANA_INACCESSIBLE:
  81         case NVME_SC_ANA_PERSISTENT_LOSS:
  82                 /*
  83                  * If we got back an ANA error we know the controller is alive,
  84                  * but not ready to serve this namespaces.  The spec suggests
  85                  * we should update our general state here, but due to the fact
  86                  * that the admin and I/O queues are not serialized that is
  87                  * fundamentally racy.  So instead just clear the current path,
  88                  * mark the the path as pending and kick of a re-read of the ANA
  89                  * log page ASAP.
  90                  */
  91                 nvme_mpath_clear_current_path(ns);
  92                 if (ns->ctrl->ana_log_buf) {
  93                         set_bit(NVME_NS_ANA_PENDING, &ns->flags);
  94                         queue_work(nvme_wq, &ns->ctrl->ana_work);
  95                 }
  96                 break;
  97         case NVME_SC_HOST_PATH_ERROR:
  98         case NVME_SC_HOST_ABORTED_CMD:
  99                 /*
 100                  * Temporary transport disruption in talking to the controller.
 101                  * Try to send on a new path.
 102                  */
 103                 nvme_mpath_clear_current_path(ns);
 104                 break;
 105         default:
 106                 /*
 107                  * Reset the controller for any non-ANA error as we don't know
 108                  * what caused the error.
 109                  */
 110                 nvme_reset_ctrl(ns->ctrl);
 111                 break;
 112         }
 113 
 114         kblockd_schedule_work(&ns->head->requeue_work);
 115 }
 116 
 117 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 118 {
 119         struct nvme_ns *ns;
 120 
 121         down_read(&ctrl->namespaces_rwsem);
 122         list_for_each_entry(ns, &ctrl->namespaces, list) {
 123                 if (ns->head->disk)
 124                         kblockd_schedule_work(&ns->head->requeue_work);
 125         }
 126         up_read(&ctrl->namespaces_rwsem);
 127 }
 128 
 129 static const char *nvme_ana_state_names[] = {
 130         [0]                             = "invalid state",
 131         [NVME_ANA_OPTIMIZED]            = "optimized",
 132         [NVME_ANA_NONOPTIMIZED]         = "non-optimized",
 133         [NVME_ANA_INACCESSIBLE]         = "inaccessible",
 134         [NVME_ANA_PERSISTENT_LOSS]      = "persistent-loss",
 135         [NVME_ANA_CHANGE]               = "change",
 136 };
 137 
 138 bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
 139 {
 140         struct nvme_ns_head *head = ns->head;
 141         bool changed = false;
 142         int node;
 143 
 144         if (!head)
 145                 goto out;
 146 
 147         for_each_node(node) {
 148                 if (ns == rcu_access_pointer(head->current_path[node])) {
 149                         rcu_assign_pointer(head->current_path[node], NULL);
 150                         changed = true;
 151                 }
 152         }
 153 out:
 154         return changed;
 155 }
 156 
 157 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 158 {
 159         struct nvme_ns *ns;
 160 
 161         mutex_lock(&ctrl->scan_lock);
 162         down_read(&ctrl->namespaces_rwsem);
 163         list_for_each_entry(ns, &ctrl->namespaces, list)
 164                 if (nvme_mpath_clear_current_path(ns))
 165                         kblockd_schedule_work(&ns->head->requeue_work);
 166         up_read(&ctrl->namespaces_rwsem);
 167         mutex_unlock(&ctrl->scan_lock);
 168 }
 169 
 170 static bool nvme_path_is_disabled(struct nvme_ns *ns)
 171 {
 172         return ns->ctrl->state != NVME_CTRL_LIVE ||
 173                 test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
 174                 test_bit(NVME_NS_REMOVING, &ns->flags);
 175 }
 176 
 177 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 178 {
 179         int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
 180         struct nvme_ns *found = NULL, *fallback = NULL, *ns;
 181 
 182         list_for_each_entry_rcu(ns, &head->list, siblings) {
 183                 if (nvme_path_is_disabled(ns))
 184                         continue;
 185 
 186                 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
 187                         distance = node_distance(node, ns->ctrl->numa_node);
 188                 else
 189                         distance = LOCAL_DISTANCE;
 190 
 191                 switch (ns->ana_state) {
 192                 case NVME_ANA_OPTIMIZED:
 193                         if (distance < found_distance) {
 194                                 found_distance = distance;
 195                                 found = ns;
 196                         }
 197                         break;
 198                 case NVME_ANA_NONOPTIMIZED:
 199                         if (distance < fallback_distance) {
 200                                 fallback_distance = distance;
 201                                 fallback = ns;
 202                         }
 203                         break;
 204                 default:
 205                         break;
 206                 }
 207         }
 208 
 209         if (!found)
 210                 found = fallback;
 211         if (found)
 212                 rcu_assign_pointer(head->current_path[node], found);
 213         return found;
 214 }
 215 
 216 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
 217                 struct nvme_ns *ns)
 218 {
 219         ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
 220                         siblings);
 221         if (ns)
 222                 return ns;
 223         return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
 224 }
 225 
 226 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
 227                 int node, struct nvme_ns *old)
 228 {
 229         struct nvme_ns *ns, *found, *fallback = NULL;
 230 
 231         if (list_is_singular(&head->list)) {
 232                 if (nvme_path_is_disabled(old))
 233                         return NULL;
 234                 return old;
 235         }
 236 
 237         for (ns = nvme_next_ns(head, old);
 238              ns != old;
 239              ns = nvme_next_ns(head, ns)) {
 240                 if (nvme_path_is_disabled(ns))
 241                         continue;
 242 
 243                 if (ns->ana_state == NVME_ANA_OPTIMIZED) {
 244                         found = ns;
 245                         goto out;
 246                 }
 247                 if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
 248                         fallback = ns;
 249         }
 250 
 251         if (!fallback)
 252                 return NULL;
 253         found = fallback;
 254 out:
 255         rcu_assign_pointer(head->current_path[node], found);
 256         return found;
 257 }
 258 
 259 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 260 {
 261         return ns->ctrl->state == NVME_CTRL_LIVE &&
 262                 ns->ana_state == NVME_ANA_OPTIMIZED;
 263 }
 264 
 265 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 266 {
 267         int node = numa_node_id();
 268         struct nvme_ns *ns;
 269 
 270         ns = srcu_dereference(head->current_path[node], &head->srcu);
 271         if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
 272                 ns = nvme_round_robin_path(head, node, ns);
 273         if (unlikely(!ns || !nvme_path_is_optimized(ns)))
 274                 ns = __nvme_find_path(head, node);
 275         return ns;
 276 }
 277 
 278 static bool nvme_available_path(struct nvme_ns_head *head)
 279 {
 280         struct nvme_ns *ns;
 281 
 282         list_for_each_entry_rcu(ns, &head->list, siblings) {
 283                 switch (ns->ctrl->state) {
 284                 case NVME_CTRL_LIVE:
 285                 case NVME_CTRL_RESETTING:
 286                 case NVME_CTRL_CONNECTING:
 287                         /* fallthru */
 288                         return true;
 289                 default:
 290                         break;
 291                 }
 292         }
 293         return false;
 294 }
 295 
 296 static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 297                 struct bio *bio)
 298 {
 299         struct nvme_ns_head *head = q->queuedata;
 300         struct device *dev = disk_to_dev(head->disk);
 301         struct nvme_ns *ns;
 302         blk_qc_t ret = BLK_QC_T_NONE;
 303         int srcu_idx;
 304 
 305         /*
 306          * The namespace might be going away and the bio might
 307          * be moved to a different queue via blk_steal_bios(),
 308          * so we need to use the bio_split pool from the original
 309          * queue to allocate the bvecs from.
 310          */
 311         blk_queue_split(q, &bio);
 312 
 313         srcu_idx = srcu_read_lock(&head->srcu);
 314         ns = nvme_find_path(head);
 315         if (likely(ns)) {
 316                 bio->bi_disk = ns->disk;
 317                 bio->bi_opf |= REQ_NVME_MPATH;
 318                 trace_block_bio_remap(bio->bi_disk->queue, bio,
 319                                       disk_devt(ns->head->disk),
 320                                       bio->bi_iter.bi_sector);
 321                 ret = direct_make_request(bio);
 322         } else if (nvme_available_path(head)) {
 323                 dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
 324 
 325                 spin_lock_irq(&head->requeue_lock);
 326                 bio_list_add(&head->requeue_list, bio);
 327                 spin_unlock_irq(&head->requeue_lock);
 328         } else {
 329                 dev_warn_ratelimited(dev, "no available path - failing I/O\n");
 330 
 331                 bio->bi_status = BLK_STS_IOERR;
 332                 bio_endio(bio);
 333         }
 334 
 335         srcu_read_unlock(&head->srcu, srcu_idx);
 336         return ret;
 337 }
 338 
 339 static void nvme_requeue_work(struct work_struct *work)
 340 {
 341         struct nvme_ns_head *head =
 342                 container_of(work, struct nvme_ns_head, requeue_work);
 343         struct bio *bio, *next;
 344 
 345         spin_lock_irq(&head->requeue_lock);
 346         next = bio_list_get(&head->requeue_list);
 347         spin_unlock_irq(&head->requeue_lock);
 348 
 349         while ((bio = next) != NULL) {
 350                 next = bio->bi_next;
 351                 bio->bi_next = NULL;
 352 
 353                 /*
 354                  * Reset disk to the mpath node and resubmit to select a new
 355                  * path.
 356                  */
 357                 bio->bi_disk = head->disk;
 358                 generic_make_request(bio);
 359         }
 360 }
 361 
 362 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 363 {
 364         struct request_queue *q;
 365         bool vwc = false;
 366 
 367         mutex_init(&head->lock);
 368         bio_list_init(&head->requeue_list);
 369         spin_lock_init(&head->requeue_lock);
 370         INIT_WORK(&head->requeue_work, nvme_requeue_work);
 371 
 372         /*
 373          * Add a multipath node if the subsystems supports multiple controllers.
 374          * We also do this for private namespaces as the namespace sharing data could
 375          * change after a rescan.
 376          */
 377         if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
 378                 return 0;
 379 
 380         q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node);
 381         if (!q)
 382                 goto out;
 383         q->queuedata = head;
 384         blk_queue_make_request(q, nvme_ns_head_make_request);
 385         blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
 386         /* set to a default value for 512 until disk is validated */
 387         blk_queue_logical_block_size(q, 512);
 388         blk_set_stacking_limits(&q->limits);
 389 
 390         /* we need to propagate up the VMC settings */
 391         if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
 392                 vwc = true;
 393         blk_queue_write_cache(q, vwc, vwc);
 394 
 395         head->disk = alloc_disk(0);
 396         if (!head->disk)
 397                 goto out_cleanup_queue;
 398         head->disk->fops = &nvme_ns_head_ops;
 399         head->disk->private_data = head;
 400         head->disk->queue = q;
 401         head->disk->flags = GENHD_FL_EXT_DEVT;
 402         sprintf(head->disk->disk_name, "nvme%dn%d",
 403                         ctrl->subsys->instance, head->instance);
 404         return 0;
 405 
 406 out_cleanup_queue:
 407         blk_cleanup_queue(q);
 408 out:
 409         return -ENOMEM;
 410 }
 411 
 412 static void nvme_mpath_set_live(struct nvme_ns *ns)
 413 {
 414         struct nvme_ns_head *head = ns->head;
 415 
 416         lockdep_assert_held(&ns->head->lock);
 417 
 418         if (!head->disk)
 419                 return;
 420 
 421         if (!(head->disk->flags & GENHD_FL_UP))
 422                 device_add_disk(&head->subsys->dev, head->disk,
 423                                 nvme_ns_id_attr_groups);
 424 
 425         if (nvme_path_is_optimized(ns)) {
 426                 int node, srcu_idx;
 427 
 428                 srcu_idx = srcu_read_lock(&head->srcu);
 429                 for_each_node(node)
 430                         __nvme_find_path(head, node);
 431                 srcu_read_unlock(&head->srcu, srcu_idx);
 432         }
 433 
 434         synchronize_srcu(&ns->head->srcu);
 435         kblockd_schedule_work(&ns->head->requeue_work);
 436 }
 437 
 438 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
 439                 int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
 440                         void *))
 441 {
 442         void *base = ctrl->ana_log_buf;
 443         size_t offset = sizeof(struct nvme_ana_rsp_hdr);
 444         int error, i;
 445 
 446         lockdep_assert_held(&ctrl->ana_lock);
 447 
 448         for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
 449                 struct nvme_ana_group_desc *desc = base + offset;
 450                 u32 nr_nsids = le32_to_cpu(desc->nnsids);
 451                 size_t nsid_buf_size = nr_nsids * sizeof(__le32);
 452 
 453                 if (WARN_ON_ONCE(desc->grpid == 0))
 454                         return -EINVAL;
 455                 if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
 456                         return -EINVAL;
 457                 if (WARN_ON_ONCE(desc->state == 0))
 458                         return -EINVAL;
 459                 if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
 460                         return -EINVAL;
 461 
 462                 offset += sizeof(*desc);
 463                 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
 464                         return -EINVAL;
 465 
 466                 error = cb(ctrl, desc, data);
 467                 if (error)
 468                         return error;
 469 
 470                 offset += nsid_buf_size;
 471                 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
 472                         return -EINVAL;
 473         }
 474 
 475         return 0;
 476 }
 477 
 478 static inline bool nvme_state_is_live(enum nvme_ana_state state)
 479 {
 480         return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
 481 }
 482 
 483 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
 484                 struct nvme_ns *ns)
 485 {
 486         mutex_lock(&ns->head->lock);
 487         ns->ana_grpid = le32_to_cpu(desc->grpid);
 488         ns->ana_state = desc->state;
 489         clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
 490 
 491         if (nvme_state_is_live(ns->ana_state))
 492                 nvme_mpath_set_live(ns);
 493         mutex_unlock(&ns->head->lock);
 494 }
 495 
 496 static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 497                 struct nvme_ana_group_desc *desc, void *data)
 498 {
 499         u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
 500         unsigned *nr_change_groups = data;
 501         struct nvme_ns *ns;
 502 
 503         dev_dbg(ctrl->device, "ANA group %d: %s.\n",
 504                         le32_to_cpu(desc->grpid),
 505                         nvme_ana_state_names[desc->state]);
 506 
 507         if (desc->state == NVME_ANA_CHANGE)
 508                 (*nr_change_groups)++;
 509 
 510         if (!nr_nsids)
 511                 return 0;
 512 
 513         down_read(&ctrl->namespaces_rwsem);
 514         list_for_each_entry(ns, &ctrl->namespaces, list) {
 515                 unsigned nsid = le32_to_cpu(desc->nsids[n]);
 516 
 517                 if (ns->head->ns_id < nsid)
 518                         continue;
 519                 if (ns->head->ns_id == nsid)
 520                         nvme_update_ns_ana_state(desc, ns);
 521                 if (++n == nr_nsids)
 522                         break;
 523         }
 524         up_read(&ctrl->namespaces_rwsem);
 525         return 0;
 526 }
 527 
 528 static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
 529 {
 530         u32 nr_change_groups = 0;
 531         int error;
 532 
 533         mutex_lock(&ctrl->ana_lock);
 534         error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0,
 535                         ctrl->ana_log_buf, ctrl->ana_log_size, 0);
 536         if (error) {
 537                 dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
 538                 goto out_unlock;
 539         }
 540 
 541         error = nvme_parse_ana_log(ctrl, &nr_change_groups,
 542                         nvme_update_ana_state);
 543         if (error)
 544                 goto out_unlock;
 545 
 546         /*
 547          * In theory we should have an ANATT timer per group as they might enter
 548          * the change state at different times.  But that is a lot of overhead
 549          * just to protect against a target that keeps entering new changes
 550          * states while never finishing previous ones.  But we'll still
 551          * eventually time out once all groups are in change state, so this
 552          * isn't a big deal.
 553          *
 554          * We also double the ANATT value to provide some slack for transports
 555          * or AEN processing overhead.
 556          */
 557         if (nr_change_groups)
 558                 mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
 559         else
 560                 del_timer_sync(&ctrl->anatt_timer);
 561 out_unlock:
 562         mutex_unlock(&ctrl->ana_lock);
 563         return error;
 564 }
 565 
 566 static void nvme_ana_work(struct work_struct *work)
 567 {
 568         struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
 569 
 570         nvme_read_ana_log(ctrl);
 571 }
 572 
 573 static void nvme_anatt_timeout(struct timer_list *t)
 574 {
 575         struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
 576 
 577         dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
 578         nvme_reset_ctrl(ctrl);
 579 }
 580 
 581 void nvme_mpath_stop(struct nvme_ctrl *ctrl)
 582 {
 583         if (!nvme_ctrl_use_ana(ctrl))
 584                 return;
 585         del_timer_sync(&ctrl->anatt_timer);
 586         cancel_work_sync(&ctrl->ana_work);
 587 }
 588 
 589 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
 590         struct device_attribute subsys_attr_##_name =   \
 591                 __ATTR(_name, _mode, _show, _store)
 592 
 593 static const char *nvme_iopolicy_names[] = {
 594         [NVME_IOPOLICY_NUMA]    = "numa",
 595         [NVME_IOPOLICY_RR]      = "round-robin",
 596 };
 597 
 598 static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
 599                 struct device_attribute *attr, char *buf)
 600 {
 601         struct nvme_subsystem *subsys =
 602                 container_of(dev, struct nvme_subsystem, dev);
 603 
 604         return sprintf(buf, "%s\n",
 605                         nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
 606 }
 607 
 608 static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
 609                 struct device_attribute *attr, const char *buf, size_t count)
 610 {
 611         struct nvme_subsystem *subsys =
 612                 container_of(dev, struct nvme_subsystem, dev);
 613         int i;
 614 
 615         for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
 616                 if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
 617                         WRITE_ONCE(subsys->iopolicy, i);
 618                         return count;
 619                 }
 620         }
 621 
 622         return -EINVAL;
 623 }
 624 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
 625                       nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
 626 
 627 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
 628                 char *buf)
 629 {
 630         return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
 631 }
 632 DEVICE_ATTR_RO(ana_grpid);
 633 
 634 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
 635                 char *buf)
 636 {
 637         struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 638 
 639         return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
 640 }
 641 DEVICE_ATTR_RO(ana_state);
 642 
 643 static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl,
 644                 struct nvme_ana_group_desc *desc, void *data)
 645 {
 646         struct nvme_ns *ns = data;
 647 
 648         if (ns->ana_grpid == le32_to_cpu(desc->grpid)) {
 649                 nvme_update_ns_ana_state(desc, ns);
 650                 return -ENXIO; /* just break out of the loop */
 651         }
 652 
 653         return 0;
 654 }
 655 
 656 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
 657 {
 658         if (nvme_ctrl_use_ana(ns->ctrl)) {
 659                 mutex_lock(&ns->ctrl->ana_lock);
 660                 ns->ana_grpid = le32_to_cpu(id->anagrpid);
 661                 nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state);
 662                 mutex_unlock(&ns->ctrl->ana_lock);
 663         } else {
 664                 mutex_lock(&ns->head->lock);
 665                 ns->ana_state = NVME_ANA_OPTIMIZED; 
 666                 nvme_mpath_set_live(ns);
 667                 mutex_unlock(&ns->head->lock);
 668         }
 669 }
 670 
 671 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
 672 {
 673         if (!head->disk)
 674                 return;
 675         if (head->disk->flags & GENHD_FL_UP)
 676                 del_gendisk(head->disk);
 677         blk_set_queue_dying(head->disk->queue);
 678         /* make sure all pending bios are cleaned up */
 679         kblockd_schedule_work(&head->requeue_work);
 680         flush_work(&head->requeue_work);
 681         blk_cleanup_queue(head->disk->queue);
 682         put_disk(head->disk);
 683 }
 684 
 685 int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 686 {
 687         int error;
 688 
 689         /* check if multipath is enabled and we have the capability */
 690         if (!multipath || !ctrl->subsys || !(ctrl->subsys->cmic & (1 << 3)))
 691                 return 0;
 692 
 693         ctrl->anacap = id->anacap;
 694         ctrl->anatt = id->anatt;
 695         ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
 696         ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
 697 
 698         mutex_init(&ctrl->ana_lock);
 699         timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
 700         ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
 701                 ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
 702         ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
 703 
 704         if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
 705                 dev_err(ctrl->device,
 706                         "ANA log page size (%zd) larger than MDTS (%d).\n",
 707                         ctrl->ana_log_size,
 708                         ctrl->max_hw_sectors << SECTOR_SHIFT);
 709                 dev_err(ctrl->device, "disabling ANA support.\n");
 710                 return 0;
 711         }
 712 
 713         INIT_WORK(&ctrl->ana_work, nvme_ana_work);
 714         kfree(ctrl->ana_log_buf);
 715         ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
 716         if (!ctrl->ana_log_buf) {
 717                 error = -ENOMEM;
 718                 goto out;
 719         }
 720 
 721         error = nvme_read_ana_log(ctrl);
 722         if (error)
 723                 goto out_free_ana_log_buf;
 724         return 0;
 725 out_free_ana_log_buf:
 726         kfree(ctrl->ana_log_buf);
 727         ctrl->ana_log_buf = NULL;
 728 out:
 729         return error;
 730 }
 731 
 732 void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
 733 {
 734         kfree(ctrl->ana_log_buf);
 735         ctrl->ana_log_buf = NULL;
 736 }
 737 

/* [<][>][^][v][top][bottom][index][help] */