1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2017-2018 Christoph Hellwig.
6 #include <linux/moduleparam.h>
7 #include <trace/events/block.h>
10 static bool multipath
= true;
11 module_param(multipath
, bool, 0444);
12 MODULE_PARM_DESC(multipath
,
13 "turn on native support for multiple controllers per subsystem");
15 void nvme_mpath_unfreeze(struct nvme_subsystem
*subsys
)
17 struct nvme_ns_head
*h
;
19 lockdep_assert_held(&subsys
->lock
);
20 list_for_each_entry(h
, &subsys
->nsheads
, entry
)
22 blk_mq_unfreeze_queue(h
->disk
->queue
);
25 void nvme_mpath_wait_freeze(struct nvme_subsystem
*subsys
)
27 struct nvme_ns_head
*h
;
29 lockdep_assert_held(&subsys
->lock
);
30 list_for_each_entry(h
, &subsys
->nsheads
, entry
)
32 blk_mq_freeze_queue_wait(h
->disk
->queue
);
35 void nvme_mpath_start_freeze(struct nvme_subsystem
*subsys
)
37 struct nvme_ns_head
*h
;
39 lockdep_assert_held(&subsys
->lock
);
40 list_for_each_entry(h
, &subsys
->nsheads
, entry
)
42 blk_freeze_queue_start(h
->disk
->queue
);
46 * If multipathing is enabled we need to always use the subsystem instance
47 * number for numbering our devices to avoid conflicts between subsystems that
48 * have multiple controllers and thus use the multipath-aware subsystem node
49 * and those that have a single controller and use the controller node
52 void nvme_set_disk_name(char *disk_name
, struct nvme_ns
*ns
,
53 struct nvme_ctrl
*ctrl
, int *flags
)
56 sprintf(disk_name
, "nvme%dn%d", ctrl
->instance
, ns
->head
->instance
);
57 } else if (ns
->head
->disk
) {
58 sprintf(disk_name
, "nvme%dc%dn%d", ctrl
->subsys
->instance
,
59 ctrl
->instance
, ns
->head
->instance
);
60 *flags
= GENHD_FL_HIDDEN
;
62 sprintf(disk_name
, "nvme%dn%d", ctrl
->subsys
->instance
,
67 void nvme_failover_req(struct request
*req
)
69 struct nvme_ns
*ns
= req
->q
->queuedata
;
70 u16 status
= nvme_req(req
)->status
;
73 spin_lock_irqsave(&ns
->head
->requeue_lock
, flags
);
74 blk_steal_bios(&ns
->head
->requeue_list
, req
);
75 spin_unlock_irqrestore(&ns
->head
->requeue_lock
, flags
);
76 blk_mq_end_request(req
, 0);
78 switch (status
& 0x7ff) {
79 case NVME_SC_ANA_TRANSITION
:
80 case NVME_SC_ANA_INACCESSIBLE
:
81 case NVME_SC_ANA_PERSISTENT_LOSS
:
83 * If we got back an ANA error we know the controller is alive,
84 * but not ready to serve this namespaces. The spec suggests
85 * we should update our general state here, but due to the fact
86 * that the admin and I/O queues are not serialized that is
87 * fundamentally racy. So instead just clear the current path,
88 * mark the the path as pending and kick of a re-read of the ANA
91 nvme_mpath_clear_current_path(ns
);
92 if (ns
->ctrl
->ana_log_buf
) {
93 set_bit(NVME_NS_ANA_PENDING
, &ns
->flags
);
94 queue_work(nvme_wq
, &ns
->ctrl
->ana_work
);
97 case NVME_SC_HOST_PATH_ERROR
:
98 case NVME_SC_HOST_ABORTED_CMD
:
100 * Temporary transport disruption in talking to the controller.
101 * Try to send on a new path.
103 nvme_mpath_clear_current_path(ns
);
107 * Reset the controller for any non-ANA error as we don't know
108 * what caused the error.
110 nvme_reset_ctrl(ns
->ctrl
);
114 kblockd_schedule_work(&ns
->head
->requeue_work
);
117 void nvme_kick_requeue_lists(struct nvme_ctrl
*ctrl
)
121 down_read(&ctrl
->namespaces_rwsem
);
122 list_for_each_entry(ns
, &ctrl
->namespaces
, list
) {
124 kblockd_schedule_work(&ns
->head
->requeue_work
);
126 up_read(&ctrl
->namespaces_rwsem
);
129 static const char *nvme_ana_state_names
[] = {
130 [0] = "invalid state",
131 [NVME_ANA_OPTIMIZED
] = "optimized",
132 [NVME_ANA_NONOPTIMIZED
] = "non-optimized",
133 [NVME_ANA_INACCESSIBLE
] = "inaccessible",
134 [NVME_ANA_PERSISTENT_LOSS
] = "persistent-loss",
135 [NVME_ANA_CHANGE
] = "change",
138 bool nvme_mpath_clear_current_path(struct nvme_ns
*ns
)
140 struct nvme_ns_head
*head
= ns
->head
;
141 bool changed
= false;
147 for_each_node(node
) {
148 if (ns
== rcu_access_pointer(head
->current_path
[node
])) {
149 rcu_assign_pointer(head
->current_path
[node
], NULL
);
157 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl
*ctrl
)
161 mutex_lock(&ctrl
->scan_lock
);
162 down_read(&ctrl
->namespaces_rwsem
);
163 list_for_each_entry(ns
, &ctrl
->namespaces
, list
)
164 if (nvme_mpath_clear_current_path(ns
))
165 kblockd_schedule_work(&ns
->head
->requeue_work
);
166 up_read(&ctrl
->namespaces_rwsem
);
167 mutex_unlock(&ctrl
->scan_lock
);
170 static bool nvme_path_is_disabled(struct nvme_ns
*ns
)
172 return ns
->ctrl
->state
!= NVME_CTRL_LIVE
||
173 test_bit(NVME_NS_ANA_PENDING
, &ns
->flags
) ||
174 test_bit(NVME_NS_REMOVING
, &ns
->flags
);
177 static struct nvme_ns
*__nvme_find_path(struct nvme_ns_head
*head
, int node
)
179 int found_distance
= INT_MAX
, fallback_distance
= INT_MAX
, distance
;
180 struct nvme_ns
*found
= NULL
, *fallback
= NULL
, *ns
;
182 list_for_each_entry_rcu(ns
, &head
->list
, siblings
) {
183 if (nvme_path_is_disabled(ns
))
186 if (READ_ONCE(head
->subsys
->iopolicy
) == NVME_IOPOLICY_NUMA
)
187 distance
= node_distance(node
, ns
->ctrl
->numa_node
);
189 distance
= LOCAL_DISTANCE
;
191 switch (ns
->ana_state
) {
192 case NVME_ANA_OPTIMIZED
:
193 if (distance
< found_distance
) {
194 found_distance
= distance
;
198 case NVME_ANA_NONOPTIMIZED
:
199 if (distance
< fallback_distance
) {
200 fallback_distance
= distance
;
212 rcu_assign_pointer(head
->current_path
[node
], found
);
216 static struct nvme_ns
*nvme_next_ns(struct nvme_ns_head
*head
,
219 ns
= list_next_or_null_rcu(&head
->list
, &ns
->siblings
, struct nvme_ns
,
223 return list_first_or_null_rcu(&head
->list
, struct nvme_ns
, siblings
);
226 static struct nvme_ns
*nvme_round_robin_path(struct nvme_ns_head
*head
,
227 int node
, struct nvme_ns
*old
)
229 struct nvme_ns
*ns
, *found
, *fallback
= NULL
;
231 if (list_is_singular(&head
->list
)) {
232 if (nvme_path_is_disabled(old
))
237 for (ns
= nvme_next_ns(head
, old
);
239 ns
= nvme_next_ns(head
, ns
)) {
240 if (nvme_path_is_disabled(ns
))
243 if (ns
->ana_state
== NVME_ANA_OPTIMIZED
) {
247 if (ns
->ana_state
== NVME_ANA_NONOPTIMIZED
)
255 rcu_assign_pointer(head
->current_path
[node
], found
);
259 static inline bool nvme_path_is_optimized(struct nvme_ns
*ns
)
261 return ns
->ctrl
->state
== NVME_CTRL_LIVE
&&
262 ns
->ana_state
== NVME_ANA_OPTIMIZED
;
265 inline struct nvme_ns
*nvme_find_path(struct nvme_ns_head
*head
)
267 int node
= numa_node_id();
270 ns
= srcu_dereference(head
->current_path
[node
], &head
->srcu
);
271 if (READ_ONCE(head
->subsys
->iopolicy
) == NVME_IOPOLICY_RR
&& ns
)
272 ns
= nvme_round_robin_path(head
, node
, ns
);
273 if (unlikely(!ns
|| !nvme_path_is_optimized(ns
)))
274 ns
= __nvme_find_path(head
, node
);
278 static bool nvme_available_path(struct nvme_ns_head
*head
)
282 list_for_each_entry_rcu(ns
, &head
->list
, siblings
) {
283 switch (ns
->ctrl
->state
) {
285 case NVME_CTRL_RESETTING
:
286 case NVME_CTRL_CONNECTING
:
296 static blk_qc_t
nvme_ns_head_make_request(struct request_queue
*q
,
299 struct nvme_ns_head
*head
= q
->queuedata
;
300 struct device
*dev
= disk_to_dev(head
->disk
);
302 blk_qc_t ret
= BLK_QC_T_NONE
;
306 * The namespace might be going away and the bio might
307 * be moved to a different queue via blk_steal_bios(),
308 * so we need to use the bio_split pool from the original
309 * queue to allocate the bvecs from.
311 blk_queue_split(q
, &bio
);
313 srcu_idx
= srcu_read_lock(&head
->srcu
);
314 ns
= nvme_find_path(head
);
316 bio
->bi_disk
= ns
->disk
;
317 bio
->bi_opf
|= REQ_NVME_MPATH
;
318 trace_block_bio_remap(bio
->bi_disk
->queue
, bio
,
319 disk_devt(ns
->head
->disk
),
320 bio
->bi_iter
.bi_sector
);
321 ret
= direct_make_request(bio
);
322 } else if (nvme_available_path(head
)) {
323 dev_warn_ratelimited(dev
, "no usable path - requeuing I/O\n");
325 spin_lock_irq(&head
->requeue_lock
);
326 bio_list_add(&head
->requeue_list
, bio
);
327 spin_unlock_irq(&head
->requeue_lock
);
329 dev_warn_ratelimited(dev
, "no available path - failing I/O\n");
331 bio
->bi_status
= BLK_STS_IOERR
;
335 srcu_read_unlock(&head
->srcu
, srcu_idx
);
339 static void nvme_requeue_work(struct work_struct
*work
)
341 struct nvme_ns_head
*head
=
342 container_of(work
, struct nvme_ns_head
, requeue_work
);
343 struct bio
*bio
, *next
;
345 spin_lock_irq(&head
->requeue_lock
);
346 next
= bio_list_get(&head
->requeue_list
);
347 spin_unlock_irq(&head
->requeue_lock
);
349 while ((bio
= next
) != NULL
) {
354 * Reset disk to the mpath node and resubmit to select a new
357 bio
->bi_disk
= head
->disk
;
358 generic_make_request(bio
);
362 int nvme_mpath_alloc_disk(struct nvme_ctrl
*ctrl
, struct nvme_ns_head
*head
)
364 struct request_queue
*q
;
367 mutex_init(&head
->lock
);
368 bio_list_init(&head
->requeue_list
);
369 spin_lock_init(&head
->requeue_lock
);
370 INIT_WORK(&head
->requeue_work
, nvme_requeue_work
);
373 * Add a multipath node if the subsystems supports multiple controllers.
374 * We also do this for private namespaces as the namespace sharing data could
375 * change after a rescan.
377 if (!(ctrl
->subsys
->cmic
& (1 << 1)) || !multipath
)
380 q
= blk_alloc_queue_node(GFP_KERNEL
, ctrl
->numa_node
);
384 blk_queue_make_request(q
, nvme_ns_head_make_request
);
385 blk_queue_flag_set(QUEUE_FLAG_NONROT
, q
);
386 /* set to a default value for 512 until disk is validated */
387 blk_queue_logical_block_size(q
, 512);
388 blk_set_stacking_limits(&q
->limits
);
390 /* we need to propagate up the VMC settings */
391 if (ctrl
->vwc
& NVME_CTRL_VWC_PRESENT
)
393 blk_queue_write_cache(q
, vwc
, vwc
);
395 head
->disk
= alloc_disk(0);
397 goto out_cleanup_queue
;
398 head
->disk
->fops
= &nvme_ns_head_ops
;
399 head
->disk
->private_data
= head
;
400 head
->disk
->queue
= q
;
401 head
->disk
->flags
= GENHD_FL_EXT_DEVT
;
402 sprintf(head
->disk
->disk_name
, "nvme%dn%d",
403 ctrl
->subsys
->instance
, head
->instance
);
407 blk_cleanup_queue(q
);
412 static void nvme_mpath_set_live(struct nvme_ns
*ns
)
414 struct nvme_ns_head
*head
= ns
->head
;
416 lockdep_assert_held(&ns
->head
->lock
);
421 if (!(head
->disk
->flags
& GENHD_FL_UP
))
422 device_add_disk(&head
->subsys
->dev
, head
->disk
,
423 nvme_ns_id_attr_groups
);
425 if (nvme_path_is_optimized(ns
)) {
428 srcu_idx
= srcu_read_lock(&head
->srcu
);
430 __nvme_find_path(head
, node
);
431 srcu_read_unlock(&head
->srcu
, srcu_idx
);
434 synchronize_srcu(&ns
->head
->srcu
);
435 kblockd_schedule_work(&ns
->head
->requeue_work
);
438 static int nvme_parse_ana_log(struct nvme_ctrl
*ctrl
, void *data
,
439 int (*cb
)(struct nvme_ctrl
*ctrl
, struct nvme_ana_group_desc
*,
442 void *base
= ctrl
->ana_log_buf
;
443 size_t offset
= sizeof(struct nvme_ana_rsp_hdr
);
446 lockdep_assert_held(&ctrl
->ana_lock
);
448 for (i
= 0; i
< le16_to_cpu(ctrl
->ana_log_buf
->ngrps
); i
++) {
449 struct nvme_ana_group_desc
*desc
= base
+ offset
;
451 size_t nsid_buf_size
;
453 if (WARN_ON_ONCE(offset
> ctrl
->ana_log_size
- sizeof(*desc
)))
456 nr_nsids
= le32_to_cpu(desc
->nnsids
);
457 nsid_buf_size
= nr_nsids
* sizeof(__le32
);
459 if (WARN_ON_ONCE(desc
->grpid
== 0))
461 if (WARN_ON_ONCE(le32_to_cpu(desc
->grpid
) > ctrl
->anagrpmax
))
463 if (WARN_ON_ONCE(desc
->state
== 0))
465 if (WARN_ON_ONCE(desc
->state
> NVME_ANA_CHANGE
))
468 offset
+= sizeof(*desc
);
469 if (WARN_ON_ONCE(offset
> ctrl
->ana_log_size
- nsid_buf_size
))
472 error
= cb(ctrl
, desc
, data
);
476 offset
+= nsid_buf_size
;
482 static inline bool nvme_state_is_live(enum nvme_ana_state state
)
484 return state
== NVME_ANA_OPTIMIZED
|| state
== NVME_ANA_NONOPTIMIZED
;
487 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc
*desc
,
490 mutex_lock(&ns
->head
->lock
);
491 ns
->ana_grpid
= le32_to_cpu(desc
->grpid
);
492 ns
->ana_state
= desc
->state
;
493 clear_bit(NVME_NS_ANA_PENDING
, &ns
->flags
);
495 if (nvme_state_is_live(ns
->ana_state
))
496 nvme_mpath_set_live(ns
);
497 mutex_unlock(&ns
->head
->lock
);
500 static int nvme_update_ana_state(struct nvme_ctrl
*ctrl
,
501 struct nvme_ana_group_desc
*desc
, void *data
)
503 u32 nr_nsids
= le32_to_cpu(desc
->nnsids
), n
= 0;
504 unsigned *nr_change_groups
= data
;
507 dev_dbg(ctrl
->device
, "ANA group %d: %s.\n",
508 le32_to_cpu(desc
->grpid
),
509 nvme_ana_state_names
[desc
->state
]);
511 if (desc
->state
== NVME_ANA_CHANGE
)
512 (*nr_change_groups
)++;
517 down_write(&ctrl
->namespaces_rwsem
);
518 list_for_each_entry(ns
, &ctrl
->namespaces
, list
) {
519 unsigned nsid
= le32_to_cpu(desc
->nsids
[n
]);
521 if (ns
->head
->ns_id
< nsid
)
523 if (ns
->head
->ns_id
== nsid
)
524 nvme_update_ns_ana_state(desc
, ns
);
528 up_write(&ctrl
->namespaces_rwsem
);
532 static int nvme_read_ana_log(struct nvme_ctrl
*ctrl
)
534 u32 nr_change_groups
= 0;
537 mutex_lock(&ctrl
->ana_lock
);
538 error
= nvme_get_log(ctrl
, NVME_NSID_ALL
, NVME_LOG_ANA
, 0,
539 ctrl
->ana_log_buf
, ctrl
->ana_log_size
, 0);
541 dev_warn(ctrl
->device
, "Failed to get ANA log: %d\n", error
);
545 error
= nvme_parse_ana_log(ctrl
, &nr_change_groups
,
546 nvme_update_ana_state
);
551 * In theory we should have an ANATT timer per group as they might enter
552 * the change state at different times. But that is a lot of overhead
553 * just to protect against a target that keeps entering new changes
554 * states while never finishing previous ones. But we'll still
555 * eventually time out once all groups are in change state, so this
558 * We also double the ANATT value to provide some slack for transports
559 * or AEN processing overhead.
561 if (nr_change_groups
)
562 mod_timer(&ctrl
->anatt_timer
, ctrl
->anatt
* HZ
* 2 + jiffies
);
564 del_timer_sync(&ctrl
->anatt_timer
);
566 mutex_unlock(&ctrl
->ana_lock
);
570 static void nvme_ana_work(struct work_struct
*work
)
572 struct nvme_ctrl
*ctrl
= container_of(work
, struct nvme_ctrl
, ana_work
);
574 nvme_read_ana_log(ctrl
);
577 static void nvme_anatt_timeout(struct timer_list
*t
)
579 struct nvme_ctrl
*ctrl
= from_timer(ctrl
, t
, anatt_timer
);
581 dev_info(ctrl
->device
, "ANATT timeout, resetting controller.\n");
582 nvme_reset_ctrl(ctrl
);
585 void nvme_mpath_stop(struct nvme_ctrl
*ctrl
)
587 if (!nvme_ctrl_use_ana(ctrl
))
589 del_timer_sync(&ctrl
->anatt_timer
);
590 cancel_work_sync(&ctrl
->ana_work
);
593 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
594 struct device_attribute subsys_attr_##_name = \
595 __ATTR(_name, _mode, _show, _store)
597 static const char *nvme_iopolicy_names
[] = {
598 [NVME_IOPOLICY_NUMA
] = "numa",
599 [NVME_IOPOLICY_RR
] = "round-robin",
602 static ssize_t
nvme_subsys_iopolicy_show(struct device
*dev
,
603 struct device_attribute
*attr
, char *buf
)
605 struct nvme_subsystem
*subsys
=
606 container_of(dev
, struct nvme_subsystem
, dev
);
608 return sprintf(buf
, "%s\n",
609 nvme_iopolicy_names
[READ_ONCE(subsys
->iopolicy
)]);
612 static ssize_t
nvme_subsys_iopolicy_store(struct device
*dev
,
613 struct device_attribute
*attr
, const char *buf
, size_t count
)
615 struct nvme_subsystem
*subsys
=
616 container_of(dev
, struct nvme_subsystem
, dev
);
619 for (i
= 0; i
< ARRAY_SIZE(nvme_iopolicy_names
); i
++) {
620 if (sysfs_streq(buf
, nvme_iopolicy_names
[i
])) {
621 WRITE_ONCE(subsys
->iopolicy
, i
);
628 SUBSYS_ATTR_RW(iopolicy
, S_IRUGO
| S_IWUSR
,
629 nvme_subsys_iopolicy_show
, nvme_subsys_iopolicy_store
);
631 static ssize_t
ana_grpid_show(struct device
*dev
, struct device_attribute
*attr
,
634 return sprintf(buf
, "%d\n", nvme_get_ns_from_dev(dev
)->ana_grpid
);
636 DEVICE_ATTR_RO(ana_grpid
);
638 static ssize_t
ana_state_show(struct device
*dev
, struct device_attribute
*attr
,
641 struct nvme_ns
*ns
= nvme_get_ns_from_dev(dev
);
643 return sprintf(buf
, "%s\n", nvme_ana_state_names
[ns
->ana_state
]);
645 DEVICE_ATTR_RO(ana_state
);
647 static int nvme_set_ns_ana_state(struct nvme_ctrl
*ctrl
,
648 struct nvme_ana_group_desc
*desc
, void *data
)
650 struct nvme_ns
*ns
= data
;
652 if (ns
->ana_grpid
== le32_to_cpu(desc
->grpid
)) {
653 nvme_update_ns_ana_state(desc
, ns
);
654 return -ENXIO
; /* just break out of the loop */
660 void nvme_mpath_add_disk(struct nvme_ns
*ns
, struct nvme_id_ns
*id
)
662 if (nvme_ctrl_use_ana(ns
->ctrl
)) {
663 mutex_lock(&ns
->ctrl
->ana_lock
);
664 ns
->ana_grpid
= le32_to_cpu(id
->anagrpid
);
665 nvme_parse_ana_log(ns
->ctrl
, ns
, nvme_set_ns_ana_state
);
666 mutex_unlock(&ns
->ctrl
->ana_lock
);
668 mutex_lock(&ns
->head
->lock
);
669 ns
->ana_state
= NVME_ANA_OPTIMIZED
;
670 nvme_mpath_set_live(ns
);
671 mutex_unlock(&ns
->head
->lock
);
675 void nvme_mpath_remove_disk(struct nvme_ns_head
*head
)
679 if (head
->disk
->flags
& GENHD_FL_UP
)
680 del_gendisk(head
->disk
);
681 blk_set_queue_dying(head
->disk
->queue
);
682 /* make sure all pending bios are cleaned up */
683 kblockd_schedule_work(&head
->requeue_work
);
684 flush_work(&head
->requeue_work
);
685 blk_cleanup_queue(head
->disk
->queue
);
686 put_disk(head
->disk
);
689 int nvme_mpath_init(struct nvme_ctrl
*ctrl
, struct nvme_id_ctrl
*id
)
693 /* check if multipath is enabled and we have the capability */
694 if (!multipath
|| !ctrl
->subsys
|| !(ctrl
->subsys
->cmic
& (1 << 3)))
697 ctrl
->anacap
= id
->anacap
;
698 ctrl
->anatt
= id
->anatt
;
699 ctrl
->nanagrpid
= le32_to_cpu(id
->nanagrpid
);
700 ctrl
->anagrpmax
= le32_to_cpu(id
->anagrpmax
);
702 mutex_init(&ctrl
->ana_lock
);
703 timer_setup(&ctrl
->anatt_timer
, nvme_anatt_timeout
, 0);
704 ctrl
->ana_log_size
= sizeof(struct nvme_ana_rsp_hdr
) +
705 ctrl
->nanagrpid
* sizeof(struct nvme_ana_group_desc
);
706 ctrl
->ana_log_size
+= ctrl
->max_namespaces
* sizeof(__le32
);
708 if (ctrl
->ana_log_size
> ctrl
->max_hw_sectors
<< SECTOR_SHIFT
) {
709 dev_err(ctrl
->device
,
710 "ANA log page size (%zd) larger than MDTS (%d).\n",
712 ctrl
->max_hw_sectors
<< SECTOR_SHIFT
);
713 dev_err(ctrl
->device
, "disabling ANA support.\n");
717 INIT_WORK(&ctrl
->ana_work
, nvme_ana_work
);
718 ctrl
->ana_log_buf
= kmalloc(ctrl
->ana_log_size
, GFP_KERNEL
);
719 if (!ctrl
->ana_log_buf
) {
724 error
= nvme_read_ana_log(ctrl
);
726 goto out_free_ana_log_buf
;
728 out_free_ana_log_buf
:
729 kfree(ctrl
->ana_log_buf
);
730 ctrl
->ana_log_buf
= NULL
;
735 void nvme_mpath_uninit(struct nvme_ctrl
*ctrl
)
737 kfree(ctrl
->ana_log_buf
);
738 ctrl
->ana_log_buf
= NULL
;