1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2017-2018 Christoph Hellwig.
6 #include <linux/backing-dev.h>
7 #include <linux/moduleparam.h>
8 #include <linux/vmalloc.h>
9 #include <trace/events/block.h>
12 bool multipath
= true;
13 module_param(multipath
, bool, 0444);
14 MODULE_PARM_DESC(multipath
,
15 "turn on native support for multiple controllers per subsystem");
17 static const char *nvme_iopolicy_names
[] = {
18 [NVME_IOPOLICY_NUMA
] = "numa",
19 [NVME_IOPOLICY_RR
] = "round-robin",
20 [NVME_IOPOLICY_QD
] = "queue-depth",
23 static int iopolicy
= NVME_IOPOLICY_NUMA
;
25 static int nvme_set_iopolicy(const char *val
, const struct kernel_param
*kp
)
29 if (!strncmp(val
, "numa", 4))
30 iopolicy
= NVME_IOPOLICY_NUMA
;
31 else if (!strncmp(val
, "round-robin", 11))
32 iopolicy
= NVME_IOPOLICY_RR
;
33 else if (!strncmp(val
, "queue-depth", 11))
34 iopolicy
= NVME_IOPOLICY_QD
;
41 static int nvme_get_iopolicy(char *buf
, const struct kernel_param
*kp
)
43 return sprintf(buf
, "%s\n", nvme_iopolicy_names
[iopolicy
]);
46 module_param_call(iopolicy
, nvme_set_iopolicy
, nvme_get_iopolicy
,
48 MODULE_PARM_DESC(iopolicy
,
49 "Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
51 void nvme_mpath_default_iopolicy(struct nvme_subsystem
*subsys
)
53 subsys
->iopolicy
= iopolicy
;
56 void nvme_mpath_unfreeze(struct nvme_subsystem
*subsys
)
58 struct nvme_ns_head
*h
;
60 lockdep_assert_held(&subsys
->lock
);
61 list_for_each_entry(h
, &subsys
->nsheads
, entry
)
63 blk_mq_unfreeze_queue(h
->disk
->queue
);
66 void nvme_mpath_wait_freeze(struct nvme_subsystem
*subsys
)
68 struct nvme_ns_head
*h
;
70 lockdep_assert_held(&subsys
->lock
);
71 list_for_each_entry(h
, &subsys
->nsheads
, entry
)
73 blk_mq_freeze_queue_wait(h
->disk
->queue
);
76 void nvme_mpath_start_freeze(struct nvme_subsystem
*subsys
)
78 struct nvme_ns_head
*h
;
80 lockdep_assert_held(&subsys
->lock
);
81 list_for_each_entry(h
, &subsys
->nsheads
, entry
)
83 blk_freeze_queue_start(h
->disk
->queue
);
86 void nvme_failover_req(struct request
*req
)
88 struct nvme_ns
*ns
= req
->q
->queuedata
;
89 u16 status
= nvme_req(req
)->status
& NVME_SCT_SC_MASK
;
93 nvme_mpath_clear_current_path(ns
);
96 * If we got back an ANA error, we know the controller is alive but not
97 * ready to serve this namespace. Kick of a re-read of the ANA
98 * information page, and just try any other available path for now.
100 if (nvme_is_ana_error(status
) && ns
->ctrl
->ana_log_buf
) {
101 set_bit(NVME_NS_ANA_PENDING
, &ns
->flags
);
102 queue_work(nvme_wq
, &ns
->ctrl
->ana_work
);
105 spin_lock_irqsave(&ns
->head
->requeue_lock
, flags
);
106 for (bio
= req
->bio
; bio
; bio
= bio
->bi_next
) {
107 bio_set_dev(bio
, ns
->head
->disk
->part0
);
108 if (bio
->bi_opf
& REQ_POLLED
) {
109 bio
->bi_opf
&= ~REQ_POLLED
;
110 bio
->bi_cookie
= BLK_QC_T_NONE
;
113 * The alternate request queue that we may end up submitting
114 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
115 * will fail the I/O immediately with EAGAIN to the issuer.
116 * We are not in the issuer context which cannot block. Clear
117 * the flag to avoid spurious EAGAIN I/O failures.
119 bio
->bi_opf
&= ~REQ_NOWAIT
;
121 blk_steal_bios(&ns
->head
->requeue_list
, req
);
122 spin_unlock_irqrestore(&ns
->head
->requeue_lock
, flags
);
124 nvme_req(req
)->status
= 0;
126 kblockd_schedule_work(&ns
->head
->requeue_work
);
129 void nvme_mpath_start_request(struct request
*rq
)
131 struct nvme_ns
*ns
= rq
->q
->queuedata
;
132 struct gendisk
*disk
= ns
->head
->disk
;
134 if (READ_ONCE(ns
->head
->subsys
->iopolicy
) == NVME_IOPOLICY_QD
) {
135 atomic_inc(&ns
->ctrl
->nr_active
);
136 nvme_req(rq
)->flags
|= NVME_MPATH_CNT_ACTIVE
;
139 if (!blk_queue_io_stat(disk
->queue
) || blk_rq_is_passthrough(rq
))
142 nvme_req(rq
)->flags
|= NVME_MPATH_IO_STATS
;
143 nvme_req(rq
)->start_time
= bdev_start_io_acct(disk
->part0
, req_op(rq
),
146 EXPORT_SYMBOL_GPL(nvme_mpath_start_request
);
148 void nvme_mpath_end_request(struct request
*rq
)
150 struct nvme_ns
*ns
= rq
->q
->queuedata
;
152 if (nvme_req(rq
)->flags
& NVME_MPATH_CNT_ACTIVE
)
153 atomic_dec_if_positive(&ns
->ctrl
->nr_active
);
155 if (!(nvme_req(rq
)->flags
& NVME_MPATH_IO_STATS
))
157 bdev_end_io_acct(ns
->head
->disk
->part0
, req_op(rq
),
158 blk_rq_bytes(rq
) >> SECTOR_SHIFT
,
159 nvme_req(rq
)->start_time
);
162 void nvme_kick_requeue_lists(struct nvme_ctrl
*ctrl
)
167 srcu_idx
= srcu_read_lock(&ctrl
->srcu
);
168 list_for_each_entry_srcu(ns
, &ctrl
->namespaces
, list
,
169 srcu_read_lock_held(&ctrl
->srcu
)) {
172 kblockd_schedule_work(&ns
->head
->requeue_work
);
173 if (nvme_ctrl_state(ns
->ctrl
) == NVME_CTRL_LIVE
)
174 disk_uevent(ns
->head
->disk
, KOBJ_CHANGE
);
176 srcu_read_unlock(&ctrl
->srcu
, srcu_idx
);
179 static const char *nvme_ana_state_names
[] = {
180 [0] = "invalid state",
181 [NVME_ANA_OPTIMIZED
] = "optimized",
182 [NVME_ANA_NONOPTIMIZED
] = "non-optimized",
183 [NVME_ANA_INACCESSIBLE
] = "inaccessible",
184 [NVME_ANA_PERSISTENT_LOSS
] = "persistent-loss",
185 [NVME_ANA_CHANGE
] = "change",
188 bool nvme_mpath_clear_current_path(struct nvme_ns
*ns
)
190 struct nvme_ns_head
*head
= ns
->head
;
191 bool changed
= false;
197 for_each_node(node
) {
198 if (ns
== rcu_access_pointer(head
->current_path
[node
])) {
199 rcu_assign_pointer(head
->current_path
[node
], NULL
);
207 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl
*ctrl
)
212 srcu_idx
= srcu_read_lock(&ctrl
->srcu
);
213 list_for_each_entry_srcu(ns
, &ctrl
->namespaces
, list
,
214 srcu_read_lock_held(&ctrl
->srcu
)) {
215 nvme_mpath_clear_current_path(ns
);
216 kblockd_schedule_work(&ns
->head
->requeue_work
);
218 srcu_read_unlock(&ctrl
->srcu
, srcu_idx
);
221 void nvme_mpath_revalidate_paths(struct nvme_ns
*ns
)
223 struct nvme_ns_head
*head
= ns
->head
;
224 sector_t capacity
= get_capacity(head
->disk
);
228 srcu_idx
= srcu_read_lock(&head
->srcu
);
229 list_for_each_entry_srcu(ns
, &head
->list
, siblings
,
230 srcu_read_lock_held(&head
->srcu
)) {
231 if (capacity
!= get_capacity(ns
->disk
))
232 clear_bit(NVME_NS_READY
, &ns
->flags
);
234 srcu_read_unlock(&head
->srcu
, srcu_idx
);
237 rcu_assign_pointer(head
->current_path
[node
], NULL
);
238 kblockd_schedule_work(&head
->requeue_work
);
241 static bool nvme_path_is_disabled(struct nvme_ns
*ns
)
243 enum nvme_ctrl_state state
= nvme_ctrl_state(ns
->ctrl
);
246 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
247 * still be able to complete assuming that the controller is connected.
248 * Otherwise it will fail immediately and return to the requeue list.
250 if (state
!= NVME_CTRL_LIVE
&& state
!= NVME_CTRL_DELETING
)
252 if (test_bit(NVME_NS_ANA_PENDING
, &ns
->flags
) ||
253 !test_bit(NVME_NS_READY
, &ns
->flags
))
258 static struct nvme_ns
*__nvme_find_path(struct nvme_ns_head
*head
, int node
)
260 int found_distance
= INT_MAX
, fallback_distance
= INT_MAX
, distance
;
261 struct nvme_ns
*found
= NULL
, *fallback
= NULL
, *ns
;
263 list_for_each_entry_srcu(ns
, &head
->list
, siblings
,
264 srcu_read_lock_held(&head
->srcu
)) {
265 if (nvme_path_is_disabled(ns
))
268 if (ns
->ctrl
->numa_node
!= NUMA_NO_NODE
&&
269 READ_ONCE(head
->subsys
->iopolicy
) == NVME_IOPOLICY_NUMA
)
270 distance
= node_distance(node
, ns
->ctrl
->numa_node
);
272 distance
= LOCAL_DISTANCE
;
274 switch (ns
->ana_state
) {
275 case NVME_ANA_OPTIMIZED
:
276 if (distance
< found_distance
) {
277 found_distance
= distance
;
281 case NVME_ANA_NONOPTIMIZED
:
282 if (distance
< fallback_distance
) {
283 fallback_distance
= distance
;
295 rcu_assign_pointer(head
->current_path
[node
], found
);
299 static struct nvme_ns
*nvme_next_ns(struct nvme_ns_head
*head
,
302 ns
= list_next_or_null_rcu(&head
->list
, &ns
->siblings
, struct nvme_ns
,
306 return list_first_or_null_rcu(&head
->list
, struct nvme_ns
, siblings
);
309 static struct nvme_ns
*nvme_round_robin_path(struct nvme_ns_head
*head
)
311 struct nvme_ns
*ns
, *found
= NULL
;
312 int node
= numa_node_id();
313 struct nvme_ns
*old
= srcu_dereference(head
->current_path
[node
],
317 return __nvme_find_path(head
, node
);
319 if (list_is_singular(&head
->list
)) {
320 if (nvme_path_is_disabled(old
))
325 for (ns
= nvme_next_ns(head
, old
);
327 ns
= nvme_next_ns(head
, ns
)) {
328 if (nvme_path_is_disabled(ns
))
331 if (ns
->ana_state
== NVME_ANA_OPTIMIZED
) {
335 if (ns
->ana_state
== NVME_ANA_NONOPTIMIZED
)
340 * The loop above skips the current path for round-robin semantics.
341 * Fall back to the current path if either:
342 * - no other optimized path found and current is optimized,
343 * - no other usable path found and current is usable.
345 if (!nvme_path_is_disabled(old
) &&
346 (old
->ana_state
== NVME_ANA_OPTIMIZED
||
347 (!found
&& old
->ana_state
== NVME_ANA_NONOPTIMIZED
)))
353 rcu_assign_pointer(head
->current_path
[node
], found
);
357 static struct nvme_ns
*nvme_queue_depth_path(struct nvme_ns_head
*head
)
359 struct nvme_ns
*best_opt
= NULL
, *best_nonopt
= NULL
, *ns
;
360 unsigned int min_depth_opt
= UINT_MAX
, min_depth_nonopt
= UINT_MAX
;
363 list_for_each_entry_srcu(ns
, &head
->list
, siblings
,
364 srcu_read_lock_held(&head
->srcu
)) {
365 if (nvme_path_is_disabled(ns
))
368 depth
= atomic_read(&ns
->ctrl
->nr_active
);
370 switch (ns
->ana_state
) {
371 case NVME_ANA_OPTIMIZED
:
372 if (depth
< min_depth_opt
) {
373 min_depth_opt
= depth
;
377 case NVME_ANA_NONOPTIMIZED
:
378 if (depth
< min_depth_nonopt
) {
379 min_depth_nonopt
= depth
;
387 if (min_depth_opt
== 0)
391 return best_opt
? best_opt
: best_nonopt
;
394 static inline bool nvme_path_is_optimized(struct nvme_ns
*ns
)
396 return nvme_ctrl_state(ns
->ctrl
) == NVME_CTRL_LIVE
&&
397 ns
->ana_state
== NVME_ANA_OPTIMIZED
;
400 static struct nvme_ns
*nvme_numa_path(struct nvme_ns_head
*head
)
402 int node
= numa_node_id();
405 ns
= srcu_dereference(head
->current_path
[node
], &head
->srcu
);
407 return __nvme_find_path(head
, node
);
408 if (unlikely(!nvme_path_is_optimized(ns
)))
409 return __nvme_find_path(head
, node
);
413 inline struct nvme_ns
*nvme_find_path(struct nvme_ns_head
*head
)
415 switch (READ_ONCE(head
->subsys
->iopolicy
)) {
416 case NVME_IOPOLICY_QD
:
417 return nvme_queue_depth_path(head
);
418 case NVME_IOPOLICY_RR
:
419 return nvme_round_robin_path(head
);
421 return nvme_numa_path(head
);
425 static bool nvme_available_path(struct nvme_ns_head
*head
)
429 if (!test_bit(NVME_NSHEAD_DISK_LIVE
, &head
->flags
))
432 list_for_each_entry_srcu(ns
, &head
->list
, siblings
,
433 srcu_read_lock_held(&head
->srcu
)) {
434 if (test_bit(NVME_CTRL_FAILFAST_EXPIRED
, &ns
->ctrl
->flags
))
436 switch (nvme_ctrl_state(ns
->ctrl
)) {
438 case NVME_CTRL_RESETTING
:
439 case NVME_CTRL_CONNECTING
:
448 static void nvme_ns_head_submit_bio(struct bio
*bio
)
450 struct nvme_ns_head
*head
= bio
->bi_bdev
->bd_disk
->private_data
;
451 struct device
*dev
= disk_to_dev(head
->disk
);
456 * The namespace might be going away and the bio might be moved to a
457 * different queue via blk_steal_bios(), so we need to use the bio_split
458 * pool from the original queue to allocate the bvecs from.
460 bio
= bio_split_to_limits(bio
);
464 srcu_idx
= srcu_read_lock(&head
->srcu
);
465 ns
= nvme_find_path(head
);
467 bio_set_dev(bio
, ns
->disk
->part0
);
468 bio
->bi_opf
|= REQ_NVME_MPATH
;
469 trace_block_bio_remap(bio
, disk_devt(ns
->head
->disk
),
470 bio
->bi_iter
.bi_sector
);
471 submit_bio_noacct(bio
);
472 } else if (nvme_available_path(head
)) {
473 dev_warn_ratelimited(dev
, "no usable path - requeuing I/O\n");
475 spin_lock_irq(&head
->requeue_lock
);
476 bio_list_add(&head
->requeue_list
, bio
);
477 spin_unlock_irq(&head
->requeue_lock
);
479 dev_warn_ratelimited(dev
, "no available path - failing I/O\n");
484 srcu_read_unlock(&head
->srcu
, srcu_idx
);
487 static int nvme_ns_head_open(struct gendisk
*disk
, blk_mode_t mode
)
489 if (!nvme_tryget_ns_head(disk
->private_data
))
494 static void nvme_ns_head_release(struct gendisk
*disk
)
496 nvme_put_ns_head(disk
->private_data
);
499 static int nvme_ns_head_get_unique_id(struct gendisk
*disk
, u8 id
[16],
500 enum blk_unique_id type
)
502 struct nvme_ns_head
*head
= disk
->private_data
;
504 int srcu_idx
, ret
= -EWOULDBLOCK
;
506 srcu_idx
= srcu_read_lock(&head
->srcu
);
507 ns
= nvme_find_path(head
);
509 ret
= nvme_ns_get_unique_id(ns
, id
, type
);
510 srcu_read_unlock(&head
->srcu
, srcu_idx
);
514 #ifdef CONFIG_BLK_DEV_ZONED
515 static int nvme_ns_head_report_zones(struct gendisk
*disk
, sector_t sector
,
516 unsigned int nr_zones
, report_zones_cb cb
, void *data
)
518 struct nvme_ns_head
*head
= disk
->private_data
;
520 int srcu_idx
, ret
= -EWOULDBLOCK
;
522 srcu_idx
= srcu_read_lock(&head
->srcu
);
523 ns
= nvme_find_path(head
);
525 ret
= nvme_ns_report_zones(ns
, sector
, nr_zones
, cb
, data
);
526 srcu_read_unlock(&head
->srcu
, srcu_idx
);
530 #define nvme_ns_head_report_zones NULL
531 #endif /* CONFIG_BLK_DEV_ZONED */
533 const struct block_device_operations nvme_ns_head_ops
= {
534 .owner
= THIS_MODULE
,
535 .submit_bio
= nvme_ns_head_submit_bio
,
536 .open
= nvme_ns_head_open
,
537 .release
= nvme_ns_head_release
,
538 .ioctl
= nvme_ns_head_ioctl
,
539 .compat_ioctl
= blkdev_compat_ptr_ioctl
,
540 .getgeo
= nvme_getgeo
,
541 .get_unique_id
= nvme_ns_head_get_unique_id
,
542 .report_zones
= nvme_ns_head_report_zones
,
543 .pr_ops
= &nvme_pr_ops
,
546 static inline struct nvme_ns_head
*cdev_to_ns_head(struct cdev
*cdev
)
548 return container_of(cdev
, struct nvme_ns_head
, cdev
);
551 static int nvme_ns_head_chr_open(struct inode
*inode
, struct file
*file
)
553 if (!nvme_tryget_ns_head(cdev_to_ns_head(inode
->i_cdev
)))
558 static int nvme_ns_head_chr_release(struct inode
*inode
, struct file
*file
)
560 nvme_put_ns_head(cdev_to_ns_head(inode
->i_cdev
));
564 static const struct file_operations nvme_ns_head_chr_fops
= {
565 .owner
= THIS_MODULE
,
566 .open
= nvme_ns_head_chr_open
,
567 .release
= nvme_ns_head_chr_release
,
568 .unlocked_ioctl
= nvme_ns_head_chr_ioctl
,
569 .compat_ioctl
= compat_ptr_ioctl
,
570 .uring_cmd
= nvme_ns_head_chr_uring_cmd
,
571 .uring_cmd_iopoll
= nvme_ns_chr_uring_cmd_iopoll
,
574 static int nvme_add_ns_head_cdev(struct nvme_ns_head
*head
)
578 head
->cdev_device
.parent
= &head
->subsys
->dev
;
579 ret
= dev_set_name(&head
->cdev_device
, "ng%dn%d",
580 head
->subsys
->instance
, head
->instance
);
583 ret
= nvme_cdev_add(&head
->cdev
, &head
->cdev_device
,
584 &nvme_ns_head_chr_fops
, THIS_MODULE
);
588 static void nvme_partition_scan_work(struct work_struct
*work
)
590 struct nvme_ns_head
*head
=
591 container_of(work
, struct nvme_ns_head
, partition_scan_work
);
593 if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN
,
594 &head
->disk
->state
)))
597 mutex_lock(&head
->disk
->open_mutex
);
598 bdev_disk_changed(head
->disk
, false);
599 mutex_unlock(&head
->disk
->open_mutex
);
602 static void nvme_requeue_work(struct work_struct
*work
)
604 struct nvme_ns_head
*head
=
605 container_of(work
, struct nvme_ns_head
, requeue_work
);
606 struct bio
*bio
, *next
;
608 spin_lock_irq(&head
->requeue_lock
);
609 next
= bio_list_get(&head
->requeue_list
);
610 spin_unlock_irq(&head
->requeue_lock
);
612 while ((bio
= next
) != NULL
) {
616 submit_bio_noacct(bio
);
620 int nvme_mpath_alloc_disk(struct nvme_ctrl
*ctrl
, struct nvme_ns_head
*head
)
622 struct queue_limits lim
;
624 mutex_init(&head
->lock
);
625 bio_list_init(&head
->requeue_list
);
626 spin_lock_init(&head
->requeue_lock
);
627 INIT_WORK(&head
->requeue_work
, nvme_requeue_work
);
628 INIT_WORK(&head
->partition_scan_work
, nvme_partition_scan_work
);
631 * Add a multipath node if the subsystems supports multiple controllers.
632 * We also do this for private namespaces as the namespace sharing flag
633 * could change after a rescan.
635 if (!(ctrl
->subsys
->cmic
& NVME_CTRL_CMIC_MULTI_CTRL
) ||
636 !nvme_is_unique_nsid(ctrl
, head
) || !multipath
)
639 blk_set_stacking_limits(&lim
);
640 lim
.dma_alignment
= 3;
641 lim
.features
|= BLK_FEAT_IO_STAT
| BLK_FEAT_NOWAIT
| BLK_FEAT_POLL
;
642 if (head
->ids
.csi
== NVME_CSI_ZNS
)
643 lim
.features
|= BLK_FEAT_ZONED
;
645 head
->disk
= blk_alloc_disk(&lim
, ctrl
->numa_node
);
646 if (IS_ERR(head
->disk
))
647 return PTR_ERR(head
->disk
);
648 head
->disk
->fops
= &nvme_ns_head_ops
;
649 head
->disk
->private_data
= head
;
652 * We need to suppress the partition scan from occuring within the
653 * controller's scan_work context. If a path error occurs here, the IO
654 * will wait until a path becomes available or all paths are torn down,
655 * but that action also occurs within scan_work, so it would deadlock.
656 * Defer the partion scan to a different context that does not block
659 set_bit(GD_SUPPRESS_PART_SCAN
, &head
->disk
->state
);
660 sprintf(head
->disk
->disk_name
, "nvme%dn%d",
661 ctrl
->subsys
->instance
, head
->instance
);
665 static void nvme_mpath_set_live(struct nvme_ns
*ns
)
667 struct nvme_ns_head
*head
= ns
->head
;
674 * test_and_set_bit() is used because it is protecting against two nvme
675 * paths simultaneously calling device_add_disk() on the same namespace
678 if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE
, &head
->flags
)) {
679 rc
= device_add_disk(&head
->subsys
->dev
, head
->disk
,
680 nvme_ns_attr_groups
);
682 clear_bit(NVME_NSHEAD_DISK_LIVE
, &head
->flags
);
685 nvme_add_ns_head_cdev(head
);
686 kblockd_schedule_work(&head
->partition_scan_work
);
689 mutex_lock(&head
->lock
);
690 if (nvme_path_is_optimized(ns
)) {
693 srcu_idx
= srcu_read_lock(&head
->srcu
);
694 for_each_online_node(node
)
695 __nvme_find_path(head
, node
);
696 srcu_read_unlock(&head
->srcu
, srcu_idx
);
698 mutex_unlock(&head
->lock
);
700 synchronize_srcu(&head
->srcu
);
701 kblockd_schedule_work(&head
->requeue_work
);
704 static int nvme_parse_ana_log(struct nvme_ctrl
*ctrl
, void *data
,
705 int (*cb
)(struct nvme_ctrl
*ctrl
, struct nvme_ana_group_desc
*,
708 void *base
= ctrl
->ana_log_buf
;
709 size_t offset
= sizeof(struct nvme_ana_rsp_hdr
);
712 lockdep_assert_held(&ctrl
->ana_lock
);
714 for (i
= 0; i
< le16_to_cpu(ctrl
->ana_log_buf
->ngrps
); i
++) {
715 struct nvme_ana_group_desc
*desc
= base
+ offset
;
717 size_t nsid_buf_size
;
719 if (WARN_ON_ONCE(offset
> ctrl
->ana_log_size
- sizeof(*desc
)))
722 nr_nsids
= le32_to_cpu(desc
->nnsids
);
723 nsid_buf_size
= flex_array_size(desc
, nsids
, nr_nsids
);
725 if (WARN_ON_ONCE(desc
->grpid
== 0))
727 if (WARN_ON_ONCE(le32_to_cpu(desc
->grpid
) > ctrl
->anagrpmax
))
729 if (WARN_ON_ONCE(desc
->state
== 0))
731 if (WARN_ON_ONCE(desc
->state
> NVME_ANA_CHANGE
))
734 offset
+= sizeof(*desc
);
735 if (WARN_ON_ONCE(offset
> ctrl
->ana_log_size
- nsid_buf_size
))
738 error
= cb(ctrl
, desc
, data
);
742 offset
+= nsid_buf_size
;
748 static inline bool nvme_state_is_live(enum nvme_ana_state state
)
750 return state
== NVME_ANA_OPTIMIZED
|| state
== NVME_ANA_NONOPTIMIZED
;
753 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc
*desc
,
756 ns
->ana_grpid
= le32_to_cpu(desc
->grpid
);
757 ns
->ana_state
= desc
->state
;
758 clear_bit(NVME_NS_ANA_PENDING
, &ns
->flags
);
760 * nvme_mpath_set_live() will trigger I/O to the multipath path device
761 * and in turn to this path device. However we cannot accept this I/O
762 * if the controller is not live. This may deadlock if called from
763 * nvme_mpath_init_identify() and the ctrl will never complete
764 * initialization, preventing I/O from completing. For this case we
765 * will reprocess the ANA log page in nvme_mpath_update() once the
766 * controller is ready.
768 if (nvme_state_is_live(ns
->ana_state
) &&
769 nvme_ctrl_state(ns
->ctrl
) == NVME_CTRL_LIVE
)
770 nvme_mpath_set_live(ns
);
773 static int nvme_update_ana_state(struct nvme_ctrl
*ctrl
,
774 struct nvme_ana_group_desc
*desc
, void *data
)
776 u32 nr_nsids
= le32_to_cpu(desc
->nnsids
), n
= 0;
777 unsigned *nr_change_groups
= data
;
781 dev_dbg(ctrl
->device
, "ANA group %d: %s.\n",
782 le32_to_cpu(desc
->grpid
),
783 nvme_ana_state_names
[desc
->state
]);
785 if (desc
->state
== NVME_ANA_CHANGE
)
786 (*nr_change_groups
)++;
791 srcu_idx
= srcu_read_lock(&ctrl
->srcu
);
792 list_for_each_entry_srcu(ns
, &ctrl
->namespaces
, list
,
793 srcu_read_lock_held(&ctrl
->srcu
)) {
796 nsid
= le32_to_cpu(desc
->nsids
[n
]);
797 if (ns
->head
->ns_id
< nsid
)
799 if (ns
->head
->ns_id
== nsid
)
800 nvme_update_ns_ana_state(desc
, ns
);
803 if (ns
->head
->ns_id
> nsid
)
806 srcu_read_unlock(&ctrl
->srcu
, srcu_idx
);
810 static int nvme_read_ana_log(struct nvme_ctrl
*ctrl
)
812 u32 nr_change_groups
= 0;
815 mutex_lock(&ctrl
->ana_lock
);
816 error
= nvme_get_log(ctrl
, NVME_NSID_ALL
, NVME_LOG_ANA
, 0, NVME_CSI_NVM
,
817 ctrl
->ana_log_buf
, ctrl
->ana_log_size
, 0);
819 dev_warn(ctrl
->device
, "Failed to get ANA log: %d\n", error
);
823 error
= nvme_parse_ana_log(ctrl
, &nr_change_groups
,
824 nvme_update_ana_state
);
829 * In theory we should have an ANATT timer per group as they might enter
830 * the change state at different times. But that is a lot of overhead
831 * just to protect against a target that keeps entering new changes
832 * states while never finishing previous ones. But we'll still
833 * eventually time out once all groups are in change state, so this
836 * We also double the ANATT value to provide some slack for transports
837 * or AEN processing overhead.
839 if (nr_change_groups
)
840 mod_timer(&ctrl
->anatt_timer
, ctrl
->anatt
* HZ
* 2 + jiffies
);
842 del_timer_sync(&ctrl
->anatt_timer
);
844 mutex_unlock(&ctrl
->ana_lock
);
848 static void nvme_ana_work(struct work_struct
*work
)
850 struct nvme_ctrl
*ctrl
= container_of(work
, struct nvme_ctrl
, ana_work
);
852 if (nvme_ctrl_state(ctrl
) != NVME_CTRL_LIVE
)
855 nvme_read_ana_log(ctrl
);
858 void nvme_mpath_update(struct nvme_ctrl
*ctrl
)
860 u32 nr_change_groups
= 0;
862 if (!ctrl
->ana_log_buf
)
865 mutex_lock(&ctrl
->ana_lock
);
866 nvme_parse_ana_log(ctrl
, &nr_change_groups
, nvme_update_ana_state
);
867 mutex_unlock(&ctrl
->ana_lock
);
870 static void nvme_anatt_timeout(struct timer_list
*t
)
872 struct nvme_ctrl
*ctrl
= from_timer(ctrl
, t
, anatt_timer
);
874 dev_info(ctrl
->device
, "ANATT timeout, resetting controller.\n");
875 nvme_reset_ctrl(ctrl
);
878 void nvme_mpath_stop(struct nvme_ctrl
*ctrl
)
880 if (!nvme_ctrl_use_ana(ctrl
))
882 del_timer_sync(&ctrl
->anatt_timer
);
883 cancel_work_sync(&ctrl
->ana_work
);
886 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
887 struct device_attribute subsys_attr_##_name = \
888 __ATTR(_name, _mode, _show, _store)
890 static ssize_t
nvme_subsys_iopolicy_show(struct device
*dev
,
891 struct device_attribute
*attr
, char *buf
)
893 struct nvme_subsystem
*subsys
=
894 container_of(dev
, struct nvme_subsystem
, dev
);
896 return sysfs_emit(buf
, "%s\n",
897 nvme_iopolicy_names
[READ_ONCE(subsys
->iopolicy
)]);
900 static void nvme_subsys_iopolicy_update(struct nvme_subsystem
*subsys
,
903 struct nvme_ctrl
*ctrl
;
904 int old_iopolicy
= READ_ONCE(subsys
->iopolicy
);
906 if (old_iopolicy
== iopolicy
)
909 WRITE_ONCE(subsys
->iopolicy
, iopolicy
);
911 /* iopolicy changes clear the mpath by design */
912 mutex_lock(&nvme_subsystems_lock
);
913 list_for_each_entry(ctrl
, &subsys
->ctrls
, subsys_entry
)
914 nvme_mpath_clear_ctrl_paths(ctrl
);
915 mutex_unlock(&nvme_subsystems_lock
);
917 pr_notice("subsysnqn %s iopolicy changed from %s to %s\n",
919 nvme_iopolicy_names
[old_iopolicy
],
920 nvme_iopolicy_names
[iopolicy
]);
923 static ssize_t
nvme_subsys_iopolicy_store(struct device
*dev
,
924 struct device_attribute
*attr
, const char *buf
, size_t count
)
926 struct nvme_subsystem
*subsys
=
927 container_of(dev
, struct nvme_subsystem
, dev
);
930 for (i
= 0; i
< ARRAY_SIZE(nvme_iopolicy_names
); i
++) {
931 if (sysfs_streq(buf
, nvme_iopolicy_names
[i
])) {
932 nvme_subsys_iopolicy_update(subsys
, i
);
939 SUBSYS_ATTR_RW(iopolicy
, S_IRUGO
| S_IWUSR
,
940 nvme_subsys_iopolicy_show
, nvme_subsys_iopolicy_store
);
942 static ssize_t
ana_grpid_show(struct device
*dev
, struct device_attribute
*attr
,
945 return sysfs_emit(buf
, "%d\n", nvme_get_ns_from_dev(dev
)->ana_grpid
);
947 DEVICE_ATTR_RO(ana_grpid
);
949 static ssize_t
ana_state_show(struct device
*dev
, struct device_attribute
*attr
,
952 struct nvme_ns
*ns
= nvme_get_ns_from_dev(dev
);
954 return sysfs_emit(buf
, "%s\n", nvme_ana_state_names
[ns
->ana_state
]);
956 DEVICE_ATTR_RO(ana_state
);
958 static int nvme_lookup_ana_group_desc(struct nvme_ctrl
*ctrl
,
959 struct nvme_ana_group_desc
*desc
, void *data
)
961 struct nvme_ana_group_desc
*dst
= data
;
963 if (desc
->grpid
!= dst
->grpid
)
967 return -ENXIO
; /* just break out of the loop */
970 void nvme_mpath_add_disk(struct nvme_ns
*ns
, __le32 anagrpid
)
972 if (nvme_ctrl_use_ana(ns
->ctrl
)) {
973 struct nvme_ana_group_desc desc
= {
978 mutex_lock(&ns
->ctrl
->ana_lock
);
979 ns
->ana_grpid
= le32_to_cpu(anagrpid
);
980 nvme_parse_ana_log(ns
->ctrl
, &desc
, nvme_lookup_ana_group_desc
);
981 mutex_unlock(&ns
->ctrl
->ana_lock
);
983 /* found the group desc: update */
984 nvme_update_ns_ana_state(&desc
, ns
);
986 /* group desc not found: trigger a re-read */
987 set_bit(NVME_NS_ANA_PENDING
, &ns
->flags
);
988 queue_work(nvme_wq
, &ns
->ctrl
->ana_work
);
991 ns
->ana_state
= NVME_ANA_OPTIMIZED
;
992 nvme_mpath_set_live(ns
);
995 #ifdef CONFIG_BLK_DEV_ZONED
996 if (blk_queue_is_zoned(ns
->queue
) && ns
->head
->disk
)
997 ns
->head
->disk
->nr_zones
= ns
->disk
->nr_zones
;
1001 void nvme_mpath_shutdown_disk(struct nvme_ns_head
*head
)
1005 if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE
, &head
->flags
)) {
1006 nvme_cdev_del(&head
->cdev
, &head
->cdev_device
);
1008 * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
1009 * to allow multipath to fail all I/O.
1011 synchronize_srcu(&head
->srcu
);
1012 kblockd_schedule_work(&head
->requeue_work
);
1013 del_gendisk(head
->disk
);
1017 void nvme_mpath_remove_disk(struct nvme_ns_head
*head
)
1021 /* make sure all pending bios are cleaned up */
1022 kblockd_schedule_work(&head
->requeue_work
);
1023 flush_work(&head
->requeue_work
);
1024 flush_work(&head
->partition_scan_work
);
1025 put_disk(head
->disk
);
1028 void nvme_mpath_init_ctrl(struct nvme_ctrl
*ctrl
)
1030 mutex_init(&ctrl
->ana_lock
);
1031 timer_setup(&ctrl
->anatt_timer
, nvme_anatt_timeout
, 0);
1032 INIT_WORK(&ctrl
->ana_work
, nvme_ana_work
);
1035 int nvme_mpath_init_identify(struct nvme_ctrl
*ctrl
, struct nvme_id_ctrl
*id
)
1037 size_t max_transfer_size
= ctrl
->max_hw_sectors
<< SECTOR_SHIFT
;
1038 size_t ana_log_size
;
1041 /* check if multipath is enabled and we have the capability */
1042 if (!multipath
|| !ctrl
->subsys
||
1043 !(ctrl
->subsys
->cmic
& NVME_CTRL_CMIC_ANA
))
1046 /* initialize this in the identify path to cover controller resets */
1047 atomic_set(&ctrl
->nr_active
, 0);
1049 if (!ctrl
->max_namespaces
||
1050 ctrl
->max_namespaces
> le32_to_cpu(id
->nn
)) {
1051 dev_err(ctrl
->device
,
1052 "Invalid MNAN value %u\n", ctrl
->max_namespaces
);
1056 ctrl
->anacap
= id
->anacap
;
1057 ctrl
->anatt
= id
->anatt
;
1058 ctrl
->nanagrpid
= le32_to_cpu(id
->nanagrpid
);
1059 ctrl
->anagrpmax
= le32_to_cpu(id
->anagrpmax
);
1061 ana_log_size
= sizeof(struct nvme_ana_rsp_hdr
) +
1062 ctrl
->nanagrpid
* sizeof(struct nvme_ana_group_desc
) +
1063 ctrl
->max_namespaces
* sizeof(__le32
);
1064 if (ana_log_size
> max_transfer_size
) {
1065 dev_err(ctrl
->device
,
1066 "ANA log page size (%zd) larger than MDTS (%zd).\n",
1067 ana_log_size
, max_transfer_size
);
1068 dev_err(ctrl
->device
, "disabling ANA support.\n");
1071 if (ana_log_size
> ctrl
->ana_log_size
) {
1072 nvme_mpath_stop(ctrl
);
1073 nvme_mpath_uninit(ctrl
);
1074 ctrl
->ana_log_buf
= kvmalloc(ana_log_size
, GFP_KERNEL
);
1075 if (!ctrl
->ana_log_buf
)
1078 ctrl
->ana_log_size
= ana_log_size
;
1079 error
= nvme_read_ana_log(ctrl
);
1085 nvme_mpath_uninit(ctrl
);
1089 void nvme_mpath_uninit(struct nvme_ctrl
*ctrl
)
1091 kvfree(ctrl
->ana_log_buf
);
1092 ctrl
->ana_log_buf
= NULL
;
1093 ctrl
->ana_log_size
= 0;