2 * Copyright (c) 2017-2018 Christoph Hellwig.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 #include <linux/backing-dev.h>
15 #include <linux/moduleparam.h>
16 #include <trace/events/block.h>
19 static bool multipath
= true;
20 module_param(multipath
, bool, 0444);
21 MODULE_PARM_DESC(multipath
,
22 "turn on native support for multiple controllers per subsystem");
24 void nvme_mpath_unfreeze(struct nvme_subsystem
*subsys
)
26 struct nvme_ns_head
*h
;
28 lockdep_assert_held(&subsys
->lock
);
29 list_for_each_entry(h
, &subsys
->nsheads
, entry
)
31 blk_mq_unfreeze_queue(h
->disk
->queue
);
34 void nvme_mpath_wait_freeze(struct nvme_subsystem
*subsys
)
36 struct nvme_ns_head
*h
;
38 lockdep_assert_held(&subsys
->lock
);
39 list_for_each_entry(h
, &subsys
->nsheads
, entry
)
41 blk_mq_freeze_queue_wait(h
->disk
->queue
);
44 void nvme_mpath_start_freeze(struct nvme_subsystem
*subsys
)
46 struct nvme_ns_head
*h
;
48 lockdep_assert_held(&subsys
->lock
);
49 list_for_each_entry(h
, &subsys
->nsheads
, entry
)
51 blk_freeze_queue_start(h
->disk
->queue
);
55 * If multipathing is enabled we need to always use the subsystem instance
56 * number for numbering our devices to avoid conflicts between subsystems that
57 * have multiple controllers and thus use the multipath-aware subsystem node
58 * and those that have a single controller and use the controller node
61 void nvme_set_disk_name(char *disk_name
, struct nvme_ns
*ns
,
62 struct nvme_ctrl
*ctrl
, int *flags
)
65 sprintf(disk_name
, "nvme%dn%d", ctrl
->instance
, ns
->head
->instance
);
66 } else if (ns
->head
->disk
) {
67 sprintf(disk_name
, "nvme%dc%dn%d", ctrl
->subsys
->instance
,
68 ctrl
->cntlid
, ns
->head
->instance
);
69 *flags
= GENHD_FL_HIDDEN
;
71 sprintf(disk_name
, "nvme%dn%d", ctrl
->subsys
->instance
,
76 void nvme_failover_req(struct request
*req
)
78 struct nvme_ns
*ns
= req
->q
->queuedata
;
79 u16 status
= nvme_req(req
)->status
;
82 spin_lock_irqsave(&ns
->head
->requeue_lock
, flags
);
83 blk_steal_bios(&ns
->head
->requeue_list
, req
);
84 spin_unlock_irqrestore(&ns
->head
->requeue_lock
, flags
);
85 blk_mq_end_request(req
, 0);
87 switch (status
& 0x7ff) {
88 case NVME_SC_ANA_TRANSITION
:
89 case NVME_SC_ANA_INACCESSIBLE
:
90 case NVME_SC_ANA_PERSISTENT_LOSS
:
92 * If we got back an ANA error we know the controller is alive,
93 * but not ready to serve this namespaces. The spec suggests
94 * we should update our general state here, but due to the fact
95 * that the admin and I/O queues are not serialized that is
96 * fundamentally racy. So instead just clear the current path,
97 * mark the the path as pending and kick of a re-read of the ANA
100 nvme_mpath_clear_current_path(ns
);
101 if (ns
->ctrl
->ana_log_buf
) {
102 set_bit(NVME_NS_ANA_PENDING
, &ns
->flags
);
103 queue_work(nvme_wq
, &ns
->ctrl
->ana_work
);
106 case NVME_SC_HOST_PATH_ERROR
:
108 * Temporary transport disruption in talking to the controller.
109 * Try to send on a new path.
111 nvme_mpath_clear_current_path(ns
);
115 * Reset the controller for any non-ANA error as we don't know
116 * what caused the error.
118 nvme_reset_ctrl(ns
->ctrl
);
122 kblockd_schedule_work(&ns
->head
->requeue_work
);
125 void nvme_kick_requeue_lists(struct nvme_ctrl
*ctrl
)
129 down_read(&ctrl
->namespaces_rwsem
);
130 list_for_each_entry(ns
, &ctrl
->namespaces
, list
) {
132 kblockd_schedule_work(&ns
->head
->requeue_work
);
134 up_read(&ctrl
->namespaces_rwsem
);
137 static const char *nvme_ana_state_names
[] = {
138 [0] = "invalid state",
139 [NVME_ANA_OPTIMIZED
] = "optimized",
140 [NVME_ANA_NONOPTIMIZED
] = "non-optimized",
141 [NVME_ANA_INACCESSIBLE
] = "inaccessible",
142 [NVME_ANA_PERSISTENT_LOSS
] = "persistent-loss",
143 [NVME_ANA_CHANGE
] = "change",
146 static struct nvme_ns
*__nvme_find_path(struct nvme_ns_head
*head
)
148 struct nvme_ns
*ns
, *fallback
= NULL
;
150 list_for_each_entry_rcu(ns
, &head
->list
, siblings
) {
151 if (ns
->ctrl
->state
!= NVME_CTRL_LIVE
||
152 test_bit(NVME_NS_ANA_PENDING
, &ns
->flags
))
154 switch (ns
->ana_state
) {
155 case NVME_ANA_OPTIMIZED
:
156 rcu_assign_pointer(head
->current_path
, ns
);
158 case NVME_ANA_NONOPTIMIZED
:
167 rcu_assign_pointer(head
->current_path
, fallback
);
171 static inline bool nvme_path_is_optimized(struct nvme_ns
*ns
)
173 return ns
->ctrl
->state
== NVME_CTRL_LIVE
&&
174 ns
->ana_state
== NVME_ANA_OPTIMIZED
;
177 inline struct nvme_ns
*nvme_find_path(struct nvme_ns_head
*head
)
179 struct nvme_ns
*ns
= srcu_dereference(head
->current_path
, &head
->srcu
);
181 if (unlikely(!ns
|| !nvme_path_is_optimized(ns
)))
182 ns
= __nvme_find_path(head
);
186 static blk_qc_t
nvme_ns_head_make_request(struct request_queue
*q
,
189 struct nvme_ns_head
*head
= q
->queuedata
;
190 struct device
*dev
= disk_to_dev(head
->disk
);
192 blk_qc_t ret
= BLK_QC_T_NONE
;
195 srcu_idx
= srcu_read_lock(&head
->srcu
);
196 ns
= nvme_find_path(head
);
198 bio
->bi_disk
= ns
->disk
;
199 bio
->bi_opf
|= REQ_NVME_MPATH
;
200 trace_block_bio_remap(bio
->bi_disk
->queue
, bio
,
201 disk_devt(ns
->head
->disk
),
202 bio
->bi_iter
.bi_sector
);
203 ret
= direct_make_request(bio
);
204 } else if (!list_empty_careful(&head
->list
)) {
205 dev_warn_ratelimited(dev
, "no path available - requeuing I/O\n");
207 spin_lock_irq(&head
->requeue_lock
);
208 bio_list_add(&head
->requeue_list
, bio
);
209 spin_unlock_irq(&head
->requeue_lock
);
211 dev_warn_ratelimited(dev
, "no path - failing I/O\n");
213 bio
->bi_status
= BLK_STS_IOERR
;
217 srcu_read_unlock(&head
->srcu
, srcu_idx
);
221 static bool nvme_ns_head_poll(struct request_queue
*q
, blk_qc_t qc
)
223 struct nvme_ns_head
*head
= q
->queuedata
;
228 srcu_idx
= srcu_read_lock(&head
->srcu
);
229 ns
= srcu_dereference(head
->current_path
, &head
->srcu
);
230 if (likely(ns
&& nvme_path_is_optimized(ns
)))
231 found
= ns
->queue
->poll_fn(q
, qc
);
232 srcu_read_unlock(&head
->srcu
, srcu_idx
);
236 static void nvme_requeue_work(struct work_struct
*work
)
238 struct nvme_ns_head
*head
=
239 container_of(work
, struct nvme_ns_head
, requeue_work
);
240 struct bio
*bio
, *next
;
242 spin_lock_irq(&head
->requeue_lock
);
243 next
= bio_list_get(&head
->requeue_list
);
244 spin_unlock_irq(&head
->requeue_lock
);
246 while ((bio
= next
) != NULL
) {
251 * Reset disk to the mpath node and resubmit to select a new
254 bio
->bi_disk
= head
->disk
;
255 generic_make_request(bio
);
259 int nvme_mpath_alloc_disk(struct nvme_ctrl
*ctrl
, struct nvme_ns_head
*head
)
261 struct request_queue
*q
;
264 mutex_init(&head
->lock
);
265 bio_list_init(&head
->requeue_list
);
266 spin_lock_init(&head
->requeue_lock
);
267 INIT_WORK(&head
->requeue_work
, nvme_requeue_work
);
270 * Add a multipath node if the subsystems supports multiple controllers.
271 * We also do this for private namespaces as the namespace sharing data could
272 * change after a rescan.
274 if (!(ctrl
->subsys
->cmic
& (1 << 1)) || !multipath
)
277 q
= blk_alloc_queue_node(GFP_KERNEL
, NUMA_NO_NODE
, NULL
);
281 blk_queue_make_request(q
, nvme_ns_head_make_request
);
282 q
->poll_fn
= nvme_ns_head_poll
;
283 blk_queue_flag_set(QUEUE_FLAG_NONROT
, q
);
284 /* set to a default value for 512 until disk is validated */
285 blk_queue_logical_block_size(q
, 512);
286 blk_set_stacking_limits(&q
->limits
);
288 /* we need to propagate up the VMC settings */
289 if (ctrl
->vwc
& NVME_CTRL_VWC_PRESENT
)
291 blk_queue_write_cache(q
, vwc
, vwc
);
293 head
->disk
= alloc_disk(0);
295 goto out_cleanup_queue
;
296 head
->disk
->fops
= &nvme_ns_head_ops
;
297 head
->disk
->private_data
= head
;
298 head
->disk
->queue
= q
;
299 head
->disk
->flags
= GENHD_FL_EXT_DEVT
;
300 sprintf(head
->disk
->disk_name
, "nvme%dn%d",
301 ctrl
->subsys
->instance
, head
->instance
);
305 blk_cleanup_queue(q
);
310 static void nvme_mpath_set_live(struct nvme_ns
*ns
)
312 struct nvme_ns_head
*head
= ns
->head
;
314 lockdep_assert_held(&ns
->head
->lock
);
319 if (!(head
->disk
->flags
& GENHD_FL_UP
)) {
320 device_add_disk(&head
->subsys
->dev
, head
->disk
);
321 if (sysfs_create_group(&disk_to_dev(head
->disk
)->kobj
,
322 &nvme_ns_id_attr_group
))
323 dev_warn(&head
->subsys
->dev
,
324 "failed to create id group.\n");
327 synchronize_srcu(&ns
->head
->srcu
);
328 kblockd_schedule_work(&ns
->head
->requeue_work
);
331 static int nvme_parse_ana_log(struct nvme_ctrl
*ctrl
, void *data
,
332 int (*cb
)(struct nvme_ctrl
*ctrl
, struct nvme_ana_group_desc
*,
335 void *base
= ctrl
->ana_log_buf
;
336 size_t offset
= sizeof(struct nvme_ana_rsp_hdr
);
339 lockdep_assert_held(&ctrl
->ana_lock
);
341 for (i
= 0; i
< le16_to_cpu(ctrl
->ana_log_buf
->ngrps
); i
++) {
342 struct nvme_ana_group_desc
*desc
= base
+ offset
;
343 u32 nr_nsids
= le32_to_cpu(desc
->nnsids
);
344 size_t nsid_buf_size
= nr_nsids
* sizeof(__le32
);
346 if (WARN_ON_ONCE(desc
->grpid
== 0))
348 if (WARN_ON_ONCE(le32_to_cpu(desc
->grpid
) > ctrl
->anagrpmax
))
350 if (WARN_ON_ONCE(desc
->state
== 0))
352 if (WARN_ON_ONCE(desc
->state
> NVME_ANA_CHANGE
))
355 offset
+= sizeof(*desc
);
356 if (WARN_ON_ONCE(offset
> ctrl
->ana_log_size
- nsid_buf_size
))
359 error
= cb(ctrl
, desc
, data
);
363 offset
+= nsid_buf_size
;
364 if (WARN_ON_ONCE(offset
> ctrl
->ana_log_size
- sizeof(*desc
)))
371 static inline bool nvme_state_is_live(enum nvme_ana_state state
)
373 return state
== NVME_ANA_OPTIMIZED
|| state
== NVME_ANA_NONOPTIMIZED
;
376 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc
*desc
,
379 mutex_lock(&ns
->head
->lock
);
380 ns
->ana_grpid
= le32_to_cpu(desc
->grpid
);
381 ns
->ana_state
= desc
->state
;
382 clear_bit(NVME_NS_ANA_PENDING
, &ns
->flags
);
384 if (nvme_state_is_live(ns
->ana_state
))
385 nvme_mpath_set_live(ns
);
386 mutex_unlock(&ns
->head
->lock
);
389 static int nvme_update_ana_state(struct nvme_ctrl
*ctrl
,
390 struct nvme_ana_group_desc
*desc
, void *data
)
392 u32 nr_nsids
= le32_to_cpu(desc
->nnsids
), n
= 0;
393 unsigned *nr_change_groups
= data
;
396 dev_info(ctrl
->device
, "ANA group %d: %s.\n",
397 le32_to_cpu(desc
->grpid
),
398 nvme_ana_state_names
[desc
->state
]);
400 if (desc
->state
== NVME_ANA_CHANGE
)
401 (*nr_change_groups
)++;
406 down_read(&ctrl
->namespaces_rwsem
);
407 list_for_each_entry(ns
, &ctrl
->namespaces
, list
) {
408 unsigned nsid
= le32_to_cpu(desc
->nsids
[n
]);
410 if (ns
->head
->ns_id
< nsid
)
412 if (ns
->head
->ns_id
== nsid
)
413 nvme_update_ns_ana_state(desc
, ns
);
417 up_read(&ctrl
->namespaces_rwsem
);
421 static int nvme_read_ana_log(struct nvme_ctrl
*ctrl
, bool groups_only
)
423 u32 nr_change_groups
= 0;
426 mutex_lock(&ctrl
->ana_lock
);
427 error
= nvme_get_log(ctrl
, NVME_NSID_ALL
, NVME_LOG_ANA
,
428 groups_only
? NVME_ANA_LOG_RGO
: 0,
429 ctrl
->ana_log_buf
, ctrl
->ana_log_size
, 0);
431 dev_warn(ctrl
->device
, "Failed to get ANA log: %d\n", error
);
435 error
= nvme_parse_ana_log(ctrl
, &nr_change_groups
,
436 nvme_update_ana_state
);
441 * In theory we should have an ANATT timer per group as they might enter
442 * the change state at different times. But that is a lot of overhead
443 * just to protect against a target that keeps entering new changes
444 * states while never finishing previous ones. But we'll still
445 * eventually time out once all groups are in change state, so this
448 * We also double the ANATT value to provide some slack for transports
449 * or AEN processing overhead.
451 if (nr_change_groups
)
452 mod_timer(&ctrl
->anatt_timer
, ctrl
->anatt
* HZ
* 2 + jiffies
);
454 del_timer_sync(&ctrl
->anatt_timer
);
456 mutex_unlock(&ctrl
->ana_lock
);
460 static void nvme_ana_work(struct work_struct
*work
)
462 struct nvme_ctrl
*ctrl
= container_of(work
, struct nvme_ctrl
, ana_work
);
464 nvme_read_ana_log(ctrl
, false);
467 static void nvme_anatt_timeout(struct timer_list
*t
)
469 struct nvme_ctrl
*ctrl
= from_timer(ctrl
, t
, anatt_timer
);
471 dev_info(ctrl
->device
, "ANATT timeout, resetting controller.\n");
472 nvme_reset_ctrl(ctrl
);
475 void nvme_mpath_stop(struct nvme_ctrl
*ctrl
)
477 if (!nvme_ctrl_use_ana(ctrl
))
479 del_timer_sync(&ctrl
->anatt_timer
);
480 cancel_work_sync(&ctrl
->ana_work
);
483 static ssize_t
ana_grpid_show(struct device
*dev
, struct device_attribute
*attr
,
486 return sprintf(buf
, "%d\n", nvme_get_ns_from_dev(dev
)->ana_grpid
);
488 DEVICE_ATTR_RO(ana_grpid
);
490 static ssize_t
ana_state_show(struct device
*dev
, struct device_attribute
*attr
,
493 struct nvme_ns
*ns
= nvme_get_ns_from_dev(dev
);
495 return sprintf(buf
, "%s\n", nvme_ana_state_names
[ns
->ana_state
]);
497 DEVICE_ATTR_RO(ana_state
);
499 static int nvme_lookup_ana_group_desc(struct nvme_ctrl
*ctrl
,
500 struct nvme_ana_group_desc
*desc
, void *data
)
502 struct nvme_ana_group_desc
*dst
= data
;
504 if (desc
->grpid
!= dst
->grpid
)
508 return -ENXIO
; /* just break out of the loop */
511 void nvme_mpath_add_disk(struct nvme_ns
*ns
, struct nvme_id_ns
*id
)
513 if (nvme_ctrl_use_ana(ns
->ctrl
)) {
514 struct nvme_ana_group_desc desc
= {
515 .grpid
= id
->anagrpid
,
519 mutex_lock(&ns
->ctrl
->ana_lock
);
520 ns
->ana_grpid
= le32_to_cpu(id
->anagrpid
);
521 nvme_parse_ana_log(ns
->ctrl
, &desc
, nvme_lookup_ana_group_desc
);
522 mutex_unlock(&ns
->ctrl
->ana_lock
);
524 /* found the group desc: update */
525 nvme_update_ns_ana_state(&desc
, ns
);
528 mutex_lock(&ns
->head
->lock
);
529 ns
->ana_state
= NVME_ANA_OPTIMIZED
;
530 nvme_mpath_set_live(ns
);
531 mutex_unlock(&ns
->head
->lock
);
534 if (bdi_cap_stable_pages_required(ns
->queue
->backing_dev_info
)) {
535 struct gendisk
*disk
= ns
->head
->disk
;
538 disk
->queue
->backing_dev_info
->capabilities
|=
539 BDI_CAP_STABLE_WRITES
;
543 void nvme_mpath_remove_disk(struct nvme_ns_head
*head
)
547 if (head
->disk
->flags
& GENHD_FL_UP
) {
548 sysfs_remove_group(&disk_to_dev(head
->disk
)->kobj
,
549 &nvme_ns_id_attr_group
);
550 del_gendisk(head
->disk
);
552 blk_set_queue_dying(head
->disk
->queue
);
553 /* make sure all pending bios are cleaned up */
554 kblockd_schedule_work(&head
->requeue_work
);
555 flush_work(&head
->requeue_work
);
556 blk_cleanup_queue(head
->disk
->queue
);
557 put_disk(head
->disk
);
560 int nvme_mpath_init(struct nvme_ctrl
*ctrl
, struct nvme_id_ctrl
*id
)
564 /* check if multipath is enabled and we have the capability */
565 if (!multipath
|| !ctrl
->subsys
|| !(ctrl
->subsys
->cmic
& (1 << 3)))
568 ctrl
->anacap
= id
->anacap
;
569 ctrl
->anatt
= id
->anatt
;
570 ctrl
->nanagrpid
= le32_to_cpu(id
->nanagrpid
);
571 ctrl
->anagrpmax
= le32_to_cpu(id
->anagrpmax
);
573 mutex_init(&ctrl
->ana_lock
);
574 timer_setup(&ctrl
->anatt_timer
, nvme_anatt_timeout
, 0);
575 ctrl
->ana_log_size
= sizeof(struct nvme_ana_rsp_hdr
) +
576 ctrl
->nanagrpid
* sizeof(struct nvme_ana_group_desc
);
577 ctrl
->ana_log_size
+= ctrl
->max_namespaces
* sizeof(__le32
);
579 if (ctrl
->ana_log_size
> ctrl
->max_hw_sectors
<< SECTOR_SHIFT
) {
580 dev_err(ctrl
->device
,
581 "ANA log page size (%zd) larger than MDTS (%d).\n",
583 ctrl
->max_hw_sectors
<< SECTOR_SHIFT
);
584 dev_err(ctrl
->device
, "disabling ANA support.\n");
588 INIT_WORK(&ctrl
->ana_work
, nvme_ana_work
);
589 kfree(ctrl
->ana_log_buf
);
590 ctrl
->ana_log_buf
= kmalloc(ctrl
->ana_log_size
, GFP_KERNEL
);
591 if (!ctrl
->ana_log_buf
) {
596 error
= nvme_read_ana_log(ctrl
, false);
598 goto out_free_ana_log_buf
;
600 out_free_ana_log_buf
:
601 kfree(ctrl
->ana_log_buf
);
602 ctrl
->ana_log_buf
= NULL
;
607 void nvme_mpath_uninit(struct nvme_ctrl
*ctrl
)
609 kfree(ctrl
->ana_log_buf
);
610 ctrl
->ana_log_buf
= NULL
;