2 * Copyright (c) 2017 Christoph Hellwig.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 #include <linux/moduleparam.h>
17 static bool multipath
= true;
18 module_param(multipath
, bool, 0644);
19 MODULE_PARM_DESC(multipath
,
20 "turn on native support for multiple controllers per subsystem");
22 void nvme_failover_req(struct request
*req
)
24 struct nvme_ns
*ns
= req
->q
->queuedata
;
27 spin_lock_irqsave(&ns
->head
->requeue_lock
, flags
);
28 blk_steal_bios(&ns
->head
->requeue_list
, req
);
29 spin_unlock_irqrestore(&ns
->head
->requeue_lock
, flags
);
30 blk_mq_end_request(req
, 0);
32 nvme_reset_ctrl(ns
->ctrl
);
33 kblockd_schedule_work(&ns
->head
->requeue_work
);
36 bool nvme_req_needs_failover(struct request
*req
, blk_status_t error
)
38 if (!(req
->cmd_flags
& REQ_NVME_MPATH
))
40 return blk_path_error(error
);
43 void nvme_kick_requeue_lists(struct nvme_ctrl
*ctrl
)
47 down_read(&ctrl
->namespaces_rwsem
);
48 list_for_each_entry(ns
, &ctrl
->namespaces
, list
) {
50 kblockd_schedule_work(&ns
->head
->requeue_work
);
52 up_read(&ctrl
->namespaces_rwsem
);
55 static struct nvme_ns
*__nvme_find_path(struct nvme_ns_head
*head
)
59 list_for_each_entry_rcu(ns
, &head
->list
, siblings
) {
60 if (ns
->ctrl
->state
== NVME_CTRL_LIVE
) {
61 rcu_assign_pointer(head
->current_path
, ns
);
69 inline struct nvme_ns
*nvme_find_path(struct nvme_ns_head
*head
)
71 struct nvme_ns
*ns
= srcu_dereference(head
->current_path
, &head
->srcu
);
73 if (unlikely(!ns
|| ns
->ctrl
->state
!= NVME_CTRL_LIVE
))
74 ns
= __nvme_find_path(head
);
78 static blk_qc_t
nvme_ns_head_make_request(struct request_queue
*q
,
81 struct nvme_ns_head
*head
= q
->queuedata
;
82 struct device
*dev
= disk_to_dev(head
->disk
);
84 blk_qc_t ret
= BLK_QC_T_NONE
;
87 srcu_idx
= srcu_read_lock(&head
->srcu
);
88 ns
= nvme_find_path(head
);
90 bio
->bi_disk
= ns
->disk
;
91 bio
->bi_opf
|= REQ_NVME_MPATH
;
92 ret
= direct_make_request(bio
);
93 } else if (!list_empty_careful(&head
->list
)) {
94 dev_warn_ratelimited(dev
, "no path available - requeuing I/O\n");
96 spin_lock_irq(&head
->requeue_lock
);
97 bio_list_add(&head
->requeue_list
, bio
);
98 spin_unlock_irq(&head
->requeue_lock
);
100 dev_warn_ratelimited(dev
, "no path - failing I/O\n");
102 bio
->bi_status
= BLK_STS_IOERR
;
106 srcu_read_unlock(&head
->srcu
, srcu_idx
);
110 static bool nvme_ns_head_poll(struct request_queue
*q
, blk_qc_t qc
)
112 struct nvme_ns_head
*head
= q
->queuedata
;
117 srcu_idx
= srcu_read_lock(&head
->srcu
);
118 ns
= srcu_dereference(head
->current_path
, &head
->srcu
);
119 if (likely(ns
&& ns
->ctrl
->state
== NVME_CTRL_LIVE
))
120 found
= ns
->queue
->poll_fn(q
, qc
);
121 srcu_read_unlock(&head
->srcu
, srcu_idx
);
125 static void nvme_requeue_work(struct work_struct
*work
)
127 struct nvme_ns_head
*head
=
128 container_of(work
, struct nvme_ns_head
, requeue_work
);
129 struct bio
*bio
, *next
;
131 spin_lock_irq(&head
->requeue_lock
);
132 next
= bio_list_get(&head
->requeue_list
);
133 spin_unlock_irq(&head
->requeue_lock
);
135 while ((bio
= next
) != NULL
) {
140 * Reset disk to the mpath node and resubmit to select a new
143 bio
->bi_disk
= head
->disk
;
144 generic_make_request(bio
);
148 int nvme_mpath_alloc_disk(struct nvme_ctrl
*ctrl
, struct nvme_ns_head
*head
)
150 struct request_queue
*q
;
153 bio_list_init(&head
->requeue_list
);
154 spin_lock_init(&head
->requeue_lock
);
155 INIT_WORK(&head
->requeue_work
, nvme_requeue_work
);
158 * Add a multipath node if the subsystems supports multiple controllers.
159 * We also do this for private namespaces as the namespace sharing data could
160 * change after a rescan.
162 if (!(ctrl
->subsys
->cmic
& (1 << 1)) || !multipath
)
165 q
= blk_alloc_queue_node(GFP_KERNEL
, NUMA_NO_NODE
, NULL
);
169 blk_queue_make_request(q
, nvme_ns_head_make_request
);
170 q
->poll_fn
= nvme_ns_head_poll
;
171 blk_queue_flag_set(QUEUE_FLAG_NONROT
, q
);
172 /* set to a default value for 512 until disk is validated */
173 blk_queue_logical_block_size(q
, 512);
175 /* we need to propagate up the VMC settings */
176 if (ctrl
->vwc
& NVME_CTRL_VWC_PRESENT
)
178 blk_queue_write_cache(q
, vwc
, vwc
);
180 head
->disk
= alloc_disk(0);
182 goto out_cleanup_queue
;
183 head
->disk
->fops
= &nvme_ns_head_ops
;
184 head
->disk
->private_data
= head
;
185 head
->disk
->queue
= q
;
186 head
->disk
->flags
= GENHD_FL_EXT_DEVT
;
187 sprintf(head
->disk
->disk_name
, "nvme%dn%d",
188 ctrl
->subsys
->instance
, head
->instance
);
192 blk_cleanup_queue(q
);
197 void nvme_mpath_add_disk(struct nvme_ns_head
*head
)
202 mutex_lock(&head
->subsys
->lock
);
203 if (!(head
->disk
->flags
& GENHD_FL_UP
)) {
204 device_add_disk(&head
->subsys
->dev
, head
->disk
);
205 if (sysfs_create_group(&disk_to_dev(head
->disk
)->kobj
,
206 &nvme_ns_id_attr_group
))
207 pr_warn("%s: failed to create sysfs group for identification\n",
208 head
->disk
->disk_name
);
210 mutex_unlock(&head
->subsys
->lock
);
213 void nvme_mpath_remove_disk(struct nvme_ns_head
*head
)
217 sysfs_remove_group(&disk_to_dev(head
->disk
)->kobj
,
218 &nvme_ns_id_attr_group
);
219 del_gendisk(head
->disk
);
220 blk_set_queue_dying(head
->disk
->queue
);
221 /* make sure all pending bios are cleaned up */
222 kblockd_schedule_work(&head
->requeue_work
);
223 flush_work(&head
->requeue_work
);
224 blk_cleanup_queue(head
->disk
->queue
);
225 put_disk(head
->disk
);