2 * Copyright (c) 2017 Christoph Hellwig.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 #include <linux/moduleparam.h>
17 static bool multipath
= true;
18 module_param(multipath
, bool, 0444);
19 MODULE_PARM_DESC(multipath
,
20 "turn on native support for multiple controllers per subsystem");
23 * If multipathing is enabled we need to always use the subsystem instance
24 * number for numbering our devices to avoid conflicts between subsystems that
25 * have multiple controllers and thus use the multipath-aware subsystem node
26 * and those that have a single controller and use the controller node
29 void nvme_set_disk_name(char *disk_name
, struct nvme_ns
*ns
,
30 struct nvme_ctrl
*ctrl
, int *flags
)
33 sprintf(disk_name
, "nvme%dn%d", ctrl
->instance
, ns
->head
->instance
);
34 } else if (ns
->head
->disk
) {
35 sprintf(disk_name
, "nvme%dc%dn%d", ctrl
->subsys
->instance
,
36 ctrl
->cntlid
, ns
->head
->instance
);
37 *flags
= GENHD_FL_HIDDEN
;
39 sprintf(disk_name
, "nvme%dn%d", ctrl
->subsys
->instance
,
44 void nvme_failover_req(struct request
*req
)
46 struct nvme_ns
*ns
= req
->q
->queuedata
;
49 spin_lock_irqsave(&ns
->head
->requeue_lock
, flags
);
50 blk_steal_bios(&ns
->head
->requeue_list
, req
);
51 spin_unlock_irqrestore(&ns
->head
->requeue_lock
, flags
);
52 blk_mq_end_request(req
, 0);
54 nvme_reset_ctrl(ns
->ctrl
);
55 kblockd_schedule_work(&ns
->head
->requeue_work
);
58 bool nvme_req_needs_failover(struct request
*req
, blk_status_t error
)
60 if (!(req
->cmd_flags
& REQ_NVME_MPATH
))
62 return blk_path_error(error
);
65 void nvme_kick_requeue_lists(struct nvme_ctrl
*ctrl
)
69 down_read(&ctrl
->namespaces_rwsem
);
70 list_for_each_entry(ns
, &ctrl
->namespaces
, list
) {
72 kblockd_schedule_work(&ns
->head
->requeue_work
);
74 up_read(&ctrl
->namespaces_rwsem
);
77 static struct nvme_ns
*__nvme_find_path(struct nvme_ns_head
*head
)
81 list_for_each_entry_rcu(ns
, &head
->list
, siblings
) {
82 if (ns
->ctrl
->state
== NVME_CTRL_LIVE
) {
83 rcu_assign_pointer(head
->current_path
, ns
);
91 inline struct nvme_ns
*nvme_find_path(struct nvme_ns_head
*head
)
93 struct nvme_ns
*ns
= srcu_dereference(head
->current_path
, &head
->srcu
);
95 if (unlikely(!ns
|| ns
->ctrl
->state
!= NVME_CTRL_LIVE
))
96 ns
= __nvme_find_path(head
);
100 static blk_qc_t
nvme_ns_head_make_request(struct request_queue
*q
,
103 struct nvme_ns_head
*head
= q
->queuedata
;
104 struct device
*dev
= disk_to_dev(head
->disk
);
106 blk_qc_t ret
= BLK_QC_T_NONE
;
109 srcu_idx
= srcu_read_lock(&head
->srcu
);
110 ns
= nvme_find_path(head
);
112 bio
->bi_disk
= ns
->disk
;
113 bio
->bi_opf
|= REQ_NVME_MPATH
;
114 ret
= direct_make_request(bio
);
115 } else if (!list_empty_careful(&head
->list
)) {
116 dev_warn_ratelimited(dev
, "no path available - requeuing I/O\n");
118 spin_lock_irq(&head
->requeue_lock
);
119 bio_list_add(&head
->requeue_list
, bio
);
120 spin_unlock_irq(&head
->requeue_lock
);
122 dev_warn_ratelimited(dev
, "no path - failing I/O\n");
124 bio
->bi_status
= BLK_STS_IOERR
;
128 srcu_read_unlock(&head
->srcu
, srcu_idx
);
132 static bool nvme_ns_head_poll(struct request_queue
*q
, blk_qc_t qc
)
134 struct nvme_ns_head
*head
= q
->queuedata
;
139 srcu_idx
= srcu_read_lock(&head
->srcu
);
140 ns
= srcu_dereference(head
->current_path
, &head
->srcu
);
141 if (likely(ns
&& ns
->ctrl
->state
== NVME_CTRL_LIVE
))
142 found
= ns
->queue
->poll_fn(q
, qc
);
143 srcu_read_unlock(&head
->srcu
, srcu_idx
);
147 static void nvme_requeue_work(struct work_struct
*work
)
149 struct nvme_ns_head
*head
=
150 container_of(work
, struct nvme_ns_head
, requeue_work
);
151 struct bio
*bio
, *next
;
153 spin_lock_irq(&head
->requeue_lock
);
154 next
= bio_list_get(&head
->requeue_list
);
155 spin_unlock_irq(&head
->requeue_lock
);
157 while ((bio
= next
) != NULL
) {
162 * Reset disk to the mpath node and resubmit to select a new
165 bio
->bi_disk
= head
->disk
;
166 generic_make_request(bio
);
170 int nvme_mpath_alloc_disk(struct nvme_ctrl
*ctrl
, struct nvme_ns_head
*head
)
172 struct request_queue
*q
;
175 bio_list_init(&head
->requeue_list
);
176 spin_lock_init(&head
->requeue_lock
);
177 INIT_WORK(&head
->requeue_work
, nvme_requeue_work
);
180 * Add a multipath node if the subsystems supports multiple controllers.
181 * We also do this for private namespaces as the namespace sharing data could
182 * change after a rescan.
184 if (!(ctrl
->subsys
->cmic
& (1 << 1)) || !multipath
)
187 q
= blk_alloc_queue_node(GFP_KERNEL
, NUMA_NO_NODE
, NULL
);
191 blk_queue_make_request(q
, nvme_ns_head_make_request
);
192 q
->poll_fn
= nvme_ns_head_poll
;
193 blk_queue_flag_set(QUEUE_FLAG_NONROT
, q
);
194 /* set to a default value for 512 until disk is validated */
195 blk_queue_logical_block_size(q
, 512);
197 /* we need to propagate up the VMC settings */
198 if (ctrl
->vwc
& NVME_CTRL_VWC_PRESENT
)
200 blk_queue_write_cache(q
, vwc
, vwc
);
202 head
->disk
= alloc_disk(0);
204 goto out_cleanup_queue
;
205 head
->disk
->fops
= &nvme_ns_head_ops
;
206 head
->disk
->private_data
= head
;
207 head
->disk
->queue
= q
;
208 head
->disk
->flags
= GENHD_FL_EXT_DEVT
;
209 sprintf(head
->disk
->disk_name
, "nvme%dn%d",
210 ctrl
->subsys
->instance
, head
->instance
);
214 blk_cleanup_queue(q
);
219 void nvme_mpath_add_disk(struct nvme_ns_head
*head
)
224 mutex_lock(&head
->subsys
->lock
);
225 if (!(head
->disk
->flags
& GENHD_FL_UP
)) {
226 device_add_disk(&head
->subsys
->dev
, head
->disk
);
227 if (sysfs_create_group(&disk_to_dev(head
->disk
)->kobj
,
228 &nvme_ns_id_attr_group
))
229 pr_warn("%s: failed to create sysfs group for identification\n",
230 head
->disk
->disk_name
);
232 mutex_unlock(&head
->subsys
->lock
);
235 void nvme_mpath_remove_disk(struct nvme_ns_head
*head
)
239 sysfs_remove_group(&disk_to_dev(head
->disk
)->kobj
,
240 &nvme_ns_id_attr_group
);
241 del_gendisk(head
->disk
);
242 blk_set_queue_dying(head
->disk
->queue
);
243 /* make sure all pending bios are cleaned up */
244 kblockd_schedule_work(&head
->requeue_work
);
245 flush_work(&head
->requeue_work
);
246 blk_cleanup_queue(head
->disk
->queue
);
247 put_disk(head
->disk
);