1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Network block device - make block devices work over TCP
5 * Note that you can not swap over this thing, yet. Seems to work but
6 * deadlocks sometimes - you can not swap over TCP in general.
8 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
9 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
11 * (part of code stolen from loop.c)
14 #define pr_fmt(fmt) "nbd: " fmt
16 #include <linux/major.h>
18 #include <linux/blkdev.h>
19 #include <linux/module.h>
20 #include <linux/init.h>
21 #include <linux/sched.h>
22 #include <linux/sched/mm.h>
24 #include <linux/bio.h>
25 #include <linux/stat.h>
26 #include <linux/errno.h>
27 #include <linux/file.h>
28 #include <linux/ioctl.h>
29 #include <linux/mutex.h>
30 #include <linux/compiler.h>
31 #include <linux/completion.h>
32 #include <linux/err.h>
33 #include <linux/kernel.h>
34 #include <linux/slab.h>
36 #include <linux/net.h>
37 #include <linux/kthread.h>
38 #include <linux/types.h>
39 #include <linux/debugfs.h>
40 #include <linux/blk-mq.h>
42 #include <linux/uaccess.h>
43 #include <asm/types.h>
45 #include <linux/nbd.h>
46 #include <linux/nbd-netlink.h>
47 #include <net/genetlink.h>
49 #define CREATE_TRACE_POINTS
50 #include <trace/events/nbd.h>
52 static DEFINE_IDR(nbd_index_idr
);
53 static DEFINE_MUTEX(nbd_index_mutex
);
54 static struct workqueue_struct
*nbd_del_wq
;
55 static int nbd_total_devices
= 0;
60 struct request
*pending
;
67 struct recv_thread_args
{
68 struct work_struct work
;
69 struct nbd_device
*nbd
;
70 struct nbd_sock
*nsock
;
74 struct link_dead_args
{
75 struct work_struct work
;
79 #define NBD_RT_TIMEDOUT 0
80 #define NBD_RT_DISCONNECT_REQUESTED 1
81 #define NBD_RT_DISCONNECTED 2
82 #define NBD_RT_HAS_PID_FILE 3
83 #define NBD_RT_HAS_CONFIG_REF 4
84 #define NBD_RT_BOUND 5
85 #define NBD_RT_DISCONNECT_ON_CLOSE 6
86 #define NBD_RT_HAS_BACKEND_FILE 7
88 #define NBD_DESTROY_ON_DISCONNECT 0
89 #define NBD_DISCONNECT_REQUESTED 1
93 unsigned long runtime_flags
;
94 u64 dead_conn_timeout
;
96 struct nbd_sock
**socks
;
98 atomic_t live_connections
;
99 wait_queue_head_t conn_wait
;
101 atomic_t recv_threads
;
102 wait_queue_head_t recv_wq
;
103 unsigned int blksize_bits
;
105 #if IS_ENABLED(CONFIG_DEBUG_FS)
106 struct dentry
*dbg_dir
;
110 static inline unsigned int nbd_blksize(struct nbd_config
*config
)
112 return 1u << config
->blksize_bits
;
116 struct blk_mq_tag_set tag_set
;
119 refcount_t config_refs
;
121 struct nbd_config
*config
;
122 struct mutex config_lock
;
123 struct gendisk
*disk
;
124 struct workqueue_struct
*recv_workq
;
125 struct work_struct remove_work
;
127 struct list_head list
;
128 struct task_struct
*task_setup
;
131 pid_t pid
; /* pid of nbd-client, if attached */
136 #define NBD_CMD_REQUEUED 1
138 * This flag will be set if nbd_queue_rq() succeed, and will be checked and
139 * cleared in completion. Both setting and clearing of the flag are protected
142 #define NBD_CMD_INFLIGHT 2
145 struct nbd_device
*nbd
;
155 #if IS_ENABLED(CONFIG_DEBUG_FS)
156 static struct dentry
*nbd_dbg_dir
;
159 #define nbd_name(nbd) ((nbd)->disk->disk_name)
161 #define NBD_DEF_BLKSIZE_BITS 10
163 static unsigned int nbds_max
= 16;
164 static int max_part
= 16;
165 static int part_shift
;
167 static int nbd_dev_dbg_init(struct nbd_device
*nbd
);
168 static void nbd_dev_dbg_close(struct nbd_device
*nbd
);
169 static void nbd_config_put(struct nbd_device
*nbd
);
170 static void nbd_connect_reply(struct genl_info
*info
, int index
);
171 static int nbd_genl_status(struct sk_buff
*skb
, struct genl_info
*info
);
172 static void nbd_dead_link_work(struct work_struct
*work
);
173 static void nbd_disconnect_and_put(struct nbd_device
*nbd
);
175 static inline struct device
*nbd_to_dev(struct nbd_device
*nbd
)
177 return disk_to_dev(nbd
->disk
);
180 static void nbd_requeue_cmd(struct nbd_cmd
*cmd
)
182 struct request
*req
= blk_mq_rq_from_pdu(cmd
);
184 lockdep_assert_held(&cmd
->lock
);
187 * Clear INFLIGHT flag so that this cmd won't be completed in
188 * normal completion path
190 * INFLIGHT flag will be set when the cmd is queued to nbd next
193 __clear_bit(NBD_CMD_INFLIGHT
, &cmd
->flags
);
195 if (!test_and_set_bit(NBD_CMD_REQUEUED
, &cmd
->flags
))
196 blk_mq_requeue_request(req
, true);
199 #define NBD_COOKIE_BITS 32
201 static u64
nbd_cmd_handle(struct nbd_cmd
*cmd
)
203 struct request
*req
= blk_mq_rq_from_pdu(cmd
);
204 u32 tag
= blk_mq_unique_tag(req
);
205 u64 cookie
= cmd
->cmd_cookie
;
207 return (cookie
<< NBD_COOKIE_BITS
) | tag
;
210 static u32
nbd_handle_to_tag(u64 handle
)
215 static u32
nbd_handle_to_cookie(u64 handle
)
217 return (u32
)(handle
>> NBD_COOKIE_BITS
);
220 static const char *nbdcmd_to_ascii(int cmd
)
223 case NBD_CMD_READ
: return "read";
224 case NBD_CMD_WRITE
: return "write";
225 case NBD_CMD_DISC
: return "disconnect";
226 case NBD_CMD_FLUSH
: return "flush";
227 case NBD_CMD_TRIM
: return "trim/discard";
232 static ssize_t
pid_show(struct device
*dev
,
233 struct device_attribute
*attr
, char *buf
)
235 struct gendisk
*disk
= dev_to_disk(dev
);
236 struct nbd_device
*nbd
= disk
->private_data
;
238 return sprintf(buf
, "%d\n", nbd
->pid
);
241 static const struct device_attribute pid_attr
= {
242 .attr
= { .name
= "pid", .mode
= 0444},
246 static ssize_t
backend_show(struct device
*dev
,
247 struct device_attribute
*attr
, char *buf
)
249 struct gendisk
*disk
= dev_to_disk(dev
);
250 struct nbd_device
*nbd
= disk
->private_data
;
252 return sprintf(buf
, "%s\n", nbd
->backend
?: "");
255 static const struct device_attribute backend_attr
= {
256 .attr
= { .name
= "backend", .mode
= 0444},
257 .show
= backend_show
,
260 static void nbd_dev_remove(struct nbd_device
*nbd
)
262 struct gendisk
*disk
= nbd
->disk
;
265 blk_mq_free_tag_set(&nbd
->tag_set
);
268 * Remove from idr after del_gendisk() completes, so if the same ID is
269 * reused, the following add_disk() will succeed.
271 mutex_lock(&nbd_index_mutex
);
272 idr_remove(&nbd_index_idr
, nbd
->index
);
273 mutex_unlock(&nbd_index_mutex
);
274 destroy_workqueue(nbd
->recv_workq
);
278 static void nbd_dev_remove_work(struct work_struct
*work
)
280 nbd_dev_remove(container_of(work
, struct nbd_device
, remove_work
));
283 static void nbd_put(struct nbd_device
*nbd
)
285 if (!refcount_dec_and_test(&nbd
->refs
))
288 /* Call del_gendisk() asynchrounously to prevent deadlock */
289 if (test_bit(NBD_DESTROY_ON_DISCONNECT
, &nbd
->flags
))
290 queue_work(nbd_del_wq
, &nbd
->remove_work
);
295 static int nbd_disconnected(struct nbd_config
*config
)
297 return test_bit(NBD_RT_DISCONNECTED
, &config
->runtime_flags
) ||
298 test_bit(NBD_RT_DISCONNECT_REQUESTED
, &config
->runtime_flags
);
301 static void nbd_mark_nsock_dead(struct nbd_device
*nbd
, struct nbd_sock
*nsock
,
304 if (!nsock
->dead
&& notify
&& !nbd_disconnected(nbd
->config
)) {
305 struct link_dead_args
*args
;
306 args
= kmalloc(sizeof(struct link_dead_args
), GFP_NOIO
);
308 INIT_WORK(&args
->work
, nbd_dead_link_work
);
309 args
->index
= nbd
->index
;
310 queue_work(system_wq
, &args
->work
);
314 kernel_sock_shutdown(nsock
->sock
, SHUT_RDWR
);
315 if (atomic_dec_return(&nbd
->config
->live_connections
) == 0) {
316 if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED
,
317 &nbd
->config
->runtime_flags
)) {
318 set_bit(NBD_RT_DISCONNECTED
,
319 &nbd
->config
->runtime_flags
);
320 dev_info(nbd_to_dev(nbd
),
321 "Disconnected due to user request.\n");
326 nsock
->pending
= NULL
;
330 static int __nbd_set_size(struct nbd_device
*nbd
, loff_t bytesize
,
333 struct queue_limits lim
;
337 blksize
= 1u << NBD_DEF_BLKSIZE_BITS
;
339 if (blk_validate_block_size(blksize
))
345 nbd
->config
->bytesize
= bytesize
;
346 nbd
->config
->blksize_bits
= __ffs(blksize
);
351 lim
= queue_limits_start_update(nbd
->disk
->queue
);
352 if (nbd
->config
->flags
& NBD_FLAG_SEND_TRIM
)
353 lim
.max_hw_discard_sectors
= UINT_MAX
>> SECTOR_SHIFT
;
355 lim
.max_hw_discard_sectors
= 0;
356 if (!(nbd
->config
->flags
& NBD_FLAG_SEND_FLUSH
)) {
357 lim
.features
&= ~(BLK_FEAT_WRITE_CACHE
| BLK_FEAT_FUA
);
358 } else if (nbd
->config
->flags
& NBD_FLAG_SEND_FUA
) {
359 lim
.features
|= BLK_FEAT_WRITE_CACHE
| BLK_FEAT_FUA
;
361 lim
.features
|= BLK_FEAT_WRITE_CACHE
;
362 lim
.features
&= ~BLK_FEAT_FUA
;
364 if (nbd
->config
->flags
& NBD_FLAG_ROTATIONAL
)
365 lim
.features
|= BLK_FEAT_ROTATIONAL
;
366 if (nbd
->config
->flags
& NBD_FLAG_SEND_WRITE_ZEROES
)
367 lim
.max_write_zeroes_sectors
= UINT_MAX
>> SECTOR_SHIFT
;
369 lim
.logical_block_size
= blksize
;
370 lim
.physical_block_size
= blksize
;
371 error
= queue_limits_commit_update(nbd
->disk
->queue
, &lim
);
376 set_bit(GD_NEED_PART_SCAN
, &nbd
->disk
->state
);
377 if (!set_capacity_and_notify(nbd
->disk
, bytesize
>> 9))
378 kobject_uevent(&nbd_to_dev(nbd
)->kobj
, KOBJ_CHANGE
);
382 static int nbd_set_size(struct nbd_device
*nbd
, loff_t bytesize
,
387 blk_mq_freeze_queue(nbd
->disk
->queue
);
388 error
= __nbd_set_size(nbd
, bytesize
, blksize
);
389 blk_mq_unfreeze_queue(nbd
->disk
->queue
);
394 static void nbd_complete_rq(struct request
*req
)
396 struct nbd_cmd
*cmd
= blk_mq_rq_to_pdu(req
);
398 dev_dbg(nbd_to_dev(cmd
->nbd
), "request %p: %s\n", req
,
399 cmd
->status
? "failed" : "done");
401 blk_mq_end_request(req
, cmd
->status
);
405 * Forcibly shutdown the socket causing all listeners to error
407 static void sock_shutdown(struct nbd_device
*nbd
)
409 struct nbd_config
*config
= nbd
->config
;
412 if (config
->num_connections
== 0)
414 if (test_and_set_bit(NBD_RT_DISCONNECTED
, &config
->runtime_flags
))
417 for (i
= 0; i
< config
->num_connections
; i
++) {
418 struct nbd_sock
*nsock
= config
->socks
[i
];
419 mutex_lock(&nsock
->tx_lock
);
420 nbd_mark_nsock_dead(nbd
, nsock
, 0);
421 mutex_unlock(&nsock
->tx_lock
);
423 dev_warn(disk_to_dev(nbd
->disk
), "shutting down sockets\n");
426 static u32
req_to_nbd_cmd_type(struct request
*req
)
428 switch (req_op(req
)) {
432 return NBD_CMD_FLUSH
;
434 return NBD_CMD_WRITE
;
437 case REQ_OP_WRITE_ZEROES
:
438 return NBD_CMD_WRITE_ZEROES
;
444 static struct nbd_config
*nbd_get_config_unlocked(struct nbd_device
*nbd
)
446 if (refcount_inc_not_zero(&nbd
->config_refs
)) {
448 * Add smp_mb__after_atomic to ensure that reading nbd->config_refs
449 * and reading nbd->config is ordered. The pair is the barrier in
450 * nbd_alloc_and_init_config(), avoid nbd->config_refs is set
451 * before nbd->config.
453 smp_mb__after_atomic();
460 static enum blk_eh_timer_return
nbd_xmit_timeout(struct request
*req
)
462 struct nbd_cmd
*cmd
= blk_mq_rq_to_pdu(req
);
463 struct nbd_device
*nbd
= cmd
->nbd
;
464 struct nbd_config
*config
;
466 if (!mutex_trylock(&cmd
->lock
))
467 return BLK_EH_RESET_TIMER
;
469 if (!test_bit(NBD_CMD_INFLIGHT
, &cmd
->flags
)) {
470 mutex_unlock(&cmd
->lock
);
474 config
= nbd_get_config_unlocked(nbd
);
476 cmd
->status
= BLK_STS_TIMEOUT
;
477 __clear_bit(NBD_CMD_INFLIGHT
, &cmd
->flags
);
478 mutex_unlock(&cmd
->lock
);
482 if (config
->num_connections
> 1 ||
483 (config
->num_connections
== 1 && nbd
->tag_set
.timeout
)) {
484 dev_err_ratelimited(nbd_to_dev(nbd
),
485 "Connection timed out, retrying (%d/%d alive)\n",
486 atomic_read(&config
->live_connections
),
487 config
->num_connections
);
489 * Hooray we have more connections, requeue this IO, the submit
490 * path will put it on a real connection. Or if only one
491 * connection is configured, the submit path will wait util
492 * a new connection is reconfigured or util dead timeout.
495 if (cmd
->index
< config
->num_connections
) {
496 struct nbd_sock
*nsock
=
497 config
->socks
[cmd
->index
];
498 mutex_lock(&nsock
->tx_lock
);
499 /* We can have multiple outstanding requests, so
500 * we don't want to mark the nsock dead if we've
501 * already reconnected with a new socket, so
502 * only mark it dead if its the same socket we
505 if (cmd
->cookie
== nsock
->cookie
)
506 nbd_mark_nsock_dead(nbd
, nsock
, 1);
507 mutex_unlock(&nsock
->tx_lock
);
509 nbd_requeue_cmd(cmd
);
510 mutex_unlock(&cmd
->lock
);
516 if (!nbd
->tag_set
.timeout
) {
518 * Userspace sets timeout=0 to disable socket disconnection,
519 * so just warn and reset the timer.
521 struct nbd_sock
*nsock
= config
->socks
[cmd
->index
];
523 dev_info(nbd_to_dev(nbd
), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n",
524 req
, nbdcmd_to_ascii(req_to_nbd_cmd_type(req
)),
525 (unsigned long long)blk_rq_pos(req
) << 9,
526 blk_rq_bytes(req
), (req
->timeout
/ HZ
) * cmd
->retries
);
528 mutex_lock(&nsock
->tx_lock
);
529 if (cmd
->cookie
!= nsock
->cookie
) {
530 nbd_requeue_cmd(cmd
);
531 mutex_unlock(&nsock
->tx_lock
);
532 mutex_unlock(&cmd
->lock
);
536 mutex_unlock(&nsock
->tx_lock
);
537 mutex_unlock(&cmd
->lock
);
539 return BLK_EH_RESET_TIMER
;
542 dev_err_ratelimited(nbd_to_dev(nbd
), "Connection timed out\n");
543 set_bit(NBD_RT_TIMEDOUT
, &config
->runtime_flags
);
544 cmd
->status
= BLK_STS_IOERR
;
545 __clear_bit(NBD_CMD_INFLIGHT
, &cmd
->flags
);
546 mutex_unlock(&cmd
->lock
);
550 blk_mq_complete_request(req
);
554 static int __sock_xmit(struct nbd_device
*nbd
, struct socket
*sock
, int send
,
555 struct iov_iter
*iter
, int msg_flags
, int *sent
)
558 struct msghdr msg
= {} ;
559 unsigned int noreclaim_flag
;
561 if (unlikely(!sock
)) {
562 dev_err_ratelimited(disk_to_dev(nbd
->disk
),
563 "Attempted %s on closed socket in sock_xmit\n",
564 (send
? "send" : "recv"));
568 msg
.msg_iter
= *iter
;
570 noreclaim_flag
= memalloc_noreclaim_save();
572 sock
->sk
->sk_allocation
= GFP_NOIO
| __GFP_MEMALLOC
;
573 sock
->sk
->sk_use_task_frag
= false;
574 msg
.msg_flags
= msg_flags
| MSG_NOSIGNAL
;
577 result
= sock_sendmsg(sock
, &msg
);
579 result
= sock_recvmsg(sock
, &msg
, msg
.msg_flags
);
583 result
= -EPIPE
; /* short read */
588 } while (msg_data_left(&msg
));
590 memalloc_noreclaim_restore(noreclaim_flag
);
596 * Send or receive packet. Return a positive value on success and
597 * negtive value on failure, and never return 0.
599 static int sock_xmit(struct nbd_device
*nbd
, int index
, int send
,
600 struct iov_iter
*iter
, int msg_flags
, int *sent
)
602 struct nbd_config
*config
= nbd
->config
;
603 struct socket
*sock
= config
->socks
[index
]->sock
;
605 return __sock_xmit(nbd
, sock
, send
, iter
, msg_flags
, sent
);
609 * Different settings for sk->sk_sndtimeo can result in different return values
610 * if there is a signal pending when we enter sendmsg, because reasons?
612 static inline int was_interrupted(int result
)
614 return result
== -ERESTARTSYS
|| result
== -EINTR
;
618 * Returns BLK_STS_RESOURCE if the caller should retry after a delay.
619 * Returns BLK_STS_IOERR if sending failed.
621 static blk_status_t
nbd_send_cmd(struct nbd_device
*nbd
, struct nbd_cmd
*cmd
,
624 struct request
*req
= blk_mq_rq_from_pdu(cmd
);
625 struct nbd_config
*config
= nbd
->config
;
626 struct nbd_sock
*nsock
= config
->socks
[index
];
628 struct nbd_request request
= {.magic
= htonl(NBD_REQUEST_MAGIC
)};
629 struct kvec iov
= {.iov_base
= &request
, .iov_len
= sizeof(request
)};
630 struct iov_iter from
;
634 u32 nbd_cmd_flags
= 0;
635 int sent
= nsock
->sent
, skip
= 0;
637 lockdep_assert_held(&cmd
->lock
);
638 lockdep_assert_held(&nsock
->tx_lock
);
640 iov_iter_kvec(&from
, ITER_SOURCE
, &iov
, 1, sizeof(request
));
642 type
= req_to_nbd_cmd_type(req
);
644 return BLK_STS_IOERR
;
646 if (rq_data_dir(req
) == WRITE
&&
647 (config
->flags
& NBD_FLAG_READ_ONLY
)) {
648 dev_err_ratelimited(disk_to_dev(nbd
->disk
),
649 "Write on read-only\n");
650 return BLK_STS_IOERR
;
653 if (req
->cmd_flags
& REQ_FUA
)
654 nbd_cmd_flags
|= NBD_CMD_FLAG_FUA
;
655 if ((req
->cmd_flags
& REQ_NOUNMAP
) && (type
== NBD_CMD_WRITE_ZEROES
))
656 nbd_cmd_flags
|= NBD_CMD_FLAG_NO_HOLE
;
658 /* We did a partial send previously, and we at least sent the whole
659 * request struct, so just go and send the rest of the pages in the
663 if (sent
>= sizeof(request
)) {
664 skip
= sent
- sizeof(request
);
666 /* initialize handle for tracing purposes */
667 handle
= nbd_cmd_handle(cmd
);
671 iov_iter_advance(&from
, sent
);
676 cmd
->cookie
= nsock
->cookie
;
678 request
.type
= htonl(type
| nbd_cmd_flags
);
679 if (type
!= NBD_CMD_FLUSH
) {
680 request
.from
= cpu_to_be64((u64
)blk_rq_pos(req
) << 9);
681 request
.len
= htonl(blk_rq_bytes(req
));
683 handle
= nbd_cmd_handle(cmd
);
684 request
.cookie
= cpu_to_be64(handle
);
686 trace_nbd_send_request(&request
, nbd
->index
, blk_mq_rq_from_pdu(cmd
));
688 dev_dbg(nbd_to_dev(nbd
), "request %p: sending control (%s@%llu,%uB)\n",
689 req
, nbdcmd_to_ascii(type
),
690 (unsigned long long)blk_rq_pos(req
) << 9, blk_rq_bytes(req
));
691 result
= sock_xmit(nbd
, index
, 1, &from
,
692 (type
== NBD_CMD_WRITE
) ? MSG_MORE
: 0, &sent
);
693 trace_nbd_header_sent(req
, handle
);
695 if (was_interrupted(result
)) {
696 /* If we haven't sent anything we can just return BUSY,
697 * however if we have sent something we need to make
698 * sure we only allow this req to be sent until we are
702 nsock
->pending
= req
;
705 set_bit(NBD_CMD_REQUEUED
, &cmd
->flags
);
706 return BLK_STS_RESOURCE
;
708 dev_err_ratelimited(disk_to_dev(nbd
->disk
),
709 "Send control failed (result %d)\n", result
);
713 if (type
!= NBD_CMD_WRITE
)
718 struct bio
*next
= bio
->bi_next
;
719 struct bvec_iter iter
;
722 bio_for_each_segment(bvec
, bio
, iter
) {
723 bool is_last
= !next
&& bio_iter_last(bvec
, iter
);
724 int flags
= is_last
? 0 : MSG_MORE
;
726 dev_dbg(nbd_to_dev(nbd
), "request %p: sending %d bytes data\n",
728 iov_iter_bvec(&from
, ITER_SOURCE
, &bvec
, 1, bvec
.bv_len
);
730 if (skip
>= iov_iter_count(&from
)) {
731 skip
-= iov_iter_count(&from
);
734 iov_iter_advance(&from
, skip
);
737 result
= sock_xmit(nbd
, index
, 1, &from
, flags
, &sent
);
739 if (was_interrupted(result
)) {
740 /* We've already sent the header, we
741 * have no choice but to set pending and
744 nsock
->pending
= req
;
746 set_bit(NBD_CMD_REQUEUED
, &cmd
->flags
);
747 return BLK_STS_RESOURCE
;
749 dev_err(disk_to_dev(nbd
->disk
),
750 "Send data failed (result %d)\n",
755 * The completion might already have come in,
756 * so break for the last one instead of letting
757 * the iterator do it. This prevents use-after-free
766 trace_nbd_payload_sent(req
, handle
);
767 nsock
->pending
= NULL
;
769 __set_bit(NBD_CMD_INFLIGHT
, &cmd
->flags
);
773 /* retry on a different socket */
774 dev_err_ratelimited(disk_to_dev(nbd
->disk
),
775 "Request send failed, requeueing\n");
776 nbd_mark_nsock_dead(nbd
, nsock
, 1);
777 nbd_requeue_cmd(cmd
);
781 static int nbd_read_reply(struct nbd_device
*nbd
, struct socket
*sock
,
782 struct nbd_reply
*reply
)
784 struct kvec iov
= {.iov_base
= reply
, .iov_len
= sizeof(*reply
)};
789 iov_iter_kvec(&to
, ITER_DEST
, &iov
, 1, sizeof(*reply
));
790 result
= __sock_xmit(nbd
, sock
, 0, &to
, MSG_WAITALL
, NULL
);
792 if (!nbd_disconnected(nbd
->config
))
793 dev_err(disk_to_dev(nbd
->disk
),
794 "Receive control failed (result %d)\n", result
);
798 if (ntohl(reply
->magic
) != NBD_REPLY_MAGIC
) {
799 dev_err(disk_to_dev(nbd
->disk
), "Wrong magic (0x%lx)\n",
800 (unsigned long)ntohl(reply
->magic
));
807 /* NULL returned = something went wrong, inform userspace */
808 static struct nbd_cmd
*nbd_handle_reply(struct nbd_device
*nbd
, int index
,
809 struct nbd_reply
*reply
)
813 struct request
*req
= NULL
;
819 handle
= be64_to_cpu(reply
->cookie
);
820 tag
= nbd_handle_to_tag(handle
);
821 hwq
= blk_mq_unique_tag_to_hwq(tag
);
822 if (hwq
< nbd
->tag_set
.nr_hw_queues
)
823 req
= blk_mq_tag_to_rq(nbd
->tag_set
.tags
[hwq
],
824 blk_mq_unique_tag_to_tag(tag
));
825 if (!req
|| !blk_mq_request_started(req
)) {
826 dev_err(disk_to_dev(nbd
->disk
), "Unexpected reply (%d) %p\n",
828 return ERR_PTR(-ENOENT
);
830 trace_nbd_header_received(req
, handle
);
831 cmd
= blk_mq_rq_to_pdu(req
);
833 mutex_lock(&cmd
->lock
);
834 if (!test_bit(NBD_CMD_INFLIGHT
, &cmd
->flags
)) {
835 dev_err(disk_to_dev(nbd
->disk
), "Suspicious reply %d (status %u flags %lu)",
836 tag
, cmd
->status
, cmd
->flags
);
840 if (cmd
->index
!= index
) {
841 dev_err(disk_to_dev(nbd
->disk
), "Unexpected reply %d from different sock %d (expected %d)",
842 tag
, index
, cmd
->index
);
846 if (cmd
->cmd_cookie
!= nbd_handle_to_cookie(handle
)) {
847 dev_err(disk_to_dev(nbd
->disk
), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
848 req
, cmd
->cmd_cookie
, nbd_handle_to_cookie(handle
));
852 if (cmd
->status
!= BLK_STS_OK
) {
853 dev_err(disk_to_dev(nbd
->disk
), "Command already handled %p\n",
858 if (test_bit(NBD_CMD_REQUEUED
, &cmd
->flags
)) {
859 dev_err(disk_to_dev(nbd
->disk
), "Raced with timeout on req %p\n",
864 if (ntohl(reply
->error
)) {
865 dev_err(disk_to_dev(nbd
->disk
), "Other side returned error (%d)\n",
866 ntohl(reply
->error
));
867 cmd
->status
= BLK_STS_IOERR
;
871 dev_dbg(nbd_to_dev(nbd
), "request %p: got reply\n", req
);
872 if (rq_data_dir(req
) != WRITE
) {
873 struct req_iterator iter
;
877 rq_for_each_segment(bvec
, req
, iter
) {
878 iov_iter_bvec(&to
, ITER_DEST
, &bvec
, 1, bvec
.bv_len
);
879 result
= sock_xmit(nbd
, index
, 0, &to
, MSG_WAITALL
, NULL
);
881 dev_err(disk_to_dev(nbd
->disk
), "Receive data failed (result %d)\n",
884 * If we've disconnected, we need to make sure we
885 * complete this request, otherwise error out
886 * and let the timeout stuff handle resubmitting
887 * this request onto another connection.
889 if (nbd_disconnected(nbd
->config
)) {
890 cmd
->status
= BLK_STS_IOERR
;
896 dev_dbg(nbd_to_dev(nbd
), "request %p: got %d bytes data\n",
901 trace_nbd_payload_received(req
, handle
);
902 mutex_unlock(&cmd
->lock
);
903 return ret
? ERR_PTR(ret
) : cmd
;
906 static void recv_work(struct work_struct
*work
)
908 struct recv_thread_args
*args
= container_of(work
,
909 struct recv_thread_args
,
911 struct nbd_device
*nbd
= args
->nbd
;
912 struct nbd_config
*config
= nbd
->config
;
913 struct request_queue
*q
= nbd
->disk
->queue
;
914 struct nbd_sock
*nsock
= args
->nsock
;
919 struct nbd_reply reply
;
921 if (nbd_read_reply(nbd
, nsock
->sock
, &reply
))
925 * Grab .q_usage_counter so request pool won't go away, then no
926 * request use-after-free is possible during nbd_handle_reply().
927 * If queue is frozen, there won't be any inflight requests, we
928 * needn't to handle the incoming garbage message.
930 if (!percpu_ref_tryget(&q
->q_usage_counter
)) {
931 dev_err(disk_to_dev(nbd
->disk
), "%s: no io inflight\n",
936 cmd
= nbd_handle_reply(nbd
, args
->index
, &reply
);
938 percpu_ref_put(&q
->q_usage_counter
);
942 rq
= blk_mq_rq_from_pdu(cmd
);
943 if (likely(!blk_should_fake_timeout(rq
->q
))) {
946 mutex_lock(&cmd
->lock
);
947 complete
= __test_and_clear_bit(NBD_CMD_INFLIGHT
,
949 mutex_unlock(&cmd
->lock
);
951 blk_mq_complete_request(rq
);
953 percpu_ref_put(&q
->q_usage_counter
);
956 mutex_lock(&nsock
->tx_lock
);
957 nbd_mark_nsock_dead(nbd
, nsock
, 1);
958 mutex_unlock(&nsock
->tx_lock
);
961 atomic_dec(&config
->recv_threads
);
962 wake_up(&config
->recv_wq
);
966 static bool nbd_clear_req(struct request
*req
, void *data
)
968 struct nbd_cmd
*cmd
= blk_mq_rq_to_pdu(req
);
970 /* don't abort one completed request */
971 if (blk_mq_request_completed(req
))
974 mutex_lock(&cmd
->lock
);
975 if (!__test_and_clear_bit(NBD_CMD_INFLIGHT
, &cmd
->flags
)) {
976 mutex_unlock(&cmd
->lock
);
979 cmd
->status
= BLK_STS_IOERR
;
980 mutex_unlock(&cmd
->lock
);
982 blk_mq_complete_request(req
);
986 static void nbd_clear_que(struct nbd_device
*nbd
)
988 blk_mq_quiesce_queue(nbd
->disk
->queue
);
989 blk_mq_tagset_busy_iter(&nbd
->tag_set
, nbd_clear_req
, NULL
);
990 blk_mq_unquiesce_queue(nbd
->disk
->queue
);
991 dev_dbg(disk_to_dev(nbd
->disk
), "queue cleared\n");
994 static int find_fallback(struct nbd_device
*nbd
, int index
)
996 struct nbd_config
*config
= nbd
->config
;
998 struct nbd_sock
*nsock
= config
->socks
[index
];
999 int fallback
= nsock
->fallback_index
;
1001 if (test_bit(NBD_RT_DISCONNECTED
, &config
->runtime_flags
))
1004 if (config
->num_connections
<= 1) {
1005 dev_err_ratelimited(disk_to_dev(nbd
->disk
),
1006 "Dead connection, failed to find a fallback\n");
1010 if (fallback
>= 0 && fallback
< config
->num_connections
&&
1011 !config
->socks
[fallback
]->dead
)
1014 if (nsock
->fallback_index
< 0 ||
1015 nsock
->fallback_index
>= config
->num_connections
||
1016 config
->socks
[nsock
->fallback_index
]->dead
) {
1018 for (i
= 0; i
< config
->num_connections
; i
++) {
1021 if (!config
->socks
[i
]->dead
) {
1026 nsock
->fallback_index
= new_index
;
1027 if (new_index
< 0) {
1028 dev_err_ratelimited(disk_to_dev(nbd
->disk
),
1029 "Dead connection, failed to find a fallback\n");
1033 new_index
= nsock
->fallback_index
;
1037 static int wait_for_reconnect(struct nbd_device
*nbd
)
1039 struct nbd_config
*config
= nbd
->config
;
1040 if (!config
->dead_conn_timeout
)
1043 if (!wait_event_timeout(config
->conn_wait
,
1044 test_bit(NBD_RT_DISCONNECTED
,
1045 &config
->runtime_flags
) ||
1046 atomic_read(&config
->live_connections
) > 0,
1047 config
->dead_conn_timeout
))
1050 return !test_bit(NBD_RT_DISCONNECTED
, &config
->runtime_flags
);
1053 static blk_status_t
nbd_handle_cmd(struct nbd_cmd
*cmd
, int index
)
1055 struct request
*req
= blk_mq_rq_from_pdu(cmd
);
1056 struct nbd_device
*nbd
= cmd
->nbd
;
1057 struct nbd_config
*config
;
1058 struct nbd_sock
*nsock
;
1061 lockdep_assert_held(&cmd
->lock
);
1063 config
= nbd_get_config_unlocked(nbd
);
1065 dev_err_ratelimited(disk_to_dev(nbd
->disk
),
1066 "Socks array is empty\n");
1067 return BLK_STS_IOERR
;
1070 if (index
>= config
->num_connections
) {
1071 dev_err_ratelimited(disk_to_dev(nbd
->disk
),
1072 "Attempted send on invalid socket\n");
1073 nbd_config_put(nbd
);
1074 return BLK_STS_IOERR
;
1076 cmd
->status
= BLK_STS_OK
;
1078 nsock
= config
->socks
[index
];
1079 mutex_lock(&nsock
->tx_lock
);
1081 int old_index
= index
;
1082 index
= find_fallback(nbd
, index
);
1083 mutex_unlock(&nsock
->tx_lock
);
1085 if (wait_for_reconnect(nbd
)) {
1089 /* All the sockets should already be down at this point,
1090 * we just want to make sure that DISCONNECTED is set so
1091 * any requests that come in that were queue'ed waiting
1092 * for the reconnect timer don't trigger the timer again
1093 * and instead just error out.
1096 nbd_config_put(nbd
);
1097 return BLK_STS_IOERR
;
1102 /* Handle the case that we have a pending request that was partially
1103 * transmitted that _has_ to be serviced first. We need to call requeue
1104 * here so that it gets put _after_ the request that is already on the
1107 blk_mq_start_request(req
);
1108 if (unlikely(nsock
->pending
&& nsock
->pending
!= req
)) {
1109 nbd_requeue_cmd(cmd
);
1113 ret
= nbd_send_cmd(nbd
, cmd
, index
);
1115 mutex_unlock(&nsock
->tx_lock
);
1116 nbd_config_put(nbd
);
1120 static blk_status_t
nbd_queue_rq(struct blk_mq_hw_ctx
*hctx
,
1121 const struct blk_mq_queue_data
*bd
)
1123 struct nbd_cmd
*cmd
= blk_mq_rq_to_pdu(bd
->rq
);
1127 * Since we look at the bio's to send the request over the network we
1128 * need to make sure the completion work doesn't mark this request done
1129 * before we are done doing our send. This keeps us from dereferencing
1130 * freed data if we have particularly fast completions (ie we get the
1131 * completion before we exit sock_xmit on the last bvec) or in the case
1132 * that the server is misbehaving (or there was an error) before we're
1133 * done sending everything over the wire.
1135 mutex_lock(&cmd
->lock
);
1136 clear_bit(NBD_CMD_REQUEUED
, &cmd
->flags
);
1138 /* We can be called directly from the user space process, which means we
1139 * could possibly have signals pending so our sendmsg will fail. In
1140 * this case we need to return that we are busy, otherwise error out as
1143 ret
= nbd_handle_cmd(cmd
, hctx
->queue_num
);
1144 mutex_unlock(&cmd
->lock
);
1149 static struct socket
*nbd_get_socket(struct nbd_device
*nbd
, unsigned long fd
,
1152 struct socket
*sock
;
1155 sock
= sockfd_lookup(fd
, err
);
1159 if (sock
->ops
->shutdown
== sock_no_shutdown
) {
1160 dev_err(disk_to_dev(nbd
->disk
), "Unsupported socket: shutdown callout must be supported.\n");
1169 static int nbd_add_socket(struct nbd_device
*nbd
, unsigned long arg
,
1172 struct nbd_config
*config
= nbd
->config
;
1173 struct socket
*sock
;
1174 struct nbd_sock
**socks
;
1175 struct nbd_sock
*nsock
;
1178 /* Arg will be cast to int, check it to avoid overflow */
1181 sock
= nbd_get_socket(nbd
, arg
, &err
);
1186 * We need to make sure we don't get any errant requests while we're
1187 * reallocating the ->socks array.
1189 blk_mq_freeze_queue(nbd
->disk
->queue
);
1191 if (!netlink
&& !nbd
->task_setup
&&
1192 !test_bit(NBD_RT_BOUND
, &config
->runtime_flags
))
1193 nbd
->task_setup
= current
;
1196 (nbd
->task_setup
!= current
||
1197 test_bit(NBD_RT_BOUND
, &config
->runtime_flags
))) {
1198 dev_err(disk_to_dev(nbd
->disk
),
1199 "Device being setup by another task");
1204 nsock
= kzalloc(sizeof(*nsock
), GFP_KERNEL
);
1210 socks
= krealloc(config
->socks
, (config
->num_connections
+ 1) *
1211 sizeof(struct nbd_sock
*), GFP_KERNEL
);
1218 config
->socks
= socks
;
1220 nsock
->fallback_index
= -1;
1221 nsock
->dead
= false;
1222 mutex_init(&nsock
->tx_lock
);
1224 nsock
->pending
= NULL
;
1227 socks
[config
->num_connections
++] = nsock
;
1228 atomic_inc(&config
->live_connections
);
1229 blk_mq_unfreeze_queue(nbd
->disk
->queue
);
1234 blk_mq_unfreeze_queue(nbd
->disk
->queue
);
1239 static int nbd_reconnect_socket(struct nbd_device
*nbd
, unsigned long arg
)
1241 struct nbd_config
*config
= nbd
->config
;
1242 struct socket
*sock
, *old
;
1243 struct recv_thread_args
*args
;
1247 sock
= nbd_get_socket(nbd
, arg
, &err
);
1251 args
= kzalloc(sizeof(*args
), GFP_KERNEL
);
1257 for (i
= 0; i
< config
->num_connections
; i
++) {
1258 struct nbd_sock
*nsock
= config
->socks
[i
];
1263 mutex_lock(&nsock
->tx_lock
);
1265 mutex_unlock(&nsock
->tx_lock
);
1268 sk_set_memalloc(sock
->sk
);
1269 if (nbd
->tag_set
.timeout
)
1270 sock
->sk
->sk_sndtimeo
= nbd
->tag_set
.timeout
;
1271 atomic_inc(&config
->recv_threads
);
1272 refcount_inc(&nbd
->config_refs
);
1274 nsock
->fallback_index
= -1;
1276 nsock
->dead
= false;
1277 INIT_WORK(&args
->work
, recv_work
);
1280 args
->nsock
= nsock
;
1282 mutex_unlock(&nsock
->tx_lock
);
1285 clear_bit(NBD_RT_DISCONNECTED
, &config
->runtime_flags
);
1287 /* We take the tx_mutex in an error path in the recv_work, so we
1288 * need to queue_work outside of the tx_mutex.
1290 queue_work(nbd
->recv_workq
, &args
->work
);
1292 atomic_inc(&config
->live_connections
);
1293 wake_up(&config
->conn_wait
);
1301 static void nbd_bdev_reset(struct nbd_device
*nbd
)
1303 if (disk_openers(nbd
->disk
) > 1)
1305 set_capacity(nbd
->disk
, 0);
1308 static void nbd_parse_flags(struct nbd_device
*nbd
)
1310 if (nbd
->config
->flags
& NBD_FLAG_READ_ONLY
)
1311 set_disk_ro(nbd
->disk
, true);
1313 set_disk_ro(nbd
->disk
, false);
1316 static void send_disconnects(struct nbd_device
*nbd
)
1318 struct nbd_config
*config
= nbd
->config
;
1319 struct nbd_request request
= {
1320 .magic
= htonl(NBD_REQUEST_MAGIC
),
1321 .type
= htonl(NBD_CMD_DISC
),
1323 struct kvec iov
= {.iov_base
= &request
, .iov_len
= sizeof(request
)};
1324 struct iov_iter from
;
1327 for (i
= 0; i
< config
->num_connections
; i
++) {
1328 struct nbd_sock
*nsock
= config
->socks
[i
];
1330 iov_iter_kvec(&from
, ITER_SOURCE
, &iov
, 1, sizeof(request
));
1331 mutex_lock(&nsock
->tx_lock
);
1332 ret
= sock_xmit(nbd
, i
, 1, &from
, 0, NULL
);
1334 dev_err(disk_to_dev(nbd
->disk
),
1335 "Send disconnect failed %d\n", ret
);
1336 mutex_unlock(&nsock
->tx_lock
);
1340 static int nbd_disconnect(struct nbd_device
*nbd
)
1342 struct nbd_config
*config
= nbd
->config
;
1344 dev_info(disk_to_dev(nbd
->disk
), "NBD_DISCONNECT\n");
1345 set_bit(NBD_RT_DISCONNECT_REQUESTED
, &config
->runtime_flags
);
1346 set_bit(NBD_DISCONNECT_REQUESTED
, &nbd
->flags
);
1347 send_disconnects(nbd
);
1351 static void nbd_clear_sock(struct nbd_device
*nbd
)
1355 nbd
->task_setup
= NULL
;
1358 static void nbd_config_put(struct nbd_device
*nbd
)
1360 if (refcount_dec_and_mutex_lock(&nbd
->config_refs
,
1361 &nbd
->config_lock
)) {
1362 struct nbd_config
*config
= nbd
->config
;
1363 nbd_dev_dbg_close(nbd
);
1364 invalidate_disk(nbd
->disk
);
1365 if (nbd
->config
->bytesize
)
1366 kobject_uevent(&nbd_to_dev(nbd
)->kobj
, KOBJ_CHANGE
);
1367 if (test_and_clear_bit(NBD_RT_HAS_PID_FILE
,
1368 &config
->runtime_flags
))
1369 device_remove_file(disk_to_dev(nbd
->disk
), &pid_attr
);
1371 if (test_and_clear_bit(NBD_RT_HAS_BACKEND_FILE
,
1372 &config
->runtime_flags
)) {
1373 device_remove_file(disk_to_dev(nbd
->disk
), &backend_attr
);
1374 kfree(nbd
->backend
);
1375 nbd
->backend
= NULL
;
1377 nbd_clear_sock(nbd
);
1378 if (config
->num_connections
) {
1380 for (i
= 0; i
< config
->num_connections
; i
++) {
1381 sockfd_put(config
->socks
[i
]->sock
);
1382 kfree(config
->socks
[i
]);
1384 kfree(config
->socks
);
1389 nbd
->tag_set
.timeout
= 0;
1391 mutex_unlock(&nbd
->config_lock
);
1393 module_put(THIS_MODULE
);
1397 static int nbd_start_device(struct nbd_device
*nbd
)
1399 struct nbd_config
*config
= nbd
->config
;
1400 int num_connections
= config
->num_connections
;
1407 if (num_connections
> 1 &&
1408 !(config
->flags
& NBD_FLAG_CAN_MULTI_CONN
)) {
1409 dev_err(disk_to_dev(nbd
->disk
), "server does not support multiple connections per device.\n");
1413 blk_mq_update_nr_hw_queues(&nbd
->tag_set
, config
->num_connections
);
1414 nbd
->pid
= task_pid_nr(current
);
1416 nbd_parse_flags(nbd
);
1418 error
= device_create_file(disk_to_dev(nbd
->disk
), &pid_attr
);
1420 dev_err(disk_to_dev(nbd
->disk
), "device_create_file failed for pid!\n");
1423 set_bit(NBD_RT_HAS_PID_FILE
, &config
->runtime_flags
);
1425 nbd_dev_dbg_init(nbd
);
1426 for (i
= 0; i
< num_connections
; i
++) {
1427 struct recv_thread_args
*args
;
1429 args
= kzalloc(sizeof(*args
), GFP_KERNEL
);
1433 * If num_connections is m (2 < m),
1434 * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful.
1435 * But NO.(n + 1) failed. We still have n recv threads.
1436 * So, add flush_workqueue here to prevent recv threads
1437 * dropping the last config_refs and trying to destroy
1438 * the workqueue from inside the workqueue.
1441 flush_workqueue(nbd
->recv_workq
);
1444 sk_set_memalloc(config
->socks
[i
]->sock
->sk
);
1445 if (nbd
->tag_set
.timeout
)
1446 config
->socks
[i
]->sock
->sk
->sk_sndtimeo
=
1447 nbd
->tag_set
.timeout
;
1448 atomic_inc(&config
->recv_threads
);
1449 refcount_inc(&nbd
->config_refs
);
1450 INIT_WORK(&args
->work
, recv_work
);
1452 args
->nsock
= config
->socks
[i
];
1454 queue_work(nbd
->recv_workq
, &args
->work
);
1456 return nbd_set_size(nbd
, config
->bytesize
, nbd_blksize(config
));
1459 static int nbd_start_device_ioctl(struct nbd_device
*nbd
)
1461 struct nbd_config
*config
= nbd
->config
;
1464 ret
= nbd_start_device(nbd
);
1469 set_bit(GD_NEED_PART_SCAN
, &nbd
->disk
->state
);
1470 mutex_unlock(&nbd
->config_lock
);
1471 ret
= wait_event_interruptible(config
->recv_wq
,
1472 atomic_read(&config
->recv_threads
) == 0);
1478 flush_workqueue(nbd
->recv_workq
);
1479 mutex_lock(&nbd
->config_lock
);
1480 nbd_bdev_reset(nbd
);
1481 /* user requested, ignore socket errors */
1482 if (test_bit(NBD_RT_DISCONNECT_REQUESTED
, &config
->runtime_flags
))
1484 if (test_bit(NBD_RT_TIMEDOUT
, &config
->runtime_flags
))
1489 static void nbd_clear_sock_ioctl(struct nbd_device
*nbd
)
1491 nbd_clear_sock(nbd
);
1492 disk_force_media_change(nbd
->disk
);
1493 nbd_bdev_reset(nbd
);
1494 if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF
,
1495 &nbd
->config
->runtime_flags
))
1496 nbd_config_put(nbd
);
1499 static void nbd_set_cmd_timeout(struct nbd_device
*nbd
, u64 timeout
)
1501 nbd
->tag_set
.timeout
= timeout
* HZ
;
1503 blk_queue_rq_timeout(nbd
->disk
->queue
, timeout
* HZ
);
1505 blk_queue_rq_timeout(nbd
->disk
->queue
, 30 * HZ
);
1508 /* Must be called with config_lock held */
1509 static int __nbd_ioctl(struct block_device
*bdev
, struct nbd_device
*nbd
,
1510 unsigned int cmd
, unsigned long arg
)
1512 struct nbd_config
*config
= nbd
->config
;
1516 case NBD_DISCONNECT
:
1517 return nbd_disconnect(nbd
);
1518 case NBD_CLEAR_SOCK
:
1519 nbd_clear_sock_ioctl(nbd
);
1522 return nbd_add_socket(nbd
, arg
, false);
1523 case NBD_SET_BLKSIZE
:
1524 return nbd_set_size(nbd
, config
->bytesize
, arg
);
1526 return nbd_set_size(nbd
, arg
, nbd_blksize(config
));
1527 case NBD_SET_SIZE_BLOCKS
:
1528 if (check_shl_overflow(arg
, config
->blksize_bits
, &bytesize
))
1530 return nbd_set_size(nbd
, bytesize
, nbd_blksize(config
));
1531 case NBD_SET_TIMEOUT
:
1532 nbd_set_cmd_timeout(nbd
, arg
);
1536 config
->flags
= arg
;
1539 return nbd_start_device_ioctl(nbd
);
1542 * This is for compatibility only. The queue is always cleared
1543 * by NBD_DO_IT or NBD_CLEAR_SOCK.
1546 case NBD_PRINT_DEBUG
:
1548 * For compatibility only, we no longer keep a list of
1549 * outstanding requests.
1556 static int nbd_ioctl(struct block_device
*bdev
, blk_mode_t mode
,
1557 unsigned int cmd
, unsigned long arg
)
1559 struct nbd_device
*nbd
= bdev
->bd_disk
->private_data
;
1560 struct nbd_config
*config
= nbd
->config
;
1561 int error
= -EINVAL
;
1563 if (!capable(CAP_SYS_ADMIN
))
1566 /* The block layer will pass back some non-nbd ioctls in case we have
1567 * special handling for them, but we don't so just return an error.
1569 if (_IOC_TYPE(cmd
) != 0xab)
1572 mutex_lock(&nbd
->config_lock
);
1574 /* Don't allow ioctl operations on a nbd device that was created with
1575 * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
1577 if (!test_bit(NBD_RT_BOUND
, &config
->runtime_flags
) ||
1578 (cmd
== NBD_DISCONNECT
|| cmd
== NBD_CLEAR_SOCK
))
1579 error
= __nbd_ioctl(bdev
, nbd
, cmd
, arg
);
1581 dev_err(nbd_to_dev(nbd
), "Cannot use ioctl interface on a netlink controlled device.\n");
1582 mutex_unlock(&nbd
->config_lock
);
1586 static int nbd_alloc_and_init_config(struct nbd_device
*nbd
)
1588 struct nbd_config
*config
;
1590 if (WARN_ON(nbd
->config
))
1593 if (!try_module_get(THIS_MODULE
))
1596 config
= kzalloc(sizeof(struct nbd_config
), GFP_NOFS
);
1598 module_put(THIS_MODULE
);
1602 atomic_set(&config
->recv_threads
, 0);
1603 init_waitqueue_head(&config
->recv_wq
);
1604 init_waitqueue_head(&config
->conn_wait
);
1605 config
->blksize_bits
= NBD_DEF_BLKSIZE_BITS
;
1606 atomic_set(&config
->live_connections
, 0);
1608 nbd
->config
= config
;
1610 * Order refcount_set(&nbd->config_refs, 1) and nbd->config assignment,
1611 * its pair is the barrier in nbd_get_config_unlocked().
1612 * So nbd_get_config_unlocked() won't see nbd->config as null after
1613 * refcount_inc_not_zero() succeed.
1615 smp_mb__before_atomic();
1616 refcount_set(&nbd
->config_refs
, 1);
1621 static int nbd_open(struct gendisk
*disk
, blk_mode_t mode
)
1623 struct nbd_device
*nbd
;
1624 struct nbd_config
*config
;
1627 mutex_lock(&nbd_index_mutex
);
1628 nbd
= disk
->private_data
;
1633 if (!refcount_inc_not_zero(&nbd
->refs
)) {
1638 config
= nbd_get_config_unlocked(nbd
);
1640 mutex_lock(&nbd
->config_lock
);
1641 if (refcount_inc_not_zero(&nbd
->config_refs
)) {
1642 mutex_unlock(&nbd
->config_lock
);
1645 ret
= nbd_alloc_and_init_config(nbd
);
1647 mutex_unlock(&nbd
->config_lock
);
1651 refcount_inc(&nbd
->refs
);
1652 mutex_unlock(&nbd
->config_lock
);
1654 set_bit(GD_NEED_PART_SCAN
, &disk
->state
);
1655 } else if (nbd_disconnected(config
)) {
1657 set_bit(GD_NEED_PART_SCAN
, &disk
->state
);
1660 mutex_unlock(&nbd_index_mutex
);
1664 static void nbd_release(struct gendisk
*disk
)
1666 struct nbd_device
*nbd
= disk
->private_data
;
1668 if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE
, &nbd
->config
->runtime_flags
) &&
1669 disk_openers(disk
) == 0)
1670 nbd_disconnect_and_put(nbd
);
1672 nbd_config_put(nbd
);
1676 static void nbd_free_disk(struct gendisk
*disk
)
1678 struct nbd_device
*nbd
= disk
->private_data
;
1683 static const struct block_device_operations nbd_fops
=
1685 .owner
= THIS_MODULE
,
1687 .release
= nbd_release
,
1689 .compat_ioctl
= nbd_ioctl
,
1690 .free_disk
= nbd_free_disk
,
1693 #if IS_ENABLED(CONFIG_DEBUG_FS)
1695 static int nbd_dbg_tasks_show(struct seq_file
*s
, void *unused
)
1697 struct nbd_device
*nbd
= s
->private;
1700 seq_printf(s
, "recv: %d\n", nbd
->pid
);
1705 DEFINE_SHOW_ATTRIBUTE(nbd_dbg_tasks
);
1707 static int nbd_dbg_flags_show(struct seq_file
*s
, void *unused
)
1709 struct nbd_device
*nbd
= s
->private;
1710 u32 flags
= nbd
->config
->flags
;
1712 seq_printf(s
, "Hex: 0x%08x\n\n", flags
);
1714 seq_puts(s
, "Known flags:\n");
1716 if (flags
& NBD_FLAG_HAS_FLAGS
)
1717 seq_puts(s
, "NBD_FLAG_HAS_FLAGS\n");
1718 if (flags
& NBD_FLAG_READ_ONLY
)
1719 seq_puts(s
, "NBD_FLAG_READ_ONLY\n");
1720 if (flags
& NBD_FLAG_SEND_FLUSH
)
1721 seq_puts(s
, "NBD_FLAG_SEND_FLUSH\n");
1722 if (flags
& NBD_FLAG_SEND_FUA
)
1723 seq_puts(s
, "NBD_FLAG_SEND_FUA\n");
1724 if (flags
& NBD_FLAG_SEND_TRIM
)
1725 seq_puts(s
, "NBD_FLAG_SEND_TRIM\n");
1726 if (flags
& NBD_FLAG_SEND_WRITE_ZEROES
)
1727 seq_puts(s
, "NBD_FLAG_SEND_WRITE_ZEROES\n");
1728 if (flags
& NBD_FLAG_ROTATIONAL
)
1729 seq_puts(s
, "NBD_FLAG_ROTATIONAL\n");
1734 DEFINE_SHOW_ATTRIBUTE(nbd_dbg_flags
);
1736 static int nbd_dev_dbg_init(struct nbd_device
*nbd
)
1739 struct nbd_config
*config
= nbd
->config
;
1744 dir
= debugfs_create_dir(nbd_name(nbd
), nbd_dbg_dir
);
1746 dev_err(nbd_to_dev(nbd
), "Failed to create debugfs dir for '%s'\n",
1750 config
->dbg_dir
= dir
;
1752 debugfs_create_file("tasks", 0444, dir
, nbd
, &nbd_dbg_tasks_fops
);
1753 debugfs_create_u64("size_bytes", 0444, dir
, &config
->bytesize
);
1754 debugfs_create_u32("timeout", 0444, dir
, &nbd
->tag_set
.timeout
);
1755 debugfs_create_u32("blocksize_bits", 0444, dir
, &config
->blksize_bits
);
1756 debugfs_create_file("flags", 0444, dir
, nbd
, &nbd_dbg_flags_fops
);
1761 static void nbd_dev_dbg_close(struct nbd_device
*nbd
)
1763 debugfs_remove_recursive(nbd
->config
->dbg_dir
);
1766 static int nbd_dbg_init(void)
1768 struct dentry
*dbg_dir
;
1770 dbg_dir
= debugfs_create_dir("nbd", NULL
);
1771 if (IS_ERR(dbg_dir
))
1774 nbd_dbg_dir
= dbg_dir
;
1779 static void nbd_dbg_close(void)
1781 debugfs_remove_recursive(nbd_dbg_dir
);
1784 #else /* IS_ENABLED(CONFIG_DEBUG_FS) */
1786 static int nbd_dev_dbg_init(struct nbd_device
*nbd
)
1791 static void nbd_dev_dbg_close(struct nbd_device
*nbd
)
1795 static int nbd_dbg_init(void)
1800 static void nbd_dbg_close(void)
1806 static int nbd_init_request(struct blk_mq_tag_set
*set
, struct request
*rq
,
1807 unsigned int hctx_idx
, unsigned int numa_node
)
1809 struct nbd_cmd
*cmd
= blk_mq_rq_to_pdu(rq
);
1810 cmd
->nbd
= set
->driver_data
;
1812 mutex_init(&cmd
->lock
);
1816 static const struct blk_mq_ops nbd_mq_ops
= {
1817 .queue_rq
= nbd_queue_rq
,
1818 .complete
= nbd_complete_rq
,
1819 .init_request
= nbd_init_request
,
1820 .timeout
= nbd_xmit_timeout
,
1823 static struct nbd_device
*nbd_dev_add(int index
, unsigned int refs
)
1825 struct queue_limits lim
= {
1826 .max_hw_sectors
= 65536,
1827 .io_opt
= 256 << SECTOR_SHIFT
,
1828 .max_segments
= USHRT_MAX
,
1829 .max_segment_size
= UINT_MAX
,
1831 struct nbd_device
*nbd
;
1832 struct gendisk
*disk
;
1835 nbd
= kzalloc(sizeof(struct nbd_device
), GFP_KERNEL
);
1839 nbd
->tag_set
.ops
= &nbd_mq_ops
;
1840 nbd
->tag_set
.nr_hw_queues
= 1;
1841 nbd
->tag_set
.queue_depth
= 128;
1842 nbd
->tag_set
.numa_node
= NUMA_NO_NODE
;
1843 nbd
->tag_set
.cmd_size
= sizeof(struct nbd_cmd
);
1844 nbd
->tag_set
.flags
= BLK_MQ_F_SHOULD_MERGE
|
1846 nbd
->tag_set
.driver_data
= nbd
;
1847 INIT_WORK(&nbd
->remove_work
, nbd_dev_remove_work
);
1848 nbd
->backend
= NULL
;
1850 err
= blk_mq_alloc_tag_set(&nbd
->tag_set
);
1854 mutex_lock(&nbd_index_mutex
);
1856 err
= idr_alloc(&nbd_index_idr
, nbd
, index
, index
+ 1,
1861 err
= idr_alloc(&nbd_index_idr
, nbd
, 0,
1862 (MINORMASK
>> part_shift
) + 1, GFP_KERNEL
);
1867 mutex_unlock(&nbd_index_mutex
);
1871 disk
= blk_mq_alloc_disk(&nbd
->tag_set
, &lim
, NULL
);
1873 err
= PTR_ERR(disk
);
1878 nbd
->recv_workq
= alloc_workqueue("nbd%d-recv",
1879 WQ_MEM_RECLAIM
| WQ_HIGHPRI
|
1880 WQ_UNBOUND
, 0, nbd
->index
);
1881 if (!nbd
->recv_workq
) {
1882 dev_err(disk_to_dev(nbd
->disk
), "Could not allocate knbd recv work queue.\n");
1887 mutex_init(&nbd
->config_lock
);
1888 refcount_set(&nbd
->config_refs
, 0);
1890 * Start out with a zero references to keep other threads from using
1891 * this device until it is fully initialized.
1893 refcount_set(&nbd
->refs
, 0);
1894 INIT_LIST_HEAD(&nbd
->list
);
1895 disk
->major
= NBD_MAJOR
;
1896 disk
->first_minor
= index
<< part_shift
;
1897 disk
->minors
= 1 << part_shift
;
1898 disk
->fops
= &nbd_fops
;
1899 disk
->private_data
= nbd
;
1900 sprintf(disk
->disk_name
, "nbd%d", index
);
1901 err
= add_disk(disk
);
1906 * Now publish the device.
1908 refcount_set(&nbd
->refs
, refs
);
1909 nbd_total_devices
++;
1913 destroy_workqueue(nbd
->recv_workq
);
1917 mutex_lock(&nbd_index_mutex
);
1918 idr_remove(&nbd_index_idr
, index
);
1919 mutex_unlock(&nbd_index_mutex
);
1921 blk_mq_free_tag_set(&nbd
->tag_set
);
1925 return ERR_PTR(err
);
1928 static struct nbd_device
*nbd_find_get_unused(void)
1930 struct nbd_device
*nbd
;
1933 lockdep_assert_held(&nbd_index_mutex
);
1935 idr_for_each_entry(&nbd_index_idr
, nbd
, id
) {
1936 if (refcount_read(&nbd
->config_refs
) ||
1937 test_bit(NBD_DESTROY_ON_DISCONNECT
, &nbd
->flags
))
1939 if (refcount_inc_not_zero(&nbd
->refs
))
1946 /* Netlink interface. */
1947 static const struct nla_policy nbd_attr_policy
[NBD_ATTR_MAX
+ 1] = {
1948 [NBD_ATTR_INDEX
] = { .type
= NLA_U32
},
1949 [NBD_ATTR_SIZE_BYTES
] = { .type
= NLA_U64
},
1950 [NBD_ATTR_BLOCK_SIZE_BYTES
] = { .type
= NLA_U64
},
1951 [NBD_ATTR_TIMEOUT
] = { .type
= NLA_U64
},
1952 [NBD_ATTR_SERVER_FLAGS
] = { .type
= NLA_U64
},
1953 [NBD_ATTR_CLIENT_FLAGS
] = { .type
= NLA_U64
},
1954 [NBD_ATTR_SOCKETS
] = { .type
= NLA_NESTED
},
1955 [NBD_ATTR_DEAD_CONN_TIMEOUT
] = { .type
= NLA_U64
},
1956 [NBD_ATTR_DEVICE_LIST
] = { .type
= NLA_NESTED
},
1957 [NBD_ATTR_BACKEND_IDENTIFIER
] = { .type
= NLA_STRING
},
1960 static const struct nla_policy nbd_sock_policy
[NBD_SOCK_MAX
+ 1] = {
1961 [NBD_SOCK_FD
] = { .type
= NLA_U32
},
1964 /* We don't use this right now since we don't parse the incoming list, but we
1965 * still want it here so userspace knows what to expect.
1967 static const struct nla_policy
__attribute__((unused
))
1968 nbd_device_policy
[NBD_DEVICE_ATTR_MAX
+ 1] = {
1969 [NBD_DEVICE_INDEX
] = { .type
= NLA_U32
},
1970 [NBD_DEVICE_CONNECTED
] = { .type
= NLA_U8
},
1973 static int nbd_genl_size_set(struct genl_info
*info
, struct nbd_device
*nbd
)
1975 struct nbd_config
*config
= nbd
->config
;
1976 u64 bsize
= nbd_blksize(config
);
1977 u64 bytes
= config
->bytesize
;
1979 if (info
->attrs
[NBD_ATTR_SIZE_BYTES
])
1980 bytes
= nla_get_u64(info
->attrs
[NBD_ATTR_SIZE_BYTES
]);
1982 if (info
->attrs
[NBD_ATTR_BLOCK_SIZE_BYTES
])
1983 bsize
= nla_get_u64(info
->attrs
[NBD_ATTR_BLOCK_SIZE_BYTES
]);
1985 if (bytes
!= config
->bytesize
|| bsize
!= nbd_blksize(config
))
1986 return nbd_set_size(nbd
, bytes
, bsize
);
1990 static int nbd_genl_connect(struct sk_buff
*skb
, struct genl_info
*info
)
1992 struct nbd_device
*nbd
;
1993 struct nbd_config
*config
;
1996 bool put_dev
= false;
1998 if (!netlink_capable(skb
, CAP_SYS_ADMIN
))
2001 if (info
->attrs
[NBD_ATTR_INDEX
]) {
2002 index
= nla_get_u32(info
->attrs
[NBD_ATTR_INDEX
]);
2005 * Too big first_minor can cause duplicate creation of
2006 * sysfs files/links, since index << part_shift might overflow, or
2007 * MKDEV() expect that the max bits of first_minor is 20.
2009 if (index
< 0 || index
> MINORMASK
>> part_shift
) {
2010 pr_err("illegal input index %d\n", index
);
2014 if (GENL_REQ_ATTR_CHECK(info
, NBD_ATTR_SOCKETS
)) {
2015 pr_err("must specify at least one socket\n");
2018 if (GENL_REQ_ATTR_CHECK(info
, NBD_ATTR_SIZE_BYTES
)) {
2019 pr_err("must specify a size in bytes for the device\n");
2023 mutex_lock(&nbd_index_mutex
);
2025 nbd
= nbd_find_get_unused();
2027 nbd
= idr_find(&nbd_index_idr
, index
);
2029 if ((test_bit(NBD_DESTROY_ON_DISCONNECT
, &nbd
->flags
) &&
2030 test_bit(NBD_DISCONNECT_REQUESTED
, &nbd
->flags
)) ||
2031 !refcount_inc_not_zero(&nbd
->refs
)) {
2032 mutex_unlock(&nbd_index_mutex
);
2033 pr_err("device at index %d is going down\n",
2039 mutex_unlock(&nbd_index_mutex
);
2042 nbd
= nbd_dev_add(index
, 2);
2044 pr_err("failed to add new device\n");
2045 return PTR_ERR(nbd
);
2049 mutex_lock(&nbd
->config_lock
);
2050 if (refcount_read(&nbd
->config_refs
)) {
2051 mutex_unlock(&nbd
->config_lock
);
2055 pr_err("nbd%d already in use\n", index
);
2059 ret
= nbd_alloc_and_init_config(nbd
);
2061 mutex_unlock(&nbd
->config_lock
);
2063 pr_err("couldn't allocate config\n");
2067 config
= nbd
->config
;
2068 set_bit(NBD_RT_BOUND
, &config
->runtime_flags
);
2069 ret
= nbd_genl_size_set(info
, nbd
);
2073 if (info
->attrs
[NBD_ATTR_TIMEOUT
])
2074 nbd_set_cmd_timeout(nbd
,
2075 nla_get_u64(info
->attrs
[NBD_ATTR_TIMEOUT
]));
2076 if (info
->attrs
[NBD_ATTR_DEAD_CONN_TIMEOUT
]) {
2077 config
->dead_conn_timeout
=
2078 nla_get_u64(info
->attrs
[NBD_ATTR_DEAD_CONN_TIMEOUT
]);
2079 config
->dead_conn_timeout
*= HZ
;
2081 if (info
->attrs
[NBD_ATTR_SERVER_FLAGS
])
2083 nla_get_u64(info
->attrs
[NBD_ATTR_SERVER_FLAGS
]);
2084 if (info
->attrs
[NBD_ATTR_CLIENT_FLAGS
]) {
2085 u64 flags
= nla_get_u64(info
->attrs
[NBD_ATTR_CLIENT_FLAGS
]);
2086 if (flags
& NBD_CFLAG_DESTROY_ON_DISCONNECT
) {
2088 * We have 1 ref to keep the device around, and then 1
2089 * ref for our current operation here, which will be
2090 * inherited by the config. If we already have
2091 * DESTROY_ON_DISCONNECT set then we know we don't have
2092 * that extra ref already held so we don't need the
2095 if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT
,
2099 if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT
,
2101 refcount_inc(&nbd
->refs
);
2103 if (flags
& NBD_CFLAG_DISCONNECT_ON_CLOSE
) {
2104 set_bit(NBD_RT_DISCONNECT_ON_CLOSE
,
2105 &config
->runtime_flags
);
2109 if (info
->attrs
[NBD_ATTR_SOCKETS
]) {
2110 struct nlattr
*attr
;
2113 nla_for_each_nested(attr
, info
->attrs
[NBD_ATTR_SOCKETS
],
2115 struct nlattr
*socks
[NBD_SOCK_MAX
+1];
2117 if (nla_type(attr
) != NBD_SOCK_ITEM
) {
2118 pr_err("socks must be embedded in a SOCK_ITEM attr\n");
2122 ret
= nla_parse_nested_deprecated(socks
, NBD_SOCK_MAX
,
2127 pr_err("error processing sock list\n");
2131 if (!socks
[NBD_SOCK_FD
])
2133 fd
= (int)nla_get_u32(socks
[NBD_SOCK_FD
]);
2134 ret
= nbd_add_socket(nbd
, fd
, true);
2139 ret
= nbd_start_device(nbd
);
2142 if (info
->attrs
[NBD_ATTR_BACKEND_IDENTIFIER
]) {
2143 nbd
->backend
= nla_strdup(info
->attrs
[NBD_ATTR_BACKEND_IDENTIFIER
],
2145 if (!nbd
->backend
) {
2150 ret
= device_create_file(disk_to_dev(nbd
->disk
), &backend_attr
);
2152 dev_err(disk_to_dev(nbd
->disk
),
2153 "device_create_file failed for backend!\n");
2156 set_bit(NBD_RT_HAS_BACKEND_FILE
, &config
->runtime_flags
);
2158 mutex_unlock(&nbd
->config_lock
);
2160 set_bit(NBD_RT_HAS_CONFIG_REF
, &config
->runtime_flags
);
2161 refcount_inc(&nbd
->config_refs
);
2162 nbd_connect_reply(info
, nbd
->index
);
2164 nbd_config_put(nbd
);
2170 static void nbd_disconnect_and_put(struct nbd_device
*nbd
)
2172 mutex_lock(&nbd
->config_lock
);
2173 nbd_disconnect(nbd
);
2175 wake_up(&nbd
->config
->conn_wait
);
2177 * Make sure recv thread has finished, we can safely call nbd_clear_que()
2178 * to cancel the inflight I/Os.
2180 flush_workqueue(nbd
->recv_workq
);
2182 nbd
->task_setup
= NULL
;
2183 mutex_unlock(&nbd
->config_lock
);
2185 if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF
,
2186 &nbd
->config
->runtime_flags
))
2187 nbd_config_put(nbd
);
2190 static int nbd_genl_disconnect(struct sk_buff
*skb
, struct genl_info
*info
)
2192 struct nbd_device
*nbd
;
2195 if (!netlink_capable(skb
, CAP_SYS_ADMIN
))
2198 if (GENL_REQ_ATTR_CHECK(info
, NBD_ATTR_INDEX
)) {
2199 pr_err("must specify an index to disconnect\n");
2202 index
= nla_get_u32(info
->attrs
[NBD_ATTR_INDEX
]);
2203 mutex_lock(&nbd_index_mutex
);
2204 nbd
= idr_find(&nbd_index_idr
, index
);
2206 mutex_unlock(&nbd_index_mutex
);
2207 pr_err("couldn't find device at index %d\n", index
);
2210 if (!refcount_inc_not_zero(&nbd
->refs
)) {
2211 mutex_unlock(&nbd_index_mutex
);
2212 pr_err("device at index %d is going down\n", index
);
2215 mutex_unlock(&nbd_index_mutex
);
2216 if (!refcount_inc_not_zero(&nbd
->config_refs
))
2218 nbd_disconnect_and_put(nbd
);
2219 nbd_config_put(nbd
);
2225 static int nbd_genl_reconfigure(struct sk_buff
*skb
, struct genl_info
*info
)
2227 struct nbd_device
*nbd
= NULL
;
2228 struct nbd_config
*config
;
2231 bool put_dev
= false;
2233 if (!netlink_capable(skb
, CAP_SYS_ADMIN
))
2236 if (GENL_REQ_ATTR_CHECK(info
, NBD_ATTR_INDEX
)) {
2237 pr_err("must specify a device to reconfigure\n");
2240 index
= nla_get_u32(info
->attrs
[NBD_ATTR_INDEX
]);
2241 mutex_lock(&nbd_index_mutex
);
2242 nbd
= idr_find(&nbd_index_idr
, index
);
2244 mutex_unlock(&nbd_index_mutex
);
2245 pr_err("couldn't find a device at index %d\n", index
);
2249 if (info
->attrs
[NBD_ATTR_BACKEND_IDENTIFIER
]) {
2250 if (nla_strcmp(info
->attrs
[NBD_ATTR_BACKEND_IDENTIFIER
],
2252 mutex_unlock(&nbd_index_mutex
);
2253 dev_err(nbd_to_dev(nbd
),
2254 "backend image doesn't match with %s\n",
2259 mutex_unlock(&nbd_index_mutex
);
2260 dev_err(nbd_to_dev(nbd
), "must specify backend\n");
2264 if (!refcount_inc_not_zero(&nbd
->refs
)) {
2265 mutex_unlock(&nbd_index_mutex
);
2266 pr_err("device at index %d is going down\n", index
);
2269 mutex_unlock(&nbd_index_mutex
);
2271 config
= nbd_get_config_unlocked(nbd
);
2273 dev_err(nbd_to_dev(nbd
),
2274 "not configured, cannot reconfigure\n");
2279 mutex_lock(&nbd
->config_lock
);
2280 if (!test_bit(NBD_RT_BOUND
, &config
->runtime_flags
) ||
2282 dev_err(nbd_to_dev(nbd
),
2283 "not configured, cannot reconfigure\n");
2288 ret
= nbd_genl_size_set(info
, nbd
);
2292 if (info
->attrs
[NBD_ATTR_TIMEOUT
])
2293 nbd_set_cmd_timeout(nbd
,
2294 nla_get_u64(info
->attrs
[NBD_ATTR_TIMEOUT
]));
2295 if (info
->attrs
[NBD_ATTR_DEAD_CONN_TIMEOUT
]) {
2296 config
->dead_conn_timeout
=
2297 nla_get_u64(info
->attrs
[NBD_ATTR_DEAD_CONN_TIMEOUT
]);
2298 config
->dead_conn_timeout
*= HZ
;
2300 if (info
->attrs
[NBD_ATTR_CLIENT_FLAGS
]) {
2301 u64 flags
= nla_get_u64(info
->attrs
[NBD_ATTR_CLIENT_FLAGS
]);
2302 if (flags
& NBD_CFLAG_DESTROY_ON_DISCONNECT
) {
2303 if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT
,
2307 if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT
,
2309 refcount_inc(&nbd
->refs
);
2312 if (flags
& NBD_CFLAG_DISCONNECT_ON_CLOSE
) {
2313 set_bit(NBD_RT_DISCONNECT_ON_CLOSE
,
2314 &config
->runtime_flags
);
2316 clear_bit(NBD_RT_DISCONNECT_ON_CLOSE
,
2317 &config
->runtime_flags
);
2321 if (info
->attrs
[NBD_ATTR_SOCKETS
]) {
2322 struct nlattr
*attr
;
2325 nla_for_each_nested(attr
, info
->attrs
[NBD_ATTR_SOCKETS
],
2327 struct nlattr
*socks
[NBD_SOCK_MAX
+1];
2329 if (nla_type(attr
) != NBD_SOCK_ITEM
) {
2330 pr_err("socks must be embedded in a SOCK_ITEM attr\n");
2334 ret
= nla_parse_nested_deprecated(socks
, NBD_SOCK_MAX
,
2339 pr_err("error processing sock list\n");
2343 if (!socks
[NBD_SOCK_FD
])
2345 fd
= (int)nla_get_u32(socks
[NBD_SOCK_FD
]);
2346 ret
= nbd_reconnect_socket(nbd
, fd
);
2352 dev_info(nbd_to_dev(nbd
), "reconnected socket\n");
2356 mutex_unlock(&nbd
->config_lock
);
2357 nbd_config_put(nbd
);
2364 static const struct genl_small_ops nbd_connect_genl_ops
[] = {
2366 .cmd
= NBD_CMD_CONNECT
,
2367 .validate
= GENL_DONT_VALIDATE_STRICT
| GENL_DONT_VALIDATE_DUMP
,
2368 .doit
= nbd_genl_connect
,
2371 .cmd
= NBD_CMD_DISCONNECT
,
2372 .validate
= GENL_DONT_VALIDATE_STRICT
| GENL_DONT_VALIDATE_DUMP
,
2373 .doit
= nbd_genl_disconnect
,
2376 .cmd
= NBD_CMD_RECONFIGURE
,
2377 .validate
= GENL_DONT_VALIDATE_STRICT
| GENL_DONT_VALIDATE_DUMP
,
2378 .doit
= nbd_genl_reconfigure
,
2381 .cmd
= NBD_CMD_STATUS
,
2382 .validate
= GENL_DONT_VALIDATE_STRICT
| GENL_DONT_VALIDATE_DUMP
,
2383 .doit
= nbd_genl_status
,
2387 static const struct genl_multicast_group nbd_mcast_grps
[] = {
2388 { .name
= NBD_GENL_MCAST_GROUP_NAME
, },
2391 static struct genl_family nbd_genl_family __ro_after_init
= {
2393 .name
= NBD_GENL_FAMILY_NAME
,
2394 .version
= NBD_GENL_VERSION
,
2395 .module
= THIS_MODULE
,
2396 .small_ops
= nbd_connect_genl_ops
,
2397 .n_small_ops
= ARRAY_SIZE(nbd_connect_genl_ops
),
2398 .resv_start_op
= NBD_CMD_STATUS
+ 1,
2399 .maxattr
= NBD_ATTR_MAX
,
2401 .policy
= nbd_attr_policy
,
2402 .mcgrps
= nbd_mcast_grps
,
2403 .n_mcgrps
= ARRAY_SIZE(nbd_mcast_grps
),
2405 MODULE_ALIAS_GENL_FAMILY(NBD_GENL_FAMILY_NAME
);
2407 static int populate_nbd_status(struct nbd_device
*nbd
, struct sk_buff
*reply
)
2409 struct nlattr
*dev_opt
;
2413 /* This is a little racey, but for status it's ok. The
2414 * reason we don't take a ref here is because we can't
2415 * take a ref in the index == -1 case as we would need
2416 * to put under the nbd_index_mutex, which could
2417 * deadlock if we are configured to remove ourselves
2418 * once we're disconnected.
2420 if (refcount_read(&nbd
->config_refs
))
2422 dev_opt
= nla_nest_start_noflag(reply
, NBD_DEVICE_ITEM
);
2425 ret
= nla_put_u32(reply
, NBD_DEVICE_INDEX
, nbd
->index
);
2428 ret
= nla_put_u8(reply
, NBD_DEVICE_CONNECTED
,
2432 nla_nest_end(reply
, dev_opt
);
2436 static int status_cb(int id
, void *ptr
, void *data
)
2438 struct nbd_device
*nbd
= ptr
;
2439 return populate_nbd_status(nbd
, (struct sk_buff
*)data
);
2442 static int nbd_genl_status(struct sk_buff
*skb
, struct genl_info
*info
)
2444 struct nlattr
*dev_list
;
2445 struct sk_buff
*reply
;
2451 if (info
->attrs
[NBD_ATTR_INDEX
])
2452 index
= nla_get_u32(info
->attrs
[NBD_ATTR_INDEX
]);
2454 mutex_lock(&nbd_index_mutex
);
2456 msg_size
= nla_total_size(nla_attr_size(sizeof(u32
)) +
2457 nla_attr_size(sizeof(u8
)));
2458 msg_size
*= (index
== -1) ? nbd_total_devices
: 1;
2460 reply
= genlmsg_new(msg_size
, GFP_KERNEL
);
2463 reply_head
= genlmsg_put_reply(reply
, info
, &nbd_genl_family
, 0,
2470 dev_list
= nla_nest_start_noflag(reply
, NBD_ATTR_DEVICE_LIST
);
2478 ret
= idr_for_each(&nbd_index_idr
, &status_cb
, reply
);
2484 struct nbd_device
*nbd
;
2485 nbd
= idr_find(&nbd_index_idr
, index
);
2487 ret
= populate_nbd_status(nbd
, reply
);
2494 nla_nest_end(reply
, dev_list
);
2495 genlmsg_end(reply
, reply_head
);
2496 ret
= genlmsg_reply(reply
, info
);
2498 mutex_unlock(&nbd_index_mutex
);
2502 static void nbd_connect_reply(struct genl_info
*info
, int index
)
2504 struct sk_buff
*skb
;
2508 skb
= genlmsg_new(nla_total_size(sizeof(u32
)), GFP_KERNEL
);
2511 msg_head
= genlmsg_put_reply(skb
, info
, &nbd_genl_family
, 0,
2517 ret
= nla_put_u32(skb
, NBD_ATTR_INDEX
, index
);
2522 genlmsg_end(skb
, msg_head
);
2523 genlmsg_reply(skb
, info
);
2526 static void nbd_mcast_index(int index
)
2528 struct sk_buff
*skb
;
2532 skb
= genlmsg_new(nla_total_size(sizeof(u32
)), GFP_KERNEL
);
2535 msg_head
= genlmsg_put(skb
, 0, 0, &nbd_genl_family
, 0,
2541 ret
= nla_put_u32(skb
, NBD_ATTR_INDEX
, index
);
2546 genlmsg_end(skb
, msg_head
);
2547 genlmsg_multicast(&nbd_genl_family
, skb
, 0, 0, GFP_KERNEL
);
2550 static void nbd_dead_link_work(struct work_struct
*work
)
2552 struct link_dead_args
*args
= container_of(work
, struct link_dead_args
,
2554 nbd_mcast_index(args
->index
);
2558 static int __init
nbd_init(void)
2562 BUILD_BUG_ON(sizeof(struct nbd_request
) != 28);
2565 pr_err("max_part must be >= 0\n");
2571 part_shift
= fls(max_part
);
2574 * Adjust max_part according to part_shift as it is exported
2575 * to user space so that user can know the max number of
2576 * partition kernel should be able to manage.
2578 * Note that -1 is required because partition 0 is reserved
2579 * for the whole disk.
2581 max_part
= (1UL << part_shift
) - 1;
2584 if ((1UL << part_shift
) > DISK_MAX_PARTS
)
2587 if (nbds_max
> 1UL << (MINORBITS
- part_shift
))
2590 if (register_blkdev(NBD_MAJOR
, "nbd"))
2593 nbd_del_wq
= alloc_workqueue("nbd-del", WQ_UNBOUND
, 0);
2595 unregister_blkdev(NBD_MAJOR
, "nbd");
2599 if (genl_register_family(&nbd_genl_family
)) {
2600 destroy_workqueue(nbd_del_wq
);
2601 unregister_blkdev(NBD_MAJOR
, "nbd");
2606 for (i
= 0; i
< nbds_max
; i
++)
2611 static int nbd_exit_cb(int id
, void *ptr
, void *data
)
2613 struct list_head
*list
= (struct list_head
*)data
;
2614 struct nbd_device
*nbd
= ptr
;
2616 /* Skip nbd that is being removed asynchronously */
2617 if (refcount_read(&nbd
->refs
))
2618 list_add_tail(&nbd
->list
, list
);
2623 static void __exit
nbd_cleanup(void)
2625 struct nbd_device
*nbd
;
2626 LIST_HEAD(del_list
);
2629 * Unregister netlink interface prior to waiting
2630 * for the completion of netlink commands.
2632 genl_unregister_family(&nbd_genl_family
);
2636 mutex_lock(&nbd_index_mutex
);
2637 idr_for_each(&nbd_index_idr
, &nbd_exit_cb
, &del_list
);
2638 mutex_unlock(&nbd_index_mutex
);
2640 while (!list_empty(&del_list
)) {
2641 nbd
= list_first_entry(&del_list
, struct nbd_device
, list
);
2642 list_del_init(&nbd
->list
);
2643 if (refcount_read(&nbd
->config_refs
))
2644 pr_err("possibly leaking nbd_config (ref %d)\n",
2645 refcount_read(&nbd
->config_refs
));
2646 if (refcount_read(&nbd
->refs
) != 1)
2647 pr_err("possibly leaking a device\n");
2651 /* Also wait for nbd_dev_remove_work() completes */
2652 destroy_workqueue(nbd_del_wq
);
2654 idr_destroy(&nbd_index_idr
);
2655 unregister_blkdev(NBD_MAJOR
, "nbd");
2658 module_init(nbd_init
);
2659 module_exit(nbd_cleanup
);
2661 MODULE_DESCRIPTION("Network Block Device");
2662 MODULE_LICENSE("GPL");
2664 module_param(nbds_max
, int, 0444);
2665 MODULE_PARM_DESC(nbds_max
, "number of network block devices to initialize (default: 16)");
2666 module_param(max_part
, int, 0444);
2667 MODULE_PARM_DESC(max_part
, "number of partitions per device (default: 16)");