1 // SPDX-License-Identifier: GPL-2.0-only
3 * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and
4 * Shaohua Li <shli@fb.com>
6 #include <linux/module.h>
8 #include <linux/moduleparam.h>
9 #include <linux/sched.h>
11 #include <linux/init.h>
15 #define pr_fmt(fmt) "null_blk: " fmt
19 #define TICKS_PER_SEC 50ULL
20 #define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC)
22 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
23 static DECLARE_FAULT_ATTR(null_timeout_attr
);
24 static DECLARE_FAULT_ATTR(null_requeue_attr
);
25 static DECLARE_FAULT_ATTR(null_init_hctx_attr
);
28 static inline u64
mb_per_tick(int mbps
)
30 return (1 << 20) / TICKS_PER_SEC
* ((u64
) mbps
);
34 * Status flags for nullb_device.
36 * CONFIGURED: Device has been configured and turned on. Cannot reconfigure.
37 * UP: Device is currently on and visible in userspace.
38 * THROTTLED: Device is being throttled.
39 * CACHE: Device is using a write-back cache.
41 enum nullb_device_flags
{
42 NULLB_DEV_FL_CONFIGURED
= 0,
44 NULLB_DEV_FL_THROTTLED
= 2,
45 NULLB_DEV_FL_CACHE
= 3,
48 #define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2)
50 * nullb_page is a page in memory for nullb devices.
52 * @page: The page holding the data.
53 * @bitmap: The bitmap represents which sector in the page has data.
54 * Each bit represents one block size. For example, sector 8
55 * will use the 7th bit
56 * The highest 2 bits of bitmap are for special purpose. LOCK means the cache
57 * page is being flushing to storage. FREE means the cache page is freed and
58 * should be skipped from flushing to storage. Please see
59 * null_make_cache_space
63 DECLARE_BITMAP(bitmap
, MAP_SZ
);
65 #define NULLB_PAGE_LOCK (MAP_SZ - 1)
66 #define NULLB_PAGE_FREE (MAP_SZ - 2)
68 static LIST_HEAD(nullb_list
);
69 static struct mutex lock
;
70 static int null_major
;
71 static DEFINE_IDA(nullb_indexes
);
72 static struct blk_mq_tag_set tag_set
;
80 static bool g_virt_boundary
;
81 module_param_named(virt_boundary
, g_virt_boundary
, bool, 0444);
82 MODULE_PARM_DESC(virt_boundary
, "Require a virtual boundary for the device. Default: False");
84 static int g_no_sched
;
85 module_param_named(no_sched
, g_no_sched
, int, 0444);
86 MODULE_PARM_DESC(no_sched
, "No io scheduler");
88 static int g_submit_queues
= 1;
89 module_param_named(submit_queues
, g_submit_queues
, int, 0444);
90 MODULE_PARM_DESC(submit_queues
, "Number of submission queues");
92 static int g_poll_queues
= 1;
93 module_param_named(poll_queues
, g_poll_queues
, int, 0444);
94 MODULE_PARM_DESC(poll_queues
, "Number of IOPOLL submission queues");
96 static int g_home_node
= NUMA_NO_NODE
;
97 module_param_named(home_node
, g_home_node
, int, 0444);
98 MODULE_PARM_DESC(home_node
, "Home node for the device");
100 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
102 * For more details about fault injection, please refer to
103 * Documentation/fault-injection/fault-injection.rst.
105 static char g_timeout_str
[80];
106 module_param_string(timeout
, g_timeout_str
, sizeof(g_timeout_str
), 0444);
107 MODULE_PARM_DESC(timeout
, "Fault injection. timeout=<interval>,<probability>,<space>,<times>");
109 static char g_requeue_str
[80];
110 module_param_string(requeue
, g_requeue_str
, sizeof(g_requeue_str
), 0444);
111 MODULE_PARM_DESC(requeue
, "Fault injection. requeue=<interval>,<probability>,<space>,<times>");
113 static char g_init_hctx_str
[80];
114 module_param_string(init_hctx
, g_init_hctx_str
, sizeof(g_init_hctx_str
), 0444);
115 MODULE_PARM_DESC(init_hctx
, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>");
119 * Historic queue modes.
121 * These days nothing but NULL_Q_MQ is actually supported, but we keep it the
122 * enum for error reporting.
130 static int g_queue_mode
= NULL_Q_MQ
;
132 static int null_param_store_val(const char *str
, int *val
, int min
, int max
)
136 ret
= kstrtoint(str
, 10, &new_val
);
140 if (new_val
< min
|| new_val
> max
)
147 static int null_set_queue_mode(const char *str
, const struct kernel_param
*kp
)
149 return null_param_store_val(str
, &g_queue_mode
, NULL_Q_BIO
, NULL_Q_MQ
);
152 static const struct kernel_param_ops null_queue_mode_param_ops
= {
153 .set
= null_set_queue_mode
,
154 .get
= param_get_int
,
157 device_param_cb(queue_mode
, &null_queue_mode_param_ops
, &g_queue_mode
, 0444);
158 MODULE_PARM_DESC(queue_mode
, "Block interface to use (0=bio,1=rq,2=multiqueue)");
160 static int g_gb
= 250;
161 module_param_named(gb
, g_gb
, int, 0444);
162 MODULE_PARM_DESC(gb
, "Size in GB");
164 static int g_bs
= 512;
165 module_param_named(bs
, g_bs
, int, 0444);
166 MODULE_PARM_DESC(bs
, "Block size (in bytes)");
168 static int g_max_sectors
;
169 module_param_named(max_sectors
, g_max_sectors
, int, 0444);
170 MODULE_PARM_DESC(max_sectors
, "Maximum size of a command (in 512B sectors)");
172 static unsigned int nr_devices
= 1;
173 module_param(nr_devices
, uint
, 0444);
174 MODULE_PARM_DESC(nr_devices
, "Number of devices to register");
176 static bool g_blocking
;
177 module_param_named(blocking
, g_blocking
, bool, 0444);
178 MODULE_PARM_DESC(blocking
, "Register as a blocking blk-mq driver device");
180 static bool g_shared_tags
;
181 module_param_named(shared_tags
, g_shared_tags
, bool, 0444);
182 MODULE_PARM_DESC(shared_tags
, "Share tag set between devices for blk-mq");
184 static bool g_shared_tag_bitmap
;
185 module_param_named(shared_tag_bitmap
, g_shared_tag_bitmap
, bool, 0444);
186 MODULE_PARM_DESC(shared_tag_bitmap
, "Use shared tag bitmap for all submission queues for blk-mq");
188 static int g_irqmode
= NULL_IRQ_SOFTIRQ
;
190 static int null_set_irqmode(const char *str
, const struct kernel_param
*kp
)
192 return null_param_store_val(str
, &g_irqmode
, NULL_IRQ_NONE
,
196 static const struct kernel_param_ops null_irqmode_param_ops
= {
197 .set
= null_set_irqmode
,
198 .get
= param_get_int
,
201 device_param_cb(irqmode
, &null_irqmode_param_ops
, &g_irqmode
, 0444);
202 MODULE_PARM_DESC(irqmode
, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
204 static unsigned long g_completion_nsec
= 10000;
205 module_param_named(completion_nsec
, g_completion_nsec
, ulong
, 0444);
206 MODULE_PARM_DESC(completion_nsec
, "Time in ns to complete a request in hardware. Default: 10,000ns");
208 static int g_hw_queue_depth
= 64;
209 module_param_named(hw_queue_depth
, g_hw_queue_depth
, int, 0444);
210 MODULE_PARM_DESC(hw_queue_depth
, "Queue depth for each hardware queue. Default: 64");
212 static bool g_use_per_node_hctx
;
213 module_param_named(use_per_node_hctx
, g_use_per_node_hctx
, bool, 0444);
214 MODULE_PARM_DESC(use_per_node_hctx
, "Use per-node allocation for hardware context queues. Default: false");
216 static bool g_memory_backed
;
217 module_param_named(memory_backed
, g_memory_backed
, bool, 0444);
218 MODULE_PARM_DESC(memory_backed
, "Create a memory-backed block device. Default: false");
220 static bool g_discard
;
221 module_param_named(discard
, g_discard
, bool, 0444);
222 MODULE_PARM_DESC(discard
, "Support discard operations (requires memory-backed null_blk device). Default: false");
224 static unsigned long g_cache_size
;
225 module_param_named(cache_size
, g_cache_size
, ulong
, 0444);
226 MODULE_PARM_DESC(mbps
, "Cache size in MiB for memory-backed device. Default: 0 (none)");
228 static bool g_fua
= true;
229 module_param_named(fua
, g_fua
, bool, 0444);
230 MODULE_PARM_DESC(fua
, "Enable/disable FUA support when cache_size is used. Default: true");
232 static unsigned int g_mbps
;
233 module_param_named(mbps
, g_mbps
, uint
, 0444);
234 MODULE_PARM_DESC(mbps
, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");
237 module_param_named(zoned
, g_zoned
, bool, S_IRUGO
);
238 MODULE_PARM_DESC(zoned
, "Make device as a host-managed zoned block device. Default: false");
240 static unsigned long g_zone_size
= 256;
241 module_param_named(zone_size
, g_zone_size
, ulong
, S_IRUGO
);
242 MODULE_PARM_DESC(zone_size
, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
244 static unsigned long g_zone_capacity
;
245 module_param_named(zone_capacity
, g_zone_capacity
, ulong
, 0444);
246 MODULE_PARM_DESC(zone_capacity
, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size");
248 static unsigned int g_zone_nr_conv
;
249 module_param_named(zone_nr_conv
, g_zone_nr_conv
, uint
, 0444);
250 MODULE_PARM_DESC(zone_nr_conv
, "Number of conventional zones when block device is zoned. Default: 0");
252 static unsigned int g_zone_max_open
;
253 module_param_named(zone_max_open
, g_zone_max_open
, uint
, 0444);
254 MODULE_PARM_DESC(zone_max_open
, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)");
256 static unsigned int g_zone_max_active
;
257 module_param_named(zone_max_active
, g_zone_max_active
, uint
, 0444);
258 MODULE_PARM_DESC(zone_max_active
, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
260 static int g_zone_append_max_sectors
= INT_MAX
;
261 module_param_named(zone_append_max_sectors
, g_zone_append_max_sectors
, int, 0444);
262 MODULE_PARM_DESC(zone_append_max_sectors
,
263 "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation");
265 static bool g_zone_full
;
266 module_param_named(zone_full
, g_zone_full
, bool, S_IRUGO
);
267 MODULE_PARM_DESC(zone_full
, "Initialize the sequential write required zones of a zoned device to be full. Default: false");
269 static struct nullb_device
*null_alloc_dev(void);
270 static void null_free_dev(struct nullb_device
*dev
);
271 static void null_del_dev(struct nullb
*nullb
);
272 static int null_add_dev(struct nullb_device
*dev
);
273 static struct nullb
*null_find_dev_by_name(const char *name
);
274 static void null_free_device_storage(struct nullb_device
*dev
, bool is_cache
);
276 static inline struct nullb_device
*to_nullb_device(struct config_item
*item
)
278 return item
? container_of(to_config_group(item
), struct nullb_device
, group
) : NULL
;
281 static inline ssize_t
nullb_device_uint_attr_show(unsigned int val
, char *page
)
283 return snprintf(page
, PAGE_SIZE
, "%u\n", val
);
286 static inline ssize_t
nullb_device_ulong_attr_show(unsigned long val
,
289 return snprintf(page
, PAGE_SIZE
, "%lu\n", val
);
292 static inline ssize_t
nullb_device_bool_attr_show(bool val
, char *page
)
294 return snprintf(page
, PAGE_SIZE
, "%u\n", val
);
297 static ssize_t
nullb_device_uint_attr_store(unsigned int *val
,
298 const char *page
, size_t count
)
303 result
= kstrtouint(page
, 0, &tmp
);
311 static ssize_t
nullb_device_ulong_attr_store(unsigned long *val
,
312 const char *page
, size_t count
)
317 result
= kstrtoul(page
, 0, &tmp
);
325 static ssize_t
nullb_device_bool_attr_store(bool *val
, const char *page
,
331 result
= kstrtobool(page
, &tmp
);
339 /* The following macro should only be used with TYPE = {uint, ulong, bool}. */
340 #define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \
342 nullb_device_##NAME##_show(struct config_item *item, char *page) \
344 return nullb_device_##TYPE##_attr_show( \
345 to_nullb_device(item)->NAME, page); \
348 nullb_device_##NAME##_store(struct config_item *item, const char *page, \
351 int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\
352 struct nullb_device *dev = to_nullb_device(item); \
353 TYPE new_value = 0; \
356 ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\
360 ret = apply_fn(dev, new_value); \
361 else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \
365 dev->NAME = new_value; \
368 CONFIGFS_ATTR(nullb_device_, NAME);
370 static int nullb_update_nr_hw_queues(struct nullb_device
*dev
,
371 unsigned int submit_queues
,
372 unsigned int poll_queues
)
375 struct blk_mq_tag_set
*set
;
376 int ret
, nr_hw_queues
;
382 * Make sure at least one submit queue exists.
388 * Make sure that null_init_hctx() does not access nullb->queues[] past
389 * the end of that array.
391 if (submit_queues
> nr_cpu_ids
|| poll_queues
> g_poll_queues
)
395 * Keep previous and new queue numbers in nullb_device for reference in
396 * the call back function null_map_queues().
398 dev
->prev_submit_queues
= dev
->submit_queues
;
399 dev
->prev_poll_queues
= dev
->poll_queues
;
400 dev
->submit_queues
= submit_queues
;
401 dev
->poll_queues
= poll_queues
;
403 set
= dev
->nullb
->tag_set
;
404 nr_hw_queues
= submit_queues
+ poll_queues
;
405 blk_mq_update_nr_hw_queues(set
, nr_hw_queues
);
406 ret
= set
->nr_hw_queues
== nr_hw_queues
? 0 : -ENOMEM
;
409 /* on error, revert the queue numbers */
410 dev
->submit_queues
= dev
->prev_submit_queues
;
411 dev
->poll_queues
= dev
->prev_poll_queues
;
417 static int nullb_apply_submit_queues(struct nullb_device
*dev
,
418 unsigned int submit_queues
)
423 ret
= nullb_update_nr_hw_queues(dev
, submit_queues
, dev
->poll_queues
);
429 static int nullb_apply_poll_queues(struct nullb_device
*dev
,
430 unsigned int poll_queues
)
435 ret
= nullb_update_nr_hw_queues(dev
, dev
->submit_queues
, poll_queues
);
441 NULLB_DEVICE_ATTR(size
, ulong
, NULL
);
442 NULLB_DEVICE_ATTR(completion_nsec
, ulong
, NULL
);
443 NULLB_DEVICE_ATTR(submit_queues
, uint
, nullb_apply_submit_queues
);
444 NULLB_DEVICE_ATTR(poll_queues
, uint
, nullb_apply_poll_queues
);
445 NULLB_DEVICE_ATTR(home_node
, uint
, NULL
);
446 NULLB_DEVICE_ATTR(queue_mode
, uint
, NULL
);
447 NULLB_DEVICE_ATTR(blocksize
, uint
, NULL
);
448 NULLB_DEVICE_ATTR(max_sectors
, uint
, NULL
);
449 NULLB_DEVICE_ATTR(irqmode
, uint
, NULL
);
450 NULLB_DEVICE_ATTR(hw_queue_depth
, uint
, NULL
);
451 NULLB_DEVICE_ATTR(index
, uint
, NULL
);
452 NULLB_DEVICE_ATTR(blocking
, bool, NULL
);
453 NULLB_DEVICE_ATTR(use_per_node_hctx
, bool, NULL
);
454 NULLB_DEVICE_ATTR(memory_backed
, bool, NULL
);
455 NULLB_DEVICE_ATTR(discard
, bool, NULL
);
456 NULLB_DEVICE_ATTR(mbps
, uint
, NULL
);
457 NULLB_DEVICE_ATTR(cache_size
, ulong
, NULL
);
458 NULLB_DEVICE_ATTR(zoned
, bool, NULL
);
459 NULLB_DEVICE_ATTR(zone_size
, ulong
, NULL
);
460 NULLB_DEVICE_ATTR(zone_capacity
, ulong
, NULL
);
461 NULLB_DEVICE_ATTR(zone_nr_conv
, uint
, NULL
);
462 NULLB_DEVICE_ATTR(zone_max_open
, uint
, NULL
);
463 NULLB_DEVICE_ATTR(zone_max_active
, uint
, NULL
);
464 NULLB_DEVICE_ATTR(zone_append_max_sectors
, uint
, NULL
);
465 NULLB_DEVICE_ATTR(zone_full
, bool, NULL
);
466 NULLB_DEVICE_ATTR(virt_boundary
, bool, NULL
);
467 NULLB_DEVICE_ATTR(no_sched
, bool, NULL
);
468 NULLB_DEVICE_ATTR(shared_tags
, bool, NULL
);
469 NULLB_DEVICE_ATTR(shared_tag_bitmap
, bool, NULL
);
470 NULLB_DEVICE_ATTR(fua
, bool, NULL
);
472 static ssize_t
nullb_device_power_show(struct config_item
*item
, char *page
)
474 return nullb_device_bool_attr_show(to_nullb_device(item
)->power
, page
);
477 static ssize_t
nullb_device_power_store(struct config_item
*item
,
478 const char *page
, size_t count
)
480 struct nullb_device
*dev
= to_nullb_device(item
);
484 ret
= nullb_device_bool_attr_store(&newp
, page
, count
);
490 if (!dev
->power
&& newp
) {
491 if (test_and_set_bit(NULLB_DEV_FL_UP
, &dev
->flags
))
494 ret
= null_add_dev(dev
);
496 clear_bit(NULLB_DEV_FL_UP
, &dev
->flags
);
500 set_bit(NULLB_DEV_FL_CONFIGURED
, &dev
->flags
);
503 } else if (dev
->power
&& !newp
) {
504 if (test_and_clear_bit(NULLB_DEV_FL_UP
, &dev
->flags
)) {
506 null_del_dev(dev
->nullb
);
508 clear_bit(NULLB_DEV_FL_CONFIGURED
, &dev
->flags
);
516 CONFIGFS_ATTR(nullb_device_
, power
);
518 static ssize_t
nullb_device_badblocks_show(struct config_item
*item
, char *page
)
520 struct nullb_device
*t_dev
= to_nullb_device(item
);
522 return badblocks_show(&t_dev
->badblocks
, page
, 0);
525 static ssize_t
nullb_device_badblocks_store(struct config_item
*item
,
526 const char *page
, size_t count
)
528 struct nullb_device
*t_dev
= to_nullb_device(item
);
529 char *orig
, *buf
, *tmp
;
533 orig
= kstrndup(page
, count
, GFP_KERNEL
);
537 buf
= strstrip(orig
);
540 if (buf
[0] != '+' && buf
[0] != '-')
542 tmp
= strchr(&buf
[1], '-');
546 ret
= kstrtoull(buf
+ 1, 0, &start
);
549 ret
= kstrtoull(tmp
+ 1, 0, &end
);
555 /* enable badblocks */
556 cmpxchg(&t_dev
->badblocks
.shift
, -1, 0);
558 ret
= badblocks_set(&t_dev
->badblocks
, start
,
561 ret
= badblocks_clear(&t_dev
->badblocks
, start
,
569 CONFIGFS_ATTR(nullb_device_
, badblocks
);
571 static ssize_t
nullb_device_zone_readonly_store(struct config_item
*item
,
572 const char *page
, size_t count
)
574 struct nullb_device
*dev
= to_nullb_device(item
);
576 return zone_cond_store(dev
, page
, count
, BLK_ZONE_COND_READONLY
);
578 CONFIGFS_ATTR_WO(nullb_device_
, zone_readonly
);
580 static ssize_t
nullb_device_zone_offline_store(struct config_item
*item
,
581 const char *page
, size_t count
)
583 struct nullb_device
*dev
= to_nullb_device(item
);
585 return zone_cond_store(dev
, page
, count
, BLK_ZONE_COND_OFFLINE
);
587 CONFIGFS_ATTR_WO(nullb_device_
, zone_offline
);
589 static struct configfs_attribute
*nullb_device_attrs
[] = {
590 &nullb_device_attr_size
,
591 &nullb_device_attr_completion_nsec
,
592 &nullb_device_attr_submit_queues
,
593 &nullb_device_attr_poll_queues
,
594 &nullb_device_attr_home_node
,
595 &nullb_device_attr_queue_mode
,
596 &nullb_device_attr_blocksize
,
597 &nullb_device_attr_max_sectors
,
598 &nullb_device_attr_irqmode
,
599 &nullb_device_attr_hw_queue_depth
,
600 &nullb_device_attr_index
,
601 &nullb_device_attr_blocking
,
602 &nullb_device_attr_use_per_node_hctx
,
603 &nullb_device_attr_power
,
604 &nullb_device_attr_memory_backed
,
605 &nullb_device_attr_discard
,
606 &nullb_device_attr_mbps
,
607 &nullb_device_attr_cache_size
,
608 &nullb_device_attr_badblocks
,
609 &nullb_device_attr_zoned
,
610 &nullb_device_attr_zone_size
,
611 &nullb_device_attr_zone_capacity
,
612 &nullb_device_attr_zone_nr_conv
,
613 &nullb_device_attr_zone_max_open
,
614 &nullb_device_attr_zone_max_active
,
615 &nullb_device_attr_zone_append_max_sectors
,
616 &nullb_device_attr_zone_readonly
,
617 &nullb_device_attr_zone_offline
,
618 &nullb_device_attr_zone_full
,
619 &nullb_device_attr_virt_boundary
,
620 &nullb_device_attr_no_sched
,
621 &nullb_device_attr_shared_tags
,
622 &nullb_device_attr_shared_tag_bitmap
,
623 &nullb_device_attr_fua
,
627 static void nullb_device_release(struct config_item
*item
)
629 struct nullb_device
*dev
= to_nullb_device(item
);
631 null_free_device_storage(dev
, false);
635 static struct configfs_item_operations nullb_device_ops
= {
636 .release
= nullb_device_release
,
639 static const struct config_item_type nullb_device_type
= {
640 .ct_item_ops
= &nullb_device_ops
,
641 .ct_attrs
= nullb_device_attrs
,
642 .ct_owner
= THIS_MODULE
,
645 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
647 static void nullb_add_fault_config(struct nullb_device
*dev
)
649 fault_config_init(&dev
->timeout_config
, "timeout_inject");
650 fault_config_init(&dev
->requeue_config
, "requeue_inject");
651 fault_config_init(&dev
->init_hctx_fault_config
, "init_hctx_fault_inject");
653 configfs_add_default_group(&dev
->timeout_config
.group
, &dev
->group
);
654 configfs_add_default_group(&dev
->requeue_config
.group
, &dev
->group
);
655 configfs_add_default_group(&dev
->init_hctx_fault_config
.group
, &dev
->group
);
660 static void nullb_add_fault_config(struct nullb_device
*dev
)
667 config_group
*nullb_group_make_group(struct config_group
*group
, const char *name
)
669 struct nullb_device
*dev
;
671 if (null_find_dev_by_name(name
))
672 return ERR_PTR(-EEXIST
);
674 dev
= null_alloc_dev();
676 return ERR_PTR(-ENOMEM
);
678 config_group_init_type_name(&dev
->group
, name
, &nullb_device_type
);
679 nullb_add_fault_config(dev
);
685 nullb_group_drop_item(struct config_group
*group
, struct config_item
*item
)
687 struct nullb_device
*dev
= to_nullb_device(item
);
689 if (test_and_clear_bit(NULLB_DEV_FL_UP
, &dev
->flags
)) {
692 null_del_dev(dev
->nullb
);
696 config_item_put(item
);
699 static ssize_t
memb_group_features_show(struct config_item
*item
, char *page
)
701 return snprintf(page
, PAGE_SIZE
,
702 "badblocks,blocking,blocksize,cache_size,fua,"
703 "completion_nsec,discard,home_node,hw_queue_depth,"
704 "irqmode,max_sectors,mbps,memory_backed,no_sched,"
705 "poll_queues,power,queue_mode,shared_tag_bitmap,"
706 "shared_tags,size,submit_queues,use_per_node_hctx,"
707 "virt_boundary,zoned,zone_capacity,zone_max_active,"
708 "zone_max_open,zone_nr_conv,zone_offline,zone_readonly,"
709 "zone_size,zone_append_max_sectors,zone_full\n");
712 CONFIGFS_ATTR_RO(memb_group_
, features
);
714 static struct configfs_attribute
*nullb_group_attrs
[] = {
715 &memb_group_attr_features
,
719 static struct configfs_group_operations nullb_group_ops
= {
720 .make_group
= nullb_group_make_group
,
721 .drop_item
= nullb_group_drop_item
,
724 static const struct config_item_type nullb_group_type
= {
725 .ct_group_ops
= &nullb_group_ops
,
726 .ct_attrs
= nullb_group_attrs
,
727 .ct_owner
= THIS_MODULE
,
730 static struct configfs_subsystem nullb_subsys
= {
733 .ci_namebuf
= "nullb",
734 .ci_type
= &nullb_group_type
,
739 static inline int null_cache_active(struct nullb
*nullb
)
741 return test_bit(NULLB_DEV_FL_CACHE
, &nullb
->dev
->flags
);
744 static struct nullb_device
*null_alloc_dev(void)
746 struct nullb_device
*dev
;
748 dev
= kzalloc(sizeof(*dev
), GFP_KERNEL
);
752 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
753 dev
->timeout_config
.attr
= null_timeout_attr
;
754 dev
->requeue_config
.attr
= null_requeue_attr
;
755 dev
->init_hctx_fault_config
.attr
= null_init_hctx_attr
;
758 INIT_RADIX_TREE(&dev
->data
, GFP_ATOMIC
);
759 INIT_RADIX_TREE(&dev
->cache
, GFP_ATOMIC
);
760 if (badblocks_init(&dev
->badblocks
, 0)) {
765 dev
->size
= g_gb
* 1024;
766 dev
->completion_nsec
= g_completion_nsec
;
767 dev
->submit_queues
= g_submit_queues
;
768 dev
->prev_submit_queues
= g_submit_queues
;
769 dev
->poll_queues
= g_poll_queues
;
770 dev
->prev_poll_queues
= g_poll_queues
;
771 dev
->home_node
= g_home_node
;
772 dev
->queue_mode
= g_queue_mode
;
773 dev
->blocksize
= g_bs
;
774 dev
->max_sectors
= g_max_sectors
;
775 dev
->irqmode
= g_irqmode
;
776 dev
->hw_queue_depth
= g_hw_queue_depth
;
777 dev
->blocking
= g_blocking
;
778 dev
->memory_backed
= g_memory_backed
;
779 dev
->discard
= g_discard
;
780 dev
->cache_size
= g_cache_size
;
782 dev
->use_per_node_hctx
= g_use_per_node_hctx
;
783 dev
->zoned
= g_zoned
;
784 dev
->zone_size
= g_zone_size
;
785 dev
->zone_capacity
= g_zone_capacity
;
786 dev
->zone_nr_conv
= g_zone_nr_conv
;
787 dev
->zone_max_open
= g_zone_max_open
;
788 dev
->zone_max_active
= g_zone_max_active
;
789 dev
->zone_append_max_sectors
= g_zone_append_max_sectors
;
790 dev
->zone_full
= g_zone_full
;
791 dev
->virt_boundary
= g_virt_boundary
;
792 dev
->no_sched
= g_no_sched
;
793 dev
->shared_tags
= g_shared_tags
;
794 dev
->shared_tag_bitmap
= g_shared_tag_bitmap
;
800 static void null_free_dev(struct nullb_device
*dev
)
805 null_free_zoned_dev(dev
);
806 badblocks_exit(&dev
->badblocks
);
810 static enum hrtimer_restart
null_cmd_timer_expired(struct hrtimer
*timer
)
812 struct nullb_cmd
*cmd
= container_of(timer
, struct nullb_cmd
, timer
);
814 blk_mq_end_request(blk_mq_rq_from_pdu(cmd
), cmd
->error
);
815 return HRTIMER_NORESTART
;
818 static void null_cmd_end_timer(struct nullb_cmd
*cmd
)
820 ktime_t kt
= cmd
->nq
->dev
->completion_nsec
;
822 hrtimer_start(&cmd
->timer
, kt
, HRTIMER_MODE_REL
);
825 static void null_complete_rq(struct request
*rq
)
827 struct nullb_cmd
*cmd
= blk_mq_rq_to_pdu(rq
);
829 blk_mq_end_request(rq
, cmd
->error
);
832 static struct nullb_page
*null_alloc_page(void)
834 struct nullb_page
*t_page
;
836 t_page
= kmalloc(sizeof(struct nullb_page
), GFP_NOIO
);
840 t_page
->page
= alloc_pages(GFP_NOIO
, 0);
846 memset(t_page
->bitmap
, 0, sizeof(t_page
->bitmap
));
850 static void null_free_page(struct nullb_page
*t_page
)
852 __set_bit(NULLB_PAGE_FREE
, t_page
->bitmap
);
853 if (test_bit(NULLB_PAGE_LOCK
, t_page
->bitmap
))
855 __free_page(t_page
->page
);
859 static bool null_page_empty(struct nullb_page
*page
)
861 int size
= MAP_SZ
- 2;
863 return find_first_bit(page
->bitmap
, size
) == size
;
866 static void null_free_sector(struct nullb
*nullb
, sector_t sector
,
869 unsigned int sector_bit
;
871 struct nullb_page
*t_page
, *ret
;
872 struct radix_tree_root
*root
;
874 root
= is_cache
? &nullb
->dev
->cache
: &nullb
->dev
->data
;
875 idx
= sector
>> PAGE_SECTORS_SHIFT
;
876 sector_bit
= (sector
& SECTOR_MASK
);
878 t_page
= radix_tree_lookup(root
, idx
);
880 __clear_bit(sector_bit
, t_page
->bitmap
);
882 if (null_page_empty(t_page
)) {
883 ret
= radix_tree_delete_item(root
, idx
, t_page
);
884 WARN_ON(ret
!= t_page
);
887 nullb
->dev
->curr_cache
-= PAGE_SIZE
;
892 static struct nullb_page
*null_radix_tree_insert(struct nullb
*nullb
, u64 idx
,
893 struct nullb_page
*t_page
, bool is_cache
)
895 struct radix_tree_root
*root
;
897 root
= is_cache
? &nullb
->dev
->cache
: &nullb
->dev
->data
;
899 if (radix_tree_insert(root
, idx
, t_page
)) {
900 null_free_page(t_page
);
901 t_page
= radix_tree_lookup(root
, idx
);
902 WARN_ON(!t_page
|| t_page
->page
->index
!= idx
);
904 nullb
->dev
->curr_cache
+= PAGE_SIZE
;
909 static void null_free_device_storage(struct nullb_device
*dev
, bool is_cache
)
911 unsigned long pos
= 0;
913 struct nullb_page
*ret
, *t_pages
[FREE_BATCH
];
914 struct radix_tree_root
*root
;
916 root
= is_cache
? &dev
->cache
: &dev
->data
;
921 nr_pages
= radix_tree_gang_lookup(root
,
922 (void **)t_pages
, pos
, FREE_BATCH
);
924 for (i
= 0; i
< nr_pages
; i
++) {
925 pos
= t_pages
[i
]->page
->index
;
926 ret
= radix_tree_delete_item(root
, pos
, t_pages
[i
]);
927 WARN_ON(ret
!= t_pages
[i
]);
932 } while (nr_pages
== FREE_BATCH
);
938 static struct nullb_page
*__null_lookup_page(struct nullb
*nullb
,
939 sector_t sector
, bool for_write
, bool is_cache
)
941 unsigned int sector_bit
;
943 struct nullb_page
*t_page
;
944 struct radix_tree_root
*root
;
946 idx
= sector
>> PAGE_SECTORS_SHIFT
;
947 sector_bit
= (sector
& SECTOR_MASK
);
949 root
= is_cache
? &nullb
->dev
->cache
: &nullb
->dev
->data
;
950 t_page
= radix_tree_lookup(root
, idx
);
951 WARN_ON(t_page
&& t_page
->page
->index
!= idx
);
953 if (t_page
&& (for_write
|| test_bit(sector_bit
, t_page
->bitmap
)))
959 static struct nullb_page
*null_lookup_page(struct nullb
*nullb
,
960 sector_t sector
, bool for_write
, bool ignore_cache
)
962 struct nullb_page
*page
= NULL
;
965 page
= __null_lookup_page(nullb
, sector
, for_write
, true);
968 return __null_lookup_page(nullb
, sector
, for_write
, false);
971 static struct nullb_page
*null_insert_page(struct nullb
*nullb
,
972 sector_t sector
, bool ignore_cache
)
973 __releases(&nullb
->lock
)
974 __acquires(&nullb
->lock
)
977 struct nullb_page
*t_page
;
979 t_page
= null_lookup_page(nullb
, sector
, true, ignore_cache
);
983 spin_unlock_irq(&nullb
->lock
);
985 t_page
= null_alloc_page();
989 if (radix_tree_preload(GFP_NOIO
))
992 spin_lock_irq(&nullb
->lock
);
993 idx
= sector
>> PAGE_SECTORS_SHIFT
;
994 t_page
->page
->index
= idx
;
995 t_page
= null_radix_tree_insert(nullb
, idx
, t_page
, !ignore_cache
);
996 radix_tree_preload_end();
1000 null_free_page(t_page
);
1002 spin_lock_irq(&nullb
->lock
);
1003 return null_lookup_page(nullb
, sector
, true, ignore_cache
);
1006 static int null_flush_cache_page(struct nullb
*nullb
, struct nullb_page
*c_page
)
1009 unsigned int offset
;
1011 struct nullb_page
*t_page
, *ret
;
1014 idx
= c_page
->page
->index
;
1016 t_page
= null_insert_page(nullb
, idx
<< PAGE_SECTORS_SHIFT
, true);
1018 __clear_bit(NULLB_PAGE_LOCK
, c_page
->bitmap
);
1019 if (test_bit(NULLB_PAGE_FREE
, c_page
->bitmap
)) {
1020 null_free_page(c_page
);
1021 if (t_page
&& null_page_empty(t_page
)) {
1022 ret
= radix_tree_delete_item(&nullb
->dev
->data
,
1024 null_free_page(t_page
);
1032 src
= kmap_local_page(c_page
->page
);
1033 dst
= kmap_local_page(t_page
->page
);
1035 for (i
= 0; i
< PAGE_SECTORS
;
1036 i
+= (nullb
->dev
->blocksize
>> SECTOR_SHIFT
)) {
1037 if (test_bit(i
, c_page
->bitmap
)) {
1038 offset
= (i
<< SECTOR_SHIFT
);
1039 memcpy(dst
+ offset
, src
+ offset
,
1040 nullb
->dev
->blocksize
);
1041 __set_bit(i
, t_page
->bitmap
);
1048 ret
= radix_tree_delete_item(&nullb
->dev
->cache
, idx
, c_page
);
1049 null_free_page(ret
);
1050 nullb
->dev
->curr_cache
-= PAGE_SIZE
;
1055 static int null_make_cache_space(struct nullb
*nullb
, unsigned long n
)
1057 int i
, err
, nr_pages
;
1058 struct nullb_page
*c_pages
[FREE_BATCH
];
1059 unsigned long flushed
= 0, one_round
;
1062 if ((nullb
->dev
->cache_size
* 1024 * 1024) >
1063 nullb
->dev
->curr_cache
+ n
|| nullb
->dev
->curr_cache
== 0)
1066 nr_pages
= radix_tree_gang_lookup(&nullb
->dev
->cache
,
1067 (void **)c_pages
, nullb
->cache_flush_pos
, FREE_BATCH
);
1069 * nullb_flush_cache_page could unlock before using the c_pages. To
1070 * avoid race, we don't allow page free
1072 for (i
= 0; i
< nr_pages
; i
++) {
1073 nullb
->cache_flush_pos
= c_pages
[i
]->page
->index
;
1075 * We found the page which is being flushed to disk by other
1078 if (test_bit(NULLB_PAGE_LOCK
, c_pages
[i
]->bitmap
))
1081 __set_bit(NULLB_PAGE_LOCK
, c_pages
[i
]->bitmap
);
1085 for (i
= 0; i
< nr_pages
; i
++) {
1086 if (c_pages
[i
] == NULL
)
1088 err
= null_flush_cache_page(nullb
, c_pages
[i
]);
1093 flushed
+= one_round
<< PAGE_SHIFT
;
1097 nullb
->cache_flush_pos
= 0;
1098 if (one_round
== 0) {
1099 /* give other threads a chance */
1100 spin_unlock_irq(&nullb
->lock
);
1101 spin_lock_irq(&nullb
->lock
);
1108 static int copy_to_nullb(struct nullb
*nullb
, struct page
*source
,
1109 unsigned int off
, sector_t sector
, size_t n
, bool is_fua
)
1111 size_t temp
, count
= 0;
1112 unsigned int offset
;
1113 struct nullb_page
*t_page
;
1116 temp
= min_t(size_t, nullb
->dev
->blocksize
, n
- count
);
1118 if (null_cache_active(nullb
) && !is_fua
)
1119 null_make_cache_space(nullb
, PAGE_SIZE
);
1121 offset
= (sector
& SECTOR_MASK
) << SECTOR_SHIFT
;
1122 t_page
= null_insert_page(nullb
, sector
,
1123 !null_cache_active(nullb
) || is_fua
);
1127 memcpy_page(t_page
->page
, offset
, source
, off
+ count
, temp
);
1129 __set_bit(sector
& SECTOR_MASK
, t_page
->bitmap
);
1132 null_free_sector(nullb
, sector
, true);
1135 sector
+= temp
>> SECTOR_SHIFT
;
1140 static int copy_from_nullb(struct nullb
*nullb
, struct page
*dest
,
1141 unsigned int off
, sector_t sector
, size_t n
)
1143 size_t temp
, count
= 0;
1144 unsigned int offset
;
1145 struct nullb_page
*t_page
;
1148 temp
= min_t(size_t, nullb
->dev
->blocksize
, n
- count
);
1150 offset
= (sector
& SECTOR_MASK
) << SECTOR_SHIFT
;
1151 t_page
= null_lookup_page(nullb
, sector
, false,
1152 !null_cache_active(nullb
));
1155 memcpy_page(dest
, off
+ count
, t_page
->page
, offset
,
1158 zero_user(dest
, off
+ count
, temp
);
1161 sector
+= temp
>> SECTOR_SHIFT
;
1166 static void nullb_fill_pattern(struct nullb
*nullb
, struct page
*page
,
1167 unsigned int len
, unsigned int off
)
1169 memset_page(page
, off
, 0xff, len
);
1172 blk_status_t
null_handle_discard(struct nullb_device
*dev
,
1173 sector_t sector
, sector_t nr_sectors
)
1175 struct nullb
*nullb
= dev
->nullb
;
1176 size_t n
= nr_sectors
<< SECTOR_SHIFT
;
1179 spin_lock_irq(&nullb
->lock
);
1181 temp
= min_t(size_t, n
, dev
->blocksize
);
1182 null_free_sector(nullb
, sector
, false);
1183 if (null_cache_active(nullb
))
1184 null_free_sector(nullb
, sector
, true);
1185 sector
+= temp
>> SECTOR_SHIFT
;
1188 spin_unlock_irq(&nullb
->lock
);
1193 static blk_status_t
null_handle_flush(struct nullb
*nullb
)
1197 if (!null_cache_active(nullb
))
1200 spin_lock_irq(&nullb
->lock
);
1202 err
= null_make_cache_space(nullb
,
1203 nullb
->dev
->cache_size
* 1024 * 1024);
1204 if (err
|| nullb
->dev
->curr_cache
== 0)
1208 WARN_ON(!radix_tree_empty(&nullb
->dev
->cache
));
1209 spin_unlock_irq(&nullb
->lock
);
1210 return errno_to_blk_status(err
);
1213 static int null_transfer(struct nullb
*nullb
, struct page
*page
,
1214 unsigned int len
, unsigned int off
, bool is_write
, sector_t sector
,
1217 struct nullb_device
*dev
= nullb
->dev
;
1218 unsigned int valid_len
= len
;
1223 valid_len
= null_zone_valid_read_len(nullb
,
1227 err
= copy_from_nullb(nullb
, page
, off
,
1234 nullb_fill_pattern(nullb
, page
, len
, off
);
1235 flush_dcache_page(page
);
1237 flush_dcache_page(page
);
1238 err
= copy_to_nullb(nullb
, page
, off
, sector
, len
, is_fua
);
1244 static blk_status_t
null_handle_rq(struct nullb_cmd
*cmd
)
1246 struct request
*rq
= blk_mq_rq_from_pdu(cmd
);
1247 struct nullb
*nullb
= cmd
->nq
->dev
->nullb
;
1250 sector_t sector
= blk_rq_pos(rq
);
1251 struct req_iterator iter
;
1252 struct bio_vec bvec
;
1254 spin_lock_irq(&nullb
->lock
);
1255 rq_for_each_segment(bvec
, rq
, iter
) {
1257 err
= null_transfer(nullb
, bvec
.bv_page
, len
, bvec
.bv_offset
,
1258 op_is_write(req_op(rq
)), sector
,
1259 rq
->cmd_flags
& REQ_FUA
);
1262 sector
+= len
>> SECTOR_SHIFT
;
1264 spin_unlock_irq(&nullb
->lock
);
1266 return errno_to_blk_status(err
);
1269 static inline blk_status_t
null_handle_throttled(struct nullb_cmd
*cmd
)
1271 struct nullb_device
*dev
= cmd
->nq
->dev
;
1272 struct nullb
*nullb
= dev
->nullb
;
1273 blk_status_t sts
= BLK_STS_OK
;
1274 struct request
*rq
= blk_mq_rq_from_pdu(cmd
);
1276 if (!hrtimer_active(&nullb
->bw_timer
))
1277 hrtimer_restart(&nullb
->bw_timer
);
1279 if (atomic_long_sub_return(blk_rq_bytes(rq
), &nullb
->cur_bytes
) < 0) {
1280 blk_mq_stop_hw_queues(nullb
->q
);
1281 /* race with timer */
1282 if (atomic_long_read(&nullb
->cur_bytes
) > 0)
1283 blk_mq_start_stopped_hw_queues(nullb
->q
, true);
1284 /* requeue request */
1285 sts
= BLK_STS_DEV_RESOURCE
;
1290 static inline blk_status_t
null_handle_badblocks(struct nullb_cmd
*cmd
,
1292 sector_t nr_sectors
)
1294 struct badblocks
*bb
= &cmd
->nq
->dev
->badblocks
;
1298 if (badblocks_check(bb
, sector
, nr_sectors
, &first_bad
, &bad_sectors
))
1299 return BLK_STS_IOERR
;
1304 static inline blk_status_t
null_handle_memory_backed(struct nullb_cmd
*cmd
,
1307 sector_t nr_sectors
)
1309 struct nullb_device
*dev
= cmd
->nq
->dev
;
1311 if (op
== REQ_OP_DISCARD
)
1312 return null_handle_discard(dev
, sector
, nr_sectors
);
1314 return null_handle_rq(cmd
);
1317 static void nullb_zero_read_cmd_buffer(struct nullb_cmd
*cmd
)
1319 struct request
*rq
= blk_mq_rq_from_pdu(cmd
);
1320 struct nullb_device
*dev
= cmd
->nq
->dev
;
1323 if (!dev
->memory_backed
&& req_op(rq
) == REQ_OP_READ
) {
1324 __rq_for_each_bio(bio
, rq
)
1329 static inline void nullb_complete_cmd(struct nullb_cmd
*cmd
)
1331 struct request
*rq
= blk_mq_rq_from_pdu(cmd
);
1334 * Since root privileges are required to configure the null_blk
1335 * driver, it is fine that this driver does not initialize the
1336 * data buffers of read commands. Zero-initialize these buffers
1337 * anyway if KMSAN is enabled to prevent that KMSAN complains
1338 * about null_blk not initializing read data buffers.
1340 if (IS_ENABLED(CONFIG_KMSAN
))
1341 nullb_zero_read_cmd_buffer(cmd
);
1343 /* Complete IO by inline, softirq or timer */
1344 switch (cmd
->nq
->dev
->irqmode
) {
1345 case NULL_IRQ_SOFTIRQ
:
1346 blk_mq_complete_request(rq
);
1349 blk_mq_end_request(rq
, cmd
->error
);
1351 case NULL_IRQ_TIMER
:
1352 null_cmd_end_timer(cmd
);
1357 blk_status_t
null_process_cmd(struct nullb_cmd
*cmd
, enum req_op op
,
1358 sector_t sector
, unsigned int nr_sectors
)
1360 struct nullb_device
*dev
= cmd
->nq
->dev
;
1363 if (dev
->badblocks
.shift
!= -1) {
1364 ret
= null_handle_badblocks(cmd
, sector
, nr_sectors
);
1365 if (ret
!= BLK_STS_OK
)
1369 if (dev
->memory_backed
)
1370 return null_handle_memory_backed(cmd
, op
, sector
, nr_sectors
);
1375 static void null_handle_cmd(struct nullb_cmd
*cmd
, sector_t sector
,
1376 sector_t nr_sectors
, enum req_op op
)
1378 struct nullb_device
*dev
= cmd
->nq
->dev
;
1379 struct nullb
*nullb
= dev
->nullb
;
1382 if (op
== REQ_OP_FLUSH
) {
1383 cmd
->error
= null_handle_flush(nullb
);
1388 sts
= null_process_zoned_cmd(cmd
, op
, sector
, nr_sectors
);
1390 sts
= null_process_cmd(cmd
, op
, sector
, nr_sectors
);
1392 /* Do not overwrite errors (e.g. timeout errors) */
1393 if (cmd
->error
== BLK_STS_OK
)
1397 nullb_complete_cmd(cmd
);
1400 static enum hrtimer_restart
nullb_bwtimer_fn(struct hrtimer
*timer
)
1402 struct nullb
*nullb
= container_of(timer
, struct nullb
, bw_timer
);
1403 ktime_t timer_interval
= ktime_set(0, TIMER_INTERVAL
);
1404 unsigned int mbps
= nullb
->dev
->mbps
;
1406 if (atomic_long_read(&nullb
->cur_bytes
) == mb_per_tick(mbps
))
1407 return HRTIMER_NORESTART
;
1409 atomic_long_set(&nullb
->cur_bytes
, mb_per_tick(mbps
));
1410 blk_mq_start_stopped_hw_queues(nullb
->q
, true);
1412 hrtimer_forward_now(&nullb
->bw_timer
, timer_interval
);
1414 return HRTIMER_RESTART
;
1417 static void nullb_setup_bwtimer(struct nullb
*nullb
)
1419 ktime_t timer_interval
= ktime_set(0, TIMER_INTERVAL
);
1421 hrtimer_init(&nullb
->bw_timer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL
);
1422 nullb
->bw_timer
.function
= nullb_bwtimer_fn
;
1423 atomic_long_set(&nullb
->cur_bytes
, mb_per_tick(nullb
->dev
->mbps
));
1424 hrtimer_start(&nullb
->bw_timer
, timer_interval
, HRTIMER_MODE_REL
);
1427 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
1429 static bool should_timeout_request(struct request
*rq
)
1431 struct nullb_cmd
*cmd
= blk_mq_rq_to_pdu(rq
);
1432 struct nullb_device
*dev
= cmd
->nq
->dev
;
1434 return should_fail(&dev
->timeout_config
.attr
, 1);
1437 static bool should_requeue_request(struct request
*rq
)
1439 struct nullb_cmd
*cmd
= blk_mq_rq_to_pdu(rq
);
1440 struct nullb_device
*dev
= cmd
->nq
->dev
;
1442 return should_fail(&dev
->requeue_config
.attr
, 1);
1445 static bool should_init_hctx_fail(struct nullb_device
*dev
)
1447 return should_fail(&dev
->init_hctx_fault_config
.attr
, 1);
1452 static bool should_timeout_request(struct request
*rq
)
1457 static bool should_requeue_request(struct request
*rq
)
1462 static bool should_init_hctx_fail(struct nullb_device
*dev
)
1469 static void null_map_queues(struct blk_mq_tag_set
*set
)
1471 struct nullb
*nullb
= set
->driver_data
;
1473 unsigned int submit_queues
= g_submit_queues
;
1474 unsigned int poll_queues
= g_poll_queues
;
1477 struct nullb_device
*dev
= nullb
->dev
;
1480 * Refer nr_hw_queues of the tag set to check if the expected
1481 * number of hardware queues are prepared. If block layer failed
1482 * to prepare them, use previous numbers of submit queues and
1483 * poll queues to map queues.
1485 if (set
->nr_hw_queues
==
1486 dev
->submit_queues
+ dev
->poll_queues
) {
1487 submit_queues
= dev
->submit_queues
;
1488 poll_queues
= dev
->poll_queues
;
1489 } else if (set
->nr_hw_queues
==
1490 dev
->prev_submit_queues
+ dev
->prev_poll_queues
) {
1491 submit_queues
= dev
->prev_submit_queues
;
1492 poll_queues
= dev
->prev_poll_queues
;
1494 pr_warn("tag set has unexpected nr_hw_queues: %d\n",
1502 for (i
= 0, qoff
= 0; i
< set
->nr_maps
; i
++) {
1503 struct blk_mq_queue_map
*map
= &set
->map
[i
];
1506 case HCTX_TYPE_DEFAULT
:
1507 map
->nr_queues
= submit_queues
;
1509 case HCTX_TYPE_READ
:
1512 case HCTX_TYPE_POLL
:
1513 map
->nr_queues
= poll_queues
;
1516 map
->queue_offset
= qoff
;
1517 qoff
+= map
->nr_queues
;
1518 blk_mq_map_queues(map
);
1522 static int null_poll(struct blk_mq_hw_ctx
*hctx
, struct io_comp_batch
*iob
)
1524 struct nullb_queue
*nq
= hctx
->driver_data
;
1529 spin_lock(&nq
->poll_lock
);
1530 list_splice_init(&nq
->poll_list
, &list
);
1531 list_for_each_entry(rq
, &list
, queuelist
)
1532 blk_mq_set_request_complete(rq
);
1533 spin_unlock(&nq
->poll_lock
);
1535 while (!list_empty(&list
)) {
1536 struct nullb_cmd
*cmd
;
1537 struct request
*req
;
1539 req
= list_first_entry(&list
, struct request
, queuelist
);
1540 list_del_init(&req
->queuelist
);
1541 cmd
= blk_mq_rq_to_pdu(req
);
1542 cmd
->error
= null_process_cmd(cmd
, req_op(req
), blk_rq_pos(req
),
1543 blk_rq_sectors(req
));
1544 if (!blk_mq_add_to_batch(req
, iob
, (__force
int) cmd
->error
,
1545 blk_mq_end_request_batch
))
1546 blk_mq_end_request(req
, cmd
->error
);
1553 static enum blk_eh_timer_return
null_timeout_rq(struct request
*rq
)
1555 struct blk_mq_hw_ctx
*hctx
= rq
->mq_hctx
;
1556 struct nullb_cmd
*cmd
= blk_mq_rq_to_pdu(rq
);
1558 if (hctx
->type
== HCTX_TYPE_POLL
) {
1559 struct nullb_queue
*nq
= hctx
->driver_data
;
1561 spin_lock(&nq
->poll_lock
);
1562 /* The request may have completed meanwhile. */
1563 if (blk_mq_request_completed(rq
)) {
1564 spin_unlock(&nq
->poll_lock
);
1567 list_del_init(&rq
->queuelist
);
1568 spin_unlock(&nq
->poll_lock
);
1571 pr_info("rq %p timed out\n", rq
);
1574 * If the device is marked as blocking (i.e. memory backed or zoned
1575 * device), the submission path may be blocked waiting for resources
1576 * and cause real timeouts. For these real timeouts, the submission
1577 * path will complete the request using blk_mq_complete_request().
1578 * Only fake timeouts need to execute blk_mq_complete_request() here.
1580 cmd
->error
= BLK_STS_TIMEOUT
;
1581 if (cmd
->fake_timeout
|| hctx
->type
== HCTX_TYPE_POLL
)
1582 blk_mq_complete_request(rq
);
1586 static blk_status_t
null_queue_rq(struct blk_mq_hw_ctx
*hctx
,
1587 const struct blk_mq_queue_data
*bd
)
1589 struct request
*rq
= bd
->rq
;
1590 struct nullb_cmd
*cmd
= blk_mq_rq_to_pdu(rq
);
1591 struct nullb_queue
*nq
= hctx
->driver_data
;
1592 sector_t nr_sectors
= blk_rq_sectors(rq
);
1593 sector_t sector
= blk_rq_pos(rq
);
1594 const bool is_poll
= hctx
->type
== HCTX_TYPE_POLL
;
1596 might_sleep_if(hctx
->flags
& BLK_MQ_F_BLOCKING
);
1598 if (!is_poll
&& nq
->dev
->irqmode
== NULL_IRQ_TIMER
) {
1599 hrtimer_init(&cmd
->timer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL
);
1600 cmd
->timer
.function
= null_cmd_timer_expired
;
1602 cmd
->error
= BLK_STS_OK
;
1604 cmd
->fake_timeout
= should_timeout_request(rq
) ||
1605 blk_should_fake_timeout(rq
->q
);
1607 if (should_requeue_request(rq
)) {
1609 * Alternate between hitting the core BUSY path, and the
1610 * driver driven requeue path
1612 nq
->requeue_selection
++;
1613 if (nq
->requeue_selection
& 1)
1614 return BLK_STS_RESOURCE
;
1615 blk_mq_requeue_request(rq
, true);
1619 if (test_bit(NULLB_DEV_FL_THROTTLED
, &nq
->dev
->flags
)) {
1620 blk_status_t sts
= null_handle_throttled(cmd
);
1622 if (sts
!= BLK_STS_OK
)
1626 blk_mq_start_request(rq
);
1629 spin_lock(&nq
->poll_lock
);
1630 list_add_tail(&rq
->queuelist
, &nq
->poll_list
);
1631 spin_unlock(&nq
->poll_lock
);
1634 if (cmd
->fake_timeout
)
1637 null_handle_cmd(cmd
, sector
, nr_sectors
, req_op(rq
));
1641 static void null_queue_rqs(struct rq_list
*rqlist
)
1643 struct rq_list requeue_list
= {};
1644 struct blk_mq_queue_data bd
= { };
1648 struct request
*rq
= rq_list_pop(rqlist
);
1651 ret
= null_queue_rq(rq
->mq_hctx
, &bd
);
1652 if (ret
!= BLK_STS_OK
)
1653 rq_list_add_tail(&requeue_list
, rq
);
1654 } while (!rq_list_empty(rqlist
));
1656 *rqlist
= requeue_list
;
1659 static void null_init_queue(struct nullb
*nullb
, struct nullb_queue
*nq
)
1661 nq
->dev
= nullb
->dev
;
1662 INIT_LIST_HEAD(&nq
->poll_list
);
1663 spin_lock_init(&nq
->poll_lock
);
1666 static int null_init_hctx(struct blk_mq_hw_ctx
*hctx
, void *driver_data
,
1667 unsigned int hctx_idx
)
1669 struct nullb
*nullb
= hctx
->queue
->queuedata
;
1670 struct nullb_queue
*nq
;
1672 if (should_init_hctx_fail(nullb
->dev
))
1675 nq
= &nullb
->queues
[hctx_idx
];
1676 hctx
->driver_data
= nq
;
1677 null_init_queue(nullb
, nq
);
1682 static const struct blk_mq_ops null_mq_ops
= {
1683 .queue_rq
= null_queue_rq
,
1684 .queue_rqs
= null_queue_rqs
,
1685 .complete
= null_complete_rq
,
1686 .timeout
= null_timeout_rq
,
1688 .map_queues
= null_map_queues
,
1689 .init_hctx
= null_init_hctx
,
1692 static void null_del_dev(struct nullb
*nullb
)
1694 struct nullb_device
*dev
;
1701 ida_free(&nullb_indexes
, nullb
->index
);
1703 list_del_init(&nullb
->list
);
1705 del_gendisk(nullb
->disk
);
1707 if (test_bit(NULLB_DEV_FL_THROTTLED
, &nullb
->dev
->flags
)) {
1708 hrtimer_cancel(&nullb
->bw_timer
);
1709 atomic_long_set(&nullb
->cur_bytes
, LONG_MAX
);
1710 blk_mq_start_stopped_hw_queues(nullb
->q
, true);
1713 put_disk(nullb
->disk
);
1714 if (nullb
->tag_set
== &nullb
->__tag_set
)
1715 blk_mq_free_tag_set(nullb
->tag_set
);
1716 kfree(nullb
->queues
);
1717 if (null_cache_active(nullb
))
1718 null_free_device_storage(nullb
->dev
, true);
1723 static void null_config_discard(struct nullb
*nullb
, struct queue_limits
*lim
)
1725 if (nullb
->dev
->discard
== false)
1728 if (!nullb
->dev
->memory_backed
) {
1729 nullb
->dev
->discard
= false;
1730 pr_info("discard option is ignored without memory backing\n");
1734 if (nullb
->dev
->zoned
) {
1735 nullb
->dev
->discard
= false;
1736 pr_info("discard option is ignored in zoned mode\n");
1740 lim
->max_hw_discard_sectors
= UINT_MAX
>> 9;
1743 static const struct block_device_operations null_ops
= {
1744 .owner
= THIS_MODULE
,
1745 .report_zones
= null_report_zones
,
1748 static int setup_queues(struct nullb
*nullb
)
1750 int nqueues
= nr_cpu_ids
;
1753 nqueues
+= g_poll_queues
;
1755 nullb
->queues
= kcalloc(nqueues
, sizeof(struct nullb_queue
),
1763 static int null_init_tag_set(struct blk_mq_tag_set
*set
, int poll_queues
)
1765 set
->ops
= &null_mq_ops
;
1766 set
->cmd_size
= sizeof(struct nullb_cmd
);
1767 set
->timeout
= 5 * HZ
;
1770 set
->nr_hw_queues
+= poll_queues
;
1773 return blk_mq_alloc_tag_set(set
);
1776 static int null_init_global_tag_set(void)
1783 tag_set
.nr_hw_queues
= g_submit_queues
;
1784 tag_set
.queue_depth
= g_hw_queue_depth
;
1785 tag_set
.numa_node
= g_home_node
;
1786 tag_set
.flags
= BLK_MQ_F_SHOULD_MERGE
;
1788 tag_set
.flags
|= BLK_MQ_F_NO_SCHED
;
1789 if (g_shared_tag_bitmap
)
1790 tag_set
.flags
|= BLK_MQ_F_TAG_HCTX_SHARED
;
1792 tag_set
.flags
|= BLK_MQ_F_BLOCKING
;
1794 error
= null_init_tag_set(&tag_set
, g_poll_queues
);
1800 static int null_setup_tagset(struct nullb
*nullb
)
1802 if (nullb
->dev
->shared_tags
) {
1803 nullb
->tag_set
= &tag_set
;
1804 return null_init_global_tag_set();
1807 nullb
->tag_set
= &nullb
->__tag_set
;
1808 nullb
->tag_set
->driver_data
= nullb
;
1809 nullb
->tag_set
->nr_hw_queues
= nullb
->dev
->submit_queues
;
1810 nullb
->tag_set
->queue_depth
= nullb
->dev
->hw_queue_depth
;
1811 nullb
->tag_set
->numa_node
= nullb
->dev
->home_node
;
1812 nullb
->tag_set
->flags
= BLK_MQ_F_SHOULD_MERGE
;
1813 if (nullb
->dev
->no_sched
)
1814 nullb
->tag_set
->flags
|= BLK_MQ_F_NO_SCHED
;
1815 if (nullb
->dev
->shared_tag_bitmap
)
1816 nullb
->tag_set
->flags
|= BLK_MQ_F_TAG_HCTX_SHARED
;
1817 if (nullb
->dev
->blocking
)
1818 nullb
->tag_set
->flags
|= BLK_MQ_F_BLOCKING
;
1819 return null_init_tag_set(nullb
->tag_set
, nullb
->dev
->poll_queues
);
1822 static int null_validate_conf(struct nullb_device
*dev
)
1824 if (dev
->queue_mode
== NULL_Q_RQ
) {
1825 pr_err("legacy IO path is no longer available\n");
1828 if (dev
->queue_mode
== NULL_Q_BIO
) {
1829 pr_err("BIO-based IO path is no longer available, using blk-mq instead.\n");
1830 dev
->queue_mode
= NULL_Q_MQ
;
1833 if (dev
->use_per_node_hctx
) {
1834 if (dev
->submit_queues
!= nr_online_nodes
)
1835 dev
->submit_queues
= nr_online_nodes
;
1836 } else if (dev
->submit_queues
> nr_cpu_ids
)
1837 dev
->submit_queues
= nr_cpu_ids
;
1838 else if (dev
->submit_queues
== 0)
1839 dev
->submit_queues
= 1;
1840 dev
->prev_submit_queues
= dev
->submit_queues
;
1842 if (dev
->poll_queues
> g_poll_queues
)
1843 dev
->poll_queues
= g_poll_queues
;
1844 dev
->prev_poll_queues
= dev
->poll_queues
;
1845 dev
->irqmode
= min_t(unsigned int, dev
->irqmode
, NULL_IRQ_TIMER
);
1847 /* Do memory allocation, so set blocking */
1848 if (dev
->memory_backed
)
1849 dev
->blocking
= true;
1850 else /* cache is meaningless */
1851 dev
->cache_size
= 0;
1852 dev
->cache_size
= min_t(unsigned long, ULONG_MAX
/ 1024 / 1024,
1854 dev
->mbps
= min_t(unsigned int, 1024 * 40, dev
->mbps
);
1857 (!dev
->zone_size
|| !is_power_of_2(dev
->zone_size
))) {
1858 pr_err("zone_size must be power-of-two\n");
1865 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
1866 static bool __null_setup_fault(struct fault_attr
*attr
, char *str
)
1871 if (!setup_fault_attr(attr
, str
))
1879 static bool null_setup_fault(void)
1881 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
1882 if (!__null_setup_fault(&null_timeout_attr
, g_timeout_str
))
1884 if (!__null_setup_fault(&null_requeue_attr
, g_requeue_str
))
1886 if (!__null_setup_fault(&null_init_hctx_attr
, g_init_hctx_str
))
1892 static int null_add_dev(struct nullb_device
*dev
)
1894 struct queue_limits lim
= {
1895 .logical_block_size
= dev
->blocksize
,
1896 .physical_block_size
= dev
->blocksize
,
1897 .max_hw_sectors
= dev
->max_sectors
,
1900 struct nullb
*nullb
;
1903 rv
= null_validate_conf(dev
);
1907 nullb
= kzalloc_node(sizeof(*nullb
), GFP_KERNEL
, dev
->home_node
);
1915 spin_lock_init(&nullb
->lock
);
1917 rv
= setup_queues(nullb
);
1919 goto out_free_nullb
;
1921 rv
= null_setup_tagset(nullb
);
1923 goto out_cleanup_queues
;
1925 if (dev
->virt_boundary
)
1926 lim
.virt_boundary_mask
= PAGE_SIZE
- 1;
1927 null_config_discard(nullb
, &lim
);
1929 rv
= null_init_zoned_dev(dev
, &lim
);
1931 goto out_cleanup_tags
;
1934 if (dev
->cache_size
> 0) {
1935 set_bit(NULLB_DEV_FL_CACHE
, &nullb
->dev
->flags
);
1936 lim
.features
|= BLK_FEAT_WRITE_CACHE
;
1938 lim
.features
|= BLK_FEAT_FUA
;
1941 nullb
->disk
= blk_mq_alloc_disk(nullb
->tag_set
, &lim
, nullb
);
1942 if (IS_ERR(nullb
->disk
)) {
1943 rv
= PTR_ERR(nullb
->disk
);
1944 goto out_cleanup_zone
;
1946 nullb
->q
= nullb
->disk
->queue
;
1949 set_bit(NULLB_DEV_FL_THROTTLED
, &dev
->flags
);
1950 nullb_setup_bwtimer(nullb
);
1953 nullb
->q
->queuedata
= nullb
;
1955 rv
= ida_alloc(&nullb_indexes
, GFP_KERNEL
);
1957 goto out_cleanup_disk
;
1962 if (config_item_name(&dev
->group
.cg_item
)) {
1963 /* Use configfs dir name as the device name */
1964 snprintf(nullb
->disk_name
, sizeof(nullb
->disk_name
),
1965 "%s", config_item_name(&dev
->group
.cg_item
));
1967 sprintf(nullb
->disk_name
, "nullb%d", nullb
->index
);
1970 set_capacity(nullb
->disk
,
1971 ((sector_t
)nullb
->dev
->size
* SZ_1M
) >> SECTOR_SHIFT
);
1972 nullb
->disk
->major
= null_major
;
1973 nullb
->disk
->first_minor
= nullb
->index
;
1974 nullb
->disk
->minors
= 1;
1975 nullb
->disk
->fops
= &null_ops
;
1976 nullb
->disk
->private_data
= nullb
;
1977 strscpy_pad(nullb
->disk
->disk_name
, nullb
->disk_name
, DISK_NAME_LEN
);
1979 if (nullb
->dev
->zoned
) {
1980 rv
= null_register_zoned_dev(nullb
);
1985 rv
= add_disk(nullb
->disk
);
1989 list_add_tail(&nullb
->list
, &nullb_list
);
1991 pr_info("disk %s created\n", nullb
->disk_name
);
1996 ida_free(&nullb_indexes
, nullb
->index
);
1998 put_disk(nullb
->disk
);
2000 null_free_zoned_dev(dev
);
2002 if (nullb
->tag_set
== &nullb
->__tag_set
)
2003 blk_mq_free_tag_set(nullb
->tag_set
);
2005 kfree(nullb
->queues
);
2013 static struct nullb
*null_find_dev_by_name(const char *name
)
2015 struct nullb
*nullb
= NULL
, *nb
;
2018 list_for_each_entry(nb
, &nullb_list
, list
) {
2019 if (strcmp(nb
->disk_name
, name
) == 0) {
2024 mutex_unlock(&lock
);
2029 static int null_create_dev(void)
2031 struct nullb_device
*dev
;
2034 dev
= null_alloc_dev();
2039 ret
= null_add_dev(dev
);
2040 mutex_unlock(&lock
);
2049 static void null_destroy_dev(struct nullb
*nullb
)
2051 struct nullb_device
*dev
= nullb
->dev
;
2053 null_del_dev(nullb
);
2054 null_free_device_storage(dev
, false);
2058 static int __init
null_init(void)
2062 struct nullb
*nullb
;
2064 if (g_bs
> PAGE_SIZE
) {
2065 pr_warn("invalid block size\n");
2066 pr_warn("defaults block size to %lu\n", PAGE_SIZE
);
2070 if (g_home_node
!= NUMA_NO_NODE
&& g_home_node
>= nr_online_nodes
) {
2071 pr_err("invalid home_node value\n");
2072 g_home_node
= NUMA_NO_NODE
;
2075 if (!null_setup_fault())
2078 if (g_queue_mode
== NULL_Q_RQ
) {
2079 pr_err("legacy IO path is no longer available\n");
2083 if (g_use_per_node_hctx
) {
2084 if (g_submit_queues
!= nr_online_nodes
) {
2085 pr_warn("submit_queues param is set to %u.\n",
2087 g_submit_queues
= nr_online_nodes
;
2089 } else if (g_submit_queues
> nr_cpu_ids
) {
2090 g_submit_queues
= nr_cpu_ids
;
2091 } else if (g_submit_queues
<= 0) {
2092 g_submit_queues
= 1;
2095 config_group_init(&nullb_subsys
.su_group
);
2096 mutex_init(&nullb_subsys
.su_mutex
);
2098 ret
= configfs_register_subsystem(&nullb_subsys
);
2104 null_major
= register_blkdev(0, "nullb");
2105 if (null_major
< 0) {
2110 for (i
= 0; i
< nr_devices
; i
++) {
2111 ret
= null_create_dev();
2116 pr_info("module loaded\n");
2120 while (!list_empty(&nullb_list
)) {
2121 nullb
= list_entry(nullb_list
.next
, struct nullb
, list
);
2122 null_destroy_dev(nullb
);
2124 unregister_blkdev(null_major
, "nullb");
2126 configfs_unregister_subsystem(&nullb_subsys
);
2130 static void __exit
null_exit(void)
2132 struct nullb
*nullb
;
2134 configfs_unregister_subsystem(&nullb_subsys
);
2136 unregister_blkdev(null_major
, "nullb");
2139 while (!list_empty(&nullb_list
)) {
2140 nullb
= list_entry(nullb_list
.next
, struct nullb
, list
);
2141 null_destroy_dev(nullb
);
2143 mutex_unlock(&lock
);
2146 blk_mq_free_tag_set(&tag_set
);
2148 mutex_destroy(&lock
);
2151 module_init(null_init
);
2152 module_exit(null_exit
);
2154 MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>");
2155 MODULE_DESCRIPTION("multi queue aware block test driver");
2156 MODULE_LICENSE("GPL");