1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2020 Red Hat GmbH
5 * This file is released under the GPL.
7 * Device-mapper target to emulate smaller logical block
8 * size on backing devices exposing (natively) larger ones.
10 * E.g. 512 byte sector emulation on 4K native disks.
14 #include <linux/module.h>
15 #include <linux/workqueue.h>
16 #include <linux/dm-bufio.h>
18 #define DM_MSG_PREFIX "ebs"
20 static void ebs_dtr(struct dm_target
*ti
);
22 /* Emulated block size context. */
24 struct dm_dev
*dev
; /* Underlying device to emulate block size on. */
25 struct dm_bufio_client
*bufio
; /* Use dm-bufio for read and read-modify-write processing. */
26 struct workqueue_struct
*wq
; /* Workqueue for ^ processing of bios. */
27 struct work_struct ws
; /* Work item used for ^. */
28 struct bio_list bios_in
; /* Worker bios input list. */
29 spinlock_t lock
; /* Guard bios input list above. */
30 sector_t start
; /* <start> table line argument, see ebs_ctr below. */
31 unsigned int e_bs
; /* Emulated block size in sectors exposed to upper layer. */
32 unsigned int u_bs
; /* Underlying block size in sectors retrieved from/set on lower layer device. */
33 unsigned char block_shift
; /* bitshift sectors -> blocks used in dm-bufio API. */
34 bool u_bs_set
:1; /* Flag to indicate underlying block size is set on table line. */
37 static inline sector_t
__sector_to_block(struct ebs_c
*ec
, sector_t sector
)
39 return sector
>> ec
->block_shift
;
42 static inline sector_t
__block_mod(sector_t sector
, unsigned int bs
)
44 return sector
& (bs
- 1);
47 /* Return number of blocks for a bio, accounting for misalignment of start and end sectors. */
48 static inline unsigned int __nr_blocks(struct ebs_c
*ec
, struct bio
*bio
)
50 sector_t end_sector
= __block_mod(bio
->bi_iter
.bi_sector
, ec
->u_bs
) + bio_sectors(bio
);
52 return __sector_to_block(ec
, end_sector
) + (__block_mod(end_sector
, ec
->u_bs
) ? 1 : 0);
55 static inline bool __ebs_check_bs(unsigned int bs
)
57 return bs
&& is_power_of_2(bs
);
63 * copy blocks between bufio blocks and bio vector's (partial/overlapping) pages.
65 static int __ebs_rw_bvec(struct ebs_c
*ec
, enum req_op op
, struct bio_vec
*bv
,
66 struct bvec_iter
*iter
)
69 unsigned char *ba
, *pa
;
71 unsigned int bv_len
= bv
->bv_len
;
72 unsigned int buf_off
= to_bytes(__block_mod(iter
->bi_sector
, ec
->u_bs
));
73 sector_t block
= __sector_to_block(ec
, iter
->bi_sector
);
76 if (unlikely(!bv
->bv_page
|| !bv_len
))
81 /* Handle overlapping page <-> blocks */
83 cur_len
= min(dm_bufio_get_block_size(ec
->bufio
) - buf_off
, bv_len
);
85 /* Avoid reading for writes in case bio vector's page overwrites block completely. */
86 if (op
== REQ_OP_READ
|| buf_off
|| bv_len
< dm_bufio_get_block_size(ec
->bufio
))
87 ba
= dm_bufio_read(ec
->bufio
, block
, &b
);
89 ba
= dm_bufio_new(ec
->bufio
, block
, &b
);
93 * Carry on with next buffer, if any, to issue all possible
94 * data but return error.
98 /* Copy data to/from bio to buffer if read/new was successful above. */
100 if (op
== REQ_OP_READ
) {
101 memcpy(pa
, ba
, cur_len
);
102 flush_dcache_page(bv
->bv_page
);
104 flush_dcache_page(bv
->bv_page
);
105 memcpy(ba
, pa
, cur_len
);
106 dm_bufio_mark_partial_buffer_dirty(b
, buf_off
, buf_off
+ cur_len
);
121 /* READ/WRITE: iterate bio vector's copying between (partial) pages and bufio blocks. */
122 static int __ebs_rw_bio(struct ebs_c
*ec
, enum req_op op
, struct bio
*bio
)
126 struct bvec_iter iter
;
128 bio_for_each_bvec(bv
, bio
, iter
) {
129 rr
= __ebs_rw_bvec(ec
, op
, &bv
, &iter
);
138 * Discard bio's blocks, i.e. pass discards down.
140 * Avoid discarding partial blocks at beginning and end;
141 * return 0 in case no blocks can be discarded as a result.
143 static int __ebs_discard_bio(struct ebs_c
*ec
, struct bio
*bio
)
145 sector_t block
, blocks
, sector
= bio
->bi_iter
.bi_sector
;
147 block
= __sector_to_block(ec
, sector
);
148 blocks
= __nr_blocks(ec
, bio
);
151 * Partial first underlying block (__nr_blocks() may have
152 * resulted in one block).
154 if (__block_mod(sector
, ec
->u_bs
)) {
159 /* Partial last underlying block if any. */
160 if (blocks
&& __block_mod(bio_end_sector(bio
), ec
->u_bs
))
163 return blocks
? dm_bufio_issue_discard(ec
->bufio
, block
, blocks
) : 0;
166 /* Release blocks them from the bufio cache. */
167 static void __ebs_forget_bio(struct ebs_c
*ec
, struct bio
*bio
)
169 sector_t blocks
, sector
= bio
->bi_iter
.bi_sector
;
171 blocks
= __nr_blocks(ec
, bio
);
173 dm_bufio_forget_buffers(ec
->bufio
, __sector_to_block(ec
, sector
), blocks
);
176 /* Worker function to process incoming bios. */
177 static void __ebs_process_bios(struct work_struct
*ws
)
181 sector_t block1
, block2
;
182 struct ebs_c
*ec
= container_of(ws
, struct ebs_c
, ws
);
184 struct bio_list bios
;
186 bio_list_init(&bios
);
188 spin_lock_irq(&ec
->lock
);
190 bio_list_init(&ec
->bios_in
);
191 spin_unlock_irq(&ec
->lock
);
193 /* Prefetch all read and any mis-aligned write buffers */
194 bio_list_for_each(bio
, &bios
) {
195 block1
= __sector_to_block(ec
, bio
->bi_iter
.bi_sector
);
196 if (bio_op(bio
) == REQ_OP_READ
)
197 dm_bufio_prefetch(ec
->bufio
, block1
, __nr_blocks(ec
, bio
));
198 else if (bio_op(bio
) == REQ_OP_WRITE
&& !(bio
->bi_opf
& REQ_PREFLUSH
)) {
199 block2
= __sector_to_block(ec
, bio_end_sector(bio
));
200 if (__block_mod(bio
->bi_iter
.bi_sector
, ec
->u_bs
))
201 dm_bufio_prefetch(ec
->bufio
, block1
, 1);
202 if (__block_mod(bio_end_sector(bio
), ec
->u_bs
) && block2
!= block1
)
203 dm_bufio_prefetch(ec
->bufio
, block2
, 1);
207 bio_list_for_each(bio
, &bios
) {
209 if (bio_op(bio
) == REQ_OP_READ
)
210 r
= __ebs_rw_bio(ec
, REQ_OP_READ
, bio
);
211 else if (bio_op(bio
) == REQ_OP_WRITE
) {
213 r
= __ebs_rw_bio(ec
, REQ_OP_WRITE
, bio
);
214 } else if (bio_op(bio
) == REQ_OP_DISCARD
) {
215 __ebs_forget_bio(ec
, bio
);
216 r
= __ebs_discard_bio(ec
, bio
);
220 bio
->bi_status
= errno_to_blk_status(r
);
224 * We write dirty buffers after processing I/O on them
225 * but before we endio thus addressing REQ_FUA/REQ_SYNC.
227 r
= write
? dm_bufio_write_dirty_buffers(ec
->bufio
) : 0;
229 while ((bio
= bio_list_pop(&bios
))) {
230 /* Any other request is endioed. */
231 if (unlikely(r
&& bio_op(bio
) == REQ_OP_WRITE
))
239 * Construct an emulated block size mapping: <dev_path> <offset> <ebs> [<ubs>]
241 * <dev_path>: path of the underlying device
242 * <offset>: offset in 512 bytes sectors into <dev_path>
243 * <ebs>: emulated block size in units of 512 bytes exposed to the upper layer
244 * [<ubs>]: underlying block size in units of 512 bytes imposed on the lower layer;
245 * optional, if not supplied, retrieve logical block size from underlying device
247 static int ebs_ctr(struct dm_target
*ti
, unsigned int argc
, char **argv
)
251 unsigned long long tmp
;
255 if (argc
< 3 || argc
> 4) {
256 ti
->error
= "Invalid argument count";
260 ec
= ti
->private = kzalloc(sizeof(*ec
), GFP_KERNEL
);
262 ti
->error
= "Cannot allocate ebs context";
267 if (sscanf(argv
[1], "%llu%c", &tmp
, &dummy
) != 1 ||
268 tmp
!= (sector_t
)tmp
||
269 (sector_t
)tmp
>= ti
->len
) {
270 ti
->error
= "Invalid device offset sector";
275 if (sscanf(argv
[2], "%hu%c", &tmp1
, &dummy
) != 1 ||
276 !__ebs_check_bs(tmp1
) ||
277 to_bytes(tmp1
) > PAGE_SIZE
) {
278 ti
->error
= "Invalid emulated block size";
284 if (sscanf(argv
[3], "%hu%c", &tmp1
, &dummy
) != 1 || !__ebs_check_bs(tmp1
)) {
285 ti
->error
= "Invalid underlying block size";
291 ec
->u_bs_set
= false;
293 r
= dm_get_device(ti
, argv
[0], dm_table_get_mode(ti
->table
), &ec
->dev
);
295 ti
->error
= "Device lookup failed";
302 ec
->u_bs
= to_sector(bdev_logical_block_size(ec
->dev
->bdev
));
303 if (!__ebs_check_bs(ec
->u_bs
)) {
304 ti
->error
= "Invalid retrieved underlying block size";
309 if (!ec
->u_bs_set
&& ec
->e_bs
== ec
->u_bs
)
310 DMINFO("Emulation superfluous: emulated equal to underlying block size");
312 if (__block_mod(ec
->start
, ec
->u_bs
)) {
313 ti
->error
= "Device offset must be multiple of underlying block size";
317 ec
->bufio
= dm_bufio_client_create(ec
->dev
->bdev
, to_bytes(ec
->u_bs
), 1,
319 if (IS_ERR(ec
->bufio
)) {
320 ti
->error
= "Cannot create dm bufio client";
321 r
= PTR_ERR(ec
->bufio
);
326 ec
->wq
= alloc_ordered_workqueue("dm-" DM_MSG_PREFIX
, WQ_MEM_RECLAIM
);
328 ti
->error
= "Cannot create dm-" DM_MSG_PREFIX
" workqueue";
333 ec
->block_shift
= __ffs(ec
->u_bs
);
334 INIT_WORK(&ec
->ws
, &__ebs_process_bios
);
335 bio_list_init(&ec
->bios_in
);
336 spin_lock_init(&ec
->lock
);
338 ti
->num_flush_bios
= 1;
339 ti
->num_discard_bios
= 1;
340 ti
->num_secure_erase_bios
= 0;
341 ti
->num_write_zeroes_bios
= 0;
348 static void ebs_dtr(struct dm_target
*ti
)
350 struct ebs_c
*ec
= ti
->private;
353 destroy_workqueue(ec
->wq
);
355 dm_bufio_client_destroy(ec
->bufio
);
357 dm_put_device(ti
, ec
->dev
);
361 static int ebs_map(struct dm_target
*ti
, struct bio
*bio
)
363 struct ebs_c
*ec
= ti
->private;
365 bio_set_dev(bio
, ec
->dev
->bdev
);
366 bio
->bi_iter
.bi_sector
= ec
->start
+ dm_target_offset(ti
, bio
->bi_iter
.bi_sector
);
368 if (unlikely(bio_op(bio
) == REQ_OP_FLUSH
))
369 return DM_MAPIO_REMAPPED
;
371 * Only queue for bufio processing in case of partial or overlapping buffers
373 * emulation with ebs == ubs aiming for tests of dm-bufio overhead.
375 if (likely(__block_mod(bio
->bi_iter
.bi_sector
, ec
->u_bs
) ||
376 __block_mod(bio_end_sector(bio
), ec
->u_bs
) ||
377 ec
->e_bs
== ec
->u_bs
)) {
378 spin_lock_irq(&ec
->lock
);
379 bio_list_add(&ec
->bios_in
, bio
);
380 spin_unlock_irq(&ec
->lock
);
382 queue_work(ec
->wq
, &ec
->ws
);
384 return DM_MAPIO_SUBMITTED
;
387 /* Forget any buffer content relative to this direct backing device I/O. */
388 __ebs_forget_bio(ec
, bio
);
390 return DM_MAPIO_REMAPPED
;
393 static void ebs_status(struct dm_target
*ti
, status_type_t type
,
394 unsigned int status_flags
, char *result
, unsigned int maxlen
)
396 struct ebs_c
*ec
= ti
->private;
399 case STATUSTYPE_INFO
:
402 case STATUSTYPE_TABLE
:
403 snprintf(result
, maxlen
, ec
->u_bs_set
? "%s %llu %u %u" : "%s %llu %u",
404 ec
->dev
->name
, (unsigned long long) ec
->start
, ec
->e_bs
, ec
->u_bs
);
412 static int ebs_prepare_ioctl(struct dm_target
*ti
, struct block_device
**bdev
)
414 struct ebs_c
*ec
= ti
->private;
415 struct dm_dev
*dev
= ec
->dev
;
418 * Only pass ioctls through if the device sizes match exactly.
421 return !!(ec
->start
|| ti
->len
!= bdev_nr_sectors(dev
->bdev
));
424 static void ebs_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
426 struct ebs_c
*ec
= ti
->private;
428 limits
->logical_block_size
= to_bytes(ec
->e_bs
);
429 limits
->physical_block_size
= to_bytes(ec
->u_bs
);
430 limits
->alignment_offset
= limits
->physical_block_size
;
431 limits
->io_min
= limits
->logical_block_size
;
434 static int ebs_iterate_devices(struct dm_target
*ti
,
435 iterate_devices_callout_fn fn
, void *data
)
437 struct ebs_c
*ec
= ti
->private;
439 return fn(ti
, ec
->dev
, ec
->start
, ti
->len
, data
);
442 static struct target_type ebs_target
= {
444 .version
= {1, 0, 1},
445 .features
= DM_TARGET_PASSES_INTEGRITY
,
446 .module
= THIS_MODULE
,
450 .status
= ebs_status
,
451 .io_hints
= ebs_io_hints
,
452 .prepare_ioctl
= ebs_prepare_ioctl
,
453 .iterate_devices
= ebs_iterate_devices
,
457 MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@lists.linux.dev>");
458 MODULE_DESCRIPTION(DM_NAME
" emulated block size target");
459 MODULE_LICENSE("GPL");