2 * Copyright (C) 2020 Red Hat GmbH
4 * This file is released under the GPL.
6 * Device-mapper target to emulate smaller logical block
7 * size on backing devices exposing (natively) larger ones.
9 * E.g. 512 byte sector emulation on 4K native disks.
13 #include <linux/module.h>
14 #include <linux/workqueue.h>
15 #include <linux/dm-bufio.h>
17 #define DM_MSG_PREFIX "ebs"
19 static void ebs_dtr(struct dm_target
*ti
);
21 /* Emulated block size context. */
23 struct dm_dev
*dev
; /* Underlying device to emulate block size on. */
24 struct dm_bufio_client
*bufio
; /* Use dm-bufio for read and read-modify-write processing. */
25 struct workqueue_struct
*wq
; /* Workqueue for ^ processing of bios. */
26 struct work_struct ws
; /* Work item used for ^. */
27 struct bio_list bios_in
; /* Worker bios input list. */
28 spinlock_t lock
; /* Guard bios input list above. */
29 sector_t start
; /* <start> table line argument, see ebs_ctr below. */
30 unsigned int e_bs
; /* Emulated block size in sectors exposed to upper layer. */
31 unsigned int u_bs
; /* Underlying block size in sectors retrievd from/set on lower layer device. */
32 unsigned char block_shift
; /* bitshift sectors -> blocks used in dm-bufio API. */
33 bool u_bs_set
:1; /* Flag to indicate underlying block size is set on table line. */
36 static inline sector_t
__sector_to_block(struct ebs_c
*ec
, sector_t sector
)
38 return sector
>> ec
->block_shift
;
41 static inline sector_t
__block_mod(sector_t sector
, unsigned int bs
)
43 return sector
& (bs
- 1);
46 /* Return number of blocks for a bio, accounting for misalignement of start and end sectors. */
47 static inline unsigned int __nr_blocks(struct ebs_c
*ec
, struct bio
*bio
)
49 sector_t end_sector
= __block_mod(bio
->bi_iter
.bi_sector
, ec
->u_bs
) + bio_sectors(bio
);
51 return __sector_to_block(ec
, end_sector
) + (__block_mod(end_sector
, ec
->u_bs
) ? 1 : 0);
54 static inline bool __ebs_check_bs(unsigned int bs
)
56 return bs
&& is_power_of_2(bs
);
62 * copy blocks between bufio blocks and bio vector's (partial/overlapping) pages.
64 static int __ebs_rw_bvec(struct ebs_c
*ec
, int rw
, struct bio_vec
*bv
, struct bvec_iter
*iter
)
67 unsigned char *ba
, *pa
;
69 unsigned int bv_len
= bv
->bv_len
;
70 unsigned int buf_off
= to_bytes(__block_mod(iter
->bi_sector
, ec
->u_bs
));
71 sector_t block
= __sector_to_block(ec
, iter
->bi_sector
);
74 if (unlikely(!bv
->bv_page
|| !bv_len
))
77 pa
= page_address(bv
->bv_page
) + bv
->bv_offset
;
79 /* Handle overlapping page <-> blocks */
81 cur_len
= min(dm_bufio_get_block_size(ec
->bufio
) - buf_off
, bv_len
);
83 /* Avoid reading for writes in case bio vector's page overwrites block completely. */
84 if (rw
== READ
|| buf_off
|| bv_len
< dm_bufio_get_block_size(ec
->bufio
))
85 ba
= dm_bufio_read(ec
->bufio
, block
, &b
);
87 ba
= dm_bufio_new(ec
->bufio
, block
, &b
);
91 * Carry on with next buffer, if any, to issue all possible
92 * data but return error.
96 /* Copy data to/from bio to buffer if read/new was successful above. */
99 memcpy(pa
, ba
, cur_len
);
100 flush_dcache_page(bv
->bv_page
);
102 flush_dcache_page(bv
->bv_page
);
103 memcpy(ba
, pa
, cur_len
);
104 dm_bufio_mark_partial_buffer_dirty(b
, buf_off
, buf_off
+ cur_len
);
119 /* READ/WRITE: iterate bio vector's copying between (partial) pages and bufio blocks. */
120 static int __ebs_rw_bio(struct ebs_c
*ec
, int rw
, struct bio
*bio
)
124 struct bvec_iter iter
;
126 bio_for_each_bvec(bv
, bio
, iter
) {
127 rr
= __ebs_rw_bvec(ec
, rw
, &bv
, &iter
);
136 * Discard bio's blocks, i.e. pass discards down.
138 * Avoid discarding partial blocks at beginning and end;
139 * return 0 in case no blocks can be discarded as a result.
141 static int __ebs_discard_bio(struct ebs_c
*ec
, struct bio
*bio
)
143 sector_t block
, blocks
, sector
= bio
->bi_iter
.bi_sector
;
145 block
= __sector_to_block(ec
, sector
);
146 blocks
= __nr_blocks(ec
, bio
);
149 * Partial first underlying block (__nr_blocks() may have
150 * resulted in one block).
152 if (__block_mod(sector
, ec
->u_bs
)) {
157 /* Partial last underlying block if any. */
158 if (blocks
&& __block_mod(bio_end_sector(bio
), ec
->u_bs
))
161 return blocks
? dm_bufio_issue_discard(ec
->bufio
, block
, blocks
) : 0;
164 /* Release blocks them from the bufio cache. */
165 static void __ebs_forget_bio(struct ebs_c
*ec
, struct bio
*bio
)
167 sector_t blocks
, sector
= bio
->bi_iter
.bi_sector
;
169 blocks
= __nr_blocks(ec
, bio
);
171 dm_bufio_forget_buffers(ec
->bufio
, __sector_to_block(ec
, sector
), blocks
);
174 /* Worker funtion to process incoming bios. */
175 static void __ebs_process_bios(struct work_struct
*ws
)
179 sector_t block1
, block2
;
180 struct ebs_c
*ec
= container_of(ws
, struct ebs_c
, ws
);
182 struct bio_list bios
;
184 bio_list_init(&bios
);
186 spin_lock_irq(&ec
->lock
);
188 bio_list_init(&ec
->bios_in
);
189 spin_unlock_irq(&ec
->lock
);
191 /* Prefetch all read and any mis-aligned write buffers */
192 bio_list_for_each(bio
, &bios
) {
193 block1
= __sector_to_block(ec
, bio
->bi_iter
.bi_sector
);
194 if (bio_op(bio
) == REQ_OP_READ
)
195 dm_bufio_prefetch(ec
->bufio
, block1
, __nr_blocks(ec
, bio
));
196 else if (bio_op(bio
) == REQ_OP_WRITE
&& !(bio
->bi_opf
& REQ_PREFLUSH
)) {
197 block2
= __sector_to_block(ec
, bio_end_sector(bio
));
198 if (__block_mod(bio
->bi_iter
.bi_sector
, ec
->u_bs
))
199 dm_bufio_prefetch(ec
->bufio
, block1
, 1);
200 if (__block_mod(bio_end_sector(bio
), ec
->u_bs
) && block2
!= block1
)
201 dm_bufio_prefetch(ec
->bufio
, block2
, 1);
205 bio_list_for_each(bio
, &bios
) {
207 if (bio_op(bio
) == REQ_OP_READ
)
208 r
= __ebs_rw_bio(ec
, READ
, bio
);
209 else if (bio_op(bio
) == REQ_OP_WRITE
) {
211 r
= __ebs_rw_bio(ec
, WRITE
, bio
);
212 } else if (bio_op(bio
) == REQ_OP_DISCARD
) {
213 __ebs_forget_bio(ec
, bio
);
214 r
= __ebs_discard_bio(ec
, bio
);
218 bio
->bi_status
= errno_to_blk_status(r
);
222 * We write dirty buffers after processing I/O on them
223 * but before we endio thus addressing REQ_FUA/REQ_SYNC.
225 r
= write
? dm_bufio_write_dirty_buffers(ec
->bufio
) : 0;
227 while ((bio
= bio_list_pop(&bios
))) {
228 /* Any other request is endioed. */
229 if (unlikely(r
&& bio_op(bio
) == REQ_OP_WRITE
))
237 * Construct an emulated block size mapping: <dev_path> <offset> <ebs> [<ubs>]
239 * <dev_path>: path of the underlying device
240 * <offset>: offset in 512 bytes sectors into <dev_path>
241 * <ebs>: emulated block size in units of 512 bytes exposed to the upper layer
242 * [<ubs>]: underlying block size in units of 512 bytes imposed on the lower layer;
243 * optional, if not supplied, retrieve logical block size from underlying device
245 static int ebs_ctr(struct dm_target
*ti
, unsigned int argc
, char **argv
)
249 unsigned long long tmp
;
253 if (argc
< 3 || argc
> 4) {
254 ti
->error
= "Invalid argument count";
258 ec
= ti
->private = kzalloc(sizeof(*ec
), GFP_KERNEL
);
260 ti
->error
= "Cannot allocate ebs context";
265 if (sscanf(argv
[1], "%llu%c", &tmp
, &dummy
) != 1 ||
266 tmp
!= (sector_t
)tmp
||
267 (sector_t
)tmp
>= ti
->len
) {
268 ti
->error
= "Invalid device offset sector";
273 if (sscanf(argv
[2], "%hu%c", &tmp1
, &dummy
) != 1 ||
274 !__ebs_check_bs(tmp1
) ||
275 to_bytes(tmp1
) > PAGE_SIZE
) {
276 ti
->error
= "Invalid emulated block size";
282 if (sscanf(argv
[3], "%hu%c", &tmp1
, &dummy
) != 1 || !__ebs_check_bs(tmp1
)) {
283 ti
->error
= "Invalid underlying block size";
289 ec
->u_bs_set
= false;
291 r
= dm_get_device(ti
, argv
[0], dm_table_get_mode(ti
->table
), &ec
->dev
);
293 ti
->error
= "Device lookup failed";
300 ec
->u_bs
= to_sector(bdev_logical_block_size(ec
->dev
->bdev
));
301 if (!__ebs_check_bs(ec
->u_bs
)) {
302 ti
->error
= "Invalid retrieved underlying block size";
307 if (!ec
->u_bs_set
&& ec
->e_bs
== ec
->u_bs
)
308 DMINFO("Emulation superfluous: emulated equal to underlying block size");
310 if (__block_mod(ec
->start
, ec
->u_bs
)) {
311 ti
->error
= "Device offset must be multiple of underlying block size";
315 ec
->bufio
= dm_bufio_client_create(ec
->dev
->bdev
, to_bytes(ec
->u_bs
), 1, 0, NULL
, NULL
);
316 if (IS_ERR(ec
->bufio
)) {
317 ti
->error
= "Cannot create dm bufio client";
318 r
= PTR_ERR(ec
->bufio
);
323 ec
->wq
= alloc_ordered_workqueue("dm-" DM_MSG_PREFIX
, WQ_MEM_RECLAIM
);
325 ti
->error
= "Cannot create dm-" DM_MSG_PREFIX
" workqueue";
330 ec
->block_shift
= __ffs(ec
->u_bs
);
331 INIT_WORK(&ec
->ws
, &__ebs_process_bios
);
332 bio_list_init(&ec
->bios_in
);
333 spin_lock_init(&ec
->lock
);
335 ti
->num_flush_bios
= 1;
336 ti
->num_discard_bios
= 1;
337 ti
->num_secure_erase_bios
= 0;
338 ti
->num_write_same_bios
= 0;
339 ti
->num_write_zeroes_bios
= 0;
346 static void ebs_dtr(struct dm_target
*ti
)
348 struct ebs_c
*ec
= ti
->private;
351 destroy_workqueue(ec
->wq
);
353 dm_bufio_client_destroy(ec
->bufio
);
355 dm_put_device(ti
, ec
->dev
);
359 static int ebs_map(struct dm_target
*ti
, struct bio
*bio
)
361 struct ebs_c
*ec
= ti
->private;
363 bio_set_dev(bio
, ec
->dev
->bdev
);
364 bio
->bi_iter
.bi_sector
= ec
->start
+ dm_target_offset(ti
, bio
->bi_iter
.bi_sector
);
366 if (unlikely(bio_op(bio
) == REQ_OP_FLUSH
))
367 return DM_MAPIO_REMAPPED
;
369 * Only queue for bufio processing in case of partial or overlapping buffers
371 * emulation with ebs == ubs aiming for tests of dm-bufio overhead.
373 if (likely(__block_mod(bio
->bi_iter
.bi_sector
, ec
->u_bs
) ||
374 __block_mod(bio_end_sector(bio
), ec
->u_bs
) ||
375 ec
->e_bs
== ec
->u_bs
)) {
376 spin_lock_irq(&ec
->lock
);
377 bio_list_add(&ec
->bios_in
, bio
);
378 spin_unlock_irq(&ec
->lock
);
380 queue_work(ec
->wq
, &ec
->ws
);
382 return DM_MAPIO_SUBMITTED
;
385 /* Forget any buffer content relative to this direct backing device I/O. */
386 __ebs_forget_bio(ec
, bio
);
388 return DM_MAPIO_REMAPPED
;
391 static void ebs_status(struct dm_target
*ti
, status_type_t type
,
392 unsigned status_flags
, char *result
, unsigned maxlen
)
394 struct ebs_c
*ec
= ti
->private;
397 case STATUSTYPE_INFO
:
400 case STATUSTYPE_TABLE
:
401 snprintf(result
, maxlen
, ec
->u_bs_set
? "%s %llu %u %u" : "%s %llu %u",
402 ec
->dev
->name
, (unsigned long long) ec
->start
, ec
->e_bs
, ec
->u_bs
);
407 static int ebs_prepare_ioctl(struct dm_target
*ti
, struct block_device
**bdev
)
409 struct ebs_c
*ec
= ti
->private;
410 struct dm_dev
*dev
= ec
->dev
;
413 * Only pass ioctls through if the device sizes match exactly.
416 return !!(ec
->start
|| ti
->len
!= i_size_read(dev
->bdev
->bd_inode
) >> SECTOR_SHIFT
);
419 static void ebs_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
421 struct ebs_c
*ec
= ti
->private;
423 limits
->logical_block_size
= to_bytes(ec
->e_bs
);
424 limits
->physical_block_size
= to_bytes(ec
->u_bs
);
425 limits
->alignment_offset
= limits
->physical_block_size
;
426 blk_limits_io_min(limits
, limits
->logical_block_size
);
429 static int ebs_iterate_devices(struct dm_target
*ti
,
430 iterate_devices_callout_fn fn
, void *data
)
432 struct ebs_c
*ec
= ti
->private;
434 return fn(ti
, ec
->dev
, ec
->start
, ti
->len
, data
);
437 static struct target_type ebs_target
= {
439 .version
= {1, 0, 1},
440 .features
= DM_TARGET_PASSES_INTEGRITY
,
441 .module
= THIS_MODULE
,
445 .status
= ebs_status
,
446 .io_hints
= ebs_io_hints
,
447 .prepare_ioctl
= ebs_prepare_ioctl
,
448 .iterate_devices
= ebs_iterate_devices
,
451 static int __init
dm_ebs_init(void)
453 int r
= dm_register_target(&ebs_target
);
456 DMERR("register failed %d", r
);
461 static void dm_ebs_exit(void)
463 dm_unregister_target(&ebs_target
);
466 module_init(dm_ebs_init
);
467 module_exit(dm_ebs_exit
);
469 MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
470 MODULE_DESCRIPTION(DM_NAME
" emulated block size target");
471 MODULE_LICENSE("GPL");