2 * Copyright (C) 2014 Facebook. All rights reserved.
4 * This file is released under the GPL.
7 #include <linux/device-mapper.h>
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/blkdev.h>
12 #include <linux/bio.h>
13 #include <linux/dax.h>
14 #include <linux/slab.h>
15 #include <linux/kthread.h>
16 #include <linux/freezer.h>
17 #include <linux/uio.h>
19 #define DM_MSG_PREFIX "log-writes"
22 * This target will sequentially log all writes to the target device onto the
23 * log device. This is helpful for replaying writes to check for fs consistency
24 * at all times. This target provides a mechanism to mark specific events to
25 * check data at a later time. So for example you would:
29 * dmsetup message /dev/whatever mark mymark
32 * Then replay the log up to mymark and check the contents of the replay to
33 * verify it matches what was written.
35 * We log writes only after they have been flushed, this makes the log describe
36 * close to the order in which the data hits the actual disk, not its cache. So
37 * for example the following sequence (W means write, C means complete)
39 * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
41 * Would result in the log looking like this:
43 * c,a,flush,fuad,b,<other writes>,<next flush>
45 * This is meant to help expose problems where file systems do not properly wait
46 * on data being written before invoking a FLUSH. FUA bypasses cache so once it
47 * completes it is added to the log as it should be on disk.
49 * We treat DISCARDs as if they don't bypass cache so that they are logged in
50 * order of completion along with the normal writes. If we didn't do it this
51 * way we would process all the discards first and then write all the data, when
52 * in fact we want to do the data and the discard in the order that they
55 #define LOG_FLUSH_FLAG (1 << 0)
56 #define LOG_FUA_FLAG (1 << 1)
57 #define LOG_DISCARD_FLAG (1 << 2)
58 #define LOG_MARK_FLAG (1 << 3)
59 #define LOG_METADATA_FLAG (1 << 4)
61 #define WRITE_LOG_VERSION 1ULL
62 #define WRITE_LOG_MAGIC 0x6a736677736872ULL
63 #define WRITE_LOG_SUPER_SECTOR 0
66 * The disk format for this is braindead simple.
68 * At byte 0 we have our super, followed by the following sequence for
71 * [ 1 sector ][ entry->nr_sectors ]
72 * [log_write_entry][ data written ]
74 * The log_write_entry takes up a full sector so we can have arbitrary length
75 * marks and it leaves us room for extra content in the future.
79 * Basic info about the log for userspace.
81 struct log_write_super
{
89 * sector - the sector we wrote.
90 * nr_sectors - the number of sectors we wrote.
91 * flags - flags for this log entry.
92 * data_len - the size of the data in this log entry, this is for private log
93 * entry stuff, the MARK data provided by userspace for example.
95 struct log_write_entry
{
102 struct log_writes_c
{
104 struct dm_dev
*logdev
;
109 atomic_t pending_blocks
;
110 sector_t next_sector
;
112 bool logging_enabled
;
113 bool device_supports_discard
;
114 spinlock_t blocks_lock
;
115 struct list_head unflushed_blocks
;
116 struct list_head logging_blocks
;
117 wait_queue_head_t wait
;
118 struct task_struct
*log_kthread
;
119 struct completion super_done
;
122 struct pending_block
{
129 struct list_head list
;
130 struct bio_vec vecs
[0];
133 struct per_bio_data
{
134 struct pending_block
*block
;
137 static inline sector_t
bio_to_dev_sectors(struct log_writes_c
*lc
,
140 return sectors
>> (lc
->sectorshift
- SECTOR_SHIFT
);
143 static inline sector_t
dev_to_bio_sectors(struct log_writes_c
*lc
,
146 return sectors
<< (lc
->sectorshift
- SECTOR_SHIFT
);
149 static void put_pending_block(struct log_writes_c
*lc
)
151 if (atomic_dec_and_test(&lc
->pending_blocks
)) {
152 smp_mb__after_atomic();
153 if (waitqueue_active(&lc
->wait
))
158 static void put_io_block(struct log_writes_c
*lc
)
160 if (atomic_dec_and_test(&lc
->io_blocks
)) {
161 smp_mb__after_atomic();
162 if (waitqueue_active(&lc
->wait
))
167 static void log_end_io(struct bio
*bio
)
169 struct log_writes_c
*lc
= bio
->bi_private
;
171 if (bio
->bi_status
) {
174 DMERR("Error writing log block, error=%d", bio
->bi_status
);
175 spin_lock_irqsave(&lc
->blocks_lock
, flags
);
176 lc
->logging_enabled
= false;
177 spin_unlock_irqrestore(&lc
->blocks_lock
, flags
);
185 static void log_end_super(struct bio
*bio
)
187 struct log_writes_c
*lc
= bio
->bi_private
;
189 complete(&lc
->super_done
);
194 * Meant to be called if there is an error, it will free all the pages
195 * associated with the block.
197 static void free_pending_block(struct log_writes_c
*lc
,
198 struct pending_block
*block
)
202 for (i
= 0; i
< block
->vec_cnt
; i
++) {
203 if (block
->vecs
[i
].bv_page
)
204 __free_page(block
->vecs
[i
].bv_page
);
208 put_pending_block(lc
);
211 static int write_metadata(struct log_writes_c
*lc
, void *entry
,
212 size_t entrylen
, void *data
, size_t datalen
,
220 bio
= bio_alloc(GFP_KERNEL
, 1);
222 DMERR("Couldn't alloc log bio");
225 bio
->bi_iter
.bi_size
= 0;
226 bio
->bi_iter
.bi_sector
= sector
;
227 bio_set_dev(bio
, lc
->logdev
->bdev
);
228 bio
->bi_end_io
= (sector
== WRITE_LOG_SUPER_SECTOR
) ?
229 log_end_super
: log_end_io
;
230 bio
->bi_private
= lc
;
231 bio_set_op_attrs(bio
, REQ_OP_WRITE
, 0);
233 page
= alloc_page(GFP_KERNEL
);
235 DMERR("Couldn't alloc log page");
240 ptr
= kmap_atomic(page
);
241 memcpy(ptr
, entry
, entrylen
);
243 memcpy(ptr
+ entrylen
, data
, datalen
);
244 memset(ptr
+ entrylen
+ datalen
, 0,
245 lc
->sectorsize
- entrylen
- datalen
);
248 ret
= bio_add_page(bio
, page
, lc
->sectorsize
, 0);
249 if (ret
!= lc
->sectorsize
) {
250 DMERR("Couldn't add page to the log block");
263 static int write_inline_data(struct log_writes_c
*lc
, void *entry
,
264 size_t entrylen
, void *data
, size_t datalen
,
267 int num_pages
, bio_pages
, pg_datalen
, pg_sectorlen
, i
;
274 num_pages
= ALIGN(datalen
, PAGE_SIZE
) >> PAGE_SHIFT
;
275 bio_pages
= min(num_pages
, BIO_MAX_PAGES
);
277 atomic_inc(&lc
->io_blocks
);
279 bio
= bio_alloc(GFP_KERNEL
, bio_pages
);
281 DMERR("Couldn't alloc inline data bio");
285 bio
->bi_iter
.bi_size
= 0;
286 bio
->bi_iter
.bi_sector
= sector
;
287 bio_set_dev(bio
, lc
->logdev
->bdev
);
288 bio
->bi_end_io
= log_end_io
;
289 bio
->bi_private
= lc
;
290 bio_set_op_attrs(bio
, REQ_OP_WRITE
, 0);
292 for (i
= 0; i
< bio_pages
; i
++) {
293 pg_datalen
= min_t(int, datalen
, PAGE_SIZE
);
294 pg_sectorlen
= ALIGN(pg_datalen
, lc
->sectorsize
);
296 page
= alloc_page(GFP_KERNEL
);
298 DMERR("Couldn't alloc inline data page");
302 ptr
= kmap_atomic(page
);
303 memcpy(ptr
, data
, pg_datalen
);
304 if (pg_sectorlen
> pg_datalen
)
305 memset(ptr
+ pg_datalen
, 0, pg_sectorlen
- pg_datalen
);
308 ret
= bio_add_page(bio
, page
, pg_sectorlen
, 0);
309 if (ret
!= pg_sectorlen
) {
310 DMERR("Couldn't add page of inline data");
315 datalen
-= pg_datalen
;
320 sector
+= bio_pages
* PAGE_SECTORS
;
331 static int log_one_block(struct log_writes_c
*lc
,
332 struct pending_block
*block
, sector_t sector
)
335 struct log_write_entry entry
;
336 size_t metadatalen
, ret
;
339 entry
.sector
= cpu_to_le64(block
->sector
);
340 entry
.nr_sectors
= cpu_to_le64(block
->nr_sectors
);
341 entry
.flags
= cpu_to_le64(block
->flags
);
342 entry
.data_len
= cpu_to_le64(block
->datalen
);
344 metadatalen
= (block
->flags
& LOG_MARK_FLAG
) ? block
->datalen
: 0;
345 if (write_metadata(lc
, &entry
, sizeof(entry
), block
->data
,
346 metadatalen
, sector
)) {
347 free_pending_block(lc
, block
);
351 sector
+= dev_to_bio_sectors(lc
, 1);
353 if (block
->datalen
&& metadatalen
== 0) {
354 if (write_inline_data(lc
, &entry
, sizeof(entry
), block
->data
,
355 block
->datalen
, sector
)) {
356 free_pending_block(lc
, block
);
359 /* we don't support both inline data & bio data */
366 atomic_inc(&lc
->io_blocks
);
367 bio
= bio_alloc(GFP_KERNEL
, min(block
->vec_cnt
, BIO_MAX_PAGES
));
369 DMERR("Couldn't alloc log bio");
372 bio
->bi_iter
.bi_size
= 0;
373 bio
->bi_iter
.bi_sector
= sector
;
374 bio_set_dev(bio
, lc
->logdev
->bdev
);
375 bio
->bi_end_io
= log_end_io
;
376 bio
->bi_private
= lc
;
377 bio_set_op_attrs(bio
, REQ_OP_WRITE
, 0);
379 for (i
= 0; i
< block
->vec_cnt
; i
++) {
381 * The page offset is always 0 because we allocate a new page
382 * for every bvec in the original bio for simplicity sake.
384 ret
= bio_add_page(bio
, block
->vecs
[i
].bv_page
,
385 block
->vecs
[i
].bv_len
, 0);
386 if (ret
!= block
->vecs
[i
].bv_len
) {
387 atomic_inc(&lc
->io_blocks
);
389 bio
= bio_alloc(GFP_KERNEL
, min(block
->vec_cnt
- i
, BIO_MAX_PAGES
));
391 DMERR("Couldn't alloc log bio");
394 bio
->bi_iter
.bi_size
= 0;
395 bio
->bi_iter
.bi_sector
= sector
;
396 bio_set_dev(bio
, lc
->logdev
->bdev
);
397 bio
->bi_end_io
= log_end_io
;
398 bio
->bi_private
= lc
;
399 bio_set_op_attrs(bio
, REQ_OP_WRITE
, 0);
401 ret
= bio_add_page(bio
, block
->vecs
[i
].bv_page
,
402 block
->vecs
[i
].bv_len
, 0);
403 if (ret
!= block
->vecs
[i
].bv_len
) {
404 DMERR("Couldn't add page on new bio?");
409 sector
+= block
->vecs
[i
].bv_len
>> SECTOR_SHIFT
;
415 put_pending_block(lc
);
418 free_pending_block(lc
, block
);
423 static int log_super(struct log_writes_c
*lc
)
425 struct log_write_super super
;
427 super
.magic
= cpu_to_le64(WRITE_LOG_MAGIC
);
428 super
.version
= cpu_to_le64(WRITE_LOG_VERSION
);
429 super
.nr_entries
= cpu_to_le64(lc
->logged_entries
);
430 super
.sectorsize
= cpu_to_le32(lc
->sectorsize
);
432 if (write_metadata(lc
, &super
, sizeof(super
), NULL
, 0,
433 WRITE_LOG_SUPER_SECTOR
)) {
434 DMERR("Couldn't write super");
439 * Super sector should be writen in-order, otherwise the
440 * nr_entries could be rewritten incorrectly by an old bio.
442 wait_for_completion_io(&lc
->super_done
);
447 static inline sector_t
logdev_last_sector(struct log_writes_c
*lc
)
449 return i_size_read(lc
->logdev
->bdev
->bd_inode
) >> SECTOR_SHIFT
;
452 static int log_writes_kthread(void *arg
)
454 struct log_writes_c
*lc
= (struct log_writes_c
*)arg
;
457 while (!kthread_should_stop()) {
459 bool logging_enabled
;
460 struct pending_block
*block
= NULL
;
463 spin_lock_irq(&lc
->blocks_lock
);
464 if (!list_empty(&lc
->logging_blocks
)) {
465 block
= list_first_entry(&lc
->logging_blocks
,
466 struct pending_block
, list
);
467 list_del_init(&block
->list
);
468 if (!lc
->logging_enabled
)
471 sector
= lc
->next_sector
;
472 if (!(block
->flags
& LOG_DISCARD_FLAG
))
473 lc
->next_sector
+= dev_to_bio_sectors(lc
, block
->nr_sectors
);
474 lc
->next_sector
+= dev_to_bio_sectors(lc
, 1);
477 * Apparently the size of the device may not be known
478 * right away, so handle this properly.
481 lc
->end_sector
= logdev_last_sector(lc
);
482 if (lc
->end_sector
&&
483 lc
->next_sector
>= lc
->end_sector
) {
484 DMERR("Ran out of space on the logdev");
485 lc
->logging_enabled
= false;
488 lc
->logged_entries
++;
489 atomic_inc(&lc
->io_blocks
);
491 super
= (block
->flags
& (LOG_FUA_FLAG
| LOG_MARK_FLAG
));
493 atomic_inc(&lc
->io_blocks
);
496 logging_enabled
= lc
->logging_enabled
;
497 spin_unlock_irq(&lc
->blocks_lock
);
499 if (logging_enabled
) {
500 ret
= log_one_block(lc
, block
, sector
);
504 spin_lock_irq(&lc
->blocks_lock
);
505 lc
->logging_enabled
= false;
506 spin_unlock_irq(&lc
->blocks_lock
);
509 free_pending_block(lc
, block
);
513 if (!try_to_freeze()) {
514 set_current_state(TASK_INTERRUPTIBLE
);
515 if (!kthread_should_stop() &&
516 list_empty(&lc
->logging_blocks
))
518 __set_current_state(TASK_RUNNING
);
525 * Construct a log-writes mapping:
526 * log-writes <dev_path> <log_dev_path>
528 static int log_writes_ctr(struct dm_target
*ti
, unsigned int argc
, char **argv
)
530 struct log_writes_c
*lc
;
531 struct dm_arg_set as
;
532 const char *devname
, *logdevname
;
539 ti
->error
= "Invalid argument count";
543 lc
= kzalloc(sizeof(struct log_writes_c
), GFP_KERNEL
);
545 ti
->error
= "Cannot allocate context";
548 spin_lock_init(&lc
->blocks_lock
);
549 INIT_LIST_HEAD(&lc
->unflushed_blocks
);
550 INIT_LIST_HEAD(&lc
->logging_blocks
);
551 init_waitqueue_head(&lc
->wait
);
552 init_completion(&lc
->super_done
);
553 atomic_set(&lc
->io_blocks
, 0);
554 atomic_set(&lc
->pending_blocks
, 0);
556 devname
= dm_shift_arg(&as
);
557 ret
= dm_get_device(ti
, devname
, dm_table_get_mode(ti
->table
), &lc
->dev
);
559 ti
->error
= "Device lookup failed";
563 logdevname
= dm_shift_arg(&as
);
564 ret
= dm_get_device(ti
, logdevname
, dm_table_get_mode(ti
->table
),
567 ti
->error
= "Log device lookup failed";
568 dm_put_device(ti
, lc
->dev
);
572 lc
->sectorsize
= bdev_logical_block_size(lc
->dev
->bdev
);
573 lc
->sectorshift
= ilog2(lc
->sectorsize
);
574 lc
->log_kthread
= kthread_run(log_writes_kthread
, lc
, "log-write");
575 if (IS_ERR(lc
->log_kthread
)) {
576 ret
= PTR_ERR(lc
->log_kthread
);
577 ti
->error
= "Couldn't alloc kthread";
578 dm_put_device(ti
, lc
->dev
);
579 dm_put_device(ti
, lc
->logdev
);
584 * next_sector is in 512b sectors to correspond to what bi_sector expects.
585 * The super starts at sector 0, and the next_sector is the next logical
586 * one based on the sectorsize of the device.
588 lc
->next_sector
= lc
->sectorsize
>> SECTOR_SHIFT
;
589 lc
->logging_enabled
= true;
590 lc
->end_sector
= logdev_last_sector(lc
);
591 lc
->device_supports_discard
= true;
593 ti
->num_flush_bios
= 1;
594 ti
->flush_supported
= true;
595 ti
->num_discard_bios
= 1;
596 ti
->discards_supported
= true;
597 ti
->per_io_data_size
= sizeof(struct per_bio_data
);
606 static int log_mark(struct log_writes_c
*lc
, char *data
)
608 struct pending_block
*block
;
609 size_t maxsize
= lc
->sectorsize
- sizeof(struct log_write_entry
);
611 block
= kzalloc(sizeof(struct pending_block
), GFP_KERNEL
);
613 DMERR("Error allocating pending block");
617 block
->data
= kstrndup(data
, maxsize
- 1, GFP_KERNEL
);
619 DMERR("Error copying mark data");
623 atomic_inc(&lc
->pending_blocks
);
624 block
->datalen
= strlen(block
->data
);
625 block
->flags
|= LOG_MARK_FLAG
;
626 spin_lock_irq(&lc
->blocks_lock
);
627 list_add_tail(&block
->list
, &lc
->logging_blocks
);
628 spin_unlock_irq(&lc
->blocks_lock
);
629 wake_up_process(lc
->log_kthread
);
633 static void log_writes_dtr(struct dm_target
*ti
)
635 struct log_writes_c
*lc
= ti
->private;
637 spin_lock_irq(&lc
->blocks_lock
);
638 list_splice_init(&lc
->unflushed_blocks
, &lc
->logging_blocks
);
639 spin_unlock_irq(&lc
->blocks_lock
);
642 * This is just nice to have since it'll update the super to include the
643 * unflushed blocks, if it fails we don't really care.
645 log_mark(lc
, "dm-log-writes-end");
646 wake_up_process(lc
->log_kthread
);
647 wait_event(lc
->wait
, !atomic_read(&lc
->io_blocks
) &&
648 !atomic_read(&lc
->pending_blocks
));
649 kthread_stop(lc
->log_kthread
);
651 WARN_ON(!list_empty(&lc
->logging_blocks
));
652 WARN_ON(!list_empty(&lc
->unflushed_blocks
));
653 dm_put_device(ti
, lc
->dev
);
654 dm_put_device(ti
, lc
->logdev
);
658 static void normal_map_bio(struct dm_target
*ti
, struct bio
*bio
)
660 struct log_writes_c
*lc
= ti
->private;
662 bio_set_dev(bio
, lc
->dev
->bdev
);
665 static int log_writes_map(struct dm_target
*ti
, struct bio
*bio
)
667 struct log_writes_c
*lc
= ti
->private;
668 struct per_bio_data
*pb
= dm_per_bio_data(bio
, sizeof(struct per_bio_data
));
669 struct pending_block
*block
;
670 struct bvec_iter iter
;
674 bool flush_bio
= (bio
->bi_opf
& REQ_PREFLUSH
);
675 bool fua_bio
= (bio
->bi_opf
& REQ_FUA
);
676 bool discard_bio
= (bio_op(bio
) == REQ_OP_DISCARD
);
677 bool meta_bio
= (bio
->bi_opf
& REQ_META
);
681 /* Don't bother doing anything if logging has been disabled */
682 if (!lc
->logging_enabled
)
686 * Map reads as normal.
688 if (bio_data_dir(bio
) == READ
)
691 /* No sectors and not a flush? Don't care */
692 if (!bio_sectors(bio
) && !flush_bio
)
696 * Discards will have bi_size set but there's no actual data, so just
697 * allocate the size of the pending block.
700 alloc_size
= sizeof(struct pending_block
);
702 alloc_size
= sizeof(struct pending_block
) + sizeof(struct bio_vec
) * bio_segments(bio
);
704 block
= kzalloc(alloc_size
, GFP_NOIO
);
706 DMERR("Error allocating pending block");
707 spin_lock_irq(&lc
->blocks_lock
);
708 lc
->logging_enabled
= false;
709 spin_unlock_irq(&lc
->blocks_lock
);
710 return DM_MAPIO_KILL
;
712 INIT_LIST_HEAD(&block
->list
);
714 atomic_inc(&lc
->pending_blocks
);
717 block
->flags
|= LOG_FLUSH_FLAG
;
719 block
->flags
|= LOG_FUA_FLAG
;
721 block
->flags
|= LOG_DISCARD_FLAG
;
723 block
->flags
|= LOG_METADATA_FLAG
;
725 block
->sector
= bio_to_dev_sectors(lc
, bio
->bi_iter
.bi_sector
);
726 block
->nr_sectors
= bio_to_dev_sectors(lc
, bio_sectors(bio
));
728 /* We don't need the data, just submit */
730 WARN_ON(flush_bio
|| fua_bio
);
731 if (lc
->device_supports_discard
)
734 return DM_MAPIO_SUBMITTED
;
737 /* Flush bio, splice the unflushed blocks onto this list and submit */
738 if (flush_bio
&& !bio_sectors(bio
)) {
739 spin_lock_irq(&lc
->blocks_lock
);
740 list_splice_init(&lc
->unflushed_blocks
, &block
->list
);
741 spin_unlock_irq(&lc
->blocks_lock
);
746 * We will write this bio somewhere else way later so we need to copy
747 * the actual contents into new pages so we know the data will always be
750 * We do this because this could be a bio from O_DIRECT in which case we
751 * can't just hold onto the page until some later point, we have to
752 * manually copy the contents.
754 bio_for_each_segment(bv
, bio
, iter
) {
758 page
= alloc_page(GFP_NOIO
);
760 DMERR("Error allocing page");
761 free_pending_block(lc
, block
);
762 spin_lock_irq(&lc
->blocks_lock
);
763 lc
->logging_enabled
= false;
764 spin_unlock_irq(&lc
->blocks_lock
);
765 return DM_MAPIO_KILL
;
768 src
= kmap_atomic(bv
.bv_page
);
769 dst
= kmap_atomic(page
);
770 memcpy(dst
, src
+ bv
.bv_offset
, bv
.bv_len
);
773 block
->vecs
[i
].bv_page
= page
;
774 block
->vecs
[i
].bv_len
= bv
.bv_len
;
779 /* Had a flush with data in it, weird */
781 spin_lock_irq(&lc
->blocks_lock
);
782 list_splice_init(&lc
->unflushed_blocks
, &block
->list
);
783 spin_unlock_irq(&lc
->blocks_lock
);
786 normal_map_bio(ti
, bio
);
787 return DM_MAPIO_REMAPPED
;
790 static int normal_end_io(struct dm_target
*ti
, struct bio
*bio
,
793 struct log_writes_c
*lc
= ti
->private;
794 struct per_bio_data
*pb
= dm_per_bio_data(bio
, sizeof(struct per_bio_data
));
796 if (bio_data_dir(bio
) == WRITE
&& pb
->block
) {
797 struct pending_block
*block
= pb
->block
;
800 spin_lock_irqsave(&lc
->blocks_lock
, flags
);
801 if (block
->flags
& LOG_FLUSH_FLAG
) {
802 list_splice_tail_init(&block
->list
, &lc
->logging_blocks
);
803 list_add_tail(&block
->list
, &lc
->logging_blocks
);
804 wake_up_process(lc
->log_kthread
);
805 } else if (block
->flags
& LOG_FUA_FLAG
) {
806 list_add_tail(&block
->list
, &lc
->logging_blocks
);
807 wake_up_process(lc
->log_kthread
);
809 list_add_tail(&block
->list
, &lc
->unflushed_blocks
);
810 spin_unlock_irqrestore(&lc
->blocks_lock
, flags
);
813 return DM_ENDIO_DONE
;
817 * INFO format: <logged entries> <highest allocated sector>
819 static void log_writes_status(struct dm_target
*ti
, status_type_t type
,
820 unsigned status_flags
, char *result
,
824 struct log_writes_c
*lc
= ti
->private;
827 case STATUSTYPE_INFO
:
828 DMEMIT("%llu %llu", lc
->logged_entries
,
829 (unsigned long long)lc
->next_sector
- 1);
830 if (!lc
->logging_enabled
)
831 DMEMIT(" logging_disabled");
834 case STATUSTYPE_TABLE
:
835 DMEMIT("%s %s", lc
->dev
->name
, lc
->logdev
->name
);
840 static int log_writes_prepare_ioctl(struct dm_target
*ti
,
841 struct block_device
**bdev
)
843 struct log_writes_c
*lc
= ti
->private;
844 struct dm_dev
*dev
= lc
->dev
;
848 * Only pass ioctls through if the device sizes match exactly.
850 if (ti
->len
!= i_size_read(dev
->bdev
->bd_inode
) >> SECTOR_SHIFT
)
855 static int log_writes_iterate_devices(struct dm_target
*ti
,
856 iterate_devices_callout_fn fn
,
859 struct log_writes_c
*lc
= ti
->private;
861 return fn(ti
, lc
->dev
, 0, ti
->len
, data
);
865 * Messages supported:
866 * mark <mark data> - specify the marked data.
868 static int log_writes_message(struct dm_target
*ti
, unsigned argc
, char **argv
,
869 char *result
, unsigned maxlen
)
872 struct log_writes_c
*lc
= ti
->private;
875 DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc
);
879 if (!strcasecmp(argv
[0], "mark"))
880 r
= log_mark(lc
, argv
[1]);
882 DMWARN("Unrecognised log writes target message received: %s", argv
[0]);
887 static void log_writes_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
889 struct log_writes_c
*lc
= ti
->private;
890 struct request_queue
*q
= bdev_get_queue(lc
->dev
->bdev
);
892 if (!q
|| !blk_queue_discard(q
)) {
893 lc
->device_supports_discard
= false;
894 limits
->discard_granularity
= lc
->sectorsize
;
895 limits
->max_discard_sectors
= (UINT_MAX
>> SECTOR_SHIFT
);
897 limits
->logical_block_size
= bdev_logical_block_size(lc
->dev
->bdev
);
898 limits
->physical_block_size
= bdev_physical_block_size(lc
->dev
->bdev
);
899 limits
->io_min
= limits
->physical_block_size
;
902 #if IS_ENABLED(CONFIG_DAX_DRIVER)
903 static int log_dax(struct log_writes_c
*lc
, sector_t sector
, size_t bytes
,
906 struct pending_block
*block
;
911 block
= kzalloc(sizeof(struct pending_block
), GFP_KERNEL
);
913 DMERR("Error allocating dax pending block");
917 block
->data
= kzalloc(bytes
, GFP_KERNEL
);
919 DMERR("Error allocating dax data space");
924 /* write data provided via the iterator */
925 if (!copy_from_iter(block
->data
, bytes
, i
)) {
926 DMERR("Error copying dax data");
932 /* rewind the iterator so that the block driver can use it */
933 iov_iter_revert(i
, bytes
);
935 block
->datalen
= bytes
;
936 block
->sector
= bio_to_dev_sectors(lc
, sector
);
937 block
->nr_sectors
= ALIGN(bytes
, lc
->sectorsize
) >> lc
->sectorshift
;
939 atomic_inc(&lc
->pending_blocks
);
940 spin_lock_irq(&lc
->blocks_lock
);
941 list_add_tail(&block
->list
, &lc
->unflushed_blocks
);
942 spin_unlock_irq(&lc
->blocks_lock
);
943 wake_up_process(lc
->log_kthread
);
948 static long log_writes_dax_direct_access(struct dm_target
*ti
, pgoff_t pgoff
,
949 long nr_pages
, void **kaddr
, pfn_t
*pfn
)
951 struct log_writes_c
*lc
= ti
->private;
952 sector_t sector
= pgoff
* PAGE_SECTORS
;
955 ret
= bdev_dax_pgoff(lc
->dev
->bdev
, sector
, nr_pages
* PAGE_SIZE
, &pgoff
);
958 return dax_direct_access(lc
->dev
->dax_dev
, pgoff
, nr_pages
, kaddr
, pfn
);
961 static size_t log_writes_dax_copy_from_iter(struct dm_target
*ti
,
962 pgoff_t pgoff
, void *addr
, size_t bytes
,
965 struct log_writes_c
*lc
= ti
->private;
966 sector_t sector
= pgoff
* PAGE_SECTORS
;
969 if (bdev_dax_pgoff(lc
->dev
->bdev
, sector
, ALIGN(bytes
, PAGE_SIZE
), &pgoff
))
972 /* Don't bother doing anything if logging has been disabled */
973 if (!lc
->logging_enabled
)
976 err
= log_dax(lc
, sector
, bytes
, i
);
978 DMWARN("Error %d logging DAX write", err
);
982 return dax_copy_from_iter(lc
->dev
->dax_dev
, pgoff
, addr
, bytes
, i
);
985 static size_t log_writes_dax_copy_to_iter(struct dm_target
*ti
,
986 pgoff_t pgoff
, void *addr
, size_t bytes
,
989 struct log_writes_c
*lc
= ti
->private;
990 sector_t sector
= pgoff
* PAGE_SECTORS
;
992 if (bdev_dax_pgoff(lc
->dev
->bdev
, sector
, ALIGN(bytes
, PAGE_SIZE
), &pgoff
))
994 return dax_copy_to_iter(lc
->dev
->dax_dev
, pgoff
, addr
, bytes
, i
);
998 #define log_writes_dax_direct_access NULL
999 #define log_writes_dax_copy_from_iter NULL
1000 #define log_writes_dax_copy_to_iter NULL
1003 static struct target_type log_writes_target
= {
1004 .name
= "log-writes",
1005 .version
= {1, 1, 0},
1006 .module
= THIS_MODULE
,
1007 .ctr
= log_writes_ctr
,
1008 .dtr
= log_writes_dtr
,
1009 .map
= log_writes_map
,
1010 .end_io
= normal_end_io
,
1011 .status
= log_writes_status
,
1012 .prepare_ioctl
= log_writes_prepare_ioctl
,
1013 .message
= log_writes_message
,
1014 .iterate_devices
= log_writes_iterate_devices
,
1015 .io_hints
= log_writes_io_hints
,
1016 .direct_access
= log_writes_dax_direct_access
,
1017 .dax_copy_from_iter
= log_writes_dax_copy_from_iter
,
1018 .dax_copy_to_iter
= log_writes_dax_copy_to_iter
,
1021 static int __init
dm_log_writes_init(void)
1023 int r
= dm_register_target(&log_writes_target
);
1026 DMERR("register failed %d", r
);
1031 static void __exit
dm_log_writes_exit(void)
1033 dm_unregister_target(&log_writes_target
);
1036 module_init(dm_log_writes_init
);
1037 module_exit(dm_log_writes_exit
);
1039 MODULE_DESCRIPTION(DM_NAME
" log writes target");
1040 MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
1041 MODULE_LICENSE("GPL");