2 * Copyright (C) 2014 Facebook. All rights reserved.
4 * This file is released under the GPL.
7 #include <linux/device-mapper.h>
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/blkdev.h>
12 #include <linux/bio.h>
13 #include <linux/slab.h>
14 #include <linux/kthread.h>
15 #include <linux/freezer.h>
17 #define DM_MSG_PREFIX "log-writes"
20 * This target will sequentially log all writes to the target device onto the
21 * log device. This is helpful for replaying writes to check for fs consistency
22 * at all times. This target provides a mechanism to mark specific events to
23 * check data at a later time. So for example you would:
27 * dmsetup message /dev/whatever mark mymark
30 * Then replay the log up to mymark and check the contents of the replay to
31 * verify it matches what was written.
33 * We log writes only after they have been flushed, this makes the log describe
34 * close to the order in which the data hits the actual disk, not its cache. So
35 * for example the following sequence (W means write, C means complete)
37 * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
39 * Would result in the log looking like this:
41 * c,a,flush,fuad,b,<other writes>,<next flush>
43 * This is meant to help expose problems where file systems do not properly wait
44 * on data being written before invoking a FLUSH. FUA bypasses cache so once it
45 * completes it is added to the log as it should be on disk.
47 * We treat DISCARDs as if they don't bypass cache so that they are logged in
48 * order of completion along with the normal writes. If we didn't do it this
49 * way we would process all the discards first and then write all the data, when
50 * in fact we want to do the data and the discard in the order that they
53 #define LOG_FLUSH_FLAG (1 << 0)
54 #define LOG_FUA_FLAG (1 << 1)
55 #define LOG_DISCARD_FLAG (1 << 2)
56 #define LOG_MARK_FLAG (1 << 3)
58 #define WRITE_LOG_VERSION 1
59 #define WRITE_LOG_MAGIC 0x6a736677736872
62 * The disk format for this is braindead simple.
64 * At byte 0 we have our super, followed by the following sequence for
67 * [ 1 sector ][ entry->nr_sectors ]
68 * [log_write_entry][ data written ]
70 * The log_write_entry takes up a full sector so we can have arbitrary length
71 * marks and it leaves us room for extra content in the future.
75 * Basic info about the log for userspace.
77 struct log_write_super
{
85 * sector - the sector we wrote.
86 * nr_sectors - the number of sectors we wrote.
87 * flags - flags for this log entry.
88 * data_len - the size of the data in this log entry, this is for private log
89 * entry stuff, the MARK data provided by userspace for example.
91 struct log_write_entry
{
100 struct dm_dev
*logdev
;
104 atomic_t pending_blocks
;
105 sector_t next_sector
;
107 bool logging_enabled
;
108 bool device_supports_discard
;
109 spinlock_t blocks_lock
;
110 struct list_head unflushed_blocks
;
111 struct list_head logging_blocks
;
112 wait_queue_head_t wait
;
113 struct task_struct
*log_kthread
;
116 struct pending_block
{
123 struct list_head list
;
124 struct bio_vec vecs
[0];
127 struct per_bio_data
{
128 struct pending_block
*block
;
131 static void put_pending_block(struct log_writes_c
*lc
)
133 if (atomic_dec_and_test(&lc
->pending_blocks
)) {
134 smp_mb__after_atomic();
135 if (waitqueue_active(&lc
->wait
))
140 static void put_io_block(struct log_writes_c
*lc
)
142 if (atomic_dec_and_test(&lc
->io_blocks
)) {
143 smp_mb__after_atomic();
144 if (waitqueue_active(&lc
->wait
))
149 static void log_end_io(struct bio
*bio
, int err
)
151 struct log_writes_c
*lc
= bio
->bi_private
;
152 struct bio_vec
*bvec
;
158 DMERR("Error writing log block, error=%d", err
);
159 spin_lock_irqsave(&lc
->blocks_lock
, flags
);
160 lc
->logging_enabled
= false;
161 spin_unlock_irqrestore(&lc
->blocks_lock
, flags
);
164 bio_for_each_segment_all(bvec
, bio
, i
)
165 __free_page(bvec
->bv_page
);
172 * Meant to be called if there is an error, it will free all the pages
173 * associated with the block.
175 static void free_pending_block(struct log_writes_c
*lc
,
176 struct pending_block
*block
)
180 for (i
= 0; i
< block
->vec_cnt
; i
++) {
181 if (block
->vecs
[i
].bv_page
)
182 __free_page(block
->vecs
[i
].bv_page
);
186 put_pending_block(lc
);
189 static int write_metadata(struct log_writes_c
*lc
, void *entry
,
190 size_t entrylen
, void *data
, size_t datalen
,
198 bio
= bio_alloc(GFP_KERNEL
, 1);
200 DMERR("Couldn't alloc log bio");
203 bio
->bi_iter
.bi_size
= 0;
204 bio
->bi_iter
.bi_sector
= sector
;
205 bio
->bi_bdev
= lc
->logdev
->bdev
;
206 bio
->bi_end_io
= log_end_io
;
207 bio
->bi_private
= lc
;
208 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
210 page
= alloc_page(GFP_KERNEL
);
212 DMERR("Couldn't alloc log page");
217 ptr
= kmap_atomic(page
);
218 memcpy(ptr
, entry
, entrylen
);
220 memcpy(ptr
+ entrylen
, data
, datalen
);
221 memset(ptr
+ entrylen
+ datalen
, 0,
222 lc
->sectorsize
- entrylen
- datalen
);
225 ret
= bio_add_page(bio
, page
, lc
->sectorsize
, 0);
226 if (ret
!= lc
->sectorsize
) {
227 DMERR("Couldn't add page to the log block");
230 submit_bio(WRITE
, bio
);
240 static int log_one_block(struct log_writes_c
*lc
,
241 struct pending_block
*block
, sector_t sector
)
244 struct log_write_entry entry
;
248 entry
.sector
= cpu_to_le64(block
->sector
);
249 entry
.nr_sectors
= cpu_to_le64(block
->nr_sectors
);
250 entry
.flags
= cpu_to_le64(block
->flags
);
251 entry
.data_len
= cpu_to_le64(block
->datalen
);
252 if (write_metadata(lc
, &entry
, sizeof(entry
), block
->data
,
253 block
->datalen
, sector
)) {
254 free_pending_block(lc
, block
);
262 atomic_inc(&lc
->io_blocks
);
263 bio
= bio_alloc(GFP_KERNEL
, min(block
->vec_cnt
, BIO_MAX_PAGES
));
265 DMERR("Couldn't alloc log bio");
268 bio
->bi_iter
.bi_size
= 0;
269 bio
->bi_iter
.bi_sector
= sector
;
270 bio
->bi_bdev
= lc
->logdev
->bdev
;
271 bio
->bi_end_io
= log_end_io
;
272 bio
->bi_private
= lc
;
273 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
275 for (i
= 0; i
< block
->vec_cnt
; i
++) {
277 * The page offset is always 0 because we allocate a new page
278 * for every bvec in the original bio for simplicity sake.
280 ret
= bio_add_page(bio
, block
->vecs
[i
].bv_page
,
281 block
->vecs
[i
].bv_len
, 0);
282 if (ret
!= block
->vecs
[i
].bv_len
) {
283 atomic_inc(&lc
->io_blocks
);
284 submit_bio(WRITE
, bio
);
285 bio
= bio_alloc(GFP_KERNEL
, min(block
->vec_cnt
- i
, BIO_MAX_PAGES
));
287 DMERR("Couldn't alloc log bio");
290 bio
->bi_iter
.bi_size
= 0;
291 bio
->bi_iter
.bi_sector
= sector
;
292 bio
->bi_bdev
= lc
->logdev
->bdev
;
293 bio
->bi_end_io
= log_end_io
;
294 bio
->bi_private
= lc
;
295 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
297 ret
= bio_add_page(bio
, block
->vecs
[i
].bv_page
,
298 block
->vecs
[i
].bv_len
, 0);
299 if (ret
!= block
->vecs
[i
].bv_len
) {
300 DMERR("Couldn't add page on new bio?");
305 sector
+= block
->vecs
[i
].bv_len
>> SECTOR_SHIFT
;
307 submit_bio(WRITE
, bio
);
311 put_pending_block(lc
);
314 free_pending_block(lc
, block
);
319 static int log_super(struct log_writes_c
*lc
)
321 struct log_write_super super
;
323 super
.magic
= cpu_to_le64(WRITE_LOG_MAGIC
);
324 super
.version
= cpu_to_le64(WRITE_LOG_VERSION
);
325 super
.nr_entries
= cpu_to_le64(lc
->logged_entries
);
326 super
.sectorsize
= cpu_to_le32(lc
->sectorsize
);
328 if (write_metadata(lc
, &super
, sizeof(super
), NULL
, 0, 0)) {
329 DMERR("Couldn't write super");
336 static inline sector_t
logdev_last_sector(struct log_writes_c
*lc
)
338 return i_size_read(lc
->logdev
->bdev
->bd_inode
) >> SECTOR_SHIFT
;
341 static int log_writes_kthread(void *arg
)
343 struct log_writes_c
*lc
= (struct log_writes_c
*)arg
;
346 while (!kthread_should_stop()) {
348 bool logging_enabled
;
349 struct pending_block
*block
= NULL
;
352 spin_lock_irq(&lc
->blocks_lock
);
353 if (!list_empty(&lc
->logging_blocks
)) {
354 block
= list_first_entry(&lc
->logging_blocks
,
355 struct pending_block
, list
);
356 list_del_init(&block
->list
);
357 if (!lc
->logging_enabled
)
360 sector
= lc
->next_sector
;
361 if (block
->flags
& LOG_DISCARD_FLAG
)
364 lc
->next_sector
+= block
->nr_sectors
+ 1;
367 * Apparently the size of the device may not be known
368 * right away, so handle this properly.
371 lc
->end_sector
= logdev_last_sector(lc
);
372 if (lc
->end_sector
&&
373 lc
->next_sector
>= lc
->end_sector
) {
374 DMERR("Ran out of space on the logdev");
375 lc
->logging_enabled
= false;
378 lc
->logged_entries
++;
379 atomic_inc(&lc
->io_blocks
);
381 super
= (block
->flags
& (LOG_FUA_FLAG
| LOG_MARK_FLAG
));
383 atomic_inc(&lc
->io_blocks
);
386 logging_enabled
= lc
->logging_enabled
;
387 spin_unlock_irq(&lc
->blocks_lock
);
389 if (logging_enabled
) {
390 ret
= log_one_block(lc
, block
, sector
);
394 spin_lock_irq(&lc
->blocks_lock
);
395 lc
->logging_enabled
= false;
396 spin_unlock_irq(&lc
->blocks_lock
);
399 free_pending_block(lc
, block
);
403 if (!try_to_freeze()) {
404 set_current_state(TASK_INTERRUPTIBLE
);
405 if (!kthread_should_stop() &&
406 !atomic_read(&lc
->pending_blocks
))
408 __set_current_state(TASK_RUNNING
);
415 * Construct a log-writes mapping:
416 * log-writes <dev_path> <log_dev_path>
418 static int log_writes_ctr(struct dm_target
*ti
, unsigned int argc
, char **argv
)
420 struct log_writes_c
*lc
;
421 struct dm_arg_set as
;
422 const char *devname
, *logdevname
;
428 ti
->error
= "Invalid argument count";
432 lc
= kzalloc(sizeof(struct log_writes_c
), GFP_KERNEL
);
434 ti
->error
= "Cannot allocate context";
437 spin_lock_init(&lc
->blocks_lock
);
438 INIT_LIST_HEAD(&lc
->unflushed_blocks
);
439 INIT_LIST_HEAD(&lc
->logging_blocks
);
440 init_waitqueue_head(&lc
->wait
);
441 lc
->sectorsize
= 1 << SECTOR_SHIFT
;
442 atomic_set(&lc
->io_blocks
, 0);
443 atomic_set(&lc
->pending_blocks
, 0);
445 devname
= dm_shift_arg(&as
);
446 if (dm_get_device(ti
, devname
, dm_table_get_mode(ti
->table
), &lc
->dev
)) {
447 ti
->error
= "Device lookup failed";
451 logdevname
= dm_shift_arg(&as
);
452 if (dm_get_device(ti
, logdevname
, dm_table_get_mode(ti
->table
), &lc
->logdev
)) {
453 ti
->error
= "Log device lookup failed";
454 dm_put_device(ti
, lc
->dev
);
458 lc
->log_kthread
= kthread_run(log_writes_kthread
, lc
, "log-write");
459 if (IS_ERR(lc
->log_kthread
)) {
460 ti
->error
= "Couldn't alloc kthread";
461 dm_put_device(ti
, lc
->dev
);
462 dm_put_device(ti
, lc
->logdev
);
466 /* We put the super at sector 0, start logging at sector 1 */
468 lc
->logging_enabled
= true;
469 lc
->end_sector
= logdev_last_sector(lc
);
470 lc
->device_supports_discard
= true;
472 ti
->num_flush_bios
= 1;
473 ti
->flush_supported
= true;
474 ti
->num_discard_bios
= 1;
475 ti
->discards_supported
= true;
476 ti
->per_bio_data_size
= sizeof(struct per_bio_data
);
485 static int log_mark(struct log_writes_c
*lc
, char *data
)
487 struct pending_block
*block
;
488 size_t maxsize
= lc
->sectorsize
- sizeof(struct log_write_entry
);
490 block
= kzalloc(sizeof(struct pending_block
), GFP_KERNEL
);
492 DMERR("Error allocating pending block");
496 block
->data
= kstrndup(data
, maxsize
, GFP_KERNEL
);
498 DMERR("Error copying mark data");
502 atomic_inc(&lc
->pending_blocks
);
503 block
->datalen
= strlen(block
->data
);
504 block
->flags
|= LOG_MARK_FLAG
;
505 spin_lock_irq(&lc
->blocks_lock
);
506 list_add_tail(&block
->list
, &lc
->logging_blocks
);
507 spin_unlock_irq(&lc
->blocks_lock
);
508 wake_up_process(lc
->log_kthread
);
512 static void log_writes_dtr(struct dm_target
*ti
)
514 struct log_writes_c
*lc
= ti
->private;
516 spin_lock_irq(&lc
->blocks_lock
);
517 list_splice_init(&lc
->unflushed_blocks
, &lc
->logging_blocks
);
518 spin_unlock_irq(&lc
->blocks_lock
);
521 * This is just nice to have since it'll update the super to include the
522 * unflushed blocks, if it fails we don't really care.
524 log_mark(lc
, "dm-log-writes-end");
525 wake_up_process(lc
->log_kthread
);
526 wait_event(lc
->wait
, !atomic_read(&lc
->io_blocks
) &&
527 !atomic_read(&lc
->pending_blocks
));
528 kthread_stop(lc
->log_kthread
);
530 WARN_ON(!list_empty(&lc
->logging_blocks
));
531 WARN_ON(!list_empty(&lc
->unflushed_blocks
));
532 dm_put_device(ti
, lc
->dev
);
533 dm_put_device(ti
, lc
->logdev
);
537 static void normal_map_bio(struct dm_target
*ti
, struct bio
*bio
)
539 struct log_writes_c
*lc
= ti
->private;
541 bio
->bi_bdev
= lc
->dev
->bdev
;
544 static int log_writes_map(struct dm_target
*ti
, struct bio
*bio
)
546 struct log_writes_c
*lc
= ti
->private;
547 struct per_bio_data
*pb
= dm_per_bio_data(bio
, sizeof(struct per_bio_data
));
548 struct pending_block
*block
;
549 struct bvec_iter iter
;
553 bool flush_bio
= (bio
->bi_rw
& REQ_FLUSH
);
554 bool fua_bio
= (bio
->bi_rw
& REQ_FUA
);
555 bool discard_bio
= (bio
->bi_rw
& REQ_DISCARD
);
559 /* Don't bother doing anything if logging has been disabled */
560 if (!lc
->logging_enabled
)
564 * Map reads as normal.
566 if (bio_data_dir(bio
) == READ
)
569 /* No sectors and not a flush? Don't care */
570 if (!bio_sectors(bio
) && !flush_bio
)
574 * Discards will have bi_size set but there's no actual data, so just
575 * allocate the size of the pending block.
578 alloc_size
= sizeof(struct pending_block
);
580 alloc_size
= sizeof(struct pending_block
) + sizeof(struct bio_vec
) * bio_segments(bio
);
582 block
= kzalloc(alloc_size
, GFP_NOIO
);
584 DMERR("Error allocating pending block");
585 spin_lock_irq(&lc
->blocks_lock
);
586 lc
->logging_enabled
= false;
587 spin_unlock_irq(&lc
->blocks_lock
);
590 INIT_LIST_HEAD(&block
->list
);
592 atomic_inc(&lc
->pending_blocks
);
595 block
->flags
|= LOG_FLUSH_FLAG
;
597 block
->flags
|= LOG_FUA_FLAG
;
599 block
->flags
|= LOG_DISCARD_FLAG
;
601 block
->sector
= bio
->bi_iter
.bi_sector
;
602 block
->nr_sectors
= bio_sectors(bio
);
604 /* We don't need the data, just submit */
606 WARN_ON(flush_bio
|| fua_bio
);
607 if (lc
->device_supports_discard
)
610 return DM_MAPIO_SUBMITTED
;
613 /* Flush bio, splice the unflushed blocks onto this list and submit */
614 if (flush_bio
&& !bio_sectors(bio
)) {
615 spin_lock_irq(&lc
->blocks_lock
);
616 list_splice_init(&lc
->unflushed_blocks
, &block
->list
);
617 spin_unlock_irq(&lc
->blocks_lock
);
622 * We will write this bio somewhere else way later so we need to copy
623 * the actual contents into new pages so we know the data will always be
626 * We do this because this could be a bio from O_DIRECT in which case we
627 * can't just hold onto the page until some later point, we have to
628 * manually copy the contents.
630 bio_for_each_segment(bv
, bio
, iter
) {
634 page
= alloc_page(GFP_NOIO
);
636 DMERR("Error allocing page");
637 free_pending_block(lc
, block
);
638 spin_lock_irq(&lc
->blocks_lock
);
639 lc
->logging_enabled
= false;
640 spin_unlock_irq(&lc
->blocks_lock
);
644 src
= kmap_atomic(bv
.bv_page
);
645 dst
= kmap_atomic(page
);
646 memcpy(dst
, src
+ bv
.bv_offset
, bv
.bv_len
);
649 block
->vecs
[i
].bv_page
= page
;
650 block
->vecs
[i
].bv_len
= bv
.bv_len
;
655 /* Had a flush with data in it, weird */
657 spin_lock_irq(&lc
->blocks_lock
);
658 list_splice_init(&lc
->unflushed_blocks
, &block
->list
);
659 spin_unlock_irq(&lc
->blocks_lock
);
662 normal_map_bio(ti
, bio
);
663 return DM_MAPIO_REMAPPED
;
666 static int normal_end_io(struct dm_target
*ti
, struct bio
*bio
, int error
)
668 struct log_writes_c
*lc
= ti
->private;
669 struct per_bio_data
*pb
= dm_per_bio_data(bio
, sizeof(struct per_bio_data
));
671 if (bio_data_dir(bio
) == WRITE
&& pb
->block
) {
672 struct pending_block
*block
= pb
->block
;
675 spin_lock_irqsave(&lc
->blocks_lock
, flags
);
676 if (block
->flags
& LOG_FLUSH_FLAG
) {
677 list_splice_tail_init(&block
->list
, &lc
->logging_blocks
);
678 list_add_tail(&block
->list
, &lc
->logging_blocks
);
679 wake_up_process(lc
->log_kthread
);
680 } else if (block
->flags
& LOG_FUA_FLAG
) {
681 list_add_tail(&block
->list
, &lc
->logging_blocks
);
682 wake_up_process(lc
->log_kthread
);
684 list_add_tail(&block
->list
, &lc
->unflushed_blocks
);
685 spin_unlock_irqrestore(&lc
->blocks_lock
, flags
);
692 * INFO format: <logged entries> <highest allocated sector>
694 static void log_writes_status(struct dm_target
*ti
, status_type_t type
,
695 unsigned status_flags
, char *result
,
699 struct log_writes_c
*lc
= ti
->private;
702 case STATUSTYPE_INFO
:
703 DMEMIT("%llu %llu", lc
->logged_entries
,
704 (unsigned long long)lc
->next_sector
- 1);
705 if (!lc
->logging_enabled
)
706 DMEMIT(" logging_disabled");
709 case STATUSTYPE_TABLE
:
710 DMEMIT("%s %s", lc
->dev
->name
, lc
->logdev
->name
);
715 static int log_writes_ioctl(struct dm_target
*ti
, unsigned int cmd
,
718 struct log_writes_c
*lc
= ti
->private;
719 struct dm_dev
*dev
= lc
->dev
;
723 * Only pass ioctls through if the device sizes match exactly.
725 if (ti
->len
!= i_size_read(dev
->bdev
->bd_inode
) >> SECTOR_SHIFT
)
726 r
= scsi_verify_blk_ioctl(NULL
, cmd
);
728 return r
? : __blkdev_driver_ioctl(dev
->bdev
, dev
->mode
, cmd
, arg
);
731 static int log_writes_merge(struct dm_target
*ti
, struct bvec_merge_data
*bvm
,
732 struct bio_vec
*biovec
, int max_size
)
734 struct log_writes_c
*lc
= ti
->private;
735 struct request_queue
*q
= bdev_get_queue(lc
->dev
->bdev
);
737 if (!q
->merge_bvec_fn
)
740 bvm
->bi_bdev
= lc
->dev
->bdev
;
741 bvm
->bi_sector
= dm_target_offset(ti
, bvm
->bi_sector
);
743 return min(max_size
, q
->merge_bvec_fn(q
, bvm
, biovec
));
746 static int log_writes_iterate_devices(struct dm_target
*ti
,
747 iterate_devices_callout_fn fn
,
750 struct log_writes_c
*lc
= ti
->private;
752 return fn(ti
, lc
->dev
, 0, ti
->len
, data
);
756 * Messages supported:
757 * mark <mark data> - specify the marked data.
759 static int log_writes_message(struct dm_target
*ti
, unsigned argc
, char **argv
)
762 struct log_writes_c
*lc
= ti
->private;
765 DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc
);
769 if (!strcasecmp(argv
[0], "mark"))
770 r
= log_mark(lc
, argv
[1]);
772 DMWARN("Unrecognised log writes target message received: %s", argv
[0]);
777 static void log_writes_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
779 struct log_writes_c
*lc
= ti
->private;
780 struct request_queue
*q
= bdev_get_queue(lc
->dev
->bdev
);
782 if (!q
|| !blk_queue_discard(q
)) {
783 lc
->device_supports_discard
= false;
784 limits
->discard_granularity
= 1 << SECTOR_SHIFT
;
785 limits
->max_discard_sectors
= (UINT_MAX
>> SECTOR_SHIFT
);
789 static struct target_type log_writes_target
= {
790 .name
= "log-writes",
791 .version
= {1, 0, 0},
792 .module
= THIS_MODULE
,
793 .ctr
= log_writes_ctr
,
794 .dtr
= log_writes_dtr
,
795 .map
= log_writes_map
,
796 .end_io
= normal_end_io
,
797 .status
= log_writes_status
,
798 .ioctl
= log_writes_ioctl
,
799 .merge
= log_writes_merge
,
800 .message
= log_writes_message
,
801 .iterate_devices
= log_writes_iterate_devices
,
802 .io_hints
= log_writes_io_hints
,
805 static int __init
dm_log_writes_init(void)
807 int r
= dm_register_target(&log_writes_target
);
810 DMERR("register failed %d", r
);
815 static void __exit
dm_log_writes_exit(void)
817 dm_unregister_target(&log_writes_target
);
820 module_init(dm_log_writes_init
);
821 module_exit(dm_log_writes_exit
);
823 MODULE_DESCRIPTION(DM_NAME
" log writes target");
824 MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
825 MODULE_LICENSE("GPL");