1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2018 Red Hat. All rights reserved.
5 * This file is released under the GPL.
8 #include <linux/device-mapper.h>
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/vmalloc.h>
12 #include <linux/kthread.h>
13 #include <linux/dm-io.h>
14 #include <linux/dm-kcopyd.h>
15 #include <linux/dax.h>
16 #include <linux/pfn_t.h>
17 #include <linux/libnvdimm.h>
19 #define DM_MSG_PREFIX "writecache"
21 #define HIGH_WATERMARK 50
22 #define LOW_WATERMARK 45
23 #define MAX_WRITEBACK_JOBS 0
24 #define ENDIO_LATENCY 16
25 #define WRITEBACK_LATENCY 64
26 #define AUTOCOMMIT_BLOCKS_SSD 65536
27 #define AUTOCOMMIT_BLOCKS_PMEM 64
28 #define AUTOCOMMIT_MSEC 1000
30 #define BITMAP_GRANULARITY 65536
31 #if BITMAP_GRANULARITY < PAGE_SIZE
32 #undef BITMAP_GRANULARITY
33 #define BITMAP_GRANULARITY PAGE_SIZE
36 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
37 #define DM_WRITECACHE_HAS_PMEM
40 #ifdef DM_WRITECACHE_HAS_PMEM
41 #define pmem_assign(dest, src) \
43 typeof(dest) uniq = (src); \
44 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \
47 #define pmem_assign(dest, src) ((dest) = (src))
50 #if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
51 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
54 #define MEMORY_SUPERBLOCK_MAGIC 0x23489321
55 #define MEMORY_SUPERBLOCK_VERSION 1
57 struct wc_memory_entry
{
58 __le64 original_sector
;
62 struct wc_memory_superblock
{
74 struct wc_memory_entry entries
[0];
78 struct rb_node rb_node
;
80 unsigned short wc_list_contiguous
;
81 bool write_in_progress
82 #if BITS_PER_LONG == 64
87 #if BITS_PER_LONG == 64
91 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
92 uint64_t original_sector
;
97 #ifdef DM_WRITECACHE_HAS_PMEM
98 #define WC_MODE_PMEM(wc) ((wc)->pmem_mode)
99 #define WC_MODE_FUA(wc) ((wc)->writeback_fua)
101 #define WC_MODE_PMEM(wc) false
102 #define WC_MODE_FUA(wc) false
104 #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc))
106 struct dm_writecache
{
108 struct list_head lru
;
110 struct list_head freelist
;
112 struct rb_root freetree
;
113 struct wc_entry
*current_free
;
118 size_t freelist_size
;
119 size_t writeback_size
;
120 size_t freelist_high_watermark
;
121 size_t freelist_low_watermark
;
123 unsigned uncommitted_blocks
;
124 unsigned autocommit_blocks
;
125 unsigned max_writeback_jobs
;
129 unsigned long autocommit_jiffies
;
130 struct timer_list autocommit_timer
;
131 struct wait_queue_head freelist_wait
;
133 atomic_t bio_in_progress
[2];
134 struct wait_queue_head bio_in_progress_wait
[2];
136 struct dm_target
*ti
;
138 struct dm_dev
*ssd_dev
;
139 sector_t start_sector
;
141 uint64_t memory_map_size
;
142 size_t metadata_sectors
;
146 struct wc_entry
*entries
;
148 unsigned char block_size_bits
;
151 bool writeback_fua
:1;
153 bool overwrote_committed
:1;
154 bool memory_vmapped
:1;
156 bool high_wm_percent_set
:1;
157 bool low_wm_percent_set
:1;
158 bool max_writeback_jobs_set
:1;
159 bool autocommit_blocks_set
:1;
160 bool autocommit_time_set
:1;
161 bool writeback_fua_set
:1;
162 bool flush_on_suspend
:1;
164 unsigned writeback_all
;
165 struct workqueue_struct
*writeback_wq
;
166 struct work_struct writeback_work
;
167 struct work_struct flush_work
;
169 struct dm_io_client
*dm_io
;
171 raw_spinlock_t endio_list_lock
;
172 struct list_head endio_list
;
173 struct task_struct
*endio_thread
;
175 struct task_struct
*flush_thread
;
176 struct bio_list flush_list
;
178 struct dm_kcopyd_client
*dm_kcopyd
;
179 unsigned long *dirty_bitmap
;
180 unsigned dirty_bitmap_size
;
182 struct bio_set bio_set
;
186 #define WB_LIST_INLINE 16
188 struct writeback_struct
{
189 struct list_head endio_entry
;
190 struct dm_writecache
*wc
;
191 struct wc_entry
**wc_list
;
193 unsigned page_offset
;
195 struct wc_entry
*wc_list_inline
[WB_LIST_INLINE
];
200 struct list_head endio_entry
;
201 struct dm_writecache
*wc
;
207 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle
,
208 "A percentage of time allocated for data copying");
210 static void wc_lock(struct dm_writecache
*wc
)
212 mutex_lock(&wc
->lock
);
215 static void wc_unlock(struct dm_writecache
*wc
)
217 mutex_unlock(&wc
->lock
);
220 #ifdef DM_WRITECACHE_HAS_PMEM
221 static int persistent_memory_claim(struct dm_writecache
*wc
)
230 wc
->memory_vmapped
= false;
232 if (!wc
->ssd_dev
->dax_dev
) {
236 s
= wc
->memory_map_size
;
242 if (p
!= s
>> PAGE_SHIFT
) {
247 id
= dax_read_lock();
249 da
= dax_direct_access(wc
->ssd_dev
->dax_dev
, 0, p
, &wc
->memory_map
, &pfn
);
251 wc
->memory_map
= NULL
;
255 if (!pfn_t_has_page(pfn
)) {
256 wc
->memory_map
= NULL
;
262 wc
->memory_map
= NULL
;
263 pages
= kvmalloc_array(p
, sizeof(struct page
*), GFP_KERNEL
);
271 daa
= dax_direct_access(wc
->ssd_dev
->dax_dev
, i
, p
- i
,
274 r
= daa
? daa
: -EINVAL
;
277 if (!pfn_t_has_page(pfn
)) {
281 while (daa
-- && i
< p
) {
282 pages
[i
++] = pfn_t_to_page(pfn
);
286 wc
->memory_map
= vmap(pages
, p
, VM_MAP
, PAGE_KERNEL
);
287 if (!wc
->memory_map
) {
292 wc
->memory_vmapped
= true;
297 wc
->memory_map
+= (size_t)wc
->start_sector
<< SECTOR_SHIFT
;
298 wc
->memory_map_size
-= (size_t)wc
->start_sector
<< SECTOR_SHIFT
;
309 static int persistent_memory_claim(struct dm_writecache
*wc
)
315 static void persistent_memory_release(struct dm_writecache
*wc
)
317 if (wc
->memory_vmapped
)
318 vunmap(wc
->memory_map
- ((size_t)wc
->start_sector
<< SECTOR_SHIFT
));
321 static struct page
*persistent_memory_page(void *addr
)
323 if (is_vmalloc_addr(addr
))
324 return vmalloc_to_page(addr
);
326 return virt_to_page(addr
);
329 static unsigned persistent_memory_page_offset(void *addr
)
331 return (unsigned long)addr
& (PAGE_SIZE
- 1);
334 static void persistent_memory_flush_cache(void *ptr
, size_t size
)
336 if (is_vmalloc_addr(ptr
))
337 flush_kernel_vmap_range(ptr
, size
);
340 static void persistent_memory_invalidate_cache(void *ptr
, size_t size
)
342 if (is_vmalloc_addr(ptr
))
343 invalidate_kernel_vmap_range(ptr
, size
);
346 static struct wc_memory_superblock
*sb(struct dm_writecache
*wc
)
348 return wc
->memory_map
;
351 static struct wc_memory_entry
*memory_entry(struct dm_writecache
*wc
, struct wc_entry
*e
)
353 return &sb(wc
)->entries
[e
->index
];
356 static void *memory_data(struct dm_writecache
*wc
, struct wc_entry
*e
)
358 return (char *)wc
->block_start
+ (e
->index
<< wc
->block_size_bits
);
361 static sector_t
cache_sector(struct dm_writecache
*wc
, struct wc_entry
*e
)
363 return wc
->start_sector
+ wc
->metadata_sectors
+
364 ((sector_t
)e
->index
<< (wc
->block_size_bits
- SECTOR_SHIFT
));
367 static uint64_t read_original_sector(struct dm_writecache
*wc
, struct wc_entry
*e
)
369 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
370 return e
->original_sector
;
372 return le64_to_cpu(memory_entry(wc
, e
)->original_sector
);
376 static uint64_t read_seq_count(struct dm_writecache
*wc
, struct wc_entry
*e
)
378 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
381 return le64_to_cpu(memory_entry(wc
, e
)->seq_count
);
385 static void clear_seq_count(struct dm_writecache
*wc
, struct wc_entry
*e
)
387 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
390 pmem_assign(memory_entry(wc
, e
)->seq_count
, cpu_to_le64(-1));
393 static void write_original_sector_seq_count(struct dm_writecache
*wc
, struct wc_entry
*e
,
394 uint64_t original_sector
, uint64_t seq_count
)
396 struct wc_memory_entry me
;
397 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
398 e
->original_sector
= original_sector
;
399 e
->seq_count
= seq_count
;
401 me
.original_sector
= cpu_to_le64(original_sector
);
402 me
.seq_count
= cpu_to_le64(seq_count
);
403 pmem_assign(*memory_entry(wc
, e
), me
);
406 #define writecache_error(wc, err, msg, arg...) \
408 if (!cmpxchg(&(wc)->error, 0, err)) \
410 wake_up(&(wc)->freelist_wait); \
413 #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error)))
415 static void writecache_flush_all_metadata(struct dm_writecache
*wc
)
417 if (!WC_MODE_PMEM(wc
))
418 memset(wc
->dirty_bitmap
, -1, wc
->dirty_bitmap_size
);
421 static void writecache_flush_region(struct dm_writecache
*wc
, void *ptr
, size_t size
)
423 if (!WC_MODE_PMEM(wc
))
424 __set_bit(((char *)ptr
- (char *)wc
->memory_map
) / BITMAP_GRANULARITY
,
428 static void writecache_disk_flush(struct dm_writecache
*wc
, struct dm_dev
*dev
);
431 struct dm_writecache
*wc
;
436 static void writecache_notify_io(unsigned long error
, void *context
)
438 struct io_notify
*endio
= context
;
440 if (unlikely(error
!= 0))
441 writecache_error(endio
->wc
, -EIO
, "error writing metadata");
442 BUG_ON(atomic_read(&endio
->count
) <= 0);
443 if (atomic_dec_and_test(&endio
->count
))
447 static void ssd_commit_flushed(struct dm_writecache
*wc
)
449 struct dm_io_region region
;
450 struct dm_io_request req
;
451 struct io_notify endio
= {
453 COMPLETION_INITIALIZER_ONSTACK(endio
.c
),
456 unsigned bitmap_bits
= wc
->dirty_bitmap_size
* 8;
461 i
= find_next_bit(wc
->dirty_bitmap
, bitmap_bits
, i
);
462 if (unlikely(i
== bitmap_bits
))
464 j
= find_next_zero_bit(wc
->dirty_bitmap
, bitmap_bits
, i
);
466 region
.bdev
= wc
->ssd_dev
->bdev
;
467 region
.sector
= (sector_t
)i
* (BITMAP_GRANULARITY
>> SECTOR_SHIFT
);
468 region
.count
= (sector_t
)(j
- i
) * (BITMAP_GRANULARITY
>> SECTOR_SHIFT
);
470 if (unlikely(region
.sector
>= wc
->metadata_sectors
))
472 if (unlikely(region
.sector
+ region
.count
> wc
->metadata_sectors
))
473 region
.count
= wc
->metadata_sectors
- region
.sector
;
475 region
.sector
+= wc
->start_sector
;
476 atomic_inc(&endio
.count
);
477 req
.bi_op
= REQ_OP_WRITE
;
478 req
.bi_op_flags
= REQ_SYNC
;
479 req
.mem
.type
= DM_IO_VMA
;
480 req
.mem
.ptr
.vma
= (char *)wc
->memory_map
+ (size_t)i
* BITMAP_GRANULARITY
;
481 req
.client
= wc
->dm_io
;
482 req
.notify
.fn
= writecache_notify_io
;
483 req
.notify
.context
= &endio
;
485 /* writing via async dm-io (implied by notify.fn above) won't return an error */
486 (void) dm_io(&req
, 1, ®ion
, NULL
);
490 writecache_notify_io(0, &endio
);
491 wait_for_completion_io(&endio
.c
);
493 writecache_disk_flush(wc
, wc
->ssd_dev
);
495 memset(wc
->dirty_bitmap
, 0, wc
->dirty_bitmap_size
);
498 static void writecache_commit_flushed(struct dm_writecache
*wc
)
500 if (WC_MODE_PMEM(wc
))
503 ssd_commit_flushed(wc
);
506 static void writecache_disk_flush(struct dm_writecache
*wc
, struct dm_dev
*dev
)
509 struct dm_io_region region
;
510 struct dm_io_request req
;
512 region
.bdev
= dev
->bdev
;
515 req
.bi_op
= REQ_OP_WRITE
;
516 req
.bi_op_flags
= REQ_PREFLUSH
;
517 req
.mem
.type
= DM_IO_KMEM
;
518 req
.mem
.ptr
.addr
= NULL
;
519 req
.client
= wc
->dm_io
;
520 req
.notify
.fn
= NULL
;
522 r
= dm_io(&req
, 1, ®ion
, NULL
);
524 writecache_error(wc
, r
, "error flushing metadata: %d", r
);
527 static void writecache_wait_for_ios(struct dm_writecache
*wc
, int direction
)
529 wait_event(wc
->bio_in_progress_wait
[direction
],
530 !atomic_read(&wc
->bio_in_progress
[direction
]));
533 #define WFE_RETURN_FOLLOWING 1
534 #define WFE_LOWEST_SEQ 2
536 static struct wc_entry
*writecache_find_entry(struct dm_writecache
*wc
,
537 uint64_t block
, int flags
)
540 struct rb_node
*node
= wc
->tree
.rb_node
;
546 e
= container_of(node
, struct wc_entry
, rb_node
);
547 if (read_original_sector(wc
, e
) == block
)
549 node
= (read_original_sector(wc
, e
) >= block
?
550 e
->rb_node
.rb_left
: e
->rb_node
.rb_right
);
551 if (unlikely(!node
)) {
552 if (!(flags
& WFE_RETURN_FOLLOWING
)) {
555 if (read_original_sector(wc
, e
) >= block
) {
558 node
= rb_next(&e
->rb_node
);
559 if (unlikely(!node
)) {
562 e
= container_of(node
, struct wc_entry
, rb_node
);
570 if (flags
& WFE_LOWEST_SEQ
)
571 node
= rb_prev(&e
->rb_node
);
573 node
= rb_next(&e
->rb_node
);
576 e2
= container_of(node
, struct wc_entry
, rb_node
);
577 if (read_original_sector(wc
, e2
) != block
)
583 static void writecache_insert_entry(struct dm_writecache
*wc
, struct wc_entry
*ins
)
586 struct rb_node
**node
= &wc
->tree
.rb_node
, *parent
= NULL
;
589 e
= container_of(*node
, struct wc_entry
, rb_node
);
590 parent
= &e
->rb_node
;
591 if (read_original_sector(wc
, e
) > read_original_sector(wc
, ins
))
592 node
= &parent
->rb_left
;
594 node
= &parent
->rb_right
;
596 rb_link_node(&ins
->rb_node
, parent
, node
);
597 rb_insert_color(&ins
->rb_node
, &wc
->tree
);
598 list_add(&ins
->lru
, &wc
->lru
);
601 static void writecache_unlink(struct dm_writecache
*wc
, struct wc_entry
*e
)
604 rb_erase(&e
->rb_node
, &wc
->tree
);
607 static void writecache_add_to_freelist(struct dm_writecache
*wc
, struct wc_entry
*e
)
609 if (WC_MODE_SORT_FREELIST(wc
)) {
610 struct rb_node
**node
= &wc
->freetree
.rb_node
, *parent
= NULL
;
611 if (unlikely(!*node
))
612 wc
->current_free
= e
;
615 if (&e
->rb_node
< *node
)
616 node
= &parent
->rb_left
;
618 node
= &parent
->rb_right
;
620 rb_link_node(&e
->rb_node
, parent
, node
);
621 rb_insert_color(&e
->rb_node
, &wc
->freetree
);
623 list_add_tail(&e
->lru
, &wc
->freelist
);
628 static struct wc_entry
*writecache_pop_from_freelist(struct dm_writecache
*wc
)
632 if (WC_MODE_SORT_FREELIST(wc
)) {
633 struct rb_node
*next
;
634 if (unlikely(!wc
->current_free
))
636 e
= wc
->current_free
;
637 next
= rb_next(&e
->rb_node
);
638 rb_erase(&e
->rb_node
, &wc
->freetree
);
640 next
= rb_first(&wc
->freetree
);
641 wc
->current_free
= next
? container_of(next
, struct wc_entry
, rb_node
) : NULL
;
643 if (unlikely(list_empty(&wc
->freelist
)))
645 e
= container_of(wc
->freelist
.next
, struct wc_entry
, lru
);
649 if (unlikely(wc
->freelist_size
+ wc
->writeback_size
<= wc
->freelist_high_watermark
))
650 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
655 static void writecache_free_entry(struct dm_writecache
*wc
, struct wc_entry
*e
)
657 writecache_unlink(wc
, e
);
658 writecache_add_to_freelist(wc
, e
);
659 clear_seq_count(wc
, e
);
660 writecache_flush_region(wc
, memory_entry(wc
, e
), sizeof(struct wc_memory_entry
));
661 if (unlikely(waitqueue_active(&wc
->freelist_wait
)))
662 wake_up(&wc
->freelist_wait
);
665 static void writecache_wait_on_freelist(struct dm_writecache
*wc
)
669 prepare_to_wait(&wc
->freelist_wait
, &wait
, TASK_UNINTERRUPTIBLE
);
672 finish_wait(&wc
->freelist_wait
, &wait
);
676 static void writecache_poison_lists(struct dm_writecache
*wc
)
679 * Catch incorrect access to these values while the device is suspended.
681 memset(&wc
->tree
, -1, sizeof wc
->tree
);
682 wc
->lru
.next
= LIST_POISON1
;
683 wc
->lru
.prev
= LIST_POISON2
;
684 wc
->freelist
.next
= LIST_POISON1
;
685 wc
->freelist
.prev
= LIST_POISON2
;
688 static void writecache_flush_entry(struct dm_writecache
*wc
, struct wc_entry
*e
)
690 writecache_flush_region(wc
, memory_entry(wc
, e
), sizeof(struct wc_memory_entry
));
691 if (WC_MODE_PMEM(wc
))
692 writecache_flush_region(wc
, memory_data(wc
, e
), wc
->block_size
);
695 static bool writecache_entry_is_committed(struct dm_writecache
*wc
, struct wc_entry
*e
)
697 return read_seq_count(wc
, e
) < wc
->seq_count
;
700 static void writecache_flush(struct dm_writecache
*wc
)
702 struct wc_entry
*e
, *e2
;
703 bool need_flush_after_free
;
705 wc
->uncommitted_blocks
= 0;
706 del_timer(&wc
->autocommit_timer
);
708 if (list_empty(&wc
->lru
))
711 e
= container_of(wc
->lru
.next
, struct wc_entry
, lru
);
712 if (writecache_entry_is_committed(wc
, e
)) {
713 if (wc
->overwrote_committed
) {
714 writecache_wait_for_ios(wc
, WRITE
);
715 writecache_disk_flush(wc
, wc
->ssd_dev
);
716 wc
->overwrote_committed
= false;
721 writecache_flush_entry(wc
, e
);
722 if (unlikely(e
->lru
.next
== &wc
->lru
))
724 e2
= container_of(e
->lru
.next
, struct wc_entry
, lru
);
725 if (writecache_entry_is_committed(wc
, e2
))
730 writecache_commit_flushed(wc
);
732 writecache_wait_for_ios(wc
, WRITE
);
735 pmem_assign(sb(wc
)->seq_count
, cpu_to_le64(wc
->seq_count
));
736 writecache_flush_region(wc
, &sb(wc
)->seq_count
, sizeof sb(wc
)->seq_count
);
737 writecache_commit_flushed(wc
);
739 wc
->overwrote_committed
= false;
741 need_flush_after_free
= false;
743 /* Free another committed entry with lower seq-count */
744 struct rb_node
*rb_node
= rb_prev(&e
->rb_node
);
747 e2
= container_of(rb_node
, struct wc_entry
, rb_node
);
748 if (read_original_sector(wc
, e2
) == read_original_sector(wc
, e
) &&
749 likely(!e2
->write_in_progress
)) {
750 writecache_free_entry(wc
, e2
);
751 need_flush_after_free
= true;
754 if (unlikely(e
->lru
.prev
== &wc
->lru
))
756 e
= container_of(e
->lru
.prev
, struct wc_entry
, lru
);
760 if (need_flush_after_free
)
761 writecache_commit_flushed(wc
);
764 static void writecache_flush_work(struct work_struct
*work
)
766 struct dm_writecache
*wc
= container_of(work
, struct dm_writecache
, flush_work
);
769 writecache_flush(wc
);
773 static void writecache_autocommit_timer(struct timer_list
*t
)
775 struct dm_writecache
*wc
= from_timer(wc
, t
, autocommit_timer
);
776 if (!writecache_has_error(wc
))
777 queue_work(wc
->writeback_wq
, &wc
->flush_work
);
780 static void writecache_schedule_autocommit(struct dm_writecache
*wc
)
782 if (!timer_pending(&wc
->autocommit_timer
))
783 mod_timer(&wc
->autocommit_timer
, jiffies
+ wc
->autocommit_jiffies
);
786 static void writecache_discard(struct dm_writecache
*wc
, sector_t start
, sector_t end
)
789 bool discarded_something
= false;
791 e
= writecache_find_entry(wc
, start
, WFE_RETURN_FOLLOWING
| WFE_LOWEST_SEQ
);
795 while (read_original_sector(wc
, e
) < end
) {
796 struct rb_node
*node
= rb_next(&e
->rb_node
);
798 if (likely(!e
->write_in_progress
)) {
799 if (!discarded_something
) {
800 writecache_wait_for_ios(wc
, READ
);
801 writecache_wait_for_ios(wc
, WRITE
);
802 discarded_something
= true;
804 writecache_free_entry(wc
, e
);
810 e
= container_of(node
, struct wc_entry
, rb_node
);
813 if (discarded_something
)
814 writecache_commit_flushed(wc
);
817 static bool writecache_wait_for_writeback(struct dm_writecache
*wc
)
819 if (wc
->writeback_size
) {
820 writecache_wait_on_freelist(wc
);
826 static void writecache_suspend(struct dm_target
*ti
)
828 struct dm_writecache
*wc
= ti
->private;
829 bool flush_on_suspend
;
831 del_timer_sync(&wc
->autocommit_timer
);
834 writecache_flush(wc
);
835 flush_on_suspend
= wc
->flush_on_suspend
;
836 if (flush_on_suspend
) {
837 wc
->flush_on_suspend
= false;
839 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
843 flush_workqueue(wc
->writeback_wq
);
846 if (flush_on_suspend
)
848 while (writecache_wait_for_writeback(wc
));
850 if (WC_MODE_PMEM(wc
))
851 persistent_memory_flush_cache(wc
->memory_map
, wc
->memory_map_size
);
853 writecache_poison_lists(wc
);
858 static int writecache_alloc_entries(struct dm_writecache
*wc
)
864 wc
->entries
= vmalloc(array_size(sizeof(struct wc_entry
), wc
->n_blocks
));
867 for (b
= 0; b
< wc
->n_blocks
; b
++) {
868 struct wc_entry
*e
= &wc
->entries
[b
];
870 e
->write_in_progress
= false;
876 static void writecache_resume(struct dm_target
*ti
)
878 struct dm_writecache
*wc
= ti
->private;
880 bool need_flush
= false;
886 if (WC_MODE_PMEM(wc
))
887 persistent_memory_invalidate_cache(wc
->memory_map
, wc
->memory_map_size
);
890 INIT_LIST_HEAD(&wc
->lru
);
891 if (WC_MODE_SORT_FREELIST(wc
)) {
892 wc
->freetree
= RB_ROOT
;
893 wc
->current_free
= NULL
;
895 INIT_LIST_HEAD(&wc
->freelist
);
897 wc
->freelist_size
= 0;
899 r
= memcpy_mcsafe(&sb_seq_count
, &sb(wc
)->seq_count
, sizeof(uint64_t));
901 writecache_error(wc
, r
, "hardware memory error when reading superblock: %d", r
);
902 sb_seq_count
= cpu_to_le64(0);
904 wc
->seq_count
= le64_to_cpu(sb_seq_count
);
906 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
907 for (b
= 0; b
< wc
->n_blocks
; b
++) {
908 struct wc_entry
*e
= &wc
->entries
[b
];
909 struct wc_memory_entry wme
;
910 if (writecache_has_error(wc
)) {
911 e
->original_sector
= -1;
915 r
= memcpy_mcsafe(&wme
, memory_entry(wc
, e
), sizeof(struct wc_memory_entry
));
917 writecache_error(wc
, r
, "hardware memory error when reading metadata entry %lu: %d",
918 (unsigned long)b
, r
);
919 e
->original_sector
= -1;
922 e
->original_sector
= le64_to_cpu(wme
.original_sector
);
923 e
->seq_count
= le64_to_cpu(wme
.seq_count
);
927 for (b
= 0; b
< wc
->n_blocks
; b
++) {
928 struct wc_entry
*e
= &wc
->entries
[b
];
929 if (!writecache_entry_is_committed(wc
, e
)) {
930 if (read_seq_count(wc
, e
) != -1) {
932 clear_seq_count(wc
, e
);
935 writecache_add_to_freelist(wc
, e
);
937 struct wc_entry
*old
;
939 old
= writecache_find_entry(wc
, read_original_sector(wc
, e
), 0);
941 writecache_insert_entry(wc
, e
);
943 if (read_seq_count(wc
, old
) == read_seq_count(wc
, e
)) {
944 writecache_error(wc
, -EINVAL
,
945 "two identical entries, position %llu, sector %llu, sequence %llu",
946 (unsigned long long)b
, (unsigned long long)read_original_sector(wc
, e
),
947 (unsigned long long)read_seq_count(wc
, e
));
949 if (read_seq_count(wc
, old
) > read_seq_count(wc
, e
)) {
952 writecache_free_entry(wc
, old
);
953 writecache_insert_entry(wc
, e
);
962 writecache_flush_all_metadata(wc
);
963 writecache_commit_flushed(wc
);
969 static int process_flush_mesg(unsigned argc
, char **argv
, struct dm_writecache
*wc
)
975 if (dm_suspended(wc
->ti
)) {
979 if (writecache_has_error(wc
)) {
984 writecache_flush(wc
);
986 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
989 flush_workqueue(wc
->writeback_wq
);
993 if (writecache_has_error(wc
)) {
1002 static int process_flush_on_suspend_mesg(unsigned argc
, char **argv
, struct dm_writecache
*wc
)
1008 wc
->flush_on_suspend
= true;
1014 static int writecache_message(struct dm_target
*ti
, unsigned argc
, char **argv
,
1015 char *result
, unsigned maxlen
)
1018 struct dm_writecache
*wc
= ti
->private;
1020 if (!strcasecmp(argv
[0], "flush"))
1021 r
= process_flush_mesg(argc
, argv
, wc
);
1022 else if (!strcasecmp(argv
[0], "flush_on_suspend"))
1023 r
= process_flush_on_suspend_mesg(argc
, argv
, wc
);
1025 DMERR("unrecognised message received: %s", argv
[0]);
1030 static void bio_copy_block(struct dm_writecache
*wc
, struct bio
*bio
, void *data
)
1033 unsigned long flags
;
1035 int rw
= bio_data_dir(bio
);
1036 unsigned remaining_size
= wc
->block_size
;
1039 struct bio_vec bv
= bio_iter_iovec(bio
, bio
->bi_iter
);
1040 buf
= bvec_kmap_irq(&bv
, &flags
);
1042 if (unlikely(size
> remaining_size
))
1043 size
= remaining_size
;
1047 r
= memcpy_mcsafe(buf
, data
, size
);
1048 flush_dcache_page(bio_page(bio
));
1050 writecache_error(wc
, r
, "hardware memory error when reading data: %d", r
);
1051 bio
->bi_status
= BLK_STS_IOERR
;
1054 flush_dcache_page(bio_page(bio
));
1055 memcpy_flushcache(data
, buf
, size
);
1058 bvec_kunmap_irq(buf
, &flags
);
1060 data
= (char *)data
+ size
;
1061 remaining_size
-= size
;
1062 bio_advance(bio
, size
);
1063 } while (unlikely(remaining_size
));
1066 static int writecache_flush_thread(void *data
)
1068 struct dm_writecache
*wc
= data
;
1074 bio
= bio_list_pop(&wc
->flush_list
);
1076 set_current_state(TASK_INTERRUPTIBLE
);
1079 if (unlikely(kthread_should_stop())) {
1080 set_current_state(TASK_RUNNING
);
1088 if (bio_op(bio
) == REQ_OP_DISCARD
) {
1089 writecache_discard(wc
, bio
->bi_iter
.bi_sector
,
1090 bio_end_sector(bio
));
1092 bio_set_dev(bio
, wc
->dev
->bdev
);
1093 generic_make_request(bio
);
1095 writecache_flush(wc
);
1097 if (writecache_has_error(wc
))
1098 bio
->bi_status
= BLK_STS_IOERR
;
1106 static void writecache_offload_bio(struct dm_writecache
*wc
, struct bio
*bio
)
1108 if (bio_list_empty(&wc
->flush_list
))
1109 wake_up_process(wc
->flush_thread
);
1110 bio_list_add(&wc
->flush_list
, bio
);
1113 static int writecache_map(struct dm_target
*ti
, struct bio
*bio
)
1116 struct dm_writecache
*wc
= ti
->private;
1118 bio
->bi_private
= NULL
;
1122 if (unlikely(bio
->bi_opf
& REQ_PREFLUSH
)) {
1123 if (writecache_has_error(wc
))
1125 if (WC_MODE_PMEM(wc
)) {
1126 writecache_flush(wc
);
1127 if (writecache_has_error(wc
))
1131 writecache_offload_bio(wc
, bio
);
1136 bio
->bi_iter
.bi_sector
= dm_target_offset(ti
, bio
->bi_iter
.bi_sector
);
1138 if (unlikely((((unsigned)bio
->bi_iter
.bi_sector
| bio_sectors(bio
)) &
1139 (wc
->block_size
/ 512 - 1)) != 0)) {
1140 DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1141 (unsigned long long)bio
->bi_iter
.bi_sector
,
1142 bio
->bi_iter
.bi_size
, wc
->block_size
);
1146 if (unlikely(bio_op(bio
) == REQ_OP_DISCARD
)) {
1147 if (writecache_has_error(wc
))
1149 if (WC_MODE_PMEM(wc
)) {
1150 writecache_discard(wc
, bio
->bi_iter
.bi_sector
, bio_end_sector(bio
));
1151 goto unlock_remap_origin
;
1153 writecache_offload_bio(wc
, bio
);
1158 if (bio_data_dir(bio
) == READ
) {
1160 e
= writecache_find_entry(wc
, bio
->bi_iter
.bi_sector
, WFE_RETURN_FOLLOWING
);
1161 if (e
&& read_original_sector(wc
, e
) == bio
->bi_iter
.bi_sector
) {
1162 if (WC_MODE_PMEM(wc
)) {
1163 bio_copy_block(wc
, bio
, memory_data(wc
, e
));
1164 if (bio
->bi_iter
.bi_size
)
1165 goto read_next_block
;
1168 dm_accept_partial_bio(bio
, wc
->block_size
>> SECTOR_SHIFT
);
1169 bio_set_dev(bio
, wc
->ssd_dev
->bdev
);
1170 bio
->bi_iter
.bi_sector
= cache_sector(wc
, e
);
1171 if (!writecache_entry_is_committed(wc
, e
))
1172 writecache_wait_for_ios(wc
, WRITE
);
1177 sector_t next_boundary
=
1178 read_original_sector(wc
, e
) - bio
->bi_iter
.bi_sector
;
1179 if (next_boundary
< bio
->bi_iter
.bi_size
>> SECTOR_SHIFT
) {
1180 dm_accept_partial_bio(bio
, next_boundary
);
1183 goto unlock_remap_origin
;
1187 if (writecache_has_error(wc
))
1189 e
= writecache_find_entry(wc
, bio
->bi_iter
.bi_sector
, 0);
1191 if (!writecache_entry_is_committed(wc
, e
))
1193 if (!WC_MODE_PMEM(wc
) && !e
->write_in_progress
) {
1194 wc
->overwrote_committed
= true;
1198 e
= writecache_pop_from_freelist(wc
);
1200 writecache_wait_on_freelist(wc
);
1203 write_original_sector_seq_count(wc
, e
, bio
->bi_iter
.bi_sector
, wc
->seq_count
);
1204 writecache_insert_entry(wc
, e
);
1205 wc
->uncommitted_blocks
++;
1207 if (WC_MODE_PMEM(wc
)) {
1208 bio_copy_block(wc
, bio
, memory_data(wc
, e
));
1210 dm_accept_partial_bio(bio
, wc
->block_size
>> SECTOR_SHIFT
);
1211 bio_set_dev(bio
, wc
->ssd_dev
->bdev
);
1212 bio
->bi_iter
.bi_sector
= cache_sector(wc
, e
);
1213 if (unlikely(wc
->uncommitted_blocks
>= wc
->autocommit_blocks
)) {
1214 wc
->uncommitted_blocks
= 0;
1215 queue_work(wc
->writeback_wq
, &wc
->flush_work
);
1217 writecache_schedule_autocommit(wc
);
1221 } while (bio
->bi_iter
.bi_size
);
1223 if (unlikely(wc
->uncommitted_blocks
>= wc
->autocommit_blocks
))
1224 writecache_flush(wc
);
1226 writecache_schedule_autocommit(wc
);
1230 unlock_remap_origin
:
1231 bio_set_dev(bio
, wc
->dev
->bdev
);
1233 return DM_MAPIO_REMAPPED
;
1236 /* make sure that writecache_end_io decrements bio_in_progress: */
1237 bio
->bi_private
= (void *)1;
1238 atomic_inc(&wc
->bio_in_progress
[bio_data_dir(bio
)]);
1240 return DM_MAPIO_REMAPPED
;
1245 return DM_MAPIO_SUBMITTED
;
1249 return DM_MAPIO_SUBMITTED
;
1254 return DM_MAPIO_SUBMITTED
;
1257 static int writecache_end_io(struct dm_target
*ti
, struct bio
*bio
, blk_status_t
*status
)
1259 struct dm_writecache
*wc
= ti
->private;
1261 if (bio
->bi_private
!= NULL
) {
1262 int dir
= bio_data_dir(bio
);
1263 if (atomic_dec_and_test(&wc
->bio_in_progress
[dir
]))
1264 if (unlikely(waitqueue_active(&wc
->bio_in_progress_wait
[dir
])))
1265 wake_up(&wc
->bio_in_progress_wait
[dir
]);
1270 static int writecache_iterate_devices(struct dm_target
*ti
,
1271 iterate_devices_callout_fn fn
, void *data
)
1273 struct dm_writecache
*wc
= ti
->private;
1275 return fn(ti
, wc
->dev
, 0, ti
->len
, data
);
1278 static void writecache_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
1280 struct dm_writecache
*wc
= ti
->private;
1282 if (limits
->logical_block_size
< wc
->block_size
)
1283 limits
->logical_block_size
= wc
->block_size
;
1285 if (limits
->physical_block_size
< wc
->block_size
)
1286 limits
->physical_block_size
= wc
->block_size
;
1288 if (limits
->io_min
< wc
->block_size
)
1289 limits
->io_min
= wc
->block_size
;
1293 static void writecache_writeback_endio(struct bio
*bio
)
1295 struct writeback_struct
*wb
= container_of(bio
, struct writeback_struct
, bio
);
1296 struct dm_writecache
*wc
= wb
->wc
;
1297 unsigned long flags
;
1299 raw_spin_lock_irqsave(&wc
->endio_list_lock
, flags
);
1300 if (unlikely(list_empty(&wc
->endio_list
)))
1301 wake_up_process(wc
->endio_thread
);
1302 list_add_tail(&wb
->endio_entry
, &wc
->endio_list
);
1303 raw_spin_unlock_irqrestore(&wc
->endio_list_lock
, flags
);
1306 static void writecache_copy_endio(int read_err
, unsigned long write_err
, void *ptr
)
1308 struct copy_struct
*c
= ptr
;
1309 struct dm_writecache
*wc
= c
->wc
;
1311 c
->error
= likely(!(read_err
| write_err
)) ? 0 : -EIO
;
1313 raw_spin_lock_irq(&wc
->endio_list_lock
);
1314 if (unlikely(list_empty(&wc
->endio_list
)))
1315 wake_up_process(wc
->endio_thread
);
1316 list_add_tail(&c
->endio_entry
, &wc
->endio_list
);
1317 raw_spin_unlock_irq(&wc
->endio_list_lock
);
1320 static void __writecache_endio_pmem(struct dm_writecache
*wc
, struct list_head
*list
)
1323 struct writeback_struct
*wb
;
1325 unsigned long n_walked
= 0;
1328 wb
= list_entry(list
->next
, struct writeback_struct
, endio_entry
);
1329 list_del(&wb
->endio_entry
);
1331 if (unlikely(wb
->bio
.bi_status
!= BLK_STS_OK
))
1332 writecache_error(wc
, blk_status_to_errno(wb
->bio
.bi_status
),
1333 "write error %d", wb
->bio
.bi_status
);
1337 BUG_ON(!e
->write_in_progress
);
1338 e
->write_in_progress
= false;
1339 INIT_LIST_HEAD(&e
->lru
);
1340 if (!writecache_has_error(wc
))
1341 writecache_free_entry(wc
, e
);
1342 BUG_ON(!wc
->writeback_size
);
1343 wc
->writeback_size
--;
1345 if (unlikely(n_walked
>= ENDIO_LATENCY
)) {
1346 writecache_commit_flushed(wc
);
1351 } while (++i
< wb
->wc_list_n
);
1353 if (wb
->wc_list
!= wb
->wc_list_inline
)
1356 } while (!list_empty(list
));
1359 static void __writecache_endio_ssd(struct dm_writecache
*wc
, struct list_head
*list
)
1361 struct copy_struct
*c
;
1365 c
= list_entry(list
->next
, struct copy_struct
, endio_entry
);
1366 list_del(&c
->endio_entry
);
1368 if (unlikely(c
->error
))
1369 writecache_error(wc
, c
->error
, "copy error");
1373 BUG_ON(!e
->write_in_progress
);
1374 e
->write_in_progress
= false;
1375 INIT_LIST_HEAD(&e
->lru
);
1376 if (!writecache_has_error(wc
))
1377 writecache_free_entry(wc
, e
);
1379 BUG_ON(!wc
->writeback_size
);
1380 wc
->writeback_size
--;
1382 } while (--c
->n_entries
);
1383 mempool_free(c
, &wc
->copy_pool
);
1384 } while (!list_empty(list
));
1387 static int writecache_endio_thread(void *data
)
1389 struct dm_writecache
*wc
= data
;
1392 struct list_head list
;
1394 raw_spin_lock_irq(&wc
->endio_list_lock
);
1395 if (!list_empty(&wc
->endio_list
))
1397 set_current_state(TASK_INTERRUPTIBLE
);
1398 raw_spin_unlock_irq(&wc
->endio_list_lock
);
1400 if (unlikely(kthread_should_stop())) {
1401 set_current_state(TASK_RUNNING
);
1410 list
= wc
->endio_list
;
1411 list
.next
->prev
= list
.prev
->next
= &list
;
1412 INIT_LIST_HEAD(&wc
->endio_list
);
1413 raw_spin_unlock_irq(&wc
->endio_list_lock
);
1415 if (!WC_MODE_FUA(wc
))
1416 writecache_disk_flush(wc
, wc
->dev
);
1420 if (WC_MODE_PMEM(wc
)) {
1421 __writecache_endio_pmem(wc
, &list
);
1423 __writecache_endio_ssd(wc
, &list
);
1424 writecache_wait_for_ios(wc
, READ
);
1427 writecache_commit_flushed(wc
);
1435 static bool wc_add_block(struct writeback_struct
*wb
, struct wc_entry
*e
, gfp_t gfp
)
1437 struct dm_writecache
*wc
= wb
->wc
;
1438 unsigned block_size
= wc
->block_size
;
1439 void *address
= memory_data(wc
, e
);
1441 persistent_memory_flush_cache(address
, block_size
);
1442 return bio_add_page(&wb
->bio
, persistent_memory_page(address
),
1443 block_size
, persistent_memory_page_offset(address
)) != 0;
1446 struct writeback_list
{
1447 struct list_head list
;
1451 static void __writeback_throttle(struct dm_writecache
*wc
, struct writeback_list
*wbl
)
1453 if (unlikely(wc
->max_writeback_jobs
)) {
1454 if (READ_ONCE(wc
->writeback_size
) - wbl
->size
>= wc
->max_writeback_jobs
) {
1456 while (wc
->writeback_size
- wbl
->size
>= wc
->max_writeback_jobs
)
1457 writecache_wait_on_freelist(wc
);
1464 static void __writecache_writeback_pmem(struct dm_writecache
*wc
, struct writeback_list
*wbl
)
1466 struct wc_entry
*e
, *f
;
1468 struct writeback_struct
*wb
;
1473 e
= container_of(wbl
->list
.prev
, struct wc_entry
, lru
);
1476 max_pages
= e
->wc_list_contiguous
;
1478 bio
= bio_alloc_bioset(GFP_NOIO
, max_pages
, &wc
->bio_set
);
1479 wb
= container_of(bio
, struct writeback_struct
, bio
);
1481 wb
->bio
.bi_end_io
= writecache_writeback_endio
;
1482 bio_set_dev(&wb
->bio
, wc
->dev
->bdev
);
1483 wb
->bio
.bi_iter
.bi_sector
= read_original_sector(wc
, e
);
1484 wb
->page_offset
= PAGE_SIZE
;
1485 if (max_pages
<= WB_LIST_INLINE
||
1486 unlikely(!(wb
->wc_list
= kmalloc_array(max_pages
, sizeof(struct wc_entry
*),
1487 GFP_NOIO
| __GFP_NORETRY
|
1488 __GFP_NOMEMALLOC
| __GFP_NOWARN
)))) {
1489 wb
->wc_list
= wb
->wc_list_inline
;
1490 max_pages
= WB_LIST_INLINE
;
1493 BUG_ON(!wc_add_block(wb
, e
, GFP_NOIO
));
1498 while (wbl
->size
&& wb
->wc_list_n
< max_pages
) {
1499 f
= container_of(wbl
->list
.prev
, struct wc_entry
, lru
);
1500 if (read_original_sector(wc
, f
) !=
1501 read_original_sector(wc
, e
) + (wc
->block_size
>> SECTOR_SHIFT
))
1503 if (!wc_add_block(wb
, f
, GFP_NOWAIT
| __GFP_NOWARN
))
1507 wb
->wc_list
[wb
->wc_list_n
++] = f
;
1510 bio_set_op_attrs(&wb
->bio
, REQ_OP_WRITE
, WC_MODE_FUA(wc
) * REQ_FUA
);
1511 if (writecache_has_error(wc
)) {
1512 bio
->bi_status
= BLK_STS_IOERR
;
1513 bio_endio(&wb
->bio
);
1515 submit_bio(&wb
->bio
);
1518 __writeback_throttle(wc
, wbl
);
1522 static void __writecache_writeback_ssd(struct dm_writecache
*wc
, struct writeback_list
*wbl
)
1524 struct wc_entry
*e
, *f
;
1525 struct dm_io_region from
, to
;
1526 struct copy_struct
*c
;
1532 e
= container_of(wbl
->list
.prev
, struct wc_entry
, lru
);
1535 n_sectors
= e
->wc_list_contiguous
<< (wc
->block_size_bits
- SECTOR_SHIFT
);
1537 from
.bdev
= wc
->ssd_dev
->bdev
;
1538 from
.sector
= cache_sector(wc
, e
);
1539 from
.count
= n_sectors
;
1540 to
.bdev
= wc
->dev
->bdev
;
1541 to
.sector
= read_original_sector(wc
, e
);
1542 to
.count
= n_sectors
;
1544 c
= mempool_alloc(&wc
->copy_pool
, GFP_NOIO
);
1547 c
->n_entries
= e
->wc_list_contiguous
;
1549 while ((n_sectors
-= wc
->block_size
>> SECTOR_SHIFT
)) {
1551 f
= container_of(wbl
->list
.prev
, struct wc_entry
, lru
);
1557 dm_kcopyd_copy(wc
->dm_kcopyd
, &from
, 1, &to
, 0, writecache_copy_endio
, c
);
1559 __writeback_throttle(wc
, wbl
);
1563 static void writecache_writeback(struct work_struct
*work
)
1565 struct dm_writecache
*wc
= container_of(work
, struct dm_writecache
, writeback_work
);
1566 struct blk_plug plug
;
1567 struct wc_entry
*e
, *f
, *g
;
1568 struct rb_node
*node
, *next_node
;
1569 struct list_head skipped
;
1570 struct writeback_list wbl
;
1571 unsigned long n_walked
;
1575 if (writecache_has_error(wc
)) {
1580 if (unlikely(wc
->writeback_all
)) {
1581 if (writecache_wait_for_writeback(wc
))
1585 if (wc
->overwrote_committed
) {
1586 writecache_wait_for_ios(wc
, WRITE
);
1590 INIT_LIST_HEAD(&skipped
);
1591 INIT_LIST_HEAD(&wbl
.list
);
1593 while (!list_empty(&wc
->lru
) &&
1594 (wc
->writeback_all
||
1595 wc
->freelist_size
+ wc
->writeback_size
<= wc
->freelist_low_watermark
)) {
1598 if (unlikely(n_walked
> WRITEBACK_LATENCY
) &&
1599 likely(!wc
->writeback_all
) && likely(!dm_suspended(wc
->ti
))) {
1600 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
1604 e
= container_of(wc
->lru
.prev
, struct wc_entry
, lru
);
1605 BUG_ON(e
->write_in_progress
);
1606 if (unlikely(!writecache_entry_is_committed(wc
, e
))) {
1607 writecache_flush(wc
);
1609 node
= rb_prev(&e
->rb_node
);
1611 f
= container_of(node
, struct wc_entry
, rb_node
);
1612 if (unlikely(read_original_sector(wc
, f
) ==
1613 read_original_sector(wc
, e
))) {
1614 BUG_ON(!f
->write_in_progress
);
1616 list_add(&e
->lru
, &skipped
);
1621 wc
->writeback_size
++;
1623 list_add(&e
->lru
, &wbl
.list
);
1625 e
->write_in_progress
= true;
1626 e
->wc_list_contiguous
= 1;
1631 next_node
= rb_next(&f
->rb_node
);
1632 if (unlikely(!next_node
))
1634 g
= container_of(next_node
, struct wc_entry
, rb_node
);
1635 if (read_original_sector(wc
, g
) ==
1636 read_original_sector(wc
, f
)) {
1640 if (read_original_sector(wc
, g
) !=
1641 read_original_sector(wc
, f
) + (wc
->block_size
>> SECTOR_SHIFT
))
1643 if (unlikely(g
->write_in_progress
))
1645 if (unlikely(!writecache_entry_is_committed(wc
, g
)))
1648 if (!WC_MODE_PMEM(wc
)) {
1654 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
1657 wc
->writeback_size
++;
1659 list_add(&g
->lru
, &wbl
.list
);
1661 g
->write_in_progress
= true;
1662 g
->wc_list_contiguous
= BIO_MAX_PAGES
;
1664 e
->wc_list_contiguous
++;
1665 if (unlikely(e
->wc_list_contiguous
== BIO_MAX_PAGES
))
1671 if (!list_empty(&skipped
)) {
1672 list_splice_tail(&skipped
, &wc
->lru
);
1674 * If we didn't do any progress, we must wait until some
1675 * writeback finishes to avoid burning CPU in a loop
1677 if (unlikely(!wbl
.size
))
1678 writecache_wait_for_writeback(wc
);
1683 blk_start_plug(&plug
);
1685 if (WC_MODE_PMEM(wc
))
1686 __writecache_writeback_pmem(wc
, &wbl
);
1688 __writecache_writeback_ssd(wc
, &wbl
);
1690 blk_finish_plug(&plug
);
1692 if (unlikely(wc
->writeback_all
)) {
1694 while (writecache_wait_for_writeback(wc
));
1699 static int calculate_memory_size(uint64_t device_size
, unsigned block_size
,
1700 size_t *n_blocks_p
, size_t *n_metadata_blocks_p
)
1702 uint64_t n_blocks
, offset
;
1705 n_blocks
= device_size
;
1706 do_div(n_blocks
, block_size
+ sizeof(struct wc_memory_entry
));
1711 /* Verify the following entries[n_blocks] won't overflow */
1712 if (n_blocks
>= ((size_t)-sizeof(struct wc_memory_superblock
) /
1713 sizeof(struct wc_memory_entry
)))
1715 offset
= offsetof(struct wc_memory_superblock
, entries
[n_blocks
]);
1716 offset
= (offset
+ block_size
- 1) & ~(uint64_t)(block_size
- 1);
1717 if (offset
+ n_blocks
* block_size
<= device_size
)
1722 /* check if the bit field overflows */
1724 if (e
.index
!= n_blocks
)
1728 *n_blocks_p
= n_blocks
;
1729 if (n_metadata_blocks_p
)
1730 *n_metadata_blocks_p
= offset
>> __ffs(block_size
);
1734 static int init_memory(struct dm_writecache
*wc
)
1739 r
= calculate_memory_size(wc
->memory_map_size
, wc
->block_size
, &wc
->n_blocks
, NULL
);
1743 r
= writecache_alloc_entries(wc
);
1747 for (b
= 0; b
< ARRAY_SIZE(sb(wc
)->padding
); b
++)
1748 pmem_assign(sb(wc
)->padding
[b
], cpu_to_le64(0));
1749 pmem_assign(sb(wc
)->version
, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION
));
1750 pmem_assign(sb(wc
)->block_size
, cpu_to_le32(wc
->block_size
));
1751 pmem_assign(sb(wc
)->n_blocks
, cpu_to_le64(wc
->n_blocks
));
1752 pmem_assign(sb(wc
)->seq_count
, cpu_to_le64(0));
1754 for (b
= 0; b
< wc
->n_blocks
; b
++)
1755 write_original_sector_seq_count(wc
, &wc
->entries
[b
], -1, -1);
1757 writecache_flush_all_metadata(wc
);
1758 writecache_commit_flushed(wc
);
1759 pmem_assign(sb(wc
)->magic
, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC
));
1760 writecache_flush_region(wc
, &sb(wc
)->magic
, sizeof sb(wc
)->magic
);
1761 writecache_commit_flushed(wc
);
1766 static void writecache_dtr(struct dm_target
*ti
)
1768 struct dm_writecache
*wc
= ti
->private;
1773 if (wc
->endio_thread
)
1774 kthread_stop(wc
->endio_thread
);
1776 if (wc
->flush_thread
)
1777 kthread_stop(wc
->flush_thread
);
1779 bioset_exit(&wc
->bio_set
);
1781 mempool_exit(&wc
->copy_pool
);
1783 if (wc
->writeback_wq
)
1784 destroy_workqueue(wc
->writeback_wq
);
1787 dm_put_device(ti
, wc
->dev
);
1790 dm_put_device(ti
, wc
->ssd_dev
);
1795 if (wc
->memory_map
) {
1796 if (WC_MODE_PMEM(wc
))
1797 persistent_memory_release(wc
);
1799 vfree(wc
->memory_map
);
1803 dm_kcopyd_client_destroy(wc
->dm_kcopyd
);
1806 dm_io_client_destroy(wc
->dm_io
);
1808 if (wc
->dirty_bitmap
)
1809 vfree(wc
->dirty_bitmap
);
1814 static int writecache_ctr(struct dm_target
*ti
, unsigned argc
, char **argv
)
1816 struct dm_writecache
*wc
;
1817 struct dm_arg_set as
;
1819 unsigned opt_params
;
1820 size_t offset
, data_size
;
1823 int high_wm_percent
= HIGH_WATERMARK
;
1824 int low_wm_percent
= LOW_WATERMARK
;
1826 struct wc_memory_superblock s
;
1828 static struct dm_arg _args
[] = {
1829 {0, 10, "Invalid number of feature args"},
1835 wc
= kzalloc(sizeof(struct dm_writecache
), GFP_KERNEL
);
1837 ti
->error
= "Cannot allocate writecache structure";
1844 mutex_init(&wc
->lock
);
1845 writecache_poison_lists(wc
);
1846 init_waitqueue_head(&wc
->freelist_wait
);
1847 timer_setup(&wc
->autocommit_timer
, writecache_autocommit_timer
, 0);
1849 for (i
= 0; i
< 2; i
++) {
1850 atomic_set(&wc
->bio_in_progress
[i
], 0);
1851 init_waitqueue_head(&wc
->bio_in_progress_wait
[i
]);
1854 wc
->dm_io
= dm_io_client_create();
1855 if (IS_ERR(wc
->dm_io
)) {
1856 r
= PTR_ERR(wc
->dm_io
);
1857 ti
->error
= "Unable to allocate dm-io client";
1862 wc
->writeback_wq
= alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM
, 1);
1863 if (!wc
->writeback_wq
) {
1865 ti
->error
= "Could not allocate writeback workqueue";
1868 INIT_WORK(&wc
->writeback_work
, writecache_writeback
);
1869 INIT_WORK(&wc
->flush_work
, writecache_flush_work
);
1871 raw_spin_lock_init(&wc
->endio_list_lock
);
1872 INIT_LIST_HEAD(&wc
->endio_list
);
1873 wc
->endio_thread
= kthread_create(writecache_endio_thread
, wc
, "writecache_endio");
1874 if (IS_ERR(wc
->endio_thread
)) {
1875 r
= PTR_ERR(wc
->endio_thread
);
1876 wc
->endio_thread
= NULL
;
1877 ti
->error
= "Couldn't spawn endio thread";
1880 wake_up_process(wc
->endio_thread
);
1883 * Parse the mode (pmem or ssd)
1885 string
= dm_shift_arg(&as
);
1889 if (!strcasecmp(string
, "s")) {
1890 wc
->pmem_mode
= false;
1891 } else if (!strcasecmp(string
, "p")) {
1892 #ifdef DM_WRITECACHE_HAS_PMEM
1893 wc
->pmem_mode
= true;
1894 wc
->writeback_fua
= true;
1897 * If the architecture doesn't support persistent memory or
1898 * the kernel doesn't support any DAX drivers, this driver can
1899 * only be used in SSD-only mode.
1902 ti
->error
= "Persistent memory or DAX not supported on this system";
1909 if (WC_MODE_PMEM(wc
)) {
1910 r
= bioset_init(&wc
->bio_set
, BIO_POOL_SIZE
,
1911 offsetof(struct writeback_struct
, bio
),
1914 ti
->error
= "Could not allocate bio set";
1918 r
= mempool_init_kmalloc_pool(&wc
->copy_pool
, 1, sizeof(struct copy_struct
));
1920 ti
->error
= "Could not allocate mempool";
1926 * Parse the origin data device
1928 string
= dm_shift_arg(&as
);
1931 r
= dm_get_device(ti
, string
, dm_table_get_mode(ti
->table
), &wc
->dev
);
1933 ti
->error
= "Origin data device lookup failed";
1938 * Parse cache data device (be it pmem or ssd)
1940 string
= dm_shift_arg(&as
);
1944 r
= dm_get_device(ti
, string
, dm_table_get_mode(ti
->table
), &wc
->ssd_dev
);
1946 ti
->error
= "Cache data device lookup failed";
1949 wc
->memory_map_size
= i_size_read(wc
->ssd_dev
->bdev
->bd_inode
);
1952 * Parse the cache block size
1954 string
= dm_shift_arg(&as
);
1957 if (sscanf(string
, "%u%c", &wc
->block_size
, &dummy
) != 1 ||
1958 wc
->block_size
< 512 || wc
->block_size
> PAGE_SIZE
||
1959 (wc
->block_size
& (wc
->block_size
- 1))) {
1961 ti
->error
= "Invalid block size";
1964 wc
->block_size_bits
= __ffs(wc
->block_size
);
1966 wc
->max_writeback_jobs
= MAX_WRITEBACK_JOBS
;
1967 wc
->autocommit_blocks
= !WC_MODE_PMEM(wc
) ? AUTOCOMMIT_BLOCKS_SSD
: AUTOCOMMIT_BLOCKS_PMEM
;
1968 wc
->autocommit_jiffies
= msecs_to_jiffies(AUTOCOMMIT_MSEC
);
1971 * Parse optional arguments
1973 r
= dm_read_arg_group(_args
, &as
, &opt_params
, &ti
->error
);
1977 while (opt_params
) {
1978 string
= dm_shift_arg(&as
), opt_params
--;
1979 if (!strcasecmp(string
, "start_sector") && opt_params
>= 1) {
1980 unsigned long long start_sector
;
1981 string
= dm_shift_arg(&as
), opt_params
--;
1982 if (sscanf(string
, "%llu%c", &start_sector
, &dummy
) != 1)
1983 goto invalid_optional
;
1984 wc
->start_sector
= start_sector
;
1985 if (wc
->start_sector
!= start_sector
||
1986 wc
->start_sector
>= wc
->memory_map_size
>> SECTOR_SHIFT
)
1987 goto invalid_optional
;
1988 } else if (!strcasecmp(string
, "high_watermark") && opt_params
>= 1) {
1989 string
= dm_shift_arg(&as
), opt_params
--;
1990 if (sscanf(string
, "%d%c", &high_wm_percent
, &dummy
) != 1)
1991 goto invalid_optional
;
1992 if (high_wm_percent
< 0 || high_wm_percent
> 100)
1993 goto invalid_optional
;
1994 wc
->high_wm_percent_set
= true;
1995 } else if (!strcasecmp(string
, "low_watermark") && opt_params
>= 1) {
1996 string
= dm_shift_arg(&as
), opt_params
--;
1997 if (sscanf(string
, "%d%c", &low_wm_percent
, &dummy
) != 1)
1998 goto invalid_optional
;
1999 if (low_wm_percent
< 0 || low_wm_percent
> 100)
2000 goto invalid_optional
;
2001 wc
->low_wm_percent_set
= true;
2002 } else if (!strcasecmp(string
, "writeback_jobs") && opt_params
>= 1) {
2003 string
= dm_shift_arg(&as
), opt_params
--;
2004 if (sscanf(string
, "%u%c", &wc
->max_writeback_jobs
, &dummy
) != 1)
2005 goto invalid_optional
;
2006 wc
->max_writeback_jobs_set
= true;
2007 } else if (!strcasecmp(string
, "autocommit_blocks") && opt_params
>= 1) {
2008 string
= dm_shift_arg(&as
), opt_params
--;
2009 if (sscanf(string
, "%u%c", &wc
->autocommit_blocks
, &dummy
) != 1)
2010 goto invalid_optional
;
2011 wc
->autocommit_blocks_set
= true;
2012 } else if (!strcasecmp(string
, "autocommit_time") && opt_params
>= 1) {
2013 unsigned autocommit_msecs
;
2014 string
= dm_shift_arg(&as
), opt_params
--;
2015 if (sscanf(string
, "%u%c", &autocommit_msecs
, &dummy
) != 1)
2016 goto invalid_optional
;
2017 if (autocommit_msecs
> 3600000)
2018 goto invalid_optional
;
2019 wc
->autocommit_jiffies
= msecs_to_jiffies(autocommit_msecs
);
2020 wc
->autocommit_time_set
= true;
2021 } else if (!strcasecmp(string
, "fua")) {
2022 if (WC_MODE_PMEM(wc
)) {
2023 wc
->writeback_fua
= true;
2024 wc
->writeback_fua_set
= true;
2025 } else goto invalid_optional
;
2026 } else if (!strcasecmp(string
, "nofua")) {
2027 if (WC_MODE_PMEM(wc
)) {
2028 wc
->writeback_fua
= false;
2029 wc
->writeback_fua_set
= true;
2030 } else goto invalid_optional
;
2034 ti
->error
= "Invalid optional argument";
2039 if (high_wm_percent
< low_wm_percent
) {
2041 ti
->error
= "High watermark must be greater than or equal to low watermark";
2045 if (WC_MODE_PMEM(wc
)) {
2046 r
= persistent_memory_claim(wc
);
2048 ti
->error
= "Unable to map persistent memory for cache";
2052 struct dm_io_region region
;
2053 struct dm_io_request req
;
2054 size_t n_blocks
, n_metadata_blocks
;
2055 uint64_t n_bitmap_bits
;
2057 wc
->memory_map_size
-= (uint64_t)wc
->start_sector
<< SECTOR_SHIFT
;
2059 bio_list_init(&wc
->flush_list
);
2060 wc
->flush_thread
= kthread_create(writecache_flush_thread
, wc
, "dm_writecache_flush");
2061 if (IS_ERR(wc
->flush_thread
)) {
2062 r
= PTR_ERR(wc
->flush_thread
);
2063 wc
->flush_thread
= NULL
;
2064 ti
->error
= "Couldn't spawn flush thread";
2067 wake_up_process(wc
->flush_thread
);
2069 r
= calculate_memory_size(wc
->memory_map_size
, wc
->block_size
,
2070 &n_blocks
, &n_metadata_blocks
);
2072 ti
->error
= "Invalid device size";
2076 n_bitmap_bits
= (((uint64_t)n_metadata_blocks
<< wc
->block_size_bits
) +
2077 BITMAP_GRANULARITY
- 1) / BITMAP_GRANULARITY
;
2078 /* this is limitation of test_bit functions */
2079 if (n_bitmap_bits
> 1U << 31) {
2081 ti
->error
= "Invalid device size";
2085 wc
->memory_map
= vmalloc(n_metadata_blocks
<< wc
->block_size_bits
);
2086 if (!wc
->memory_map
) {
2088 ti
->error
= "Unable to allocate memory for metadata";
2092 wc
->dm_kcopyd
= dm_kcopyd_client_create(&dm_kcopyd_throttle
);
2093 if (IS_ERR(wc
->dm_kcopyd
)) {
2094 r
= PTR_ERR(wc
->dm_kcopyd
);
2095 ti
->error
= "Unable to allocate dm-kcopyd client";
2096 wc
->dm_kcopyd
= NULL
;
2100 wc
->metadata_sectors
= n_metadata_blocks
<< (wc
->block_size_bits
- SECTOR_SHIFT
);
2101 wc
->dirty_bitmap_size
= (n_bitmap_bits
+ BITS_PER_LONG
- 1) /
2102 BITS_PER_LONG
* sizeof(unsigned long);
2103 wc
->dirty_bitmap
= vzalloc(wc
->dirty_bitmap_size
);
2104 if (!wc
->dirty_bitmap
) {
2106 ti
->error
= "Unable to allocate dirty bitmap";
2110 region
.bdev
= wc
->ssd_dev
->bdev
;
2111 region
.sector
= wc
->start_sector
;
2112 region
.count
= wc
->metadata_sectors
;
2113 req
.bi_op
= REQ_OP_READ
;
2114 req
.bi_op_flags
= REQ_SYNC
;
2115 req
.mem
.type
= DM_IO_VMA
;
2116 req
.mem
.ptr
.vma
= (char *)wc
->memory_map
;
2117 req
.client
= wc
->dm_io
;
2118 req
.notify
.fn
= NULL
;
2120 r
= dm_io(&req
, 1, ®ion
, NULL
);
2122 ti
->error
= "Unable to read metadata";
2127 r
= memcpy_mcsafe(&s
, sb(wc
), sizeof(struct wc_memory_superblock
));
2129 ti
->error
= "Hardware memory error when reading superblock";
2132 if (!le32_to_cpu(s
.magic
) && !le32_to_cpu(s
.version
)) {
2133 r
= init_memory(wc
);
2135 ti
->error
= "Unable to initialize device";
2138 r
= memcpy_mcsafe(&s
, sb(wc
), sizeof(struct wc_memory_superblock
));
2140 ti
->error
= "Hardware memory error when reading superblock";
2145 if (le32_to_cpu(s
.magic
) != MEMORY_SUPERBLOCK_MAGIC
) {
2146 ti
->error
= "Invalid magic in the superblock";
2151 if (le32_to_cpu(s
.version
) != MEMORY_SUPERBLOCK_VERSION
) {
2152 ti
->error
= "Invalid version in the superblock";
2157 if (le32_to_cpu(s
.block_size
) != wc
->block_size
) {
2158 ti
->error
= "Block size does not match superblock";
2163 wc
->n_blocks
= le64_to_cpu(s
.n_blocks
);
2165 offset
= wc
->n_blocks
* sizeof(struct wc_memory_entry
);
2166 if (offset
/ sizeof(struct wc_memory_entry
) != le64_to_cpu(sb(wc
)->n_blocks
)) {
2168 ti
->error
= "Overflow in size calculation";
2172 offset
+= sizeof(struct wc_memory_superblock
);
2173 if (offset
< sizeof(struct wc_memory_superblock
))
2175 offset
= (offset
+ wc
->block_size
- 1) & ~(size_t)(wc
->block_size
- 1);
2176 data_size
= wc
->n_blocks
* (size_t)wc
->block_size
;
2177 if (!offset
|| (data_size
/ wc
->block_size
!= wc
->n_blocks
) ||
2178 (offset
+ data_size
< offset
))
2180 if (offset
+ data_size
> wc
->memory_map_size
) {
2181 ti
->error
= "Memory area is too small";
2186 wc
->metadata_sectors
= offset
>> SECTOR_SHIFT
;
2187 wc
->block_start
= (char *)sb(wc
) + offset
;
2189 x
= (uint64_t)wc
->n_blocks
* (100 - high_wm_percent
);
2192 wc
->freelist_high_watermark
= x
;
2193 x
= (uint64_t)wc
->n_blocks
* (100 - low_wm_percent
);
2196 wc
->freelist_low_watermark
= x
;
2198 r
= writecache_alloc_entries(wc
);
2200 ti
->error
= "Cannot allocate memory";
2204 ti
->num_flush_bios
= 1;
2205 ti
->flush_supported
= true;
2206 ti
->num_discard_bios
= 1;
2208 if (WC_MODE_PMEM(wc
))
2209 persistent_memory_flush_cache(wc
->memory_map
, wc
->memory_map_size
);
2215 ti
->error
= "Bad arguments";
2221 static void writecache_status(struct dm_target
*ti
, status_type_t type
,
2222 unsigned status_flags
, char *result
, unsigned maxlen
)
2224 struct dm_writecache
*wc
= ti
->private;
2225 unsigned extra_args
;
2230 case STATUSTYPE_INFO
:
2231 DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc
),
2232 (unsigned long long)wc
->n_blocks
, (unsigned long long)wc
->freelist_size
,
2233 (unsigned long long)wc
->writeback_size
);
2235 case STATUSTYPE_TABLE
:
2236 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc
) ? 'p' : 's',
2237 wc
->dev
->name
, wc
->ssd_dev
->name
, wc
->block_size
);
2239 if (wc
->start_sector
)
2241 if (wc
->high_wm_percent_set
)
2243 if (wc
->low_wm_percent_set
)
2245 if (wc
->max_writeback_jobs_set
)
2247 if (wc
->autocommit_blocks_set
)
2249 if (wc
->autocommit_time_set
)
2251 if (wc
->writeback_fua_set
)
2254 DMEMIT("%u", extra_args
);
2255 if (wc
->start_sector
)
2256 DMEMIT(" start_sector %llu", (unsigned long long)wc
->start_sector
);
2257 if (wc
->high_wm_percent_set
) {
2258 x
= (uint64_t)wc
->freelist_high_watermark
* 100;
2259 x
+= wc
->n_blocks
/ 2;
2260 do_div(x
, (size_t)wc
->n_blocks
);
2261 DMEMIT(" high_watermark %u", 100 - (unsigned)x
);
2263 if (wc
->low_wm_percent_set
) {
2264 x
= (uint64_t)wc
->freelist_low_watermark
* 100;
2265 x
+= wc
->n_blocks
/ 2;
2266 do_div(x
, (size_t)wc
->n_blocks
);
2267 DMEMIT(" low_watermark %u", 100 - (unsigned)x
);
2269 if (wc
->max_writeback_jobs_set
)
2270 DMEMIT(" writeback_jobs %u", wc
->max_writeback_jobs
);
2271 if (wc
->autocommit_blocks_set
)
2272 DMEMIT(" autocommit_blocks %u", wc
->autocommit_blocks
);
2273 if (wc
->autocommit_time_set
)
2274 DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc
->autocommit_jiffies
));
2275 if (wc
->writeback_fua_set
)
2276 DMEMIT(" %sfua", wc
->writeback_fua
? "" : "no");
2281 static struct target_type writecache_target
= {
2282 .name
= "writecache",
2283 .version
= {1, 1, 1},
2284 .module
= THIS_MODULE
,
2285 .ctr
= writecache_ctr
,
2286 .dtr
= writecache_dtr
,
2287 .status
= writecache_status
,
2288 .postsuspend
= writecache_suspend
,
2289 .resume
= writecache_resume
,
2290 .message
= writecache_message
,
2291 .map
= writecache_map
,
2292 .end_io
= writecache_end_io
,
2293 .iterate_devices
= writecache_iterate_devices
,
2294 .io_hints
= writecache_io_hints
,
2297 static int __init
dm_writecache_init(void)
2301 r
= dm_register_target(&writecache_target
);
2303 DMERR("register failed %d", r
);
2310 static void __exit
dm_writecache_exit(void)
2312 dm_unregister_target(&writecache_target
);
2315 module_init(dm_writecache_init
);
2316 module_exit(dm_writecache_exit
);
2318 MODULE_DESCRIPTION(DM_NAME
" writecache target");
2319 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2320 MODULE_LICENSE("GPL");