1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2018 Red Hat. All rights reserved.
5 * This file is released under the GPL.
8 #include <linux/device-mapper.h>
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/vmalloc.h>
12 #include <linux/kthread.h>
13 #include <linux/dm-io.h>
14 #include <linux/dm-kcopyd.h>
15 #include <linux/dax.h>
16 #include <linux/pfn_t.h>
17 #include <linux/libnvdimm.h>
19 #define DM_MSG_PREFIX "writecache"
21 #define HIGH_WATERMARK 50
22 #define LOW_WATERMARK 45
23 #define MAX_WRITEBACK_JOBS 0
24 #define ENDIO_LATENCY 16
25 #define WRITEBACK_LATENCY 64
26 #define AUTOCOMMIT_BLOCKS_SSD 65536
27 #define AUTOCOMMIT_BLOCKS_PMEM 64
28 #define AUTOCOMMIT_MSEC 1000
29 #define MAX_AGE_DIV 16
30 #define MAX_AGE_UNSPECIFIED -1UL
32 #define BITMAP_GRANULARITY 65536
33 #if BITMAP_GRANULARITY < PAGE_SIZE
34 #undef BITMAP_GRANULARITY
35 #define BITMAP_GRANULARITY PAGE_SIZE
38 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
39 #define DM_WRITECACHE_HAS_PMEM
42 #ifdef DM_WRITECACHE_HAS_PMEM
43 #define pmem_assign(dest, src) \
45 typeof(dest) uniq = (src); \
46 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \
49 #define pmem_assign(dest, src) ((dest) = (src))
52 #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
53 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
56 #define MEMORY_SUPERBLOCK_MAGIC 0x23489321
57 #define MEMORY_SUPERBLOCK_VERSION 1
59 struct wc_memory_entry
{
60 __le64 original_sector
;
64 struct wc_memory_superblock
{
76 struct wc_memory_entry entries
[0];
80 struct rb_node rb_node
;
82 unsigned short wc_list_contiguous
;
83 bool write_in_progress
84 #if BITS_PER_LONG == 64
89 #if BITS_PER_LONG == 64
94 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
95 uint64_t original_sector
;
100 #ifdef DM_WRITECACHE_HAS_PMEM
101 #define WC_MODE_PMEM(wc) ((wc)->pmem_mode)
102 #define WC_MODE_FUA(wc) ((wc)->writeback_fua)
104 #define WC_MODE_PMEM(wc) false
105 #define WC_MODE_FUA(wc) false
107 #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc))
109 struct dm_writecache
{
111 struct list_head lru
;
113 struct list_head freelist
;
115 struct rb_root freetree
;
116 struct wc_entry
*current_free
;
121 size_t freelist_size
;
122 size_t writeback_size
;
123 size_t freelist_high_watermark
;
124 size_t freelist_low_watermark
;
125 unsigned long max_age
;
127 unsigned uncommitted_blocks
;
128 unsigned autocommit_blocks
;
129 unsigned max_writeback_jobs
;
133 unsigned long autocommit_jiffies
;
134 struct timer_list autocommit_timer
;
135 struct wait_queue_head freelist_wait
;
137 struct timer_list max_age_timer
;
139 atomic_t bio_in_progress
[2];
140 struct wait_queue_head bio_in_progress_wait
[2];
142 struct dm_target
*ti
;
144 struct dm_dev
*ssd_dev
;
145 sector_t start_sector
;
147 uint64_t memory_map_size
;
148 size_t metadata_sectors
;
152 struct wc_entry
*entries
;
154 unsigned char block_size_bits
;
157 bool writeback_fua
:1;
159 bool overwrote_committed
:1;
160 bool memory_vmapped
:1;
162 bool high_wm_percent_set
:1;
163 bool low_wm_percent_set
:1;
164 bool max_writeback_jobs_set
:1;
165 bool autocommit_blocks_set
:1;
166 bool autocommit_time_set
:1;
167 bool writeback_fua_set
:1;
168 bool flush_on_suspend
:1;
171 unsigned writeback_all
;
172 struct workqueue_struct
*writeback_wq
;
173 struct work_struct writeback_work
;
174 struct work_struct flush_work
;
176 struct dm_io_client
*dm_io
;
178 raw_spinlock_t endio_list_lock
;
179 struct list_head endio_list
;
180 struct task_struct
*endio_thread
;
182 struct task_struct
*flush_thread
;
183 struct bio_list flush_list
;
185 struct dm_kcopyd_client
*dm_kcopyd
;
186 unsigned long *dirty_bitmap
;
187 unsigned dirty_bitmap_size
;
189 struct bio_set bio_set
;
193 #define WB_LIST_INLINE 16
195 struct writeback_struct
{
196 struct list_head endio_entry
;
197 struct dm_writecache
*wc
;
198 struct wc_entry
**wc_list
;
200 struct wc_entry
*wc_list_inline
[WB_LIST_INLINE
];
205 struct list_head endio_entry
;
206 struct dm_writecache
*wc
;
212 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle
,
213 "A percentage of time allocated for data copying");
215 static void wc_lock(struct dm_writecache
*wc
)
217 mutex_lock(&wc
->lock
);
220 static void wc_unlock(struct dm_writecache
*wc
)
222 mutex_unlock(&wc
->lock
);
225 #ifdef DM_WRITECACHE_HAS_PMEM
226 static int persistent_memory_claim(struct dm_writecache
*wc
)
236 wc
->memory_vmapped
= false;
238 s
= wc
->memory_map_size
;
244 if (p
!= s
>> PAGE_SHIFT
) {
249 offset
= get_start_sect(wc
->ssd_dev
->bdev
);
250 if (offset
& (PAGE_SIZE
/ 512 - 1)) {
254 offset
>>= PAGE_SHIFT
- 9;
256 id
= dax_read_lock();
258 da
= dax_direct_access(wc
->ssd_dev
->dax_dev
, offset
, p
, &wc
->memory_map
, &pfn
);
260 wc
->memory_map
= NULL
;
264 if (!pfn_t_has_page(pfn
)) {
265 wc
->memory_map
= NULL
;
271 wc
->memory_map
= NULL
;
272 pages
= kvmalloc_array(p
, sizeof(struct page
*), GFP_KERNEL
);
280 daa
= dax_direct_access(wc
->ssd_dev
->dax_dev
, offset
+ i
, p
- i
,
283 r
= daa
? daa
: -EINVAL
;
286 if (!pfn_t_has_page(pfn
)) {
290 while (daa
-- && i
< p
) {
291 pages
[i
++] = pfn_t_to_page(pfn
);
297 wc
->memory_map
= vmap(pages
, p
, VM_MAP
, PAGE_KERNEL
);
298 if (!wc
->memory_map
) {
303 wc
->memory_vmapped
= true;
308 wc
->memory_map
+= (size_t)wc
->start_sector
<< SECTOR_SHIFT
;
309 wc
->memory_map_size
-= (size_t)wc
->start_sector
<< SECTOR_SHIFT
;
320 static int persistent_memory_claim(struct dm_writecache
*wc
)
326 static void persistent_memory_release(struct dm_writecache
*wc
)
328 if (wc
->memory_vmapped
)
329 vunmap(wc
->memory_map
- ((size_t)wc
->start_sector
<< SECTOR_SHIFT
));
332 static struct page
*persistent_memory_page(void *addr
)
334 if (is_vmalloc_addr(addr
))
335 return vmalloc_to_page(addr
);
337 return virt_to_page(addr
);
340 static unsigned persistent_memory_page_offset(void *addr
)
342 return (unsigned long)addr
& (PAGE_SIZE
- 1);
345 static void persistent_memory_flush_cache(void *ptr
, size_t size
)
347 if (is_vmalloc_addr(ptr
))
348 flush_kernel_vmap_range(ptr
, size
);
351 static void persistent_memory_invalidate_cache(void *ptr
, size_t size
)
353 if (is_vmalloc_addr(ptr
))
354 invalidate_kernel_vmap_range(ptr
, size
);
357 static struct wc_memory_superblock
*sb(struct dm_writecache
*wc
)
359 return wc
->memory_map
;
362 static struct wc_memory_entry
*memory_entry(struct dm_writecache
*wc
, struct wc_entry
*e
)
364 return &sb(wc
)->entries
[e
->index
];
367 static void *memory_data(struct dm_writecache
*wc
, struct wc_entry
*e
)
369 return (char *)wc
->block_start
+ (e
->index
<< wc
->block_size_bits
);
372 static sector_t
cache_sector(struct dm_writecache
*wc
, struct wc_entry
*e
)
374 return wc
->start_sector
+ wc
->metadata_sectors
+
375 ((sector_t
)e
->index
<< (wc
->block_size_bits
- SECTOR_SHIFT
));
378 static uint64_t read_original_sector(struct dm_writecache
*wc
, struct wc_entry
*e
)
380 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
381 return e
->original_sector
;
383 return le64_to_cpu(memory_entry(wc
, e
)->original_sector
);
387 static uint64_t read_seq_count(struct dm_writecache
*wc
, struct wc_entry
*e
)
389 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
392 return le64_to_cpu(memory_entry(wc
, e
)->seq_count
);
396 static void clear_seq_count(struct dm_writecache
*wc
, struct wc_entry
*e
)
398 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
401 pmem_assign(memory_entry(wc
, e
)->seq_count
, cpu_to_le64(-1));
404 static void write_original_sector_seq_count(struct dm_writecache
*wc
, struct wc_entry
*e
,
405 uint64_t original_sector
, uint64_t seq_count
)
407 struct wc_memory_entry me
;
408 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
409 e
->original_sector
= original_sector
;
410 e
->seq_count
= seq_count
;
412 me
.original_sector
= cpu_to_le64(original_sector
);
413 me
.seq_count
= cpu_to_le64(seq_count
);
414 pmem_assign(*memory_entry(wc
, e
), me
);
417 #define writecache_error(wc, err, msg, arg...) \
419 if (!cmpxchg(&(wc)->error, 0, err)) \
421 wake_up(&(wc)->freelist_wait); \
424 #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error)))
426 static void writecache_flush_all_metadata(struct dm_writecache
*wc
)
428 if (!WC_MODE_PMEM(wc
))
429 memset(wc
->dirty_bitmap
, -1, wc
->dirty_bitmap_size
);
432 static void writecache_flush_region(struct dm_writecache
*wc
, void *ptr
, size_t size
)
434 if (!WC_MODE_PMEM(wc
))
435 __set_bit(((char *)ptr
- (char *)wc
->memory_map
) / BITMAP_GRANULARITY
,
439 static void writecache_disk_flush(struct dm_writecache
*wc
, struct dm_dev
*dev
);
442 struct dm_writecache
*wc
;
447 static void writecache_notify_io(unsigned long error
, void *context
)
449 struct io_notify
*endio
= context
;
451 if (unlikely(error
!= 0))
452 writecache_error(endio
->wc
, -EIO
, "error writing metadata");
453 BUG_ON(atomic_read(&endio
->count
) <= 0);
454 if (atomic_dec_and_test(&endio
->count
))
458 static void writecache_wait_for_ios(struct dm_writecache
*wc
, int direction
)
460 wait_event(wc
->bio_in_progress_wait
[direction
],
461 !atomic_read(&wc
->bio_in_progress
[direction
]));
464 static void ssd_commit_flushed(struct dm_writecache
*wc
, bool wait_for_ios
)
466 struct dm_io_region region
;
467 struct dm_io_request req
;
468 struct io_notify endio
= {
470 COMPLETION_INITIALIZER_ONSTACK(endio
.c
),
473 unsigned bitmap_bits
= wc
->dirty_bitmap_size
* 8;
478 i
= find_next_bit(wc
->dirty_bitmap
, bitmap_bits
, i
);
479 if (unlikely(i
== bitmap_bits
))
481 j
= find_next_zero_bit(wc
->dirty_bitmap
, bitmap_bits
, i
);
483 region
.bdev
= wc
->ssd_dev
->bdev
;
484 region
.sector
= (sector_t
)i
* (BITMAP_GRANULARITY
>> SECTOR_SHIFT
);
485 region
.count
= (sector_t
)(j
- i
) * (BITMAP_GRANULARITY
>> SECTOR_SHIFT
);
487 if (unlikely(region
.sector
>= wc
->metadata_sectors
))
489 if (unlikely(region
.sector
+ region
.count
> wc
->metadata_sectors
))
490 region
.count
= wc
->metadata_sectors
- region
.sector
;
492 region
.sector
+= wc
->start_sector
;
493 atomic_inc(&endio
.count
);
494 req
.bi_op
= REQ_OP_WRITE
;
495 req
.bi_op_flags
= REQ_SYNC
;
496 req
.mem
.type
= DM_IO_VMA
;
497 req
.mem
.ptr
.vma
= (char *)wc
->memory_map
+ (size_t)i
* BITMAP_GRANULARITY
;
498 req
.client
= wc
->dm_io
;
499 req
.notify
.fn
= writecache_notify_io
;
500 req
.notify
.context
= &endio
;
502 /* writing via async dm-io (implied by notify.fn above) won't return an error */
503 (void) dm_io(&req
, 1, ®ion
, NULL
);
507 writecache_notify_io(0, &endio
);
508 wait_for_completion_io(&endio
.c
);
511 writecache_wait_for_ios(wc
, WRITE
);
513 writecache_disk_flush(wc
, wc
->ssd_dev
);
515 memset(wc
->dirty_bitmap
, 0, wc
->dirty_bitmap_size
);
518 static void ssd_commit_superblock(struct dm_writecache
*wc
)
521 struct dm_io_region region
;
522 struct dm_io_request req
;
524 region
.bdev
= wc
->ssd_dev
->bdev
;
526 region
.count
= PAGE_SIZE
;
528 if (unlikely(region
.sector
+ region
.count
> wc
->metadata_sectors
))
529 region
.count
= wc
->metadata_sectors
- region
.sector
;
531 region
.sector
+= wc
->start_sector
;
533 req
.bi_op
= REQ_OP_WRITE
;
534 req
.bi_op_flags
= REQ_SYNC
| REQ_FUA
;
535 req
.mem
.type
= DM_IO_VMA
;
536 req
.mem
.ptr
.vma
= (char *)wc
->memory_map
;
537 req
.client
= wc
->dm_io
;
538 req
.notify
.fn
= NULL
;
539 req
.notify
.context
= NULL
;
541 r
= dm_io(&req
, 1, ®ion
, NULL
);
543 writecache_error(wc
, r
, "error writing superblock");
546 static void writecache_commit_flushed(struct dm_writecache
*wc
, bool wait_for_ios
)
548 if (WC_MODE_PMEM(wc
))
551 ssd_commit_flushed(wc
, wait_for_ios
);
554 static void writecache_disk_flush(struct dm_writecache
*wc
, struct dm_dev
*dev
)
557 struct dm_io_region region
;
558 struct dm_io_request req
;
560 region
.bdev
= dev
->bdev
;
563 req
.bi_op
= REQ_OP_WRITE
;
564 req
.bi_op_flags
= REQ_PREFLUSH
;
565 req
.mem
.type
= DM_IO_KMEM
;
566 req
.mem
.ptr
.addr
= NULL
;
567 req
.client
= wc
->dm_io
;
568 req
.notify
.fn
= NULL
;
570 r
= dm_io(&req
, 1, ®ion
, NULL
);
572 writecache_error(wc
, r
, "error flushing metadata: %d", r
);
575 #define WFE_RETURN_FOLLOWING 1
576 #define WFE_LOWEST_SEQ 2
578 static struct wc_entry
*writecache_find_entry(struct dm_writecache
*wc
,
579 uint64_t block
, int flags
)
582 struct rb_node
*node
= wc
->tree
.rb_node
;
588 e
= container_of(node
, struct wc_entry
, rb_node
);
589 if (read_original_sector(wc
, e
) == block
)
592 node
= (read_original_sector(wc
, e
) >= block
?
593 e
->rb_node
.rb_left
: e
->rb_node
.rb_right
);
594 if (unlikely(!node
)) {
595 if (!(flags
& WFE_RETURN_FOLLOWING
))
597 if (read_original_sector(wc
, e
) >= block
) {
600 node
= rb_next(&e
->rb_node
);
603 e
= container_of(node
, struct wc_entry
, rb_node
);
611 if (flags
& WFE_LOWEST_SEQ
)
612 node
= rb_prev(&e
->rb_node
);
614 node
= rb_next(&e
->rb_node
);
617 e2
= container_of(node
, struct wc_entry
, rb_node
);
618 if (read_original_sector(wc
, e2
) != block
)
624 static void writecache_insert_entry(struct dm_writecache
*wc
, struct wc_entry
*ins
)
627 struct rb_node
**node
= &wc
->tree
.rb_node
, *parent
= NULL
;
630 e
= container_of(*node
, struct wc_entry
, rb_node
);
631 parent
= &e
->rb_node
;
632 if (read_original_sector(wc
, e
) > read_original_sector(wc
, ins
))
633 node
= &parent
->rb_left
;
635 node
= &parent
->rb_right
;
637 rb_link_node(&ins
->rb_node
, parent
, node
);
638 rb_insert_color(&ins
->rb_node
, &wc
->tree
);
639 list_add(&ins
->lru
, &wc
->lru
);
643 static void writecache_unlink(struct dm_writecache
*wc
, struct wc_entry
*e
)
646 rb_erase(&e
->rb_node
, &wc
->tree
);
649 static void writecache_add_to_freelist(struct dm_writecache
*wc
, struct wc_entry
*e
)
651 if (WC_MODE_SORT_FREELIST(wc
)) {
652 struct rb_node
**node
= &wc
->freetree
.rb_node
, *parent
= NULL
;
653 if (unlikely(!*node
))
654 wc
->current_free
= e
;
657 if (&e
->rb_node
< *node
)
658 node
= &parent
->rb_left
;
660 node
= &parent
->rb_right
;
662 rb_link_node(&e
->rb_node
, parent
, node
);
663 rb_insert_color(&e
->rb_node
, &wc
->freetree
);
665 list_add_tail(&e
->lru
, &wc
->freelist
);
670 static inline void writecache_verify_watermark(struct dm_writecache
*wc
)
672 if (unlikely(wc
->freelist_size
+ wc
->writeback_size
<= wc
->freelist_high_watermark
))
673 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
676 static void writecache_max_age_timer(struct timer_list
*t
)
678 struct dm_writecache
*wc
= from_timer(wc
, t
, max_age_timer
);
680 if (!dm_suspended(wc
->ti
) && !writecache_has_error(wc
)) {
681 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
682 mod_timer(&wc
->max_age_timer
, jiffies
+ wc
->max_age
/ MAX_AGE_DIV
);
686 static struct wc_entry
*writecache_pop_from_freelist(struct dm_writecache
*wc
, sector_t expected_sector
)
690 if (WC_MODE_SORT_FREELIST(wc
)) {
691 struct rb_node
*next
;
692 if (unlikely(!wc
->current_free
))
694 e
= wc
->current_free
;
695 if (expected_sector
!= (sector_t
)-1 && unlikely(cache_sector(wc
, e
) != expected_sector
))
697 next
= rb_next(&e
->rb_node
);
698 rb_erase(&e
->rb_node
, &wc
->freetree
);
700 next
= rb_first(&wc
->freetree
);
701 wc
->current_free
= next
? container_of(next
, struct wc_entry
, rb_node
) : NULL
;
703 if (unlikely(list_empty(&wc
->freelist
)))
705 e
= container_of(wc
->freelist
.next
, struct wc_entry
, lru
);
706 if (expected_sector
!= (sector_t
)-1 && unlikely(cache_sector(wc
, e
) != expected_sector
))
712 writecache_verify_watermark(wc
);
717 static void writecache_free_entry(struct dm_writecache
*wc
, struct wc_entry
*e
)
719 writecache_unlink(wc
, e
);
720 writecache_add_to_freelist(wc
, e
);
721 clear_seq_count(wc
, e
);
722 writecache_flush_region(wc
, memory_entry(wc
, e
), sizeof(struct wc_memory_entry
));
723 if (unlikely(waitqueue_active(&wc
->freelist_wait
)))
724 wake_up(&wc
->freelist_wait
);
727 static void writecache_wait_on_freelist(struct dm_writecache
*wc
)
731 prepare_to_wait(&wc
->freelist_wait
, &wait
, TASK_UNINTERRUPTIBLE
);
734 finish_wait(&wc
->freelist_wait
, &wait
);
738 static void writecache_poison_lists(struct dm_writecache
*wc
)
741 * Catch incorrect access to these values while the device is suspended.
743 memset(&wc
->tree
, -1, sizeof wc
->tree
);
744 wc
->lru
.next
= LIST_POISON1
;
745 wc
->lru
.prev
= LIST_POISON2
;
746 wc
->freelist
.next
= LIST_POISON1
;
747 wc
->freelist
.prev
= LIST_POISON2
;
750 static void writecache_flush_entry(struct dm_writecache
*wc
, struct wc_entry
*e
)
752 writecache_flush_region(wc
, memory_entry(wc
, e
), sizeof(struct wc_memory_entry
));
753 if (WC_MODE_PMEM(wc
))
754 writecache_flush_region(wc
, memory_data(wc
, e
), wc
->block_size
);
757 static bool writecache_entry_is_committed(struct dm_writecache
*wc
, struct wc_entry
*e
)
759 return read_seq_count(wc
, e
) < wc
->seq_count
;
762 static void writecache_flush(struct dm_writecache
*wc
)
764 struct wc_entry
*e
, *e2
;
765 bool need_flush_after_free
;
767 wc
->uncommitted_blocks
= 0;
768 del_timer(&wc
->autocommit_timer
);
770 if (list_empty(&wc
->lru
))
773 e
= container_of(wc
->lru
.next
, struct wc_entry
, lru
);
774 if (writecache_entry_is_committed(wc
, e
)) {
775 if (wc
->overwrote_committed
) {
776 writecache_wait_for_ios(wc
, WRITE
);
777 writecache_disk_flush(wc
, wc
->ssd_dev
);
778 wc
->overwrote_committed
= false;
783 writecache_flush_entry(wc
, e
);
784 if (unlikely(e
->lru
.next
== &wc
->lru
))
786 e2
= container_of(e
->lru
.next
, struct wc_entry
, lru
);
787 if (writecache_entry_is_committed(wc
, e2
))
792 writecache_commit_flushed(wc
, true);
795 pmem_assign(sb(wc
)->seq_count
, cpu_to_le64(wc
->seq_count
));
796 if (WC_MODE_PMEM(wc
))
797 writecache_commit_flushed(wc
, false);
799 ssd_commit_superblock(wc
);
801 wc
->overwrote_committed
= false;
803 need_flush_after_free
= false;
805 /* Free another committed entry with lower seq-count */
806 struct rb_node
*rb_node
= rb_prev(&e
->rb_node
);
809 e2
= container_of(rb_node
, struct wc_entry
, rb_node
);
810 if (read_original_sector(wc
, e2
) == read_original_sector(wc
, e
) &&
811 likely(!e2
->write_in_progress
)) {
812 writecache_free_entry(wc
, e2
);
813 need_flush_after_free
= true;
816 if (unlikely(e
->lru
.prev
== &wc
->lru
))
818 e
= container_of(e
->lru
.prev
, struct wc_entry
, lru
);
822 if (need_flush_after_free
)
823 writecache_commit_flushed(wc
, false);
826 static void writecache_flush_work(struct work_struct
*work
)
828 struct dm_writecache
*wc
= container_of(work
, struct dm_writecache
, flush_work
);
831 writecache_flush(wc
);
835 static void writecache_autocommit_timer(struct timer_list
*t
)
837 struct dm_writecache
*wc
= from_timer(wc
, t
, autocommit_timer
);
838 if (!writecache_has_error(wc
))
839 queue_work(wc
->writeback_wq
, &wc
->flush_work
);
842 static void writecache_schedule_autocommit(struct dm_writecache
*wc
)
844 if (!timer_pending(&wc
->autocommit_timer
))
845 mod_timer(&wc
->autocommit_timer
, jiffies
+ wc
->autocommit_jiffies
);
848 static void writecache_discard(struct dm_writecache
*wc
, sector_t start
, sector_t end
)
851 bool discarded_something
= false;
853 e
= writecache_find_entry(wc
, start
, WFE_RETURN_FOLLOWING
| WFE_LOWEST_SEQ
);
857 while (read_original_sector(wc
, e
) < end
) {
858 struct rb_node
*node
= rb_next(&e
->rb_node
);
860 if (likely(!e
->write_in_progress
)) {
861 if (!discarded_something
) {
862 if (!WC_MODE_PMEM(wc
)) {
863 writecache_wait_for_ios(wc
, READ
);
864 writecache_wait_for_ios(wc
, WRITE
);
866 discarded_something
= true;
868 if (!writecache_entry_is_committed(wc
, e
))
869 wc
->uncommitted_blocks
--;
870 writecache_free_entry(wc
, e
);
876 e
= container_of(node
, struct wc_entry
, rb_node
);
879 if (discarded_something
)
880 writecache_commit_flushed(wc
, false);
883 static bool writecache_wait_for_writeback(struct dm_writecache
*wc
)
885 if (wc
->writeback_size
) {
886 writecache_wait_on_freelist(wc
);
892 static void writecache_suspend(struct dm_target
*ti
)
894 struct dm_writecache
*wc
= ti
->private;
895 bool flush_on_suspend
;
897 del_timer_sync(&wc
->autocommit_timer
);
898 del_timer_sync(&wc
->max_age_timer
);
901 writecache_flush(wc
);
902 flush_on_suspend
= wc
->flush_on_suspend
;
903 if (flush_on_suspend
) {
904 wc
->flush_on_suspend
= false;
906 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
910 drain_workqueue(wc
->writeback_wq
);
913 if (flush_on_suspend
)
915 while (writecache_wait_for_writeback(wc
));
917 if (WC_MODE_PMEM(wc
))
918 persistent_memory_flush_cache(wc
->memory_map
, wc
->memory_map_size
);
920 writecache_poison_lists(wc
);
925 static int writecache_alloc_entries(struct dm_writecache
*wc
)
931 wc
->entries
= vmalloc(array_size(sizeof(struct wc_entry
), wc
->n_blocks
));
934 for (b
= 0; b
< wc
->n_blocks
; b
++) {
935 struct wc_entry
*e
= &wc
->entries
[b
];
937 e
->write_in_progress
= false;
944 static int writecache_read_metadata(struct dm_writecache
*wc
, sector_t n_sectors
)
946 struct dm_io_region region
;
947 struct dm_io_request req
;
949 region
.bdev
= wc
->ssd_dev
->bdev
;
950 region
.sector
= wc
->start_sector
;
951 region
.count
= n_sectors
;
952 req
.bi_op
= REQ_OP_READ
;
953 req
.bi_op_flags
= REQ_SYNC
;
954 req
.mem
.type
= DM_IO_VMA
;
955 req
.mem
.ptr
.vma
= (char *)wc
->memory_map
;
956 req
.client
= wc
->dm_io
;
957 req
.notify
.fn
= NULL
;
959 return dm_io(&req
, 1, ®ion
, NULL
);
962 static void writecache_resume(struct dm_target
*ti
)
964 struct dm_writecache
*wc
= ti
->private;
966 bool need_flush
= false;
972 if (WC_MODE_PMEM(wc
)) {
973 persistent_memory_invalidate_cache(wc
->memory_map
, wc
->memory_map_size
);
975 r
= writecache_read_metadata(wc
, wc
->metadata_sectors
);
977 size_t sb_entries_offset
;
978 writecache_error(wc
, r
, "unable to read metadata: %d", r
);
979 sb_entries_offset
= offsetof(struct wc_memory_superblock
, entries
);
980 memset((char *)wc
->memory_map
+ sb_entries_offset
, -1,
981 (wc
->metadata_sectors
<< SECTOR_SHIFT
) - sb_entries_offset
);
986 INIT_LIST_HEAD(&wc
->lru
);
987 if (WC_MODE_SORT_FREELIST(wc
)) {
988 wc
->freetree
= RB_ROOT
;
989 wc
->current_free
= NULL
;
991 INIT_LIST_HEAD(&wc
->freelist
);
993 wc
->freelist_size
= 0;
995 r
= copy_mc_to_kernel(&sb_seq_count
, &sb(wc
)->seq_count
,
998 writecache_error(wc
, r
, "hardware memory error when reading superblock: %d", r
);
999 sb_seq_count
= cpu_to_le64(0);
1001 wc
->seq_count
= le64_to_cpu(sb_seq_count
);
1003 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
1004 for (b
= 0; b
< wc
->n_blocks
; b
++) {
1005 struct wc_entry
*e
= &wc
->entries
[b
];
1006 struct wc_memory_entry wme
;
1007 if (writecache_has_error(wc
)) {
1008 e
->original_sector
= -1;
1012 r
= copy_mc_to_kernel(&wme
, memory_entry(wc
, e
),
1013 sizeof(struct wc_memory_entry
));
1015 writecache_error(wc
, r
, "hardware memory error when reading metadata entry %lu: %d",
1016 (unsigned long)b
, r
);
1017 e
->original_sector
= -1;
1020 e
->original_sector
= le64_to_cpu(wme
.original_sector
);
1021 e
->seq_count
= le64_to_cpu(wme
.seq_count
);
1026 for (b
= 0; b
< wc
->n_blocks
; b
++) {
1027 struct wc_entry
*e
= &wc
->entries
[b
];
1028 if (!writecache_entry_is_committed(wc
, e
)) {
1029 if (read_seq_count(wc
, e
) != -1) {
1031 clear_seq_count(wc
, e
);
1034 writecache_add_to_freelist(wc
, e
);
1036 struct wc_entry
*old
;
1038 old
= writecache_find_entry(wc
, read_original_sector(wc
, e
), 0);
1040 writecache_insert_entry(wc
, e
);
1042 if (read_seq_count(wc
, old
) == read_seq_count(wc
, e
)) {
1043 writecache_error(wc
, -EINVAL
,
1044 "two identical entries, position %llu, sector %llu, sequence %llu",
1045 (unsigned long long)b
, (unsigned long long)read_original_sector(wc
, e
),
1046 (unsigned long long)read_seq_count(wc
, e
));
1048 if (read_seq_count(wc
, old
) > read_seq_count(wc
, e
)) {
1051 writecache_free_entry(wc
, old
);
1052 writecache_insert_entry(wc
, e
);
1061 writecache_flush_all_metadata(wc
);
1062 writecache_commit_flushed(wc
, false);
1065 writecache_verify_watermark(wc
);
1067 if (wc
->max_age
!= MAX_AGE_UNSPECIFIED
)
1068 mod_timer(&wc
->max_age_timer
, jiffies
+ wc
->max_age
/ MAX_AGE_DIV
);
1073 static int process_flush_mesg(unsigned argc
, char **argv
, struct dm_writecache
*wc
)
1079 if (dm_suspended(wc
->ti
)) {
1083 if (writecache_has_error(wc
)) {
1088 writecache_flush(wc
);
1089 wc
->writeback_all
++;
1090 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
1093 flush_workqueue(wc
->writeback_wq
);
1096 wc
->writeback_all
--;
1097 if (writecache_has_error(wc
)) {
1106 static int process_flush_on_suspend_mesg(unsigned argc
, char **argv
, struct dm_writecache
*wc
)
1112 wc
->flush_on_suspend
= true;
1118 static void activate_cleaner(struct dm_writecache
*wc
)
1120 wc
->flush_on_suspend
= true;
1122 wc
->freelist_high_watermark
= wc
->n_blocks
;
1123 wc
->freelist_low_watermark
= wc
->n_blocks
;
1126 static int process_cleaner_mesg(unsigned argc
, char **argv
, struct dm_writecache
*wc
)
1132 activate_cleaner(wc
);
1133 if (!dm_suspended(wc
->ti
))
1134 writecache_verify_watermark(wc
);
1140 static int writecache_message(struct dm_target
*ti
, unsigned argc
, char **argv
,
1141 char *result
, unsigned maxlen
)
1144 struct dm_writecache
*wc
= ti
->private;
1146 if (!strcasecmp(argv
[0], "flush"))
1147 r
= process_flush_mesg(argc
, argv
, wc
);
1148 else if (!strcasecmp(argv
[0], "flush_on_suspend"))
1149 r
= process_flush_on_suspend_mesg(argc
, argv
, wc
);
1150 else if (!strcasecmp(argv
[0], "cleaner"))
1151 r
= process_cleaner_mesg(argc
, argv
, wc
);
1153 DMERR("unrecognised message received: %s", argv
[0]);
1158 static void memcpy_flushcache_optimized(void *dest
, void *source
, size_t size
)
1161 * clflushopt performs better with block size 1024, 2048, 4096
1162 * non-temporal stores perform better with block size 512
1164 * block size 512 1024 2048 4096
1165 * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s
1166 * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s
1168 * We see that movnti performs better for 512-byte blocks, and
1169 * clflushopt performs better for 1024-byte and larger blocks. So, we
1170 * prefer clflushopt for sizes >= 768.
1172 * NOTE: this happens to be the case now (with dm-writecache's single
1173 * threaded model) but re-evaluate this once memcpy_flushcache() is
1174 * enabled to use movdir64b which might invalidate this performance
1175 * advantage seen with cache-allocating-writes plus flushing.
1178 if (static_cpu_has(X86_FEATURE_CLFLUSHOPT
) &&
1179 likely(boot_cpu_data
.x86_clflush_size
== 64) &&
1180 likely(size
>= 768)) {
1182 memcpy((void *)dest
, (void *)source
, 64);
1183 clflushopt((void *)dest
);
1187 } while (size
>= 64);
1191 memcpy_flushcache(dest
, source
, size
);
1194 static void bio_copy_block(struct dm_writecache
*wc
, struct bio
*bio
, void *data
)
1197 unsigned long flags
;
1199 int rw
= bio_data_dir(bio
);
1200 unsigned remaining_size
= wc
->block_size
;
1203 struct bio_vec bv
= bio_iter_iovec(bio
, bio
->bi_iter
);
1204 buf
= bvec_kmap_irq(&bv
, &flags
);
1206 if (unlikely(size
> remaining_size
))
1207 size
= remaining_size
;
1211 r
= copy_mc_to_kernel(buf
, data
, size
);
1212 flush_dcache_page(bio_page(bio
));
1214 writecache_error(wc
, r
, "hardware memory error when reading data: %d", r
);
1215 bio
->bi_status
= BLK_STS_IOERR
;
1218 flush_dcache_page(bio_page(bio
));
1219 memcpy_flushcache_optimized(data
, buf
, size
);
1222 bvec_kunmap_irq(buf
, &flags
);
1224 data
= (char *)data
+ size
;
1225 remaining_size
-= size
;
1226 bio_advance(bio
, size
);
1227 } while (unlikely(remaining_size
));
1230 static int writecache_flush_thread(void *data
)
1232 struct dm_writecache
*wc
= data
;
1238 bio
= bio_list_pop(&wc
->flush_list
);
1240 set_current_state(TASK_INTERRUPTIBLE
);
1243 if (unlikely(kthread_should_stop())) {
1244 set_current_state(TASK_RUNNING
);
1252 if (bio_op(bio
) == REQ_OP_DISCARD
) {
1253 writecache_discard(wc
, bio
->bi_iter
.bi_sector
,
1254 bio_end_sector(bio
));
1256 bio_set_dev(bio
, wc
->dev
->bdev
);
1257 submit_bio_noacct(bio
);
1259 writecache_flush(wc
);
1261 if (writecache_has_error(wc
))
1262 bio
->bi_status
= BLK_STS_IOERR
;
1270 static void writecache_offload_bio(struct dm_writecache
*wc
, struct bio
*bio
)
1272 if (bio_list_empty(&wc
->flush_list
))
1273 wake_up_process(wc
->flush_thread
);
1274 bio_list_add(&wc
->flush_list
, bio
);
1277 static int writecache_map(struct dm_target
*ti
, struct bio
*bio
)
1280 struct dm_writecache
*wc
= ti
->private;
1282 bio
->bi_private
= NULL
;
1286 if (unlikely(bio
->bi_opf
& REQ_PREFLUSH
)) {
1287 if (writecache_has_error(wc
))
1289 if (WC_MODE_PMEM(wc
)) {
1290 writecache_flush(wc
);
1291 if (writecache_has_error(wc
))
1295 writecache_offload_bio(wc
, bio
);
1300 bio
->bi_iter
.bi_sector
= dm_target_offset(ti
, bio
->bi_iter
.bi_sector
);
1302 if (unlikely((((unsigned)bio
->bi_iter
.bi_sector
| bio_sectors(bio
)) &
1303 (wc
->block_size
/ 512 - 1)) != 0)) {
1304 DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1305 (unsigned long long)bio
->bi_iter
.bi_sector
,
1306 bio
->bi_iter
.bi_size
, wc
->block_size
);
1310 if (unlikely(bio_op(bio
) == REQ_OP_DISCARD
)) {
1311 if (writecache_has_error(wc
))
1313 if (WC_MODE_PMEM(wc
)) {
1314 writecache_discard(wc
, bio
->bi_iter
.bi_sector
, bio_end_sector(bio
));
1315 goto unlock_remap_origin
;
1317 writecache_offload_bio(wc
, bio
);
1322 if (bio_data_dir(bio
) == READ
) {
1324 e
= writecache_find_entry(wc
, bio
->bi_iter
.bi_sector
, WFE_RETURN_FOLLOWING
);
1325 if (e
&& read_original_sector(wc
, e
) == bio
->bi_iter
.bi_sector
) {
1326 if (WC_MODE_PMEM(wc
)) {
1327 bio_copy_block(wc
, bio
, memory_data(wc
, e
));
1328 if (bio
->bi_iter
.bi_size
)
1329 goto read_next_block
;
1332 dm_accept_partial_bio(bio
, wc
->block_size
>> SECTOR_SHIFT
);
1333 bio_set_dev(bio
, wc
->ssd_dev
->bdev
);
1334 bio
->bi_iter
.bi_sector
= cache_sector(wc
, e
);
1335 if (!writecache_entry_is_committed(wc
, e
))
1336 writecache_wait_for_ios(wc
, WRITE
);
1341 sector_t next_boundary
=
1342 read_original_sector(wc
, e
) - bio
->bi_iter
.bi_sector
;
1343 if (next_boundary
< bio
->bi_iter
.bi_size
>> SECTOR_SHIFT
) {
1344 dm_accept_partial_bio(bio
, next_boundary
);
1347 goto unlock_remap_origin
;
1351 bool found_entry
= false;
1352 if (writecache_has_error(wc
))
1354 e
= writecache_find_entry(wc
, bio
->bi_iter
.bi_sector
, 0);
1356 if (!writecache_entry_is_committed(wc
, e
))
1358 if (!WC_MODE_PMEM(wc
) && !e
->write_in_progress
) {
1359 wc
->overwrote_committed
= true;
1364 if (unlikely(wc
->cleaner
))
1367 e
= writecache_pop_from_freelist(wc
, (sector_t
)-1);
1371 e
= writecache_find_entry(wc
, bio
->bi_iter
.bi_sector
, WFE_RETURN_FOLLOWING
);
1373 sector_t next_boundary
= read_original_sector(wc
, e
) - bio
->bi_iter
.bi_sector
;
1374 BUG_ON(!next_boundary
);
1375 if (next_boundary
< bio
->bi_iter
.bi_size
>> SECTOR_SHIFT
) {
1376 dm_accept_partial_bio(bio
, next_boundary
);
1379 goto unlock_remap_origin
;
1381 writecache_wait_on_freelist(wc
);
1384 write_original_sector_seq_count(wc
, e
, bio
->bi_iter
.bi_sector
, wc
->seq_count
);
1385 writecache_insert_entry(wc
, e
);
1386 wc
->uncommitted_blocks
++;
1388 if (WC_MODE_PMEM(wc
)) {
1389 bio_copy_block(wc
, bio
, memory_data(wc
, e
));
1391 unsigned bio_size
= wc
->block_size
;
1392 sector_t start_cache_sec
= cache_sector(wc
, e
);
1393 sector_t current_cache_sec
= start_cache_sec
+ (bio_size
>> SECTOR_SHIFT
);
1395 while (bio_size
< bio
->bi_iter
.bi_size
) {
1396 struct wc_entry
*f
= writecache_pop_from_freelist(wc
, current_cache_sec
);
1399 write_original_sector_seq_count(wc
, f
, bio
->bi_iter
.bi_sector
+
1400 (bio_size
>> SECTOR_SHIFT
), wc
->seq_count
);
1401 writecache_insert_entry(wc
, f
);
1402 wc
->uncommitted_blocks
++;
1403 bio_size
+= wc
->block_size
;
1404 current_cache_sec
+= wc
->block_size
>> SECTOR_SHIFT
;
1407 bio_set_dev(bio
, wc
->ssd_dev
->bdev
);
1408 bio
->bi_iter
.bi_sector
= start_cache_sec
;
1409 dm_accept_partial_bio(bio
, bio_size
>> SECTOR_SHIFT
);
1411 if (unlikely(wc
->uncommitted_blocks
>= wc
->autocommit_blocks
)) {
1412 wc
->uncommitted_blocks
= 0;
1413 queue_work(wc
->writeback_wq
, &wc
->flush_work
);
1415 writecache_schedule_autocommit(wc
);
1419 } while (bio
->bi_iter
.bi_size
);
1421 if (unlikely(bio
->bi_opf
& REQ_FUA
||
1422 wc
->uncommitted_blocks
>= wc
->autocommit_blocks
))
1423 writecache_flush(wc
);
1425 writecache_schedule_autocommit(wc
);
1429 unlock_remap_origin
:
1430 bio_set_dev(bio
, wc
->dev
->bdev
);
1432 return DM_MAPIO_REMAPPED
;
1435 /* make sure that writecache_end_io decrements bio_in_progress: */
1436 bio
->bi_private
= (void *)1;
1437 atomic_inc(&wc
->bio_in_progress
[bio_data_dir(bio
)]);
1439 return DM_MAPIO_REMAPPED
;
1444 return DM_MAPIO_SUBMITTED
;
1448 return DM_MAPIO_SUBMITTED
;
1453 return DM_MAPIO_SUBMITTED
;
1456 static int writecache_end_io(struct dm_target
*ti
, struct bio
*bio
, blk_status_t
*status
)
1458 struct dm_writecache
*wc
= ti
->private;
1460 if (bio
->bi_private
!= NULL
) {
1461 int dir
= bio_data_dir(bio
);
1462 if (atomic_dec_and_test(&wc
->bio_in_progress
[dir
]))
1463 if (unlikely(waitqueue_active(&wc
->bio_in_progress_wait
[dir
])))
1464 wake_up(&wc
->bio_in_progress_wait
[dir
]);
1469 static int writecache_iterate_devices(struct dm_target
*ti
,
1470 iterate_devices_callout_fn fn
, void *data
)
1472 struct dm_writecache
*wc
= ti
->private;
1474 return fn(ti
, wc
->dev
, 0, ti
->len
, data
);
1477 static void writecache_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
1479 struct dm_writecache
*wc
= ti
->private;
1481 if (limits
->logical_block_size
< wc
->block_size
)
1482 limits
->logical_block_size
= wc
->block_size
;
1484 if (limits
->physical_block_size
< wc
->block_size
)
1485 limits
->physical_block_size
= wc
->block_size
;
1487 if (limits
->io_min
< wc
->block_size
)
1488 limits
->io_min
= wc
->block_size
;
1492 static void writecache_writeback_endio(struct bio
*bio
)
1494 struct writeback_struct
*wb
= container_of(bio
, struct writeback_struct
, bio
);
1495 struct dm_writecache
*wc
= wb
->wc
;
1496 unsigned long flags
;
1498 raw_spin_lock_irqsave(&wc
->endio_list_lock
, flags
);
1499 if (unlikely(list_empty(&wc
->endio_list
)))
1500 wake_up_process(wc
->endio_thread
);
1501 list_add_tail(&wb
->endio_entry
, &wc
->endio_list
);
1502 raw_spin_unlock_irqrestore(&wc
->endio_list_lock
, flags
);
1505 static void writecache_copy_endio(int read_err
, unsigned long write_err
, void *ptr
)
1507 struct copy_struct
*c
= ptr
;
1508 struct dm_writecache
*wc
= c
->wc
;
1510 c
->error
= likely(!(read_err
| write_err
)) ? 0 : -EIO
;
1512 raw_spin_lock_irq(&wc
->endio_list_lock
);
1513 if (unlikely(list_empty(&wc
->endio_list
)))
1514 wake_up_process(wc
->endio_thread
);
1515 list_add_tail(&c
->endio_entry
, &wc
->endio_list
);
1516 raw_spin_unlock_irq(&wc
->endio_list_lock
);
1519 static void __writecache_endio_pmem(struct dm_writecache
*wc
, struct list_head
*list
)
1522 struct writeback_struct
*wb
;
1524 unsigned long n_walked
= 0;
1527 wb
= list_entry(list
->next
, struct writeback_struct
, endio_entry
);
1528 list_del(&wb
->endio_entry
);
1530 if (unlikely(wb
->bio
.bi_status
!= BLK_STS_OK
))
1531 writecache_error(wc
, blk_status_to_errno(wb
->bio
.bi_status
),
1532 "write error %d", wb
->bio
.bi_status
);
1536 BUG_ON(!e
->write_in_progress
);
1537 e
->write_in_progress
= false;
1538 INIT_LIST_HEAD(&e
->lru
);
1539 if (!writecache_has_error(wc
))
1540 writecache_free_entry(wc
, e
);
1541 BUG_ON(!wc
->writeback_size
);
1542 wc
->writeback_size
--;
1544 if (unlikely(n_walked
>= ENDIO_LATENCY
)) {
1545 writecache_commit_flushed(wc
, false);
1550 } while (++i
< wb
->wc_list_n
);
1552 if (wb
->wc_list
!= wb
->wc_list_inline
)
1555 } while (!list_empty(list
));
1558 static void __writecache_endio_ssd(struct dm_writecache
*wc
, struct list_head
*list
)
1560 struct copy_struct
*c
;
1564 c
= list_entry(list
->next
, struct copy_struct
, endio_entry
);
1565 list_del(&c
->endio_entry
);
1567 if (unlikely(c
->error
))
1568 writecache_error(wc
, c
->error
, "copy error");
1572 BUG_ON(!e
->write_in_progress
);
1573 e
->write_in_progress
= false;
1574 INIT_LIST_HEAD(&e
->lru
);
1575 if (!writecache_has_error(wc
))
1576 writecache_free_entry(wc
, e
);
1578 BUG_ON(!wc
->writeback_size
);
1579 wc
->writeback_size
--;
1581 } while (--c
->n_entries
);
1582 mempool_free(c
, &wc
->copy_pool
);
1583 } while (!list_empty(list
));
1586 static int writecache_endio_thread(void *data
)
1588 struct dm_writecache
*wc
= data
;
1591 struct list_head list
;
1593 raw_spin_lock_irq(&wc
->endio_list_lock
);
1594 if (!list_empty(&wc
->endio_list
))
1596 set_current_state(TASK_INTERRUPTIBLE
);
1597 raw_spin_unlock_irq(&wc
->endio_list_lock
);
1599 if (unlikely(kthread_should_stop())) {
1600 set_current_state(TASK_RUNNING
);
1609 list
= wc
->endio_list
;
1610 list
.next
->prev
= list
.prev
->next
= &list
;
1611 INIT_LIST_HEAD(&wc
->endio_list
);
1612 raw_spin_unlock_irq(&wc
->endio_list_lock
);
1614 if (!WC_MODE_FUA(wc
))
1615 writecache_disk_flush(wc
, wc
->dev
);
1619 if (WC_MODE_PMEM(wc
)) {
1620 __writecache_endio_pmem(wc
, &list
);
1622 __writecache_endio_ssd(wc
, &list
);
1623 writecache_wait_for_ios(wc
, READ
);
1626 writecache_commit_flushed(wc
, false);
1634 static bool wc_add_block(struct writeback_struct
*wb
, struct wc_entry
*e
, gfp_t gfp
)
1636 struct dm_writecache
*wc
= wb
->wc
;
1637 unsigned block_size
= wc
->block_size
;
1638 void *address
= memory_data(wc
, e
);
1640 persistent_memory_flush_cache(address
, block_size
);
1641 return bio_add_page(&wb
->bio
, persistent_memory_page(address
),
1642 block_size
, persistent_memory_page_offset(address
)) != 0;
1645 struct writeback_list
{
1646 struct list_head list
;
1650 static void __writeback_throttle(struct dm_writecache
*wc
, struct writeback_list
*wbl
)
1652 if (unlikely(wc
->max_writeback_jobs
)) {
1653 if (READ_ONCE(wc
->writeback_size
) - wbl
->size
>= wc
->max_writeback_jobs
) {
1655 while (wc
->writeback_size
- wbl
->size
>= wc
->max_writeback_jobs
)
1656 writecache_wait_on_freelist(wc
);
1663 static void __writecache_writeback_pmem(struct dm_writecache
*wc
, struct writeback_list
*wbl
)
1665 struct wc_entry
*e
, *f
;
1667 struct writeback_struct
*wb
;
1672 e
= container_of(wbl
->list
.prev
, struct wc_entry
, lru
);
1675 max_pages
= e
->wc_list_contiguous
;
1677 bio
= bio_alloc_bioset(GFP_NOIO
, max_pages
, &wc
->bio_set
);
1678 wb
= container_of(bio
, struct writeback_struct
, bio
);
1680 bio
->bi_end_io
= writecache_writeback_endio
;
1681 bio_set_dev(bio
, wc
->dev
->bdev
);
1682 bio
->bi_iter
.bi_sector
= read_original_sector(wc
, e
);
1683 if (max_pages
<= WB_LIST_INLINE
||
1684 unlikely(!(wb
->wc_list
= kmalloc_array(max_pages
, sizeof(struct wc_entry
*),
1685 GFP_NOIO
| __GFP_NORETRY
|
1686 __GFP_NOMEMALLOC
| __GFP_NOWARN
)))) {
1687 wb
->wc_list
= wb
->wc_list_inline
;
1688 max_pages
= WB_LIST_INLINE
;
1691 BUG_ON(!wc_add_block(wb
, e
, GFP_NOIO
));
1696 while (wbl
->size
&& wb
->wc_list_n
< max_pages
) {
1697 f
= container_of(wbl
->list
.prev
, struct wc_entry
, lru
);
1698 if (read_original_sector(wc
, f
) !=
1699 read_original_sector(wc
, e
) + (wc
->block_size
>> SECTOR_SHIFT
))
1701 if (!wc_add_block(wb
, f
, GFP_NOWAIT
| __GFP_NOWARN
))
1705 wb
->wc_list
[wb
->wc_list_n
++] = f
;
1708 bio_set_op_attrs(bio
, REQ_OP_WRITE
, WC_MODE_FUA(wc
) * REQ_FUA
);
1709 if (writecache_has_error(wc
)) {
1710 bio
->bi_status
= BLK_STS_IOERR
;
1716 __writeback_throttle(wc
, wbl
);
1720 static void __writecache_writeback_ssd(struct dm_writecache
*wc
, struct writeback_list
*wbl
)
1722 struct wc_entry
*e
, *f
;
1723 struct dm_io_region from
, to
;
1724 struct copy_struct
*c
;
1730 e
= container_of(wbl
->list
.prev
, struct wc_entry
, lru
);
1733 n_sectors
= e
->wc_list_contiguous
<< (wc
->block_size_bits
- SECTOR_SHIFT
);
1735 from
.bdev
= wc
->ssd_dev
->bdev
;
1736 from
.sector
= cache_sector(wc
, e
);
1737 from
.count
= n_sectors
;
1738 to
.bdev
= wc
->dev
->bdev
;
1739 to
.sector
= read_original_sector(wc
, e
);
1740 to
.count
= n_sectors
;
1742 c
= mempool_alloc(&wc
->copy_pool
, GFP_NOIO
);
1745 c
->n_entries
= e
->wc_list_contiguous
;
1747 while ((n_sectors
-= wc
->block_size
>> SECTOR_SHIFT
)) {
1749 f
= container_of(wbl
->list
.prev
, struct wc_entry
, lru
);
1755 dm_kcopyd_copy(wc
->dm_kcopyd
, &from
, 1, &to
, 0, writecache_copy_endio
, c
);
1757 __writeback_throttle(wc
, wbl
);
1761 static void writecache_writeback(struct work_struct
*work
)
1763 struct dm_writecache
*wc
= container_of(work
, struct dm_writecache
, writeback_work
);
1764 struct blk_plug plug
;
1765 struct wc_entry
*f
, *g
, *e
= NULL
;
1766 struct rb_node
*node
, *next_node
;
1767 struct list_head skipped
;
1768 struct writeback_list wbl
;
1769 unsigned long n_walked
;
1773 if (writecache_has_error(wc
)) {
1778 if (unlikely(wc
->writeback_all
)) {
1779 if (writecache_wait_for_writeback(wc
))
1783 if (wc
->overwrote_committed
) {
1784 writecache_wait_for_ios(wc
, WRITE
);
1788 INIT_LIST_HEAD(&skipped
);
1789 INIT_LIST_HEAD(&wbl
.list
);
1791 while (!list_empty(&wc
->lru
) &&
1792 (wc
->writeback_all
||
1793 wc
->freelist_size
+ wc
->writeback_size
<= wc
->freelist_low_watermark
||
1794 (jiffies
- container_of(wc
->lru
.prev
, struct wc_entry
, lru
)->age
>=
1795 wc
->max_age
- wc
->max_age
/ MAX_AGE_DIV
))) {
1798 if (unlikely(n_walked
> WRITEBACK_LATENCY
) &&
1799 likely(!wc
->writeback_all
) && likely(!dm_suspended(wc
->ti
))) {
1800 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
1804 if (unlikely(wc
->writeback_all
)) {
1806 writecache_flush(wc
);
1807 e
= container_of(rb_first(&wc
->tree
), struct wc_entry
, rb_node
);
1811 e
= container_of(wc
->lru
.prev
, struct wc_entry
, lru
);
1812 BUG_ON(e
->write_in_progress
);
1813 if (unlikely(!writecache_entry_is_committed(wc
, e
))) {
1814 writecache_flush(wc
);
1816 node
= rb_prev(&e
->rb_node
);
1818 f
= container_of(node
, struct wc_entry
, rb_node
);
1819 if (unlikely(read_original_sector(wc
, f
) ==
1820 read_original_sector(wc
, e
))) {
1821 BUG_ON(!f
->write_in_progress
);
1823 list_add(&e
->lru
, &skipped
);
1828 wc
->writeback_size
++;
1830 list_add(&e
->lru
, &wbl
.list
);
1832 e
->write_in_progress
= true;
1833 e
->wc_list_contiguous
= 1;
1838 next_node
= rb_next(&f
->rb_node
);
1839 if (unlikely(!next_node
))
1841 g
= container_of(next_node
, struct wc_entry
, rb_node
);
1842 if (unlikely(read_original_sector(wc
, g
) ==
1843 read_original_sector(wc
, f
))) {
1847 if (read_original_sector(wc
, g
) !=
1848 read_original_sector(wc
, f
) + (wc
->block_size
>> SECTOR_SHIFT
))
1850 if (unlikely(g
->write_in_progress
))
1852 if (unlikely(!writecache_entry_is_committed(wc
, g
)))
1855 if (!WC_MODE_PMEM(wc
)) {
1861 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
1864 wc
->writeback_size
++;
1866 list_add(&g
->lru
, &wbl
.list
);
1868 g
->write_in_progress
= true;
1869 g
->wc_list_contiguous
= BIO_MAX_PAGES
;
1871 e
->wc_list_contiguous
++;
1872 if (unlikely(e
->wc_list_contiguous
== BIO_MAX_PAGES
)) {
1873 if (unlikely(wc
->writeback_all
)) {
1874 next_node
= rb_next(&f
->rb_node
);
1875 if (likely(next_node
))
1876 g
= container_of(next_node
, struct wc_entry
, rb_node
);
1884 if (!list_empty(&skipped
)) {
1885 list_splice_tail(&skipped
, &wc
->lru
);
1887 * If we didn't do any progress, we must wait until some
1888 * writeback finishes to avoid burning CPU in a loop
1890 if (unlikely(!wbl
.size
))
1891 writecache_wait_for_writeback(wc
);
1896 blk_start_plug(&plug
);
1898 if (WC_MODE_PMEM(wc
))
1899 __writecache_writeback_pmem(wc
, &wbl
);
1901 __writecache_writeback_ssd(wc
, &wbl
);
1903 blk_finish_plug(&plug
);
1905 if (unlikely(wc
->writeback_all
)) {
1907 while (writecache_wait_for_writeback(wc
));
1912 static int calculate_memory_size(uint64_t device_size
, unsigned block_size
,
1913 size_t *n_blocks_p
, size_t *n_metadata_blocks_p
)
1915 uint64_t n_blocks
, offset
;
1918 n_blocks
= device_size
;
1919 do_div(n_blocks
, block_size
+ sizeof(struct wc_memory_entry
));
1924 /* Verify the following entries[n_blocks] won't overflow */
1925 if (n_blocks
>= ((size_t)-sizeof(struct wc_memory_superblock
) /
1926 sizeof(struct wc_memory_entry
)))
1928 offset
= offsetof(struct wc_memory_superblock
, entries
[n_blocks
]);
1929 offset
= (offset
+ block_size
- 1) & ~(uint64_t)(block_size
- 1);
1930 if (offset
+ n_blocks
* block_size
<= device_size
)
1935 /* check if the bit field overflows */
1937 if (e
.index
!= n_blocks
)
1941 *n_blocks_p
= n_blocks
;
1942 if (n_metadata_blocks_p
)
1943 *n_metadata_blocks_p
= offset
>> __ffs(block_size
);
1947 static int init_memory(struct dm_writecache
*wc
)
1952 r
= calculate_memory_size(wc
->memory_map_size
, wc
->block_size
, &wc
->n_blocks
, NULL
);
1956 r
= writecache_alloc_entries(wc
);
1960 for (b
= 0; b
< ARRAY_SIZE(sb(wc
)->padding
); b
++)
1961 pmem_assign(sb(wc
)->padding
[b
], cpu_to_le64(0));
1962 pmem_assign(sb(wc
)->version
, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION
));
1963 pmem_assign(sb(wc
)->block_size
, cpu_to_le32(wc
->block_size
));
1964 pmem_assign(sb(wc
)->n_blocks
, cpu_to_le64(wc
->n_blocks
));
1965 pmem_assign(sb(wc
)->seq_count
, cpu_to_le64(0));
1967 for (b
= 0; b
< wc
->n_blocks
; b
++) {
1968 write_original_sector_seq_count(wc
, &wc
->entries
[b
], -1, -1);
1972 writecache_flush_all_metadata(wc
);
1973 writecache_commit_flushed(wc
, false);
1974 pmem_assign(sb(wc
)->magic
, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC
));
1975 writecache_flush_region(wc
, &sb(wc
)->magic
, sizeof sb(wc
)->magic
);
1976 writecache_commit_flushed(wc
, false);
1981 static void writecache_dtr(struct dm_target
*ti
)
1983 struct dm_writecache
*wc
= ti
->private;
1988 if (wc
->endio_thread
)
1989 kthread_stop(wc
->endio_thread
);
1991 if (wc
->flush_thread
)
1992 kthread_stop(wc
->flush_thread
);
1994 bioset_exit(&wc
->bio_set
);
1996 mempool_exit(&wc
->copy_pool
);
1998 if (wc
->writeback_wq
)
1999 destroy_workqueue(wc
->writeback_wq
);
2002 dm_put_device(ti
, wc
->dev
);
2005 dm_put_device(ti
, wc
->ssd_dev
);
2010 if (wc
->memory_map
) {
2011 if (WC_MODE_PMEM(wc
))
2012 persistent_memory_release(wc
);
2014 vfree(wc
->memory_map
);
2018 dm_kcopyd_client_destroy(wc
->dm_kcopyd
);
2021 dm_io_client_destroy(wc
->dm_io
);
2023 if (wc
->dirty_bitmap
)
2024 vfree(wc
->dirty_bitmap
);
2029 static int writecache_ctr(struct dm_target
*ti
, unsigned argc
, char **argv
)
2031 struct dm_writecache
*wc
;
2032 struct dm_arg_set as
;
2034 unsigned opt_params
;
2035 size_t offset
, data_size
;
2038 int high_wm_percent
= HIGH_WATERMARK
;
2039 int low_wm_percent
= LOW_WATERMARK
;
2041 struct wc_memory_superblock s
;
2043 static struct dm_arg _args
[] = {
2044 {0, 16, "Invalid number of feature args"},
2050 wc
= kzalloc(sizeof(struct dm_writecache
), GFP_KERNEL
);
2052 ti
->error
= "Cannot allocate writecache structure";
2059 mutex_init(&wc
->lock
);
2060 wc
->max_age
= MAX_AGE_UNSPECIFIED
;
2061 writecache_poison_lists(wc
);
2062 init_waitqueue_head(&wc
->freelist_wait
);
2063 timer_setup(&wc
->autocommit_timer
, writecache_autocommit_timer
, 0);
2064 timer_setup(&wc
->max_age_timer
, writecache_max_age_timer
, 0);
2066 for (i
= 0; i
< 2; i
++) {
2067 atomic_set(&wc
->bio_in_progress
[i
], 0);
2068 init_waitqueue_head(&wc
->bio_in_progress_wait
[i
]);
2071 wc
->dm_io
= dm_io_client_create();
2072 if (IS_ERR(wc
->dm_io
)) {
2073 r
= PTR_ERR(wc
->dm_io
);
2074 ti
->error
= "Unable to allocate dm-io client";
2079 wc
->writeback_wq
= alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM
, 1);
2080 if (!wc
->writeback_wq
) {
2082 ti
->error
= "Could not allocate writeback workqueue";
2085 INIT_WORK(&wc
->writeback_work
, writecache_writeback
);
2086 INIT_WORK(&wc
->flush_work
, writecache_flush_work
);
2088 raw_spin_lock_init(&wc
->endio_list_lock
);
2089 INIT_LIST_HEAD(&wc
->endio_list
);
2090 wc
->endio_thread
= kthread_create(writecache_endio_thread
, wc
, "writecache_endio");
2091 if (IS_ERR(wc
->endio_thread
)) {
2092 r
= PTR_ERR(wc
->endio_thread
);
2093 wc
->endio_thread
= NULL
;
2094 ti
->error
= "Couldn't spawn endio thread";
2097 wake_up_process(wc
->endio_thread
);
2100 * Parse the mode (pmem or ssd)
2102 string
= dm_shift_arg(&as
);
2106 if (!strcasecmp(string
, "s")) {
2107 wc
->pmem_mode
= false;
2108 } else if (!strcasecmp(string
, "p")) {
2109 #ifdef DM_WRITECACHE_HAS_PMEM
2110 wc
->pmem_mode
= true;
2111 wc
->writeback_fua
= true;
2114 * If the architecture doesn't support persistent memory or
2115 * the kernel doesn't support any DAX drivers, this driver can
2116 * only be used in SSD-only mode.
2119 ti
->error
= "Persistent memory or DAX not supported on this system";
2126 if (WC_MODE_PMEM(wc
)) {
2127 r
= bioset_init(&wc
->bio_set
, BIO_POOL_SIZE
,
2128 offsetof(struct writeback_struct
, bio
),
2131 ti
->error
= "Could not allocate bio set";
2135 r
= mempool_init_kmalloc_pool(&wc
->copy_pool
, 1, sizeof(struct copy_struct
));
2137 ti
->error
= "Could not allocate mempool";
2143 * Parse the origin data device
2145 string
= dm_shift_arg(&as
);
2148 r
= dm_get_device(ti
, string
, dm_table_get_mode(ti
->table
), &wc
->dev
);
2150 ti
->error
= "Origin data device lookup failed";
2155 * Parse cache data device (be it pmem or ssd)
2157 string
= dm_shift_arg(&as
);
2161 r
= dm_get_device(ti
, string
, dm_table_get_mode(ti
->table
), &wc
->ssd_dev
);
2163 ti
->error
= "Cache data device lookup failed";
2166 wc
->memory_map_size
= i_size_read(wc
->ssd_dev
->bdev
->bd_inode
);
2169 * Parse the cache block size
2171 string
= dm_shift_arg(&as
);
2174 if (sscanf(string
, "%u%c", &wc
->block_size
, &dummy
) != 1 ||
2175 wc
->block_size
< 512 || wc
->block_size
> PAGE_SIZE
||
2176 (wc
->block_size
& (wc
->block_size
- 1))) {
2178 ti
->error
= "Invalid block size";
2181 if (wc
->block_size
< bdev_logical_block_size(wc
->dev
->bdev
) ||
2182 wc
->block_size
< bdev_logical_block_size(wc
->ssd_dev
->bdev
)) {
2184 ti
->error
= "Block size is smaller than device logical block size";
2187 wc
->block_size_bits
= __ffs(wc
->block_size
);
2189 wc
->max_writeback_jobs
= MAX_WRITEBACK_JOBS
;
2190 wc
->autocommit_blocks
= !WC_MODE_PMEM(wc
) ? AUTOCOMMIT_BLOCKS_SSD
: AUTOCOMMIT_BLOCKS_PMEM
;
2191 wc
->autocommit_jiffies
= msecs_to_jiffies(AUTOCOMMIT_MSEC
);
2194 * Parse optional arguments
2196 r
= dm_read_arg_group(_args
, &as
, &opt_params
, &ti
->error
);
2200 while (opt_params
) {
2201 string
= dm_shift_arg(&as
), opt_params
--;
2202 if (!strcasecmp(string
, "start_sector") && opt_params
>= 1) {
2203 unsigned long long start_sector
;
2204 string
= dm_shift_arg(&as
), opt_params
--;
2205 if (sscanf(string
, "%llu%c", &start_sector
, &dummy
) != 1)
2206 goto invalid_optional
;
2207 wc
->start_sector
= start_sector
;
2208 if (wc
->start_sector
!= start_sector
||
2209 wc
->start_sector
>= wc
->memory_map_size
>> SECTOR_SHIFT
)
2210 goto invalid_optional
;
2211 } else if (!strcasecmp(string
, "high_watermark") && opt_params
>= 1) {
2212 string
= dm_shift_arg(&as
), opt_params
--;
2213 if (sscanf(string
, "%d%c", &high_wm_percent
, &dummy
) != 1)
2214 goto invalid_optional
;
2215 if (high_wm_percent
< 0 || high_wm_percent
> 100)
2216 goto invalid_optional
;
2217 wc
->high_wm_percent_set
= true;
2218 } else if (!strcasecmp(string
, "low_watermark") && opt_params
>= 1) {
2219 string
= dm_shift_arg(&as
), opt_params
--;
2220 if (sscanf(string
, "%d%c", &low_wm_percent
, &dummy
) != 1)
2221 goto invalid_optional
;
2222 if (low_wm_percent
< 0 || low_wm_percent
> 100)
2223 goto invalid_optional
;
2224 wc
->low_wm_percent_set
= true;
2225 } else if (!strcasecmp(string
, "writeback_jobs") && opt_params
>= 1) {
2226 string
= dm_shift_arg(&as
), opt_params
--;
2227 if (sscanf(string
, "%u%c", &wc
->max_writeback_jobs
, &dummy
) != 1)
2228 goto invalid_optional
;
2229 wc
->max_writeback_jobs_set
= true;
2230 } else if (!strcasecmp(string
, "autocommit_blocks") && opt_params
>= 1) {
2231 string
= dm_shift_arg(&as
), opt_params
--;
2232 if (sscanf(string
, "%u%c", &wc
->autocommit_blocks
, &dummy
) != 1)
2233 goto invalid_optional
;
2234 wc
->autocommit_blocks_set
= true;
2235 } else if (!strcasecmp(string
, "autocommit_time") && opt_params
>= 1) {
2236 unsigned autocommit_msecs
;
2237 string
= dm_shift_arg(&as
), opt_params
--;
2238 if (sscanf(string
, "%u%c", &autocommit_msecs
, &dummy
) != 1)
2239 goto invalid_optional
;
2240 if (autocommit_msecs
> 3600000)
2241 goto invalid_optional
;
2242 wc
->autocommit_jiffies
= msecs_to_jiffies(autocommit_msecs
);
2243 wc
->autocommit_time_set
= true;
2244 } else if (!strcasecmp(string
, "max_age") && opt_params
>= 1) {
2245 unsigned max_age_msecs
;
2246 string
= dm_shift_arg(&as
), opt_params
--;
2247 if (sscanf(string
, "%u%c", &max_age_msecs
, &dummy
) != 1)
2248 goto invalid_optional
;
2249 if (max_age_msecs
> 86400000)
2250 goto invalid_optional
;
2251 wc
->max_age
= msecs_to_jiffies(max_age_msecs
);
2252 } else if (!strcasecmp(string
, "cleaner")) {
2254 } else if (!strcasecmp(string
, "fua")) {
2255 if (WC_MODE_PMEM(wc
)) {
2256 wc
->writeback_fua
= true;
2257 wc
->writeback_fua_set
= true;
2258 } else goto invalid_optional
;
2259 } else if (!strcasecmp(string
, "nofua")) {
2260 if (WC_MODE_PMEM(wc
)) {
2261 wc
->writeback_fua
= false;
2262 wc
->writeback_fua_set
= true;
2263 } else goto invalid_optional
;
2267 ti
->error
= "Invalid optional argument";
2272 if (high_wm_percent
< low_wm_percent
) {
2274 ti
->error
= "High watermark must be greater than or equal to low watermark";
2278 if (WC_MODE_PMEM(wc
)) {
2279 if (!dax_synchronous(wc
->ssd_dev
->dax_dev
)) {
2281 ti
->error
= "Asynchronous persistent memory not supported as pmem cache";
2285 r
= persistent_memory_claim(wc
);
2287 ti
->error
= "Unable to map persistent memory for cache";
2291 size_t n_blocks
, n_metadata_blocks
;
2292 uint64_t n_bitmap_bits
;
2294 wc
->memory_map_size
-= (uint64_t)wc
->start_sector
<< SECTOR_SHIFT
;
2296 bio_list_init(&wc
->flush_list
);
2297 wc
->flush_thread
= kthread_create(writecache_flush_thread
, wc
, "dm_writecache_flush");
2298 if (IS_ERR(wc
->flush_thread
)) {
2299 r
= PTR_ERR(wc
->flush_thread
);
2300 wc
->flush_thread
= NULL
;
2301 ti
->error
= "Couldn't spawn flush thread";
2304 wake_up_process(wc
->flush_thread
);
2306 r
= calculate_memory_size(wc
->memory_map_size
, wc
->block_size
,
2307 &n_blocks
, &n_metadata_blocks
);
2309 ti
->error
= "Invalid device size";
2313 n_bitmap_bits
= (((uint64_t)n_metadata_blocks
<< wc
->block_size_bits
) +
2314 BITMAP_GRANULARITY
- 1) / BITMAP_GRANULARITY
;
2315 /* this is limitation of test_bit functions */
2316 if (n_bitmap_bits
> 1U << 31) {
2318 ti
->error
= "Invalid device size";
2322 wc
->memory_map
= vmalloc(n_metadata_blocks
<< wc
->block_size_bits
);
2323 if (!wc
->memory_map
) {
2325 ti
->error
= "Unable to allocate memory for metadata";
2329 wc
->dm_kcopyd
= dm_kcopyd_client_create(&dm_kcopyd_throttle
);
2330 if (IS_ERR(wc
->dm_kcopyd
)) {
2331 r
= PTR_ERR(wc
->dm_kcopyd
);
2332 ti
->error
= "Unable to allocate dm-kcopyd client";
2333 wc
->dm_kcopyd
= NULL
;
2337 wc
->metadata_sectors
= n_metadata_blocks
<< (wc
->block_size_bits
- SECTOR_SHIFT
);
2338 wc
->dirty_bitmap_size
= (n_bitmap_bits
+ BITS_PER_LONG
- 1) /
2339 BITS_PER_LONG
* sizeof(unsigned long);
2340 wc
->dirty_bitmap
= vzalloc(wc
->dirty_bitmap_size
);
2341 if (!wc
->dirty_bitmap
) {
2343 ti
->error
= "Unable to allocate dirty bitmap";
2347 r
= writecache_read_metadata(wc
, wc
->block_size
>> SECTOR_SHIFT
);
2349 ti
->error
= "Unable to read first block of metadata";
2354 r
= copy_mc_to_kernel(&s
, sb(wc
), sizeof(struct wc_memory_superblock
));
2356 ti
->error
= "Hardware memory error when reading superblock";
2359 if (!le32_to_cpu(s
.magic
) && !le32_to_cpu(s
.version
)) {
2360 r
= init_memory(wc
);
2362 ti
->error
= "Unable to initialize device";
2365 r
= copy_mc_to_kernel(&s
, sb(wc
),
2366 sizeof(struct wc_memory_superblock
));
2368 ti
->error
= "Hardware memory error when reading superblock";
2373 if (le32_to_cpu(s
.magic
) != MEMORY_SUPERBLOCK_MAGIC
) {
2374 ti
->error
= "Invalid magic in the superblock";
2379 if (le32_to_cpu(s
.version
) != MEMORY_SUPERBLOCK_VERSION
) {
2380 ti
->error
= "Invalid version in the superblock";
2385 if (le32_to_cpu(s
.block_size
) != wc
->block_size
) {
2386 ti
->error
= "Block size does not match superblock";
2391 wc
->n_blocks
= le64_to_cpu(s
.n_blocks
);
2393 offset
= wc
->n_blocks
* sizeof(struct wc_memory_entry
);
2394 if (offset
/ sizeof(struct wc_memory_entry
) != le64_to_cpu(sb(wc
)->n_blocks
)) {
2396 ti
->error
= "Overflow in size calculation";
2400 offset
+= sizeof(struct wc_memory_superblock
);
2401 if (offset
< sizeof(struct wc_memory_superblock
))
2403 offset
= (offset
+ wc
->block_size
- 1) & ~(size_t)(wc
->block_size
- 1);
2404 data_size
= wc
->n_blocks
* (size_t)wc
->block_size
;
2405 if (!offset
|| (data_size
/ wc
->block_size
!= wc
->n_blocks
) ||
2406 (offset
+ data_size
< offset
))
2408 if (offset
+ data_size
> wc
->memory_map_size
) {
2409 ti
->error
= "Memory area is too small";
2414 wc
->metadata_sectors
= offset
>> SECTOR_SHIFT
;
2415 wc
->block_start
= (char *)sb(wc
) + offset
;
2417 x
= (uint64_t)wc
->n_blocks
* (100 - high_wm_percent
);
2420 wc
->freelist_high_watermark
= x
;
2421 x
= (uint64_t)wc
->n_blocks
* (100 - low_wm_percent
);
2424 wc
->freelist_low_watermark
= x
;
2427 activate_cleaner(wc
);
2429 r
= writecache_alloc_entries(wc
);
2431 ti
->error
= "Cannot allocate memory";
2435 ti
->num_flush_bios
= 1;
2436 ti
->flush_supported
= true;
2437 ti
->num_discard_bios
= 1;
2439 if (WC_MODE_PMEM(wc
))
2440 persistent_memory_flush_cache(wc
->memory_map
, wc
->memory_map_size
);
2446 ti
->error
= "Bad arguments";
2452 static void writecache_status(struct dm_target
*ti
, status_type_t type
,
2453 unsigned status_flags
, char *result
, unsigned maxlen
)
2455 struct dm_writecache
*wc
= ti
->private;
2456 unsigned extra_args
;
2461 case STATUSTYPE_INFO
:
2462 DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc
),
2463 (unsigned long long)wc
->n_blocks
, (unsigned long long)wc
->freelist_size
,
2464 (unsigned long long)wc
->writeback_size
);
2466 case STATUSTYPE_TABLE
:
2467 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc
) ? 'p' : 's',
2468 wc
->dev
->name
, wc
->ssd_dev
->name
, wc
->block_size
);
2470 if (wc
->start_sector
)
2472 if (wc
->high_wm_percent_set
&& !wc
->cleaner
)
2474 if (wc
->low_wm_percent_set
&& !wc
->cleaner
)
2476 if (wc
->max_writeback_jobs_set
)
2478 if (wc
->autocommit_blocks_set
)
2480 if (wc
->autocommit_time_set
)
2482 if (wc
->max_age
!= MAX_AGE_UNSPECIFIED
)
2486 if (wc
->writeback_fua_set
)
2489 DMEMIT("%u", extra_args
);
2490 if (wc
->start_sector
)
2491 DMEMIT(" start_sector %llu", (unsigned long long)wc
->start_sector
);
2492 if (wc
->high_wm_percent_set
&& !wc
->cleaner
) {
2493 x
= (uint64_t)wc
->freelist_high_watermark
* 100;
2494 x
+= wc
->n_blocks
/ 2;
2495 do_div(x
, (size_t)wc
->n_blocks
);
2496 DMEMIT(" high_watermark %u", 100 - (unsigned)x
);
2498 if (wc
->low_wm_percent_set
&& !wc
->cleaner
) {
2499 x
= (uint64_t)wc
->freelist_low_watermark
* 100;
2500 x
+= wc
->n_blocks
/ 2;
2501 do_div(x
, (size_t)wc
->n_blocks
);
2502 DMEMIT(" low_watermark %u", 100 - (unsigned)x
);
2504 if (wc
->max_writeback_jobs_set
)
2505 DMEMIT(" writeback_jobs %u", wc
->max_writeback_jobs
);
2506 if (wc
->autocommit_blocks_set
)
2507 DMEMIT(" autocommit_blocks %u", wc
->autocommit_blocks
);
2508 if (wc
->autocommit_time_set
)
2509 DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc
->autocommit_jiffies
));
2510 if (wc
->max_age
!= MAX_AGE_UNSPECIFIED
)
2511 DMEMIT(" max_age %u", jiffies_to_msecs(wc
->max_age
));
2514 if (wc
->writeback_fua_set
)
2515 DMEMIT(" %sfua", wc
->writeback_fua
? "" : "no");
2520 static struct target_type writecache_target
= {
2521 .name
= "writecache",
2522 .version
= {1, 3, 0},
2523 .module
= THIS_MODULE
,
2524 .ctr
= writecache_ctr
,
2525 .dtr
= writecache_dtr
,
2526 .status
= writecache_status
,
2527 .postsuspend
= writecache_suspend
,
2528 .resume
= writecache_resume
,
2529 .message
= writecache_message
,
2530 .map
= writecache_map
,
2531 .end_io
= writecache_end_io
,
2532 .iterate_devices
= writecache_iterate_devices
,
2533 .io_hints
= writecache_io_hints
,
2536 static int __init
dm_writecache_init(void)
2540 r
= dm_register_target(&writecache_target
);
2542 DMERR("register failed %d", r
);
2549 static void __exit
dm_writecache_exit(void)
2551 dm_unregister_target(&writecache_target
);
2554 module_init(dm_writecache_init
);
2555 module_exit(dm_writecache_exit
);
2557 MODULE_DESCRIPTION(DM_NAME
" writecache target");
2558 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2559 MODULE_LICENSE("GPL");