1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2018 Red Hat. All rights reserved.
5 * This file is released under the GPL.
8 #include <linux/device-mapper.h>
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/vmalloc.h>
12 #include <linux/kthread.h>
13 #include <linux/dm-io.h>
14 #include <linux/dm-kcopyd.h>
15 #include <linux/dax.h>
16 #include <linux/pfn_t.h>
17 #include <linux/libnvdimm.h>
18 #include <linux/delay.h>
19 #include "dm-io-tracker.h"
21 #define DM_MSG_PREFIX "writecache"
23 #define HIGH_WATERMARK 50
24 #define LOW_WATERMARK 45
25 #define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16)
26 #define ENDIO_LATENCY 16
27 #define WRITEBACK_LATENCY 64
28 #define AUTOCOMMIT_BLOCKS_SSD 65536
29 #define AUTOCOMMIT_BLOCKS_PMEM 64
30 #define AUTOCOMMIT_MSEC 1000
31 #define MAX_AGE_DIV 16
32 #define MAX_AGE_UNSPECIFIED -1UL
33 #define PAUSE_WRITEBACK (HZ * 3)
35 #define BITMAP_GRANULARITY 65536
36 #if BITMAP_GRANULARITY < PAGE_SIZE
37 #undef BITMAP_GRANULARITY
38 #define BITMAP_GRANULARITY PAGE_SIZE
41 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX)
42 #define DM_WRITECACHE_HAS_PMEM
45 #ifdef DM_WRITECACHE_HAS_PMEM
46 #define pmem_assign(dest, src) \
48 typeof(dest) uniq = (src); \
49 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \
52 #define pmem_assign(dest, src) ((dest) = (src))
55 #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
56 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
59 #define MEMORY_SUPERBLOCK_MAGIC 0x23489321
60 #define MEMORY_SUPERBLOCK_VERSION 1
62 struct wc_memory_entry
{
63 __le64 original_sector
;
67 struct wc_memory_superblock
{
79 struct wc_memory_entry entries
[];
83 struct rb_node rb_node
;
85 unsigned short wc_list_contiguous
;
86 #if BITS_PER_LONG == 64
87 bool write_in_progress
: 1;
88 unsigned long index
: 47;
90 bool write_in_progress
;
94 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
95 uint64_t original_sector
;
100 #ifdef DM_WRITECACHE_HAS_PMEM
101 #define WC_MODE_PMEM(wc) ((wc)->pmem_mode)
102 #define WC_MODE_FUA(wc) ((wc)->writeback_fua)
104 #define WC_MODE_PMEM(wc) false
105 #define WC_MODE_FUA(wc) false
107 #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc))
109 struct dm_writecache
{
111 struct list_head lru
;
113 struct list_head freelist
;
115 struct rb_root freetree
;
116 struct wc_entry
*current_free
;
121 size_t freelist_size
;
122 size_t writeback_size
;
123 size_t freelist_high_watermark
;
124 size_t freelist_low_watermark
;
125 unsigned long max_age
;
128 unsigned int uncommitted_blocks
;
129 unsigned int autocommit_blocks
;
130 unsigned int max_writeback_jobs
;
134 unsigned long autocommit_jiffies
;
135 struct timer_list autocommit_timer
;
136 struct wait_queue_head freelist_wait
;
138 struct timer_list max_age_timer
;
140 atomic_t bio_in_progress
[2];
141 struct wait_queue_head bio_in_progress_wait
[2];
143 struct dm_target
*ti
;
145 struct dm_dev
*ssd_dev
;
146 sector_t start_sector
;
148 uint64_t memory_map_size
;
149 size_t metadata_sectors
;
152 sector_t data_device_sectors
;
154 struct wc_entry
*entries
;
155 unsigned int block_size
;
156 unsigned char block_size_bits
;
159 bool writeback_fua
:1;
161 bool overwrote_committed
:1;
162 bool memory_vmapped
:1;
164 bool start_sector_set
:1;
165 bool high_wm_percent_set
:1;
166 bool low_wm_percent_set
:1;
167 bool max_writeback_jobs_set
:1;
168 bool autocommit_blocks_set
:1;
169 bool autocommit_time_set
:1;
171 bool writeback_fua_set
:1;
172 bool flush_on_suspend
:1;
175 bool metadata_only
:1;
178 unsigned int high_wm_percent_value
;
179 unsigned int low_wm_percent_value
;
180 unsigned int autocommit_time_value
;
181 unsigned int max_age_value
;
182 unsigned int pause_value
;
184 unsigned int writeback_all
;
185 struct workqueue_struct
*writeback_wq
;
186 struct work_struct writeback_work
;
187 struct work_struct flush_work
;
189 struct dm_io_tracker iot
;
191 struct dm_io_client
*dm_io
;
193 raw_spinlock_t endio_list_lock
;
194 struct list_head endio_list
;
195 struct task_struct
*endio_thread
;
197 struct task_struct
*flush_thread
;
198 struct bio_list flush_list
;
200 struct dm_kcopyd_client
*dm_kcopyd
;
201 unsigned long *dirty_bitmap
;
202 unsigned int dirty_bitmap_size
;
204 struct bio_set bio_set
;
208 unsigned long long reads
;
209 unsigned long long read_hits
;
210 unsigned long long writes
;
211 unsigned long long write_hits_uncommitted
;
212 unsigned long long write_hits_committed
;
213 unsigned long long writes_around
;
214 unsigned long long writes_allocate
;
215 unsigned long long writes_blocked_on_freelist
;
216 unsigned long long flushes
;
217 unsigned long long discards
;
221 #define WB_LIST_INLINE 16
223 struct writeback_struct
{
224 struct list_head endio_entry
;
225 struct dm_writecache
*wc
;
226 struct wc_entry
**wc_list
;
227 unsigned int wc_list_n
;
228 struct wc_entry
*wc_list_inline
[WB_LIST_INLINE
];
233 struct list_head endio_entry
;
234 struct dm_writecache
*wc
;
236 unsigned int n_entries
;
240 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle
,
241 "A percentage of time allocated for data copying");
243 static void wc_lock(struct dm_writecache
*wc
)
245 mutex_lock(&wc
->lock
);
248 static void wc_unlock(struct dm_writecache
*wc
)
250 mutex_unlock(&wc
->lock
);
253 #ifdef DM_WRITECACHE_HAS_PMEM
254 static int persistent_memory_claim(struct dm_writecache
*wc
)
264 wc
->memory_vmapped
= false;
266 s
= wc
->memory_map_size
;
272 if (p
!= s
>> PAGE_SHIFT
) {
277 offset
= get_start_sect(wc
->ssd_dev
->bdev
);
278 if (offset
& (PAGE_SIZE
/ 512 - 1)) {
282 offset
>>= PAGE_SHIFT
- 9;
284 id
= dax_read_lock();
286 da
= dax_direct_access(wc
->ssd_dev
->dax_dev
, offset
, p
, DAX_ACCESS
,
287 &wc
->memory_map
, &pfn
);
289 wc
->memory_map
= NULL
;
293 if (!pfn_t_has_page(pfn
)) {
294 wc
->memory_map
= NULL
;
301 wc
->memory_map
= NULL
;
302 pages
= vmalloc_array(p
, sizeof(struct page
*));
311 daa
= dax_direct_access(wc
->ssd_dev
->dax_dev
, offset
+ i
,
312 p
- i
, DAX_ACCESS
, NULL
, &pfn
);
314 r
= daa
? daa
: -EINVAL
;
317 if (!pfn_t_has_page(pfn
)) {
321 while (daa
-- && i
< p
) {
322 pages
[i
++] = pfn_t_to_page(pfn
);
328 wc
->memory_map
= vmap(pages
, p
, VM_MAP
, PAGE_KERNEL
);
329 if (!wc
->memory_map
) {
334 wc
->memory_vmapped
= true;
339 wc
->memory_map
+= (size_t)wc
->start_sector
<< SECTOR_SHIFT
;
340 wc
->memory_map_size
-= (size_t)wc
->start_sector
<< SECTOR_SHIFT
;
351 static int persistent_memory_claim(struct dm_writecache
*wc
)
357 static void persistent_memory_release(struct dm_writecache
*wc
)
359 if (wc
->memory_vmapped
)
360 vunmap(wc
->memory_map
- ((size_t)wc
->start_sector
<< SECTOR_SHIFT
));
363 static struct page
*persistent_memory_page(void *addr
)
365 if (is_vmalloc_addr(addr
))
366 return vmalloc_to_page(addr
);
368 return virt_to_page(addr
);
371 static unsigned int persistent_memory_page_offset(void *addr
)
373 return (unsigned long)addr
& (PAGE_SIZE
- 1);
376 static void persistent_memory_flush_cache(void *ptr
, size_t size
)
378 if (is_vmalloc_addr(ptr
))
379 flush_kernel_vmap_range(ptr
, size
);
382 static void persistent_memory_invalidate_cache(void *ptr
, size_t size
)
384 if (is_vmalloc_addr(ptr
))
385 invalidate_kernel_vmap_range(ptr
, size
);
388 static struct wc_memory_superblock
*sb(struct dm_writecache
*wc
)
390 return wc
->memory_map
;
393 static struct wc_memory_entry
*memory_entry(struct dm_writecache
*wc
, struct wc_entry
*e
)
395 return &sb(wc
)->entries
[e
->index
];
398 static void *memory_data(struct dm_writecache
*wc
, struct wc_entry
*e
)
400 return (char *)wc
->block_start
+ (e
->index
<< wc
->block_size_bits
);
403 static sector_t
cache_sector(struct dm_writecache
*wc
, struct wc_entry
*e
)
405 return wc
->start_sector
+ wc
->metadata_sectors
+
406 ((sector_t
)e
->index
<< (wc
->block_size_bits
- SECTOR_SHIFT
));
409 static uint64_t read_original_sector(struct dm_writecache
*wc
, struct wc_entry
*e
)
411 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
412 return e
->original_sector
;
414 return le64_to_cpu(memory_entry(wc
, e
)->original_sector
);
418 static uint64_t read_seq_count(struct dm_writecache
*wc
, struct wc_entry
*e
)
420 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
423 return le64_to_cpu(memory_entry(wc
, e
)->seq_count
);
427 static void clear_seq_count(struct dm_writecache
*wc
, struct wc_entry
*e
)
429 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
432 pmem_assign(memory_entry(wc
, e
)->seq_count
, cpu_to_le64(-1));
435 static void write_original_sector_seq_count(struct dm_writecache
*wc
, struct wc_entry
*e
,
436 uint64_t original_sector
, uint64_t seq_count
)
438 struct wc_memory_entry me
;
439 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
440 e
->original_sector
= original_sector
;
441 e
->seq_count
= seq_count
;
443 me
.original_sector
= cpu_to_le64(original_sector
);
444 me
.seq_count
= cpu_to_le64(seq_count
);
445 pmem_assign(*memory_entry(wc
, e
), me
);
448 #define writecache_error(wc, err, msg, arg...) \
450 if (!cmpxchg(&(wc)->error, 0, err)) \
452 wake_up(&(wc)->freelist_wait); \
455 #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error)))
457 static void writecache_flush_all_metadata(struct dm_writecache
*wc
)
459 if (!WC_MODE_PMEM(wc
))
460 memset(wc
->dirty_bitmap
, -1, wc
->dirty_bitmap_size
);
463 static void writecache_flush_region(struct dm_writecache
*wc
, void *ptr
, size_t size
)
465 if (!WC_MODE_PMEM(wc
))
466 __set_bit(((char *)ptr
- (char *)wc
->memory_map
) / BITMAP_GRANULARITY
,
470 static void writecache_disk_flush(struct dm_writecache
*wc
, struct dm_dev
*dev
);
473 struct dm_writecache
*wc
;
478 static void writecache_notify_io(unsigned long error
, void *context
)
480 struct io_notify
*endio
= context
;
482 if (unlikely(error
!= 0))
483 writecache_error(endio
->wc
, -EIO
, "error writing metadata");
484 BUG_ON(atomic_read(&endio
->count
) <= 0);
485 if (atomic_dec_and_test(&endio
->count
))
489 static void writecache_wait_for_ios(struct dm_writecache
*wc
, int direction
)
491 wait_event(wc
->bio_in_progress_wait
[direction
],
492 !atomic_read(&wc
->bio_in_progress
[direction
]));
495 static void ssd_commit_flushed(struct dm_writecache
*wc
, bool wait_for_ios
)
497 struct dm_io_region region
;
498 struct dm_io_request req
;
499 struct io_notify endio
= {
501 COMPLETION_INITIALIZER_ONSTACK(endio
.c
),
504 unsigned int bitmap_bits
= wc
->dirty_bitmap_size
* 8;
510 i
= find_next_bit(wc
->dirty_bitmap
, bitmap_bits
, i
);
511 if (unlikely(i
== bitmap_bits
))
513 j
= find_next_zero_bit(wc
->dirty_bitmap
, bitmap_bits
, i
);
515 region
.bdev
= wc
->ssd_dev
->bdev
;
516 region
.sector
= (sector_t
)i
* (BITMAP_GRANULARITY
>> SECTOR_SHIFT
);
517 region
.count
= (sector_t
)(j
- i
) * (BITMAP_GRANULARITY
>> SECTOR_SHIFT
);
519 if (unlikely(region
.sector
>= wc
->metadata_sectors
))
521 if (unlikely(region
.sector
+ region
.count
> wc
->metadata_sectors
))
522 region
.count
= wc
->metadata_sectors
- region
.sector
;
524 region
.sector
+= wc
->start_sector
;
525 atomic_inc(&endio
.count
);
526 req
.bi_opf
= REQ_OP_WRITE
| REQ_SYNC
;
527 req
.mem
.type
= DM_IO_VMA
;
528 req
.mem
.ptr
.vma
= (char *)wc
->memory_map
+ (size_t)i
* BITMAP_GRANULARITY
;
529 req
.client
= wc
->dm_io
;
530 req
.notify
.fn
= writecache_notify_io
;
531 req
.notify
.context
= &endio
;
533 /* writing via async dm-io (implied by notify.fn above) won't return an error */
534 (void) dm_io(&req
, 1, ®ion
, NULL
, IOPRIO_DEFAULT
);
538 writecache_notify_io(0, &endio
);
539 wait_for_completion_io(&endio
.c
);
542 writecache_wait_for_ios(wc
, WRITE
);
544 writecache_disk_flush(wc
, wc
->ssd_dev
);
546 memset(wc
->dirty_bitmap
, 0, wc
->dirty_bitmap_size
);
549 static void ssd_commit_superblock(struct dm_writecache
*wc
)
552 struct dm_io_region region
;
553 struct dm_io_request req
;
555 region
.bdev
= wc
->ssd_dev
->bdev
;
557 region
.count
= max(4096U, wc
->block_size
) >> SECTOR_SHIFT
;
559 if (unlikely(region
.sector
+ region
.count
> wc
->metadata_sectors
))
560 region
.count
= wc
->metadata_sectors
- region
.sector
;
562 region
.sector
+= wc
->start_sector
;
564 req
.bi_opf
= REQ_OP_WRITE
| REQ_SYNC
| REQ_FUA
;
565 req
.mem
.type
= DM_IO_VMA
;
566 req
.mem
.ptr
.vma
= (char *)wc
->memory_map
;
567 req
.client
= wc
->dm_io
;
568 req
.notify
.fn
= NULL
;
569 req
.notify
.context
= NULL
;
571 r
= dm_io(&req
, 1, ®ion
, NULL
, IOPRIO_DEFAULT
);
573 writecache_error(wc
, r
, "error writing superblock");
576 static void writecache_commit_flushed(struct dm_writecache
*wc
, bool wait_for_ios
)
578 if (WC_MODE_PMEM(wc
))
581 ssd_commit_flushed(wc
, wait_for_ios
);
584 static void writecache_disk_flush(struct dm_writecache
*wc
, struct dm_dev
*dev
)
587 struct dm_io_region region
;
588 struct dm_io_request req
;
590 region
.bdev
= dev
->bdev
;
593 req
.bi_opf
= REQ_OP_WRITE
| REQ_PREFLUSH
;
594 req
.mem
.type
= DM_IO_KMEM
;
595 req
.mem
.ptr
.addr
= NULL
;
596 req
.client
= wc
->dm_io
;
597 req
.notify
.fn
= NULL
;
599 r
= dm_io(&req
, 1, ®ion
, NULL
, IOPRIO_DEFAULT
);
601 writecache_error(wc
, r
, "error flushing metadata: %d", r
);
604 #define WFE_RETURN_FOLLOWING 1
605 #define WFE_LOWEST_SEQ 2
607 static struct wc_entry
*writecache_find_entry(struct dm_writecache
*wc
,
608 uint64_t block
, int flags
)
611 struct rb_node
*node
= wc
->tree
.rb_node
;
617 e
= container_of(node
, struct wc_entry
, rb_node
);
618 if (read_original_sector(wc
, e
) == block
)
621 node
= (read_original_sector(wc
, e
) >= block
?
622 e
->rb_node
.rb_left
: e
->rb_node
.rb_right
);
623 if (unlikely(!node
)) {
624 if (!(flags
& WFE_RETURN_FOLLOWING
))
626 if (read_original_sector(wc
, e
) >= block
)
629 node
= rb_next(&e
->rb_node
);
633 e
= container_of(node
, struct wc_entry
, rb_node
);
641 if (flags
& WFE_LOWEST_SEQ
)
642 node
= rb_prev(&e
->rb_node
);
644 node
= rb_next(&e
->rb_node
);
647 e2
= container_of(node
, struct wc_entry
, rb_node
);
648 if (read_original_sector(wc
, e2
) != block
)
654 static void writecache_insert_entry(struct dm_writecache
*wc
, struct wc_entry
*ins
)
657 struct rb_node
**node
= &wc
->tree
.rb_node
, *parent
= NULL
;
660 e
= container_of(*node
, struct wc_entry
, rb_node
);
661 parent
= &e
->rb_node
;
662 if (read_original_sector(wc
, e
) > read_original_sector(wc
, ins
))
663 node
= &parent
->rb_left
;
665 node
= &parent
->rb_right
;
667 rb_link_node(&ins
->rb_node
, parent
, node
);
668 rb_insert_color(&ins
->rb_node
, &wc
->tree
);
669 list_add(&ins
->lru
, &wc
->lru
);
673 static void writecache_unlink(struct dm_writecache
*wc
, struct wc_entry
*e
)
676 rb_erase(&e
->rb_node
, &wc
->tree
);
679 static void writecache_add_to_freelist(struct dm_writecache
*wc
, struct wc_entry
*e
)
681 if (WC_MODE_SORT_FREELIST(wc
)) {
682 struct rb_node
**node
= &wc
->freetree
.rb_node
, *parent
= NULL
;
684 if (unlikely(!*node
))
685 wc
->current_free
= e
;
688 if (&e
->rb_node
< *node
)
689 node
= &parent
->rb_left
;
691 node
= &parent
->rb_right
;
693 rb_link_node(&e
->rb_node
, parent
, node
);
694 rb_insert_color(&e
->rb_node
, &wc
->freetree
);
696 list_add_tail(&e
->lru
, &wc
->freelist
);
701 static inline void writecache_verify_watermark(struct dm_writecache
*wc
)
703 if (unlikely(wc
->freelist_size
+ wc
->writeback_size
<= wc
->freelist_high_watermark
))
704 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
707 static void writecache_max_age_timer(struct timer_list
*t
)
709 struct dm_writecache
*wc
= from_timer(wc
, t
, max_age_timer
);
711 if (!dm_suspended(wc
->ti
) && !writecache_has_error(wc
)) {
712 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
713 mod_timer(&wc
->max_age_timer
, jiffies
+ wc
->max_age
/ MAX_AGE_DIV
);
717 static struct wc_entry
*writecache_pop_from_freelist(struct dm_writecache
*wc
, sector_t expected_sector
)
721 if (WC_MODE_SORT_FREELIST(wc
)) {
722 struct rb_node
*next
;
724 if (unlikely(!wc
->current_free
))
726 e
= wc
->current_free
;
727 if (expected_sector
!= (sector_t
)-1 && unlikely(cache_sector(wc
, e
) != expected_sector
))
729 next
= rb_next(&e
->rb_node
);
730 rb_erase(&e
->rb_node
, &wc
->freetree
);
732 next
= rb_first(&wc
->freetree
);
733 wc
->current_free
= next
? container_of(next
, struct wc_entry
, rb_node
) : NULL
;
735 if (unlikely(list_empty(&wc
->freelist
)))
737 e
= container_of(wc
->freelist
.next
, struct wc_entry
, lru
);
738 if (expected_sector
!= (sector_t
)-1 && unlikely(cache_sector(wc
, e
) != expected_sector
))
744 writecache_verify_watermark(wc
);
749 static void writecache_free_entry(struct dm_writecache
*wc
, struct wc_entry
*e
)
751 writecache_unlink(wc
, e
);
752 writecache_add_to_freelist(wc
, e
);
753 clear_seq_count(wc
, e
);
754 writecache_flush_region(wc
, memory_entry(wc
, e
), sizeof(struct wc_memory_entry
));
755 if (unlikely(waitqueue_active(&wc
->freelist_wait
)))
756 wake_up(&wc
->freelist_wait
);
759 static void writecache_wait_on_freelist(struct dm_writecache
*wc
)
763 prepare_to_wait(&wc
->freelist_wait
, &wait
, TASK_UNINTERRUPTIBLE
);
766 finish_wait(&wc
->freelist_wait
, &wait
);
770 static void writecache_poison_lists(struct dm_writecache
*wc
)
773 * Catch incorrect access to these values while the device is suspended.
775 memset(&wc
->tree
, -1, sizeof(wc
->tree
));
776 wc
->lru
.next
= LIST_POISON1
;
777 wc
->lru
.prev
= LIST_POISON2
;
778 wc
->freelist
.next
= LIST_POISON1
;
779 wc
->freelist
.prev
= LIST_POISON2
;
782 static void writecache_flush_entry(struct dm_writecache
*wc
, struct wc_entry
*e
)
784 writecache_flush_region(wc
, memory_entry(wc
, e
), sizeof(struct wc_memory_entry
));
785 if (WC_MODE_PMEM(wc
))
786 writecache_flush_region(wc
, memory_data(wc
, e
), wc
->block_size
);
789 static bool writecache_entry_is_committed(struct dm_writecache
*wc
, struct wc_entry
*e
)
791 return read_seq_count(wc
, e
) < wc
->seq_count
;
794 static void writecache_flush(struct dm_writecache
*wc
)
796 struct wc_entry
*e
, *e2
;
797 bool need_flush_after_free
;
799 wc
->uncommitted_blocks
= 0;
800 del_timer(&wc
->autocommit_timer
);
802 if (list_empty(&wc
->lru
))
805 e
= container_of(wc
->lru
.next
, struct wc_entry
, lru
);
806 if (writecache_entry_is_committed(wc
, e
)) {
807 if (wc
->overwrote_committed
) {
808 writecache_wait_for_ios(wc
, WRITE
);
809 writecache_disk_flush(wc
, wc
->ssd_dev
);
810 wc
->overwrote_committed
= false;
815 writecache_flush_entry(wc
, e
);
816 if (unlikely(e
->lru
.next
== &wc
->lru
))
818 e2
= container_of(e
->lru
.next
, struct wc_entry
, lru
);
819 if (writecache_entry_is_committed(wc
, e2
))
824 writecache_commit_flushed(wc
, true);
827 pmem_assign(sb(wc
)->seq_count
, cpu_to_le64(wc
->seq_count
));
828 if (WC_MODE_PMEM(wc
))
829 writecache_commit_flushed(wc
, false);
831 ssd_commit_superblock(wc
);
833 wc
->overwrote_committed
= false;
835 need_flush_after_free
= false;
837 /* Free another committed entry with lower seq-count */
838 struct rb_node
*rb_node
= rb_prev(&e
->rb_node
);
841 e2
= container_of(rb_node
, struct wc_entry
, rb_node
);
842 if (read_original_sector(wc
, e2
) == read_original_sector(wc
, e
) &&
843 likely(!e2
->write_in_progress
)) {
844 writecache_free_entry(wc
, e2
);
845 need_flush_after_free
= true;
848 if (unlikely(e
->lru
.prev
== &wc
->lru
))
850 e
= container_of(e
->lru
.prev
, struct wc_entry
, lru
);
854 if (need_flush_after_free
)
855 writecache_commit_flushed(wc
, false);
858 static void writecache_flush_work(struct work_struct
*work
)
860 struct dm_writecache
*wc
= container_of(work
, struct dm_writecache
, flush_work
);
863 writecache_flush(wc
);
867 static void writecache_autocommit_timer(struct timer_list
*t
)
869 struct dm_writecache
*wc
= from_timer(wc
, t
, autocommit_timer
);
871 if (!writecache_has_error(wc
))
872 queue_work(wc
->writeback_wq
, &wc
->flush_work
);
875 static void writecache_schedule_autocommit(struct dm_writecache
*wc
)
877 if (!timer_pending(&wc
->autocommit_timer
))
878 mod_timer(&wc
->autocommit_timer
, jiffies
+ wc
->autocommit_jiffies
);
881 static void writecache_discard(struct dm_writecache
*wc
, sector_t start
, sector_t end
)
884 bool discarded_something
= false;
886 e
= writecache_find_entry(wc
, start
, WFE_RETURN_FOLLOWING
| WFE_LOWEST_SEQ
);
890 while (read_original_sector(wc
, e
) < end
) {
891 struct rb_node
*node
= rb_next(&e
->rb_node
);
893 if (likely(!e
->write_in_progress
)) {
894 if (!discarded_something
) {
895 if (!WC_MODE_PMEM(wc
)) {
896 writecache_wait_for_ios(wc
, READ
);
897 writecache_wait_for_ios(wc
, WRITE
);
899 discarded_something
= true;
901 if (!writecache_entry_is_committed(wc
, e
))
902 wc
->uncommitted_blocks
--;
903 writecache_free_entry(wc
, e
);
909 e
= container_of(node
, struct wc_entry
, rb_node
);
912 if (discarded_something
)
913 writecache_commit_flushed(wc
, false);
916 static bool writecache_wait_for_writeback(struct dm_writecache
*wc
)
918 if (wc
->writeback_size
) {
919 writecache_wait_on_freelist(wc
);
925 static void writecache_suspend(struct dm_target
*ti
)
927 struct dm_writecache
*wc
= ti
->private;
928 bool flush_on_suspend
;
930 del_timer_sync(&wc
->autocommit_timer
);
931 del_timer_sync(&wc
->max_age_timer
);
934 writecache_flush(wc
);
935 flush_on_suspend
= wc
->flush_on_suspend
;
936 if (flush_on_suspend
) {
937 wc
->flush_on_suspend
= false;
939 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
943 drain_workqueue(wc
->writeback_wq
);
946 if (flush_on_suspend
)
948 while (writecache_wait_for_writeback(wc
))
951 if (WC_MODE_PMEM(wc
))
952 persistent_memory_flush_cache(wc
->memory_map
, wc
->memory_map_size
);
954 writecache_poison_lists(wc
);
959 static int writecache_alloc_entries(struct dm_writecache
*wc
)
965 wc
->entries
= vmalloc_array(wc
->n_blocks
, sizeof(struct wc_entry
));
968 for (b
= 0; b
< wc
->n_blocks
; b
++) {
969 struct wc_entry
*e
= &wc
->entries
[b
];
972 e
->write_in_progress
= false;
979 static int writecache_read_metadata(struct dm_writecache
*wc
, sector_t n_sectors
)
981 struct dm_io_region region
;
982 struct dm_io_request req
;
984 region
.bdev
= wc
->ssd_dev
->bdev
;
985 region
.sector
= wc
->start_sector
;
986 region
.count
= n_sectors
;
987 req
.bi_opf
= REQ_OP_READ
| REQ_SYNC
;
988 req
.mem
.type
= DM_IO_VMA
;
989 req
.mem
.ptr
.vma
= (char *)wc
->memory_map
;
990 req
.client
= wc
->dm_io
;
991 req
.notify
.fn
= NULL
;
993 return dm_io(&req
, 1, ®ion
, NULL
, IOPRIO_DEFAULT
);
996 static void writecache_resume(struct dm_target
*ti
)
998 struct dm_writecache
*wc
= ti
->private;
1000 bool need_flush
= false;
1001 __le64 sb_seq_count
;
1006 wc
->data_device_sectors
= bdev_nr_sectors(wc
->dev
->bdev
);
1008 if (WC_MODE_PMEM(wc
)) {
1009 persistent_memory_invalidate_cache(wc
->memory_map
, wc
->memory_map_size
);
1011 r
= writecache_read_metadata(wc
, wc
->metadata_sectors
);
1013 size_t sb_entries_offset
;
1015 writecache_error(wc
, r
, "unable to read metadata: %d", r
);
1016 sb_entries_offset
= offsetof(struct wc_memory_superblock
, entries
);
1017 memset((char *)wc
->memory_map
+ sb_entries_offset
, -1,
1018 (wc
->metadata_sectors
<< SECTOR_SHIFT
) - sb_entries_offset
);
1023 INIT_LIST_HEAD(&wc
->lru
);
1024 if (WC_MODE_SORT_FREELIST(wc
)) {
1025 wc
->freetree
= RB_ROOT
;
1026 wc
->current_free
= NULL
;
1028 INIT_LIST_HEAD(&wc
->freelist
);
1030 wc
->freelist_size
= 0;
1032 r
= copy_mc_to_kernel(&sb_seq_count
, &sb(wc
)->seq_count
,
1035 writecache_error(wc
, r
, "hardware memory error when reading superblock: %d", r
);
1036 sb_seq_count
= cpu_to_le64(0);
1038 wc
->seq_count
= le64_to_cpu(sb_seq_count
);
1040 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
1041 for (b
= 0; b
< wc
->n_blocks
; b
++) {
1042 struct wc_entry
*e
= &wc
->entries
[b
];
1043 struct wc_memory_entry wme
;
1045 if (writecache_has_error(wc
)) {
1046 e
->original_sector
= -1;
1050 r
= copy_mc_to_kernel(&wme
, memory_entry(wc
, e
),
1051 sizeof(struct wc_memory_entry
));
1053 writecache_error(wc
, r
, "hardware memory error when reading metadata entry %lu: %d",
1054 (unsigned long)b
, r
);
1055 e
->original_sector
= -1;
1058 e
->original_sector
= le64_to_cpu(wme
.original_sector
);
1059 e
->seq_count
= le64_to_cpu(wme
.seq_count
);
1064 for (b
= 0; b
< wc
->n_blocks
; b
++) {
1065 struct wc_entry
*e
= &wc
->entries
[b
];
1067 if (!writecache_entry_is_committed(wc
, e
)) {
1068 if (read_seq_count(wc
, e
) != -1) {
1070 clear_seq_count(wc
, e
);
1073 writecache_add_to_freelist(wc
, e
);
1075 struct wc_entry
*old
;
1077 old
= writecache_find_entry(wc
, read_original_sector(wc
, e
), 0);
1079 writecache_insert_entry(wc
, e
);
1081 if (read_seq_count(wc
, old
) == read_seq_count(wc
, e
)) {
1082 writecache_error(wc
, -EINVAL
,
1083 "two identical entries, position %llu, sector %llu, sequence %llu",
1084 (unsigned long long)b
, (unsigned long long)read_original_sector(wc
, e
),
1085 (unsigned long long)read_seq_count(wc
, e
));
1087 if (read_seq_count(wc
, old
) > read_seq_count(wc
, e
)) {
1090 writecache_free_entry(wc
, old
);
1091 writecache_insert_entry(wc
, e
);
1100 writecache_flush_all_metadata(wc
);
1101 writecache_commit_flushed(wc
, false);
1104 writecache_verify_watermark(wc
);
1106 if (wc
->max_age
!= MAX_AGE_UNSPECIFIED
)
1107 mod_timer(&wc
->max_age_timer
, jiffies
+ wc
->max_age
/ MAX_AGE_DIV
);
1112 static int process_flush_mesg(unsigned int argc
, char **argv
, struct dm_writecache
*wc
)
1118 if (dm_suspended(wc
->ti
)) {
1122 if (writecache_has_error(wc
)) {
1127 writecache_flush(wc
);
1128 wc
->writeback_all
++;
1129 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
1132 flush_workqueue(wc
->writeback_wq
);
1135 wc
->writeback_all
--;
1136 if (writecache_has_error(wc
)) {
1145 static int process_flush_on_suspend_mesg(unsigned int argc
, char **argv
, struct dm_writecache
*wc
)
1151 wc
->flush_on_suspend
= true;
1157 static void activate_cleaner(struct dm_writecache
*wc
)
1159 wc
->flush_on_suspend
= true;
1161 wc
->freelist_high_watermark
= wc
->n_blocks
;
1162 wc
->freelist_low_watermark
= wc
->n_blocks
;
1165 static int process_cleaner_mesg(unsigned int argc
, char **argv
, struct dm_writecache
*wc
)
1171 activate_cleaner(wc
);
1172 if (!dm_suspended(wc
->ti
))
1173 writecache_verify_watermark(wc
);
1179 static int process_clear_stats_mesg(unsigned int argc
, char **argv
, struct dm_writecache
*wc
)
1185 memset(&wc
->stats
, 0, sizeof(wc
->stats
));
1191 static int writecache_message(struct dm_target
*ti
, unsigned int argc
, char **argv
,
1192 char *result
, unsigned int maxlen
)
1195 struct dm_writecache
*wc
= ti
->private;
1197 if (!strcasecmp(argv
[0], "flush"))
1198 r
= process_flush_mesg(argc
, argv
, wc
);
1199 else if (!strcasecmp(argv
[0], "flush_on_suspend"))
1200 r
= process_flush_on_suspend_mesg(argc
, argv
, wc
);
1201 else if (!strcasecmp(argv
[0], "cleaner"))
1202 r
= process_cleaner_mesg(argc
, argv
, wc
);
1203 else if (!strcasecmp(argv
[0], "clear_stats"))
1204 r
= process_clear_stats_mesg(argc
, argv
, wc
);
1206 DMERR("unrecognised message received: %s", argv
[0]);
1211 static void memcpy_flushcache_optimized(void *dest
, void *source
, size_t size
)
1214 * clflushopt performs better with block size 1024, 2048, 4096
1215 * non-temporal stores perform better with block size 512
1217 * block size 512 1024 2048 4096
1218 * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s
1219 * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s
1221 * We see that movnti performs better for 512-byte blocks, and
1222 * clflushopt performs better for 1024-byte and larger blocks. So, we
1223 * prefer clflushopt for sizes >= 768.
1225 * NOTE: this happens to be the case now (with dm-writecache's single
1226 * threaded model) but re-evaluate this once memcpy_flushcache() is
1227 * enabled to use movdir64b which might invalidate this performance
1228 * advantage seen with cache-allocating-writes plus flushing.
1231 if (static_cpu_has(X86_FEATURE_CLFLUSHOPT
) &&
1232 likely(boot_cpu_data
.x86_clflush_size
== 64) &&
1233 likely(size
>= 768)) {
1235 memcpy((void *)dest
, (void *)source
, 64);
1236 clflushopt((void *)dest
);
1240 } while (size
>= 64);
1244 memcpy_flushcache(dest
, source
, size
);
1247 static void bio_copy_block(struct dm_writecache
*wc
, struct bio
*bio
, void *data
)
1251 int rw
= bio_data_dir(bio
);
1252 unsigned int remaining_size
= wc
->block_size
;
1255 struct bio_vec bv
= bio_iter_iovec(bio
, bio
->bi_iter
);
1257 buf
= bvec_kmap_local(&bv
);
1259 if (unlikely(size
> remaining_size
))
1260 size
= remaining_size
;
1265 r
= copy_mc_to_kernel(buf
, data
, size
);
1266 flush_dcache_page(bio_page(bio
));
1268 writecache_error(wc
, r
, "hardware memory error when reading data: %d", r
);
1269 bio
->bi_status
= BLK_STS_IOERR
;
1272 flush_dcache_page(bio_page(bio
));
1273 memcpy_flushcache_optimized(data
, buf
, size
);
1278 data
= (char *)data
+ size
;
1279 remaining_size
-= size
;
1280 bio_advance(bio
, size
);
1281 } while (unlikely(remaining_size
));
1284 static int writecache_flush_thread(void *data
)
1286 struct dm_writecache
*wc
= data
;
1292 bio
= bio_list_pop(&wc
->flush_list
);
1294 set_current_state(TASK_INTERRUPTIBLE
);
1297 if (unlikely(kthread_should_stop())) {
1298 set_current_state(TASK_RUNNING
);
1306 if (bio_op(bio
) == REQ_OP_DISCARD
) {
1307 writecache_discard(wc
, bio
->bi_iter
.bi_sector
,
1308 bio_end_sector(bio
));
1310 bio_set_dev(bio
, wc
->dev
->bdev
);
1311 submit_bio_noacct(bio
);
1313 writecache_flush(wc
);
1315 if (writecache_has_error(wc
))
1316 bio
->bi_status
= BLK_STS_IOERR
;
1324 static void writecache_offload_bio(struct dm_writecache
*wc
, struct bio
*bio
)
1326 if (bio_list_empty(&wc
->flush_list
))
1327 wake_up_process(wc
->flush_thread
);
1328 bio_list_add(&wc
->flush_list
, bio
);
1334 WC_MAP_REMAP_ORIGIN
,
1339 static void writecache_map_remap_origin(struct dm_writecache
*wc
, struct bio
*bio
,
1343 sector_t next_boundary
=
1344 read_original_sector(wc
, e
) - bio
->bi_iter
.bi_sector
;
1345 if (next_boundary
< bio
->bi_iter
.bi_size
>> SECTOR_SHIFT
)
1346 dm_accept_partial_bio(bio
, next_boundary
);
1350 static enum wc_map_op
writecache_map_read(struct dm_writecache
*wc
, struct bio
*bio
)
1352 enum wc_map_op map_op
;
1357 e
= writecache_find_entry(wc
, bio
->bi_iter
.bi_sector
, WFE_RETURN_FOLLOWING
);
1358 if (e
&& read_original_sector(wc
, e
) == bio
->bi_iter
.bi_sector
) {
1359 wc
->stats
.read_hits
++;
1360 if (WC_MODE_PMEM(wc
)) {
1361 bio_copy_block(wc
, bio
, memory_data(wc
, e
));
1362 if (bio
->bi_iter
.bi_size
)
1363 goto read_next_block
;
1364 map_op
= WC_MAP_SUBMIT
;
1366 dm_accept_partial_bio(bio
, wc
->block_size
>> SECTOR_SHIFT
);
1367 bio_set_dev(bio
, wc
->ssd_dev
->bdev
);
1368 bio
->bi_iter
.bi_sector
= cache_sector(wc
, e
);
1369 if (!writecache_entry_is_committed(wc
, e
))
1370 writecache_wait_for_ios(wc
, WRITE
);
1371 map_op
= WC_MAP_REMAP
;
1374 writecache_map_remap_origin(wc
, bio
, e
);
1375 wc
->stats
.reads
+= (bio
->bi_iter
.bi_size
- wc
->block_size
) >> wc
->block_size_bits
;
1376 map_op
= WC_MAP_REMAP_ORIGIN
;
1382 static void writecache_bio_copy_ssd(struct dm_writecache
*wc
, struct bio
*bio
,
1383 struct wc_entry
*e
, bool search_used
)
1385 unsigned int bio_size
= wc
->block_size
;
1386 sector_t start_cache_sec
= cache_sector(wc
, e
);
1387 sector_t current_cache_sec
= start_cache_sec
+ (bio_size
>> SECTOR_SHIFT
);
1389 while (bio_size
< bio
->bi_iter
.bi_size
) {
1391 struct wc_entry
*f
= writecache_pop_from_freelist(wc
, current_cache_sec
);
1395 write_original_sector_seq_count(wc
, f
, bio
->bi_iter
.bi_sector
+
1396 (bio_size
>> SECTOR_SHIFT
), wc
->seq_count
);
1397 writecache_insert_entry(wc
, f
);
1398 wc
->uncommitted_blocks
++;
1401 struct rb_node
*next
= rb_next(&e
->rb_node
);
1405 f
= container_of(next
, struct wc_entry
, rb_node
);
1408 if (read_original_sector(wc
, f
) !=
1409 read_original_sector(wc
, e
) + (wc
->block_size
>> SECTOR_SHIFT
))
1411 if (unlikely(f
->write_in_progress
))
1413 if (writecache_entry_is_committed(wc
, f
))
1414 wc
->overwrote_committed
= true;
1417 bio_size
+= wc
->block_size
;
1418 current_cache_sec
+= wc
->block_size
>> SECTOR_SHIFT
;
1421 bio_set_dev(bio
, wc
->ssd_dev
->bdev
);
1422 bio
->bi_iter
.bi_sector
= start_cache_sec
;
1423 dm_accept_partial_bio(bio
, bio_size
>> SECTOR_SHIFT
);
1425 wc
->stats
.writes
+= bio
->bi_iter
.bi_size
>> wc
->block_size_bits
;
1426 wc
->stats
.writes_allocate
+= (bio
->bi_iter
.bi_size
- wc
->block_size
) >> wc
->block_size_bits
;
1428 if (unlikely(wc
->uncommitted_blocks
>= wc
->autocommit_blocks
)) {
1429 wc
->uncommitted_blocks
= 0;
1430 queue_work(wc
->writeback_wq
, &wc
->flush_work
);
1432 writecache_schedule_autocommit(wc
);
1436 static enum wc_map_op
writecache_map_write(struct dm_writecache
*wc
, struct bio
*bio
)
1441 bool found_entry
= false;
1442 bool search_used
= false;
1444 if (writecache_has_error(wc
)) {
1445 wc
->stats
.writes
+= bio
->bi_iter
.bi_size
>> wc
->block_size_bits
;
1446 return WC_MAP_ERROR
;
1448 e
= writecache_find_entry(wc
, bio
->bi_iter
.bi_sector
, 0);
1450 if (!writecache_entry_is_committed(wc
, e
)) {
1451 wc
->stats
.write_hits_uncommitted
++;
1455 wc
->stats
.write_hits_committed
++;
1456 if (!WC_MODE_PMEM(wc
) && !e
->write_in_progress
) {
1457 wc
->overwrote_committed
= true;
1463 if (unlikely(wc
->cleaner
) ||
1464 (wc
->metadata_only
&& !(bio
->bi_opf
& REQ_META
)))
1467 e
= writecache_pop_from_freelist(wc
, (sector_t
)-1);
1469 if (!WC_MODE_PMEM(wc
) && !found_entry
) {
1471 e
= writecache_find_entry(wc
, bio
->bi_iter
.bi_sector
, WFE_RETURN_FOLLOWING
);
1472 writecache_map_remap_origin(wc
, bio
, e
);
1473 wc
->stats
.writes_around
+= bio
->bi_iter
.bi_size
>> wc
->block_size_bits
;
1474 wc
->stats
.writes
+= bio
->bi_iter
.bi_size
>> wc
->block_size_bits
;
1475 return WC_MAP_REMAP_ORIGIN
;
1477 wc
->stats
.writes_blocked_on_freelist
++;
1478 writecache_wait_on_freelist(wc
);
1481 write_original_sector_seq_count(wc
, e
, bio
->bi_iter
.bi_sector
, wc
->seq_count
);
1482 writecache_insert_entry(wc
, e
);
1483 wc
->uncommitted_blocks
++;
1484 wc
->stats
.writes_allocate
++;
1486 if (WC_MODE_PMEM(wc
)) {
1487 bio_copy_block(wc
, bio
, memory_data(wc
, e
));
1490 writecache_bio_copy_ssd(wc
, bio
, e
, search_used
);
1491 return WC_MAP_REMAP
;
1493 } while (bio
->bi_iter
.bi_size
);
1495 if (unlikely(bio
->bi_opf
& REQ_FUA
|| wc
->uncommitted_blocks
>= wc
->autocommit_blocks
))
1496 writecache_flush(wc
);
1498 writecache_schedule_autocommit(wc
);
1500 return WC_MAP_SUBMIT
;
1503 static enum wc_map_op
writecache_map_flush(struct dm_writecache
*wc
, struct bio
*bio
)
1505 if (writecache_has_error(wc
))
1506 return WC_MAP_ERROR
;
1508 if (WC_MODE_PMEM(wc
)) {
1509 wc
->stats
.flushes
++;
1510 writecache_flush(wc
);
1511 if (writecache_has_error(wc
))
1512 return WC_MAP_ERROR
;
1513 else if (unlikely(wc
->cleaner
) || unlikely(wc
->metadata_only
))
1514 return WC_MAP_REMAP_ORIGIN
;
1515 return WC_MAP_SUBMIT
;
1518 if (dm_bio_get_target_bio_nr(bio
))
1519 return WC_MAP_REMAP_ORIGIN
;
1520 wc
->stats
.flushes
++;
1521 writecache_offload_bio(wc
, bio
);
1522 return WC_MAP_RETURN
;
1525 static enum wc_map_op
writecache_map_discard(struct dm_writecache
*wc
, struct bio
*bio
)
1527 wc
->stats
.discards
+= bio
->bi_iter
.bi_size
>> wc
->block_size_bits
;
1529 if (writecache_has_error(wc
))
1530 return WC_MAP_ERROR
;
1532 if (WC_MODE_PMEM(wc
)) {
1533 writecache_discard(wc
, bio
->bi_iter
.bi_sector
, bio_end_sector(bio
));
1534 return WC_MAP_REMAP_ORIGIN
;
1537 writecache_offload_bio(wc
, bio
);
1538 return WC_MAP_RETURN
;
1541 static int writecache_map(struct dm_target
*ti
, struct bio
*bio
)
1543 struct dm_writecache
*wc
= ti
->private;
1544 enum wc_map_op map_op
;
1546 bio
->bi_private
= NULL
;
1550 if (unlikely(bio
->bi_opf
& REQ_PREFLUSH
)) {
1551 map_op
= writecache_map_flush(wc
, bio
);
1555 bio
->bi_iter
.bi_sector
= dm_target_offset(ti
, bio
->bi_iter
.bi_sector
);
1557 if (unlikely((((unsigned int)bio
->bi_iter
.bi_sector
| bio_sectors(bio
)) &
1558 (wc
->block_size
/ 512 - 1)) != 0)) {
1559 DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1560 (unsigned long long)bio
->bi_iter
.bi_sector
,
1561 bio
->bi_iter
.bi_size
, wc
->block_size
);
1562 map_op
= WC_MAP_ERROR
;
1566 if (unlikely(bio_op(bio
) == REQ_OP_DISCARD
)) {
1567 map_op
= writecache_map_discard(wc
, bio
);
1571 if (bio_data_dir(bio
) == READ
)
1572 map_op
= writecache_map_read(wc
, bio
);
1574 map_op
= writecache_map_write(wc
, bio
);
1577 case WC_MAP_REMAP_ORIGIN
:
1578 if (likely(wc
->pause
!= 0)) {
1579 if (bio_op(bio
) == REQ_OP_WRITE
) {
1580 dm_iot_io_begin(&wc
->iot
, 1);
1581 bio
->bi_private
= (void *)2;
1584 bio_set_dev(bio
, wc
->dev
->bdev
);
1586 return DM_MAPIO_REMAPPED
;
1589 /* make sure that writecache_end_io decrements bio_in_progress: */
1590 bio
->bi_private
= (void *)1;
1591 atomic_inc(&wc
->bio_in_progress
[bio_data_dir(bio
)]);
1593 return DM_MAPIO_REMAPPED
;
1598 return DM_MAPIO_SUBMITTED
;
1602 return DM_MAPIO_SUBMITTED
;
1607 return DM_MAPIO_SUBMITTED
;
1612 return DM_MAPIO_KILL
;
1616 static int writecache_end_io(struct dm_target
*ti
, struct bio
*bio
, blk_status_t
*status
)
1618 struct dm_writecache
*wc
= ti
->private;
1620 if (bio
->bi_private
== (void *)1) {
1621 int dir
= bio_data_dir(bio
);
1623 if (atomic_dec_and_test(&wc
->bio_in_progress
[dir
]))
1624 if (unlikely(waitqueue_active(&wc
->bio_in_progress_wait
[dir
])))
1625 wake_up(&wc
->bio_in_progress_wait
[dir
]);
1626 } else if (bio
->bi_private
== (void *)2) {
1627 dm_iot_io_end(&wc
->iot
, 1);
1632 static int writecache_iterate_devices(struct dm_target
*ti
,
1633 iterate_devices_callout_fn fn
, void *data
)
1635 struct dm_writecache
*wc
= ti
->private;
1637 return fn(ti
, wc
->dev
, 0, ti
->len
, data
);
1640 static void writecache_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
1642 struct dm_writecache
*wc
= ti
->private;
1644 if (limits
->logical_block_size
< wc
->block_size
)
1645 limits
->logical_block_size
= wc
->block_size
;
1647 if (limits
->physical_block_size
< wc
->block_size
)
1648 limits
->physical_block_size
= wc
->block_size
;
1650 if (limits
->io_min
< wc
->block_size
)
1651 limits
->io_min
= wc
->block_size
;
1655 static void writecache_writeback_endio(struct bio
*bio
)
1657 struct writeback_struct
*wb
= container_of(bio
, struct writeback_struct
, bio
);
1658 struct dm_writecache
*wc
= wb
->wc
;
1659 unsigned long flags
;
1661 raw_spin_lock_irqsave(&wc
->endio_list_lock
, flags
);
1662 if (unlikely(list_empty(&wc
->endio_list
)))
1663 wake_up_process(wc
->endio_thread
);
1664 list_add_tail(&wb
->endio_entry
, &wc
->endio_list
);
1665 raw_spin_unlock_irqrestore(&wc
->endio_list_lock
, flags
);
1668 static void writecache_copy_endio(int read_err
, unsigned long write_err
, void *ptr
)
1670 struct copy_struct
*c
= ptr
;
1671 struct dm_writecache
*wc
= c
->wc
;
1673 c
->error
= likely(!(read_err
| write_err
)) ? 0 : -EIO
;
1675 raw_spin_lock_irq(&wc
->endio_list_lock
);
1676 if (unlikely(list_empty(&wc
->endio_list
)))
1677 wake_up_process(wc
->endio_thread
);
1678 list_add_tail(&c
->endio_entry
, &wc
->endio_list
);
1679 raw_spin_unlock_irq(&wc
->endio_list_lock
);
1682 static void __writecache_endio_pmem(struct dm_writecache
*wc
, struct list_head
*list
)
1685 struct writeback_struct
*wb
;
1687 unsigned long n_walked
= 0;
1690 wb
= list_entry(list
->next
, struct writeback_struct
, endio_entry
);
1691 list_del(&wb
->endio_entry
);
1693 if (unlikely(wb
->bio
.bi_status
!= BLK_STS_OK
))
1694 writecache_error(wc
, blk_status_to_errno(wb
->bio
.bi_status
),
1695 "write error %d", wb
->bio
.bi_status
);
1699 BUG_ON(!e
->write_in_progress
);
1700 e
->write_in_progress
= false;
1701 INIT_LIST_HEAD(&e
->lru
);
1702 if (!writecache_has_error(wc
))
1703 writecache_free_entry(wc
, e
);
1704 BUG_ON(!wc
->writeback_size
);
1705 wc
->writeback_size
--;
1707 if (unlikely(n_walked
>= ENDIO_LATENCY
)) {
1708 writecache_commit_flushed(wc
, false);
1713 } while (++i
< wb
->wc_list_n
);
1715 if (wb
->wc_list
!= wb
->wc_list_inline
)
1718 } while (!list_empty(list
));
1721 static void __writecache_endio_ssd(struct dm_writecache
*wc
, struct list_head
*list
)
1723 struct copy_struct
*c
;
1727 c
= list_entry(list
->next
, struct copy_struct
, endio_entry
);
1728 list_del(&c
->endio_entry
);
1730 if (unlikely(c
->error
))
1731 writecache_error(wc
, c
->error
, "copy error");
1735 BUG_ON(!e
->write_in_progress
);
1736 e
->write_in_progress
= false;
1737 INIT_LIST_HEAD(&e
->lru
);
1738 if (!writecache_has_error(wc
))
1739 writecache_free_entry(wc
, e
);
1741 BUG_ON(!wc
->writeback_size
);
1742 wc
->writeback_size
--;
1744 } while (--c
->n_entries
);
1745 mempool_free(c
, &wc
->copy_pool
);
1746 } while (!list_empty(list
));
1749 static int writecache_endio_thread(void *data
)
1751 struct dm_writecache
*wc
= data
;
1754 struct list_head list
;
1756 raw_spin_lock_irq(&wc
->endio_list_lock
);
1757 if (!list_empty(&wc
->endio_list
))
1759 set_current_state(TASK_INTERRUPTIBLE
);
1760 raw_spin_unlock_irq(&wc
->endio_list_lock
);
1762 if (unlikely(kthread_should_stop())) {
1763 set_current_state(TASK_RUNNING
);
1772 list
= wc
->endio_list
;
1773 list
.next
->prev
= list
.prev
->next
= &list
;
1774 INIT_LIST_HEAD(&wc
->endio_list
);
1775 raw_spin_unlock_irq(&wc
->endio_list_lock
);
1777 if (!WC_MODE_FUA(wc
))
1778 writecache_disk_flush(wc
, wc
->dev
);
1782 if (WC_MODE_PMEM(wc
)) {
1783 __writecache_endio_pmem(wc
, &list
);
1785 __writecache_endio_ssd(wc
, &list
);
1786 writecache_wait_for_ios(wc
, READ
);
1789 writecache_commit_flushed(wc
, false);
1797 static bool wc_add_block(struct writeback_struct
*wb
, struct wc_entry
*e
)
1799 struct dm_writecache
*wc
= wb
->wc
;
1800 unsigned int block_size
= wc
->block_size
;
1801 void *address
= memory_data(wc
, e
);
1803 persistent_memory_flush_cache(address
, block_size
);
1805 if (unlikely(bio_end_sector(&wb
->bio
) >= wc
->data_device_sectors
))
1808 return bio_add_page(&wb
->bio
, persistent_memory_page(address
),
1809 block_size
, persistent_memory_page_offset(address
)) != 0;
1812 struct writeback_list
{
1813 struct list_head list
;
1817 static void __writeback_throttle(struct dm_writecache
*wc
, struct writeback_list
*wbl
)
1819 if (unlikely(wc
->max_writeback_jobs
)) {
1820 if (READ_ONCE(wc
->writeback_size
) - wbl
->size
>= wc
->max_writeback_jobs
) {
1822 while (wc
->writeback_size
- wbl
->size
>= wc
->max_writeback_jobs
)
1823 writecache_wait_on_freelist(wc
);
1830 static void __writecache_writeback_pmem(struct dm_writecache
*wc
, struct writeback_list
*wbl
)
1832 struct wc_entry
*e
, *f
;
1834 struct writeback_struct
*wb
;
1835 unsigned int max_pages
;
1839 e
= container_of(wbl
->list
.prev
, struct wc_entry
, lru
);
1842 max_pages
= e
->wc_list_contiguous
;
1844 bio
= bio_alloc_bioset(wc
->dev
->bdev
, max_pages
, REQ_OP_WRITE
,
1845 GFP_NOIO
, &wc
->bio_set
);
1846 wb
= container_of(bio
, struct writeback_struct
, bio
);
1848 bio
->bi_end_io
= writecache_writeback_endio
;
1849 bio
->bi_iter
.bi_sector
= read_original_sector(wc
, e
);
1851 if (unlikely(max_pages
> WB_LIST_INLINE
))
1852 wb
->wc_list
= kmalloc_array(max_pages
, sizeof(struct wc_entry
*),
1853 GFP_NOIO
| __GFP_NORETRY
|
1854 __GFP_NOMEMALLOC
| __GFP_NOWARN
);
1856 if (likely(max_pages
<= WB_LIST_INLINE
) || unlikely(!wb
->wc_list
)) {
1857 wb
->wc_list
= wb
->wc_list_inline
;
1858 max_pages
= WB_LIST_INLINE
;
1861 BUG_ON(!wc_add_block(wb
, e
));
1866 while (wbl
->size
&& wb
->wc_list_n
< max_pages
) {
1867 f
= container_of(wbl
->list
.prev
, struct wc_entry
, lru
);
1868 if (read_original_sector(wc
, f
) !=
1869 read_original_sector(wc
, e
) + (wc
->block_size
>> SECTOR_SHIFT
))
1871 if (!wc_add_block(wb
, f
))
1875 wb
->wc_list
[wb
->wc_list_n
++] = f
;
1878 if (WC_MODE_FUA(wc
))
1879 bio
->bi_opf
|= REQ_FUA
;
1880 if (writecache_has_error(wc
)) {
1881 bio
->bi_status
= BLK_STS_IOERR
;
1883 } else if (unlikely(!bio_sectors(bio
))) {
1884 bio
->bi_status
= BLK_STS_OK
;
1890 __writeback_throttle(wc
, wbl
);
1894 static void __writecache_writeback_ssd(struct dm_writecache
*wc
, struct writeback_list
*wbl
)
1896 struct wc_entry
*e
, *f
;
1897 struct dm_io_region from
, to
;
1898 struct copy_struct
*c
;
1901 unsigned int n_sectors
;
1904 e
= container_of(wbl
->list
.prev
, struct wc_entry
, lru
);
1907 n_sectors
= e
->wc_list_contiguous
<< (wc
->block_size_bits
- SECTOR_SHIFT
);
1909 from
.bdev
= wc
->ssd_dev
->bdev
;
1910 from
.sector
= cache_sector(wc
, e
);
1911 from
.count
= n_sectors
;
1912 to
.bdev
= wc
->dev
->bdev
;
1913 to
.sector
= read_original_sector(wc
, e
);
1914 to
.count
= n_sectors
;
1916 c
= mempool_alloc(&wc
->copy_pool
, GFP_NOIO
);
1919 c
->n_entries
= e
->wc_list_contiguous
;
1921 while ((n_sectors
-= wc
->block_size
>> SECTOR_SHIFT
)) {
1923 f
= container_of(wbl
->list
.prev
, struct wc_entry
, lru
);
1929 if (unlikely(to
.sector
+ to
.count
> wc
->data_device_sectors
)) {
1930 if (to
.sector
>= wc
->data_device_sectors
) {
1931 writecache_copy_endio(0, 0, c
);
1934 from
.count
= to
.count
= wc
->data_device_sectors
- to
.sector
;
1937 dm_kcopyd_copy(wc
->dm_kcopyd
, &from
, 1, &to
, 0, writecache_copy_endio
, c
);
1939 __writeback_throttle(wc
, wbl
);
1943 static void writecache_writeback(struct work_struct
*work
)
1945 struct dm_writecache
*wc
= container_of(work
, struct dm_writecache
, writeback_work
);
1946 struct blk_plug plug
;
1947 struct wc_entry
*f
, *g
, *e
= NULL
;
1948 struct rb_node
*node
, *next_node
;
1949 struct list_head skipped
;
1950 struct writeback_list wbl
;
1951 unsigned long n_walked
;
1953 if (!WC_MODE_PMEM(wc
)) {
1954 /* Wait for any active kcopyd work on behalf of ssd writeback */
1955 dm_kcopyd_client_flush(wc
->dm_kcopyd
);
1958 if (likely(wc
->pause
!= 0)) {
1962 if (unlikely(wc
->cleaner
) || unlikely(wc
->writeback_all
) ||
1963 unlikely(dm_suspended(wc
->ti
)))
1965 idle
= dm_iot_idle_time(&wc
->iot
);
1966 if (idle
>= wc
->pause
)
1968 idle
= wc
->pause
- idle
;
1971 schedule_timeout_idle(idle
);
1977 if (writecache_has_error(wc
)) {
1982 if (unlikely(wc
->writeback_all
)) {
1983 if (writecache_wait_for_writeback(wc
))
1987 if (wc
->overwrote_committed
)
1988 writecache_wait_for_ios(wc
, WRITE
);
1991 INIT_LIST_HEAD(&skipped
);
1992 INIT_LIST_HEAD(&wbl
.list
);
1994 while (!list_empty(&wc
->lru
) &&
1995 (wc
->writeback_all
||
1996 wc
->freelist_size
+ wc
->writeback_size
<= wc
->freelist_low_watermark
||
1997 (jiffies
- container_of(wc
->lru
.prev
, struct wc_entry
, lru
)->age
>=
1998 wc
->max_age
- wc
->max_age
/ MAX_AGE_DIV
))) {
2001 if (unlikely(n_walked
> WRITEBACK_LATENCY
) &&
2002 likely(!wc
->writeback_all
)) {
2003 if (likely(!dm_suspended(wc
->ti
)))
2004 queue_work(wc
->writeback_wq
, &wc
->writeback_work
);
2008 if (unlikely(wc
->writeback_all
)) {
2010 writecache_flush(wc
);
2011 e
= container_of(rb_first(&wc
->tree
), struct wc_entry
, rb_node
);
2015 e
= container_of(wc
->lru
.prev
, struct wc_entry
, lru
);
2016 BUG_ON(e
->write_in_progress
);
2017 if (unlikely(!writecache_entry_is_committed(wc
, e
)))
2018 writecache_flush(wc
);
2020 node
= rb_prev(&e
->rb_node
);
2022 f
= container_of(node
, struct wc_entry
, rb_node
);
2023 if (unlikely(read_original_sector(wc
, f
) ==
2024 read_original_sector(wc
, e
))) {
2025 BUG_ON(!f
->write_in_progress
);
2026 list_move(&e
->lru
, &skipped
);
2031 wc
->writeback_size
++;
2032 list_move(&e
->lru
, &wbl
.list
);
2034 e
->write_in_progress
= true;
2035 e
->wc_list_contiguous
= 1;
2040 next_node
= rb_next(&f
->rb_node
);
2041 if (unlikely(!next_node
))
2043 g
= container_of(next_node
, struct wc_entry
, rb_node
);
2044 if (unlikely(read_original_sector(wc
, g
) ==
2045 read_original_sector(wc
, f
))) {
2049 if (read_original_sector(wc
, g
) !=
2050 read_original_sector(wc
, f
) + (wc
->block_size
>> SECTOR_SHIFT
))
2052 if (unlikely(g
->write_in_progress
))
2054 if (unlikely(!writecache_entry_is_committed(wc
, g
)))
2057 if (!WC_MODE_PMEM(wc
)) {
2063 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
2066 wc
->writeback_size
++;
2067 list_move(&g
->lru
, &wbl
.list
);
2069 g
->write_in_progress
= true;
2070 g
->wc_list_contiguous
= BIO_MAX_VECS
;
2072 e
->wc_list_contiguous
++;
2073 if (unlikely(e
->wc_list_contiguous
== BIO_MAX_VECS
)) {
2074 if (unlikely(wc
->writeback_all
)) {
2075 next_node
= rb_next(&f
->rb_node
);
2076 if (likely(next_node
))
2077 g
= container_of(next_node
, struct wc_entry
, rb_node
);
2085 if (!list_empty(&skipped
)) {
2086 list_splice_tail(&skipped
, &wc
->lru
);
2088 * If we didn't do any progress, we must wait until some
2089 * writeback finishes to avoid burning CPU in a loop
2091 if (unlikely(!wbl
.size
))
2092 writecache_wait_for_writeback(wc
);
2097 blk_start_plug(&plug
);
2099 if (WC_MODE_PMEM(wc
))
2100 __writecache_writeback_pmem(wc
, &wbl
);
2102 __writecache_writeback_ssd(wc
, &wbl
);
2104 blk_finish_plug(&plug
);
2106 if (unlikely(wc
->writeback_all
)) {
2108 while (writecache_wait_for_writeback(wc
))
2114 static int calculate_memory_size(uint64_t device_size
, unsigned int block_size
,
2115 size_t *n_blocks_p
, size_t *n_metadata_blocks_p
)
2117 uint64_t n_blocks
, offset
;
2120 n_blocks
= device_size
;
2121 do_div(n_blocks
, block_size
+ sizeof(struct wc_memory_entry
));
2126 /* Verify the following entries[n_blocks] won't overflow */
2127 if (n_blocks
>= ((size_t)-sizeof(struct wc_memory_superblock
) /
2128 sizeof(struct wc_memory_entry
)))
2130 offset
= offsetof(struct wc_memory_superblock
, entries
[n_blocks
]);
2131 offset
= (offset
+ block_size
- 1) & ~(uint64_t)(block_size
- 1);
2132 if (offset
+ n_blocks
* block_size
<= device_size
)
2137 /* check if the bit field overflows */
2139 if (e
.index
!= n_blocks
)
2143 *n_blocks_p
= n_blocks
;
2144 if (n_metadata_blocks_p
)
2145 *n_metadata_blocks_p
= offset
>> __ffs(block_size
);
2149 static int init_memory(struct dm_writecache
*wc
)
2154 r
= calculate_memory_size(wc
->memory_map_size
, wc
->block_size
, &wc
->n_blocks
, NULL
);
2158 r
= writecache_alloc_entries(wc
);
2162 for (b
= 0; b
< ARRAY_SIZE(sb(wc
)->padding
); b
++)
2163 pmem_assign(sb(wc
)->padding
[b
], cpu_to_le64(0));
2164 pmem_assign(sb(wc
)->version
, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION
));
2165 pmem_assign(sb(wc
)->block_size
, cpu_to_le32(wc
->block_size
));
2166 pmem_assign(sb(wc
)->n_blocks
, cpu_to_le64(wc
->n_blocks
));
2167 pmem_assign(sb(wc
)->seq_count
, cpu_to_le64(0));
2169 for (b
= 0; b
< wc
->n_blocks
; b
++) {
2170 write_original_sector_seq_count(wc
, &wc
->entries
[b
], -1, -1);
2174 writecache_flush_all_metadata(wc
);
2175 writecache_commit_flushed(wc
, false);
2176 pmem_assign(sb(wc
)->magic
, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC
));
2177 writecache_flush_region(wc
, &sb(wc
)->magic
, sizeof(sb(wc
)->magic
));
2178 writecache_commit_flushed(wc
, false);
2183 static void writecache_dtr(struct dm_target
*ti
)
2185 struct dm_writecache
*wc
= ti
->private;
2190 if (wc
->endio_thread
)
2191 kthread_stop(wc
->endio_thread
);
2193 if (wc
->flush_thread
)
2194 kthread_stop(wc
->flush_thread
);
2196 bioset_exit(&wc
->bio_set
);
2198 mempool_exit(&wc
->copy_pool
);
2200 if (wc
->writeback_wq
)
2201 destroy_workqueue(wc
->writeback_wq
);
2204 dm_put_device(ti
, wc
->dev
);
2207 dm_put_device(ti
, wc
->ssd_dev
);
2211 if (wc
->memory_map
) {
2212 if (WC_MODE_PMEM(wc
))
2213 persistent_memory_release(wc
);
2215 vfree(wc
->memory_map
);
2219 dm_kcopyd_client_destroy(wc
->dm_kcopyd
);
2222 dm_io_client_destroy(wc
->dm_io
);
2224 vfree(wc
->dirty_bitmap
);
2229 static int writecache_ctr(struct dm_target
*ti
, unsigned int argc
, char **argv
)
2231 struct dm_writecache
*wc
;
2232 struct dm_arg_set as
;
2234 unsigned int opt_params
;
2235 size_t offset
, data_size
;
2238 int high_wm_percent
= HIGH_WATERMARK
;
2239 int low_wm_percent
= LOW_WATERMARK
;
2241 struct wc_memory_superblock s
;
2243 static struct dm_arg _args
[] = {
2244 {0, 18, "Invalid number of feature args"},
2250 wc
= kzalloc(sizeof(struct dm_writecache
), GFP_KERNEL
);
2252 ti
->error
= "Cannot allocate writecache structure";
2259 mutex_init(&wc
->lock
);
2260 wc
->max_age
= MAX_AGE_UNSPECIFIED
;
2261 writecache_poison_lists(wc
);
2262 init_waitqueue_head(&wc
->freelist_wait
);
2263 timer_setup(&wc
->autocommit_timer
, writecache_autocommit_timer
, 0);
2264 timer_setup(&wc
->max_age_timer
, writecache_max_age_timer
, 0);
2266 for (i
= 0; i
< 2; i
++) {
2267 atomic_set(&wc
->bio_in_progress
[i
], 0);
2268 init_waitqueue_head(&wc
->bio_in_progress_wait
[i
]);
2271 wc
->dm_io
= dm_io_client_create();
2272 if (IS_ERR(wc
->dm_io
)) {
2273 r
= PTR_ERR(wc
->dm_io
);
2274 ti
->error
= "Unable to allocate dm-io client";
2279 wc
->writeback_wq
= alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM
, 1);
2280 if (!wc
->writeback_wq
) {
2282 ti
->error
= "Could not allocate writeback workqueue";
2285 INIT_WORK(&wc
->writeback_work
, writecache_writeback
);
2286 INIT_WORK(&wc
->flush_work
, writecache_flush_work
);
2288 dm_iot_init(&wc
->iot
);
2290 raw_spin_lock_init(&wc
->endio_list_lock
);
2291 INIT_LIST_HEAD(&wc
->endio_list
);
2292 wc
->endio_thread
= kthread_run(writecache_endio_thread
, wc
, "writecache_endio");
2293 if (IS_ERR(wc
->endio_thread
)) {
2294 r
= PTR_ERR(wc
->endio_thread
);
2295 wc
->endio_thread
= NULL
;
2296 ti
->error
= "Couldn't spawn endio thread";
2301 * Parse the mode (pmem or ssd)
2303 string
= dm_shift_arg(&as
);
2307 if (!strcasecmp(string
, "s")) {
2308 wc
->pmem_mode
= false;
2309 } else if (!strcasecmp(string
, "p")) {
2310 #ifdef DM_WRITECACHE_HAS_PMEM
2311 wc
->pmem_mode
= true;
2312 wc
->writeback_fua
= true;
2315 * If the architecture doesn't support persistent memory or
2316 * the kernel doesn't support any DAX drivers, this driver can
2317 * only be used in SSD-only mode.
2320 ti
->error
= "Persistent memory or DAX not supported on this system";
2327 if (WC_MODE_PMEM(wc
)) {
2328 r
= bioset_init(&wc
->bio_set
, BIO_POOL_SIZE
,
2329 offsetof(struct writeback_struct
, bio
),
2332 ti
->error
= "Could not allocate bio set";
2336 wc
->pause
= PAUSE_WRITEBACK
;
2337 r
= mempool_init_kmalloc_pool(&wc
->copy_pool
, 1, sizeof(struct copy_struct
));
2339 ti
->error
= "Could not allocate mempool";
2345 * Parse the origin data device
2347 string
= dm_shift_arg(&as
);
2350 r
= dm_get_device(ti
, string
, dm_table_get_mode(ti
->table
), &wc
->dev
);
2352 ti
->error
= "Origin data device lookup failed";
2357 * Parse cache data device (be it pmem or ssd)
2359 string
= dm_shift_arg(&as
);
2363 r
= dm_get_device(ti
, string
, dm_table_get_mode(ti
->table
), &wc
->ssd_dev
);
2365 ti
->error
= "Cache data device lookup failed";
2368 wc
->memory_map_size
= bdev_nr_bytes(wc
->ssd_dev
->bdev
);
2371 * Parse the cache block size
2373 string
= dm_shift_arg(&as
);
2376 if (sscanf(string
, "%u%c", &wc
->block_size
, &dummy
) != 1 ||
2377 wc
->block_size
< 512 || wc
->block_size
> PAGE_SIZE
||
2378 (wc
->block_size
& (wc
->block_size
- 1))) {
2380 ti
->error
= "Invalid block size";
2383 if (wc
->block_size
< bdev_logical_block_size(wc
->dev
->bdev
) ||
2384 wc
->block_size
< bdev_logical_block_size(wc
->ssd_dev
->bdev
)) {
2386 ti
->error
= "Block size is smaller than device logical block size";
2389 wc
->block_size_bits
= __ffs(wc
->block_size
);
2391 wc
->max_writeback_jobs
= MAX_WRITEBACK_JOBS
;
2392 wc
->autocommit_blocks
= !WC_MODE_PMEM(wc
) ? AUTOCOMMIT_BLOCKS_SSD
: AUTOCOMMIT_BLOCKS_PMEM
;
2393 wc
->autocommit_jiffies
= msecs_to_jiffies(AUTOCOMMIT_MSEC
);
2396 * Parse optional arguments
2398 r
= dm_read_arg_group(_args
, &as
, &opt_params
, &ti
->error
);
2402 while (opt_params
) {
2403 string
= dm_shift_arg(&as
), opt_params
--;
2404 if (!strcasecmp(string
, "start_sector") && opt_params
>= 1) {
2405 unsigned long long start_sector
;
2407 string
= dm_shift_arg(&as
), opt_params
--;
2408 if (sscanf(string
, "%llu%c", &start_sector
, &dummy
) != 1)
2409 goto invalid_optional
;
2410 wc
->start_sector
= start_sector
;
2411 wc
->start_sector_set
= true;
2412 if (wc
->start_sector
!= start_sector
||
2413 wc
->start_sector
>= wc
->memory_map_size
>> SECTOR_SHIFT
)
2414 goto invalid_optional
;
2415 } else if (!strcasecmp(string
, "high_watermark") && opt_params
>= 1) {
2416 string
= dm_shift_arg(&as
), opt_params
--;
2417 if (sscanf(string
, "%d%c", &high_wm_percent
, &dummy
) != 1)
2418 goto invalid_optional
;
2419 if (high_wm_percent
< 0 || high_wm_percent
> 100)
2420 goto invalid_optional
;
2421 wc
->high_wm_percent_value
= high_wm_percent
;
2422 wc
->high_wm_percent_set
= true;
2423 } else if (!strcasecmp(string
, "low_watermark") && opt_params
>= 1) {
2424 string
= dm_shift_arg(&as
), opt_params
--;
2425 if (sscanf(string
, "%d%c", &low_wm_percent
, &dummy
) != 1)
2426 goto invalid_optional
;
2427 if (low_wm_percent
< 0 || low_wm_percent
> 100)
2428 goto invalid_optional
;
2429 wc
->low_wm_percent_value
= low_wm_percent
;
2430 wc
->low_wm_percent_set
= true;
2431 } else if (!strcasecmp(string
, "writeback_jobs") && opt_params
>= 1) {
2432 string
= dm_shift_arg(&as
), opt_params
--;
2433 if (sscanf(string
, "%u%c", &wc
->max_writeback_jobs
, &dummy
) != 1)
2434 goto invalid_optional
;
2435 wc
->max_writeback_jobs_set
= true;
2436 } else if (!strcasecmp(string
, "autocommit_blocks") && opt_params
>= 1) {
2437 string
= dm_shift_arg(&as
), opt_params
--;
2438 if (sscanf(string
, "%u%c", &wc
->autocommit_blocks
, &dummy
) != 1)
2439 goto invalid_optional
;
2440 wc
->autocommit_blocks_set
= true;
2441 } else if (!strcasecmp(string
, "autocommit_time") && opt_params
>= 1) {
2442 unsigned int autocommit_msecs
;
2444 string
= dm_shift_arg(&as
), opt_params
--;
2445 if (sscanf(string
, "%u%c", &autocommit_msecs
, &dummy
) != 1)
2446 goto invalid_optional
;
2447 if (autocommit_msecs
> 3600000)
2448 goto invalid_optional
;
2449 wc
->autocommit_jiffies
= msecs_to_jiffies(autocommit_msecs
);
2450 wc
->autocommit_time_value
= autocommit_msecs
;
2451 wc
->autocommit_time_set
= true;
2452 } else if (!strcasecmp(string
, "max_age") && opt_params
>= 1) {
2453 unsigned int max_age_msecs
;
2455 string
= dm_shift_arg(&as
), opt_params
--;
2456 if (sscanf(string
, "%u%c", &max_age_msecs
, &dummy
) != 1)
2457 goto invalid_optional
;
2458 if (max_age_msecs
> 86400000)
2459 goto invalid_optional
;
2460 wc
->max_age
= msecs_to_jiffies(max_age_msecs
);
2461 wc
->max_age_set
= true;
2462 wc
->max_age_value
= max_age_msecs
;
2463 } else if (!strcasecmp(string
, "cleaner")) {
2464 wc
->cleaner_set
= true;
2466 } else if (!strcasecmp(string
, "fua")) {
2467 if (WC_MODE_PMEM(wc
)) {
2468 wc
->writeback_fua
= true;
2469 wc
->writeback_fua_set
= true;
2471 goto invalid_optional
;
2472 } else if (!strcasecmp(string
, "nofua")) {
2473 if (WC_MODE_PMEM(wc
)) {
2474 wc
->writeback_fua
= false;
2475 wc
->writeback_fua_set
= true;
2477 goto invalid_optional
;
2478 } else if (!strcasecmp(string
, "metadata_only")) {
2479 wc
->metadata_only
= true;
2480 } else if (!strcasecmp(string
, "pause_writeback") && opt_params
>= 1) {
2481 unsigned int pause_msecs
;
2483 if (WC_MODE_PMEM(wc
))
2484 goto invalid_optional
;
2485 string
= dm_shift_arg(&as
), opt_params
--;
2486 if (sscanf(string
, "%u%c", &pause_msecs
, &dummy
) != 1)
2487 goto invalid_optional
;
2488 if (pause_msecs
> 60000)
2489 goto invalid_optional
;
2490 wc
->pause
= msecs_to_jiffies(pause_msecs
);
2491 wc
->pause_set
= true;
2492 wc
->pause_value
= pause_msecs
;
2496 ti
->error
= "Invalid optional argument";
2501 if (high_wm_percent
< low_wm_percent
) {
2503 ti
->error
= "High watermark must be greater than or equal to low watermark";
2507 if (WC_MODE_PMEM(wc
)) {
2508 if (!dax_synchronous(wc
->ssd_dev
->dax_dev
)) {
2510 ti
->error
= "Asynchronous persistent memory not supported as pmem cache";
2514 r
= persistent_memory_claim(wc
);
2516 ti
->error
= "Unable to map persistent memory for cache";
2520 size_t n_blocks
, n_metadata_blocks
;
2521 uint64_t n_bitmap_bits
;
2523 wc
->memory_map_size
-= (uint64_t)wc
->start_sector
<< SECTOR_SHIFT
;
2525 bio_list_init(&wc
->flush_list
);
2526 wc
->flush_thread
= kthread_run(writecache_flush_thread
, wc
, "dm_writecache_flush");
2527 if (IS_ERR(wc
->flush_thread
)) {
2528 r
= PTR_ERR(wc
->flush_thread
);
2529 wc
->flush_thread
= NULL
;
2530 ti
->error
= "Couldn't spawn flush thread";
2534 r
= calculate_memory_size(wc
->memory_map_size
, wc
->block_size
,
2535 &n_blocks
, &n_metadata_blocks
);
2537 ti
->error
= "Invalid device size";
2541 n_bitmap_bits
= (((uint64_t)n_metadata_blocks
<< wc
->block_size_bits
) +
2542 BITMAP_GRANULARITY
- 1) / BITMAP_GRANULARITY
;
2543 /* this is limitation of test_bit functions */
2544 if (n_bitmap_bits
> 1U << 31) {
2546 ti
->error
= "Invalid device size";
2550 wc
->memory_map
= vmalloc(n_metadata_blocks
<< wc
->block_size_bits
);
2551 if (!wc
->memory_map
) {
2553 ti
->error
= "Unable to allocate memory for metadata";
2557 wc
->dm_kcopyd
= dm_kcopyd_client_create(&dm_kcopyd_throttle
);
2558 if (IS_ERR(wc
->dm_kcopyd
)) {
2559 r
= PTR_ERR(wc
->dm_kcopyd
);
2560 ti
->error
= "Unable to allocate dm-kcopyd client";
2561 wc
->dm_kcopyd
= NULL
;
2565 wc
->metadata_sectors
= n_metadata_blocks
<< (wc
->block_size_bits
- SECTOR_SHIFT
);
2566 wc
->dirty_bitmap_size
= (n_bitmap_bits
+ BITS_PER_LONG
- 1) /
2567 BITS_PER_LONG
* sizeof(unsigned long);
2568 wc
->dirty_bitmap
= vzalloc(wc
->dirty_bitmap_size
);
2569 if (!wc
->dirty_bitmap
) {
2571 ti
->error
= "Unable to allocate dirty bitmap";
2575 r
= writecache_read_metadata(wc
, wc
->block_size
>> SECTOR_SHIFT
);
2577 ti
->error
= "Unable to read first block of metadata";
2582 r
= copy_mc_to_kernel(&s
, sb(wc
), sizeof(struct wc_memory_superblock
));
2584 ti
->error
= "Hardware memory error when reading superblock";
2587 if (!le32_to_cpu(s
.magic
) && !le32_to_cpu(s
.version
)) {
2588 r
= init_memory(wc
);
2590 ti
->error
= "Unable to initialize device";
2593 r
= copy_mc_to_kernel(&s
, sb(wc
),
2594 sizeof(struct wc_memory_superblock
));
2596 ti
->error
= "Hardware memory error when reading superblock";
2601 if (le32_to_cpu(s
.magic
) != MEMORY_SUPERBLOCK_MAGIC
) {
2602 ti
->error
= "Invalid magic in the superblock";
2607 if (le32_to_cpu(s
.version
) != MEMORY_SUPERBLOCK_VERSION
) {
2608 ti
->error
= "Invalid version in the superblock";
2613 if (le32_to_cpu(s
.block_size
) != wc
->block_size
) {
2614 ti
->error
= "Block size does not match superblock";
2619 wc
->n_blocks
= le64_to_cpu(s
.n_blocks
);
2621 offset
= wc
->n_blocks
* sizeof(struct wc_memory_entry
);
2622 if (offset
/ sizeof(struct wc_memory_entry
) != le64_to_cpu(sb(wc
)->n_blocks
)) {
2624 ti
->error
= "Overflow in size calculation";
2628 offset
+= sizeof(struct wc_memory_superblock
);
2629 if (offset
< sizeof(struct wc_memory_superblock
))
2631 offset
= (offset
+ wc
->block_size
- 1) & ~(size_t)(wc
->block_size
- 1);
2632 data_size
= wc
->n_blocks
* (size_t)wc
->block_size
;
2633 if (!offset
|| (data_size
/ wc
->block_size
!= wc
->n_blocks
) ||
2634 (offset
+ data_size
< offset
))
2636 if (offset
+ data_size
> wc
->memory_map_size
) {
2637 ti
->error
= "Memory area is too small";
2642 wc
->metadata_sectors
= offset
>> SECTOR_SHIFT
;
2643 wc
->block_start
= (char *)sb(wc
) + offset
;
2645 x
= (uint64_t)wc
->n_blocks
* (100 - high_wm_percent
);
2648 wc
->freelist_high_watermark
= x
;
2649 x
= (uint64_t)wc
->n_blocks
* (100 - low_wm_percent
);
2652 wc
->freelist_low_watermark
= x
;
2655 activate_cleaner(wc
);
2657 r
= writecache_alloc_entries(wc
);
2659 ti
->error
= "Cannot allocate memory";
2663 ti
->num_flush_bios
= WC_MODE_PMEM(wc
) ? 1 : 2;
2664 ti
->flush_supported
= true;
2665 ti
->num_discard_bios
= 1;
2667 if (WC_MODE_PMEM(wc
))
2668 persistent_memory_flush_cache(wc
->memory_map
, wc
->memory_map_size
);
2674 ti
->error
= "Bad arguments";
2680 static void writecache_status(struct dm_target
*ti
, status_type_t type
,
2681 unsigned int status_flags
, char *result
, unsigned int maxlen
)
2683 struct dm_writecache
*wc
= ti
->private;
2684 unsigned int extra_args
;
2685 unsigned int sz
= 0;
2688 case STATUSTYPE_INFO
:
2689 DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu",
2690 writecache_has_error(wc
),
2691 (unsigned long long)wc
->n_blocks
, (unsigned long long)wc
->freelist_size
,
2692 (unsigned long long)wc
->writeback_size
,
2694 wc
->stats
.read_hits
,
2696 wc
->stats
.write_hits_uncommitted
,
2697 wc
->stats
.write_hits_committed
,
2698 wc
->stats
.writes_around
,
2699 wc
->stats
.writes_allocate
,
2700 wc
->stats
.writes_blocked_on_freelist
,
2702 wc
->stats
.discards
);
2704 case STATUSTYPE_TABLE
:
2705 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc
) ? 'p' : 's',
2706 wc
->dev
->name
, wc
->ssd_dev
->name
, wc
->block_size
);
2708 if (wc
->start_sector_set
)
2710 if (wc
->high_wm_percent_set
)
2712 if (wc
->low_wm_percent_set
)
2714 if (wc
->max_writeback_jobs_set
)
2716 if (wc
->autocommit_blocks_set
)
2718 if (wc
->autocommit_time_set
)
2720 if (wc
->max_age_set
)
2722 if (wc
->cleaner_set
)
2724 if (wc
->writeback_fua_set
)
2726 if (wc
->metadata_only
)
2731 DMEMIT("%u", extra_args
);
2732 if (wc
->start_sector_set
)
2733 DMEMIT(" start_sector %llu", (unsigned long long)wc
->start_sector
);
2734 if (wc
->high_wm_percent_set
)
2735 DMEMIT(" high_watermark %u", wc
->high_wm_percent_value
);
2736 if (wc
->low_wm_percent_set
)
2737 DMEMIT(" low_watermark %u", wc
->low_wm_percent_value
);
2738 if (wc
->max_writeback_jobs_set
)
2739 DMEMIT(" writeback_jobs %u", wc
->max_writeback_jobs
);
2740 if (wc
->autocommit_blocks_set
)
2741 DMEMIT(" autocommit_blocks %u", wc
->autocommit_blocks
);
2742 if (wc
->autocommit_time_set
)
2743 DMEMIT(" autocommit_time %u", wc
->autocommit_time_value
);
2744 if (wc
->max_age_set
)
2745 DMEMIT(" max_age %u", wc
->max_age_value
);
2746 if (wc
->cleaner_set
)
2748 if (wc
->writeback_fua_set
)
2749 DMEMIT(" %sfua", wc
->writeback_fua
? "" : "no");
2750 if (wc
->metadata_only
)
2751 DMEMIT(" metadata_only");
2753 DMEMIT(" pause_writeback %u", wc
->pause_value
);
2755 case STATUSTYPE_IMA
:
2761 static struct target_type writecache_target
= {
2762 .name
= "writecache",
2763 .version
= {1, 6, 0},
2764 .module
= THIS_MODULE
,
2765 .ctr
= writecache_ctr
,
2766 .dtr
= writecache_dtr
,
2767 .status
= writecache_status
,
2768 .postsuspend
= writecache_suspend
,
2769 .resume
= writecache_resume
,
2770 .message
= writecache_message
,
2771 .map
= writecache_map
,
2772 .end_io
= writecache_end_io
,
2773 .iterate_devices
= writecache_iterate_devices
,
2774 .io_hints
= writecache_io_hints
,
2776 module_dm(writecache
);
2778 MODULE_DESCRIPTION(DM_NAME
" writecache target");
2779 MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>");
2780 MODULE_LICENSE("GPL");