1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2009-2011 Red Hat, Inc.
5 * Author: Mikulas Patocka <mpatocka@redhat.com>
7 * This file is released under the GPL.
10 #include <linux/dm-bufio.h>
12 #include <linux/device-mapper.h>
13 #include <linux/dm-io.h>
14 #include <linux/slab.h>
15 #include <linux/sched/mm.h>
16 #include <linux/jiffies.h>
17 #include <linux/vmalloc.h>
18 #include <linux/shrinker.h>
19 #include <linux/module.h>
20 #include <linux/rbtree.h>
21 #include <linux/stacktrace.h>
22 #include <linux/jump_label.h>
26 #define DM_MSG_PREFIX "bufio"
29 * Memory management policy:
30 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
31 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
32 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
33 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
36 #define DM_BUFIO_MIN_BUFFERS 8
38 #define DM_BUFIO_MEMORY_PERCENT 2
39 #define DM_BUFIO_VMALLOC_PERCENT 25
40 #define DM_BUFIO_WRITEBACK_RATIO 3
41 #define DM_BUFIO_LOW_WATERMARK_RATIO 16
44 * Check buffer ages in this interval (seconds)
46 #define DM_BUFIO_WORK_TIMER_SECS 30
49 * Free buffers when they are older than this (seconds)
51 #define DM_BUFIO_DEFAULT_AGE_SECS 300
54 * The nr of bytes of cached data to keep around.
56 #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
59 * Align buffer writes to this boundary.
60 * Tests show that SSDs have the highest IOPS when using 4k writes.
62 #define DM_BUFIO_WRITE_ALIGN 4096
65 * dm_buffer->list_mode
71 /*--------------------------------------------------------------*/
74 * Rather than use an LRU list, we use a clock algorithm where entries
75 * are held in a circular list. When an entry is 'hit' a reference bit
76 * is set. The least recently used entry is approximated by running a
77 * cursor around the list selecting unreferenced entries. Referenced
78 * entries have their reference bit cleared as the cursor passes them.
81 struct list_head list
;
87 struct list_head list
;
88 struct lru_entry
*stop
;
93 struct list_head
*cursor
;
96 struct list_head iterators
;
101 static void lru_init(struct lru
*lru
)
105 INIT_LIST_HEAD(&lru
->iterators
);
108 static void lru_destroy(struct lru
*lru
)
110 WARN_ON_ONCE(lru
->cursor
);
111 WARN_ON_ONCE(!list_empty(&lru
->iterators
));
115 * Insert a new entry into the lru.
117 static void lru_insert(struct lru
*lru
, struct lru_entry
*le
)
120 * Don't be tempted to set to 1, makes the lru aspect
123 atomic_set(&le
->referenced
, 0);
126 list_add_tail(&le
->list
, lru
->cursor
);
128 INIT_LIST_HEAD(&le
->list
);
129 lru
->cursor
= &le
->list
;
137 * Convert a list_head pointer to an lru_entry pointer.
139 static inline struct lru_entry
*to_le(struct list_head
*l
)
141 return container_of(l
, struct lru_entry
, list
);
145 * Initialize an lru_iter and add it to the list of cursors in the lru.
147 static void lru_iter_begin(struct lru
*lru
, struct lru_iter
*it
)
150 it
->stop
= lru
->cursor
? to_le(lru
->cursor
->prev
) : NULL
;
151 it
->e
= lru
->cursor
? to_le(lru
->cursor
) : NULL
;
152 list_add(&it
->list
, &lru
->iterators
);
156 * Remove an lru_iter from the list of cursors in the lru.
158 static inline void lru_iter_end(struct lru_iter
*it
)
163 /* Predicate function type to be used with lru_iter_next */
164 typedef bool (*iter_predicate
)(struct lru_entry
*le
, void *context
);
167 * Advance the cursor to the next entry that passes the
168 * predicate, and return that entry. Returns NULL if the
169 * iteration is complete.
171 static struct lru_entry
*lru_iter_next(struct lru_iter
*it
,
172 iter_predicate pred
, void *context
)
179 /* advance the cursor */
180 if (it
->e
== it
->stop
)
183 it
->e
= to_le(it
->e
->list
.next
);
185 if (pred(e
, context
))
193 * Invalidate a specific lru_entry and update all cursors in
194 * the lru accordingly.
196 static void lru_iter_invalidate(struct lru
*lru
, struct lru_entry
*e
)
200 list_for_each_entry(it
, &lru
->iterators
, list
) {
201 /* Move c->e forwards if necc. */
203 it
->e
= to_le(it
->e
->list
.next
);
208 /* Move it->stop backwards if necc. */
210 it
->stop
= to_le(it
->stop
->list
.prev
);
220 * Remove a specific entry from the lru.
222 static void lru_remove(struct lru
*lru
, struct lru_entry
*le
)
224 lru_iter_invalidate(lru
, le
);
225 if (lru
->count
== 1) {
228 if (lru
->cursor
== &le
->list
)
229 lru
->cursor
= lru
->cursor
->next
;
236 * Mark as referenced.
238 static inline void lru_reference(struct lru_entry
*le
)
240 atomic_set(&le
->referenced
, 1);
246 * Remove the least recently used entry (approx), that passes the predicate.
247 * Returns NULL on failure.
252 ER_STOP
, /* stop looking for something to evict */
255 typedef enum evict_result (*le_predicate
)(struct lru_entry
*le
, void *context
);
257 static struct lru_entry
*lru_evict(struct lru
*lru
, le_predicate pred
, void *context
, bool no_sleep
)
259 unsigned long tested
= 0;
260 struct list_head
*h
= lru
->cursor
;
261 struct lru_entry
*le
;
266 * In the worst case we have to loop around twice. Once to clear
267 * the reference flags, and then again to discover the predicate
268 * fails for all entries.
270 while (tested
< lru
->count
) {
271 le
= container_of(h
, struct lru_entry
, list
);
273 if (atomic_read(&le
->referenced
)) {
274 atomic_set(&le
->referenced
, 0);
277 switch (pred(le
, context
)) {
280 * Adjust the cursor, so we start the next
283 lru
->cursor
= le
->list
.next
;
291 lru
->cursor
= le
->list
.next
;
305 /*--------------------------------------------------------------*/
315 * Describes how the block was allocated:
316 * kmem_cache_alloc(), __get_free_pages() or vmalloc().
317 * See the comment at alloc_buffer_data.
321 DATA_MODE_GET_FREE_PAGES
= 1,
322 DATA_MODE_VMALLOC
= 2,
327 /* protected by the locks in dm_buffer_cache */
330 /* immutable, so don't need protecting */
333 unsigned char data_mode
; /* DATA_MODE_* */
336 * These two fields are used in isolation, so do not need
337 * a surrounding lock.
340 unsigned long last_accessed
;
343 * Everything else is protected by the mutex in
347 struct lru_entry lru
;
348 unsigned char list_mode
; /* LIST_* */
349 blk_status_t read_error
;
350 blk_status_t write_error
;
351 unsigned int dirty_start
;
352 unsigned int dirty_end
;
353 unsigned int write_start
;
354 unsigned int write_end
;
355 struct list_head write_list
;
356 struct dm_bufio_client
*c
;
357 void (*end_io
)(struct dm_buffer
*b
, blk_status_t bs
);
358 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
360 unsigned int stack_len
;
361 unsigned long stack_entries
[MAX_STACK
];
365 /*--------------------------------------------------------------*/
368 * The buffer cache manages buffers, particularly:
369 * - inc/dec of holder count
370 * - setting the last_accessed field
371 * - maintains clean/dirty state along with lru
372 * - selecting buffers that match predicates
374 * It does *not* handle:
375 * - allocation/freeing of buffers.
377 * - Eviction or cache sizing.
379 * cache_get() and cache_put() are threadsafe, you do not need to
380 * protect these calls with a surrounding mutex. All the other
381 * methods are not threadsafe; they do use locking primitives, but
382 * only enough to ensure get/put are threadsafe.
387 struct rw_semaphore lock
;
391 } ____cacheline_aligned_in_smp
;
393 struct dm_buffer_cache
{
394 struct lru lru
[LIST_SIZE
];
396 * We spread entries across multiple trees to reduce contention
399 unsigned int num_locks
;
401 struct buffer_tree trees
[];
404 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled
);
406 static inline unsigned int cache_index(sector_t block
, unsigned int num_locks
)
408 return dm_hash_locks_index(block
, num_locks
);
411 static inline void cache_read_lock(struct dm_buffer_cache
*bc
, sector_t block
)
413 if (static_branch_unlikely(&no_sleep_enabled
) && bc
->no_sleep
)
414 read_lock_bh(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.spinlock
);
416 down_read(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.lock
);
419 static inline void cache_read_unlock(struct dm_buffer_cache
*bc
, sector_t block
)
421 if (static_branch_unlikely(&no_sleep_enabled
) && bc
->no_sleep
)
422 read_unlock_bh(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.spinlock
);
424 up_read(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.lock
);
427 static inline void cache_write_lock(struct dm_buffer_cache
*bc
, sector_t block
)
429 if (static_branch_unlikely(&no_sleep_enabled
) && bc
->no_sleep
)
430 write_lock_bh(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.spinlock
);
432 down_write(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.lock
);
435 static inline void cache_write_unlock(struct dm_buffer_cache
*bc
, sector_t block
)
437 if (static_branch_unlikely(&no_sleep_enabled
) && bc
->no_sleep
)
438 write_unlock_bh(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.spinlock
);
440 up_write(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.lock
);
444 * Sometimes we want to repeatedly get and drop locks as part of an iteration.
445 * This struct helps avoid redundant drop and gets of the same lock.
447 struct lock_history
{
448 struct dm_buffer_cache
*cache
;
450 unsigned int previous
;
451 unsigned int no_previous
;
454 static void lh_init(struct lock_history
*lh
, struct dm_buffer_cache
*cache
, bool write
)
458 lh
->no_previous
= cache
->num_locks
;
459 lh
->previous
= lh
->no_previous
;
462 static void __lh_lock(struct lock_history
*lh
, unsigned int index
)
465 if (static_branch_unlikely(&no_sleep_enabled
) && lh
->cache
->no_sleep
)
466 write_lock_bh(&lh
->cache
->trees
[index
].u
.spinlock
);
468 down_write(&lh
->cache
->trees
[index
].u
.lock
);
470 if (static_branch_unlikely(&no_sleep_enabled
) && lh
->cache
->no_sleep
)
471 read_lock_bh(&lh
->cache
->trees
[index
].u
.spinlock
);
473 down_read(&lh
->cache
->trees
[index
].u
.lock
);
477 static void __lh_unlock(struct lock_history
*lh
, unsigned int index
)
480 if (static_branch_unlikely(&no_sleep_enabled
) && lh
->cache
->no_sleep
)
481 write_unlock_bh(&lh
->cache
->trees
[index
].u
.spinlock
);
483 up_write(&lh
->cache
->trees
[index
].u
.lock
);
485 if (static_branch_unlikely(&no_sleep_enabled
) && lh
->cache
->no_sleep
)
486 read_unlock_bh(&lh
->cache
->trees
[index
].u
.spinlock
);
488 up_read(&lh
->cache
->trees
[index
].u
.lock
);
493 * Make sure you call this since it will unlock the final lock.
495 static void lh_exit(struct lock_history
*lh
)
497 if (lh
->previous
!= lh
->no_previous
) {
498 __lh_unlock(lh
, lh
->previous
);
499 lh
->previous
= lh
->no_previous
;
504 * Named 'next' because there is no corresponding
505 * 'up/unlock' call since it's done automatically.
507 static void lh_next(struct lock_history
*lh
, sector_t b
)
509 unsigned int index
= cache_index(b
, lh
->no_previous
); /* no_previous is num_locks */
511 if (lh
->previous
!= lh
->no_previous
) {
512 if (lh
->previous
!= index
) {
513 __lh_unlock(lh
, lh
->previous
);
514 __lh_lock(lh
, index
);
515 lh
->previous
= index
;
518 __lh_lock(lh
, index
);
519 lh
->previous
= index
;
523 static inline struct dm_buffer
*le_to_buffer(struct lru_entry
*le
)
525 return container_of(le
, struct dm_buffer
, lru
);
528 static struct dm_buffer
*list_to_buffer(struct list_head
*l
)
530 struct lru_entry
*le
= list_entry(l
, struct lru_entry
, list
);
532 return le_to_buffer(le
);
535 static void cache_init(struct dm_buffer_cache
*bc
, unsigned int num_locks
, bool no_sleep
)
539 bc
->num_locks
= num_locks
;
540 bc
->no_sleep
= no_sleep
;
542 for (i
= 0; i
< bc
->num_locks
; i
++) {
544 rwlock_init(&bc
->trees
[i
].u
.spinlock
);
546 init_rwsem(&bc
->trees
[i
].u
.lock
);
547 bc
->trees
[i
].root
= RB_ROOT
;
550 lru_init(&bc
->lru
[LIST_CLEAN
]);
551 lru_init(&bc
->lru
[LIST_DIRTY
]);
554 static void cache_destroy(struct dm_buffer_cache
*bc
)
558 for (i
= 0; i
< bc
->num_locks
; i
++)
559 WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc
->trees
[i
].root
));
561 lru_destroy(&bc
->lru
[LIST_CLEAN
]);
562 lru_destroy(&bc
->lru
[LIST_DIRTY
]);
568 * not threadsafe, or racey depending how you look at it
570 static inline unsigned long cache_count(struct dm_buffer_cache
*bc
, int list_mode
)
572 return bc
->lru
[list_mode
].count
;
575 static inline unsigned long cache_total(struct dm_buffer_cache
*bc
)
577 return cache_count(bc
, LIST_CLEAN
) + cache_count(bc
, LIST_DIRTY
);
583 * Gets a specific buffer, indexed by block.
584 * If the buffer is found then its holder count will be incremented and
585 * lru_reference will be called.
589 static struct dm_buffer
*__cache_get(const struct rb_root
*root
, sector_t block
)
591 struct rb_node
*n
= root
->rb_node
;
595 b
= container_of(n
, struct dm_buffer
, node
);
597 if (b
->block
== block
)
600 n
= block
< b
->block
? n
->rb_left
: n
->rb_right
;
606 static void __cache_inc_buffer(struct dm_buffer
*b
)
608 atomic_inc(&b
->hold_count
);
609 WRITE_ONCE(b
->last_accessed
, jiffies
);
612 static struct dm_buffer
*cache_get(struct dm_buffer_cache
*bc
, sector_t block
)
616 cache_read_lock(bc
, block
);
617 b
= __cache_get(&bc
->trees
[cache_index(block
, bc
->num_locks
)].root
, block
);
619 lru_reference(&b
->lru
);
620 __cache_inc_buffer(b
);
622 cache_read_unlock(bc
, block
);
630 * Returns true if the hold count hits zero.
633 static bool cache_put(struct dm_buffer_cache
*bc
, struct dm_buffer
*b
)
637 cache_read_lock(bc
, b
->block
);
638 BUG_ON(!atomic_read(&b
->hold_count
));
639 r
= atomic_dec_and_test(&b
->hold_count
);
640 cache_read_unlock(bc
, b
->block
);
647 typedef enum evict_result (*b_predicate
)(struct dm_buffer
*, void *);
650 * Evicts a buffer based on a predicate. The oldest buffer that
651 * matches the predicate will be selected. In addition to the
652 * predicate the hold_count of the selected buffer will be zero.
654 struct evict_wrapper
{
655 struct lock_history
*lh
;
661 * Wraps the buffer predicate turning it into an lru predicate. Adds
662 * extra test for hold_count.
664 static enum evict_result
__evict_pred(struct lru_entry
*le
, void *context
)
666 struct evict_wrapper
*w
= context
;
667 struct dm_buffer
*b
= le_to_buffer(le
);
669 lh_next(w
->lh
, b
->block
);
671 if (atomic_read(&b
->hold_count
))
672 return ER_DONT_EVICT
;
674 return w
->pred(b
, w
->context
);
677 static struct dm_buffer
*__cache_evict(struct dm_buffer_cache
*bc
, int list_mode
,
678 b_predicate pred
, void *context
,
679 struct lock_history
*lh
)
681 struct evict_wrapper w
= {.lh
= lh
, .pred
= pred
, .context
= context
};
682 struct lru_entry
*le
;
685 le
= lru_evict(&bc
->lru
[list_mode
], __evict_pred
, &w
, bc
->no_sleep
);
689 b
= le_to_buffer(le
);
690 /* __evict_pred will have locked the appropriate tree. */
691 rb_erase(&b
->node
, &bc
->trees
[cache_index(b
->block
, bc
->num_locks
)].root
);
696 static struct dm_buffer
*cache_evict(struct dm_buffer_cache
*bc
, int list_mode
,
697 b_predicate pred
, void *context
)
700 struct lock_history lh
;
702 lh_init(&lh
, bc
, true);
703 b
= __cache_evict(bc
, list_mode
, pred
, context
, &lh
);
712 * Mark a buffer as clean or dirty. Not threadsafe.
714 static void cache_mark(struct dm_buffer_cache
*bc
, struct dm_buffer
*b
, int list_mode
)
716 cache_write_lock(bc
, b
->block
);
717 if (list_mode
!= b
->list_mode
) {
718 lru_remove(&bc
->lru
[b
->list_mode
], &b
->lru
);
719 b
->list_mode
= list_mode
;
720 lru_insert(&bc
->lru
[b
->list_mode
], &b
->lru
);
722 cache_write_unlock(bc
, b
->block
);
728 * Runs through the lru associated with 'old_mode', if the predicate matches then
729 * it moves them to 'new_mode'. Not threadsafe.
731 static void __cache_mark_many(struct dm_buffer_cache
*bc
, int old_mode
, int new_mode
,
732 b_predicate pred
, void *context
, struct lock_history
*lh
)
734 struct lru_entry
*le
;
736 struct evict_wrapper w
= {.lh
= lh
, .pred
= pred
, .context
= context
};
739 le
= lru_evict(&bc
->lru
[old_mode
], __evict_pred
, &w
, bc
->no_sleep
);
743 b
= le_to_buffer(le
);
744 b
->list_mode
= new_mode
;
745 lru_insert(&bc
->lru
[b
->list_mode
], &b
->lru
);
749 static void cache_mark_many(struct dm_buffer_cache
*bc
, int old_mode
, int new_mode
,
750 b_predicate pred
, void *context
)
752 struct lock_history lh
;
754 lh_init(&lh
, bc
, true);
755 __cache_mark_many(bc
, old_mode
, new_mode
, pred
, context
, &lh
);
762 * Iterates through all clean or dirty entries calling a function for each
763 * entry. The callback may terminate the iteration early. Not threadsafe.
767 * Iterator functions should return one of these actions to indicate
768 * how the iteration should proceed.
775 typedef enum it_action (*iter_fn
)(struct dm_buffer
*b
, void *context
);
777 static void __cache_iterate(struct dm_buffer_cache
*bc
, int list_mode
,
778 iter_fn fn
, void *context
, struct lock_history
*lh
)
780 struct lru
*lru
= &bc
->lru
[list_mode
];
781 struct lru_entry
*le
, *first
;
786 first
= le
= to_le(lru
->cursor
);
788 struct dm_buffer
*b
= le_to_buffer(le
);
790 lh_next(lh
, b
->block
);
792 switch (fn(b
, context
)) {
801 le
= to_le(le
->list
.next
);
802 } while (le
!= first
);
805 static void cache_iterate(struct dm_buffer_cache
*bc
, int list_mode
,
806 iter_fn fn
, void *context
)
808 struct lock_history lh
;
810 lh_init(&lh
, bc
, false);
811 __cache_iterate(bc
, list_mode
, fn
, context
, &lh
);
818 * Passes ownership of the buffer to the cache. Returns false if the
819 * buffer was already present (in which case ownership does not pass).
820 * eg, a race with another thread.
822 * Holder count should be 1 on insertion.
826 static bool __cache_insert(struct rb_root
*root
, struct dm_buffer
*b
)
828 struct rb_node
**new = &root
->rb_node
, *parent
= NULL
;
829 struct dm_buffer
*found
;
832 found
= container_of(*new, struct dm_buffer
, node
);
834 if (found
->block
== b
->block
)
838 new = b
->block
< found
->block
?
839 &found
->node
.rb_left
: &found
->node
.rb_right
;
842 rb_link_node(&b
->node
, parent
, new);
843 rb_insert_color(&b
->node
, root
);
848 static bool cache_insert(struct dm_buffer_cache
*bc
, struct dm_buffer
*b
)
852 if (WARN_ON_ONCE(b
->list_mode
>= LIST_SIZE
))
855 cache_write_lock(bc
, b
->block
);
856 BUG_ON(atomic_read(&b
->hold_count
) != 1);
857 r
= __cache_insert(&bc
->trees
[cache_index(b
->block
, bc
->num_locks
)].root
, b
);
859 lru_insert(&bc
->lru
[b
->list_mode
], &b
->lru
);
860 cache_write_unlock(bc
, b
->block
);
868 * Removes buffer from cache, ownership of the buffer passes back to the caller.
869 * Fails if the hold_count is not one (ie. the caller holds the only reference).
873 static bool cache_remove(struct dm_buffer_cache
*bc
, struct dm_buffer
*b
)
877 cache_write_lock(bc
, b
->block
);
879 if (atomic_read(&b
->hold_count
) != 1) {
883 rb_erase(&b
->node
, &bc
->trees
[cache_index(b
->block
, bc
->num_locks
)].root
);
884 lru_remove(&bc
->lru
[b
->list_mode
], &b
->lru
);
887 cache_write_unlock(bc
, b
->block
);
894 typedef void (*b_release
)(struct dm_buffer
*);
896 static struct dm_buffer
*__find_next(struct rb_root
*root
, sector_t block
)
898 struct rb_node
*n
= root
->rb_node
;
900 struct dm_buffer
*best
= NULL
;
903 b
= container_of(n
, struct dm_buffer
, node
);
905 if (b
->block
== block
)
908 if (block
<= b
->block
) {
919 static void __remove_range(struct dm_buffer_cache
*bc
,
920 struct rb_root
*root
,
921 sector_t begin
, sector_t end
,
922 b_predicate pred
, b_release release
)
929 b
= __find_next(root
, begin
);
930 if (!b
|| (b
->block
>= end
))
933 begin
= b
->block
+ 1;
935 if (atomic_read(&b
->hold_count
))
938 if (pred(b
, NULL
) == ER_EVICT
) {
939 rb_erase(&b
->node
, root
);
940 lru_remove(&bc
->lru
[b
->list_mode
], &b
->lru
);
946 static void cache_remove_range(struct dm_buffer_cache
*bc
,
947 sector_t begin
, sector_t end
,
948 b_predicate pred
, b_release release
)
952 BUG_ON(bc
->no_sleep
);
953 for (i
= 0; i
< bc
->num_locks
; i
++) {
954 down_write(&bc
->trees
[i
].u
.lock
);
955 __remove_range(bc
, &bc
->trees
[i
].root
, begin
, end
, pred
, release
);
956 up_write(&bc
->trees
[i
].u
.lock
);
960 /*----------------------------------------------------------------*/
963 * Linking of buffers:
964 * All buffers are linked to buffer_cache with their node field.
966 * Clean buffers that are not being written (B_WRITING not set)
967 * are linked to lru[LIST_CLEAN] with their lru_list field.
969 * Dirty and clean buffers that are being written are linked to
970 * lru[LIST_DIRTY] with their lru_list field. When the write
971 * finishes, the buffer cannot be relinked immediately (because we
972 * are in an interrupt context and relinking requires process
973 * context), so some clean-not-writing buffers can be held on
974 * dirty_lru too. They are later added to lru in the process
977 struct dm_bufio_client
{
978 struct block_device
*bdev
;
979 unsigned int block_size
;
980 s8 sectors_per_block_bits
;
986 int async_write_error
;
988 void (*alloc_callback
)(struct dm_buffer
*buf
);
989 void (*write_callback
)(struct dm_buffer
*buf
);
990 struct kmem_cache
*slab_buffer
;
991 struct kmem_cache
*slab_cache
;
992 struct dm_io_client
*dm_io
;
994 struct list_head reserved_buffers
;
995 unsigned int need_reserved_buffers
;
997 unsigned int minimum_buffers
;
1001 struct shrinker
*shrinker
;
1002 struct work_struct shrink_work
;
1003 atomic_long_t need_shrink
;
1005 wait_queue_head_t free_buffer_wait
;
1007 struct list_head client_list
;
1010 * Used by global_cleanup to sort the clients list.
1012 unsigned long oldest_buffer
;
1014 struct dm_buffer_cache cache
; /* must be last member */
1017 /*----------------------------------------------------------------*/
1019 #define dm_bufio_in_request() (!!current->bio_list)
1021 static void dm_bufio_lock(struct dm_bufio_client
*c
)
1023 if (static_branch_unlikely(&no_sleep_enabled
) && c
->no_sleep
)
1024 spin_lock_bh(&c
->spinlock
);
1026 mutex_lock_nested(&c
->lock
, dm_bufio_in_request());
1029 static void dm_bufio_unlock(struct dm_bufio_client
*c
)
1031 if (static_branch_unlikely(&no_sleep_enabled
) && c
->no_sleep
)
1032 spin_unlock_bh(&c
->spinlock
);
1034 mutex_unlock(&c
->lock
);
1037 /*----------------------------------------------------------------*/
1040 * Default cache size: available memory divided by the ratio.
1042 static unsigned long dm_bufio_default_cache_size
;
1045 * Total cache size set by the user.
1047 static unsigned long dm_bufio_cache_size
;
1050 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
1051 * at any time. If it disagrees, the user has changed cache size.
1053 static unsigned long dm_bufio_cache_size_latch
;
1055 static DEFINE_SPINLOCK(global_spinlock
);
1058 * Buffers are freed after this timeout
1060 static unsigned int dm_bufio_max_age
= DM_BUFIO_DEFAULT_AGE_SECS
;
1061 static unsigned long dm_bufio_retain_bytes
= DM_BUFIO_DEFAULT_RETAIN_BYTES
;
1063 static unsigned long dm_bufio_peak_allocated
;
1064 static unsigned long dm_bufio_allocated_kmem_cache
;
1065 static unsigned long dm_bufio_allocated_get_free_pages
;
1066 static unsigned long dm_bufio_allocated_vmalloc
;
1067 static unsigned long dm_bufio_current_allocated
;
1069 /*----------------------------------------------------------------*/
1072 * The current number of clients.
1074 static int dm_bufio_client_count
;
1077 * The list of all clients.
1079 static LIST_HEAD(dm_bufio_all_clients
);
1082 * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
1084 static DEFINE_MUTEX(dm_bufio_clients_lock
);
1086 static struct workqueue_struct
*dm_bufio_wq
;
1087 static struct delayed_work dm_bufio_cleanup_old_work
;
1088 static struct work_struct dm_bufio_replacement_work
;
1091 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1092 static void buffer_record_stack(struct dm_buffer
*b
)
1094 b
->stack_len
= stack_trace_save(b
->stack_entries
, MAX_STACK
, 2);
1098 /*----------------------------------------------------------------*/
1100 static void adjust_total_allocated(struct dm_buffer
*b
, bool unlink
)
1102 unsigned char data_mode
;
1105 static unsigned long * const class_ptr
[DATA_MODE_LIMIT
] = {
1106 &dm_bufio_allocated_kmem_cache
,
1107 &dm_bufio_allocated_get_free_pages
,
1108 &dm_bufio_allocated_vmalloc
,
1111 data_mode
= b
->data_mode
;
1112 diff
= (long)b
->c
->block_size
;
1116 spin_lock(&global_spinlock
);
1118 *class_ptr
[data_mode
] += diff
;
1120 dm_bufio_current_allocated
+= diff
;
1122 if (dm_bufio_current_allocated
> dm_bufio_peak_allocated
)
1123 dm_bufio_peak_allocated
= dm_bufio_current_allocated
;
1126 if (dm_bufio_current_allocated
> dm_bufio_cache_size
)
1127 queue_work(dm_bufio_wq
, &dm_bufio_replacement_work
);
1130 spin_unlock(&global_spinlock
);
1134 * Change the number of clients and recalculate per-client limit.
1136 static void __cache_size_refresh(void)
1138 if (WARN_ON(!mutex_is_locked(&dm_bufio_clients_lock
)))
1140 if (WARN_ON(dm_bufio_client_count
< 0))
1143 dm_bufio_cache_size_latch
= READ_ONCE(dm_bufio_cache_size
);
1146 * Use default if set to 0 and report the actual cache size used.
1148 if (!dm_bufio_cache_size_latch
) {
1149 (void)cmpxchg(&dm_bufio_cache_size
, 0,
1150 dm_bufio_default_cache_size
);
1151 dm_bufio_cache_size_latch
= dm_bufio_default_cache_size
;
1156 * Allocating buffer data.
1158 * Small buffers are allocated with kmem_cache, to use space optimally.
1160 * For large buffers, we choose between get_free_pages and vmalloc.
1161 * Each has advantages and disadvantages.
1163 * __get_free_pages can randomly fail if the memory is fragmented.
1164 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
1165 * as low as 128M) so using it for caching is not appropriate.
1167 * If the allocation may fail we use __get_free_pages. Memory fragmentation
1168 * won't have a fatal effect here, but it just causes flushes of some other
1169 * buffers and more I/O will be performed. Don't use __get_free_pages if it
1170 * always fails (i.e. order > MAX_PAGE_ORDER).
1172 * If the allocation shouldn't fail we use __vmalloc. This is only for the
1173 * initial reserve allocation, so there's no risk of wasting all vmalloc
1176 static void *alloc_buffer_data(struct dm_bufio_client
*c
, gfp_t gfp_mask
,
1177 unsigned char *data_mode
)
1179 if (unlikely(c
->slab_cache
!= NULL
)) {
1180 *data_mode
= DATA_MODE_SLAB
;
1181 return kmem_cache_alloc(c
->slab_cache
, gfp_mask
);
1184 if (c
->block_size
<= KMALLOC_MAX_SIZE
&&
1185 gfp_mask
& __GFP_NORETRY
) {
1186 *data_mode
= DATA_MODE_GET_FREE_PAGES
;
1187 return (void *)__get_free_pages(gfp_mask
,
1188 c
->sectors_per_block_bits
- (PAGE_SHIFT
- SECTOR_SHIFT
));
1191 *data_mode
= DATA_MODE_VMALLOC
;
1193 return __vmalloc(c
->block_size
, gfp_mask
);
1197 * Free buffer's data.
1199 static void free_buffer_data(struct dm_bufio_client
*c
,
1200 void *data
, unsigned char data_mode
)
1202 switch (data_mode
) {
1203 case DATA_MODE_SLAB
:
1204 kmem_cache_free(c
->slab_cache
, data
);
1207 case DATA_MODE_GET_FREE_PAGES
:
1208 free_pages((unsigned long)data
,
1209 c
->sectors_per_block_bits
- (PAGE_SHIFT
- SECTOR_SHIFT
));
1212 case DATA_MODE_VMALLOC
:
1217 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
1224 * Allocate buffer and its data.
1226 static struct dm_buffer
*alloc_buffer(struct dm_bufio_client
*c
, gfp_t gfp_mask
)
1228 struct dm_buffer
*b
= kmem_cache_alloc(c
->slab_buffer
, gfp_mask
);
1235 b
->data
= alloc_buffer_data(c
, gfp_mask
, &b
->data_mode
);
1237 kmem_cache_free(c
->slab_buffer
, b
);
1240 adjust_total_allocated(b
, false);
1242 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1249 * Free buffer and its data.
1251 static void free_buffer(struct dm_buffer
*b
)
1253 struct dm_bufio_client
*c
= b
->c
;
1255 adjust_total_allocated(b
, true);
1256 free_buffer_data(c
, b
->data
, b
->data_mode
);
1257 kmem_cache_free(c
->slab_buffer
, b
);
1261 *--------------------------------------------------------------------------
1262 * Submit I/O on the buffer.
1264 * Bio interface is faster but it has some problems:
1265 * the vector list is limited (increasing this limit increases
1266 * memory-consumption per buffer, so it is not viable);
1268 * the memory must be direct-mapped, not vmalloced;
1270 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
1271 * it is not vmalloced, try using the bio interface.
1273 * If the buffer is big, if it is vmalloced or if the underlying device
1274 * rejects the bio because it is too large, use dm-io layer to do the I/O.
1275 * The dm-io layer splits the I/O into multiple requests, avoiding the above
1277 *--------------------------------------------------------------------------
1281 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
1282 * that the request was handled directly with bio interface.
1284 static void dmio_complete(unsigned long error
, void *context
)
1286 struct dm_buffer
*b
= context
;
1288 b
->end_io(b
, unlikely(error
!= 0) ? BLK_STS_IOERR
: 0);
1291 static void use_dmio(struct dm_buffer
*b
, enum req_op op
, sector_t sector
,
1292 unsigned int n_sectors
, unsigned int offset
,
1293 unsigned short ioprio
)
1296 struct dm_io_request io_req
= {
1298 .notify
.fn
= dmio_complete
,
1299 .notify
.context
= b
,
1300 .client
= b
->c
->dm_io
,
1302 struct dm_io_region region
= {
1308 if (b
->data_mode
!= DATA_MODE_VMALLOC
) {
1309 io_req
.mem
.type
= DM_IO_KMEM
;
1310 io_req
.mem
.ptr
.addr
= (char *)b
->data
+ offset
;
1312 io_req
.mem
.type
= DM_IO_VMA
;
1313 io_req
.mem
.ptr
.vma
= (char *)b
->data
+ offset
;
1316 r
= dm_io(&io_req
, 1, ®ion
, NULL
, ioprio
);
1318 b
->end_io(b
, errno_to_blk_status(r
));
1321 static void bio_complete(struct bio
*bio
)
1323 struct dm_buffer
*b
= bio
->bi_private
;
1324 blk_status_t status
= bio
->bi_status
;
1328 b
->end_io(b
, status
);
1331 static void use_bio(struct dm_buffer
*b
, enum req_op op
, sector_t sector
,
1332 unsigned int n_sectors
, unsigned int offset
,
1333 unsigned short ioprio
)
1339 bio
= bio_kmalloc(1, GFP_NOWAIT
| __GFP_NORETRY
| __GFP_NOWARN
);
1341 use_dmio(b
, op
, sector
, n_sectors
, offset
, ioprio
);
1344 bio_init(bio
, b
->c
->bdev
, bio
->bi_inline_vecs
, 1, op
);
1345 bio
->bi_iter
.bi_sector
= sector
;
1346 bio
->bi_end_io
= bio_complete
;
1347 bio
->bi_private
= b
;
1348 bio
->bi_ioprio
= ioprio
;
1350 ptr
= (char *)b
->data
+ offset
;
1351 len
= n_sectors
<< SECTOR_SHIFT
;
1353 __bio_add_page(bio
, virt_to_page(ptr
), len
, offset_in_page(ptr
));
1358 static inline sector_t
block_to_sector(struct dm_bufio_client
*c
, sector_t block
)
1362 if (likely(c
->sectors_per_block_bits
>= 0))
1363 sector
= block
<< c
->sectors_per_block_bits
;
1365 sector
= block
* (c
->block_size
>> SECTOR_SHIFT
);
1371 static void submit_io(struct dm_buffer
*b
, enum req_op op
, unsigned short ioprio
,
1372 void (*end_io
)(struct dm_buffer
*, blk_status_t
))
1374 unsigned int n_sectors
;
1376 unsigned int offset
, end
;
1380 sector
= block_to_sector(b
->c
, b
->block
);
1382 if (op
!= REQ_OP_WRITE
) {
1383 n_sectors
= b
->c
->block_size
>> SECTOR_SHIFT
;
1386 if (b
->c
->write_callback
)
1387 b
->c
->write_callback(b
);
1388 offset
= b
->write_start
;
1390 offset
&= -DM_BUFIO_WRITE_ALIGN
;
1391 end
+= DM_BUFIO_WRITE_ALIGN
- 1;
1392 end
&= -DM_BUFIO_WRITE_ALIGN
;
1393 if (unlikely(end
> b
->c
->block_size
))
1394 end
= b
->c
->block_size
;
1396 sector
+= offset
>> SECTOR_SHIFT
;
1397 n_sectors
= (end
- offset
) >> SECTOR_SHIFT
;
1400 if (b
->data_mode
!= DATA_MODE_VMALLOC
)
1401 use_bio(b
, op
, sector
, n_sectors
, offset
, ioprio
);
1403 use_dmio(b
, op
, sector
, n_sectors
, offset
, ioprio
);
1407 *--------------------------------------------------------------
1408 * Writing dirty buffers
1409 *--------------------------------------------------------------
1413 * The endio routine for write.
1415 * Set the error, clear B_WRITING bit and wake anyone who was waiting on
1418 static void write_endio(struct dm_buffer
*b
, blk_status_t status
)
1420 b
->write_error
= status
;
1421 if (unlikely(status
)) {
1422 struct dm_bufio_client
*c
= b
->c
;
1424 (void)cmpxchg(&c
->async_write_error
, 0,
1425 blk_status_to_errno(status
));
1428 BUG_ON(!test_bit(B_WRITING
, &b
->state
));
1430 smp_mb__before_atomic();
1431 clear_bit(B_WRITING
, &b
->state
);
1432 smp_mb__after_atomic();
1434 wake_up_bit(&b
->state
, B_WRITING
);
1438 * Initiate a write on a dirty buffer, but don't wait for it.
1440 * - If the buffer is not dirty, exit.
1441 * - If there some previous write going on, wait for it to finish (we can't
1442 * have two writes on the same buffer simultaneously).
1443 * - Submit our write and don't wait on it. We set B_WRITING indicating
1444 * that there is a write in progress.
1446 static void __write_dirty_buffer(struct dm_buffer
*b
,
1447 struct list_head
*write_list
)
1449 if (!test_bit(B_DIRTY
, &b
->state
))
1452 clear_bit(B_DIRTY
, &b
->state
);
1453 wait_on_bit_lock_io(&b
->state
, B_WRITING
, TASK_UNINTERRUPTIBLE
);
1455 b
->write_start
= b
->dirty_start
;
1456 b
->write_end
= b
->dirty_end
;
1459 submit_io(b
, REQ_OP_WRITE
, IOPRIO_DEFAULT
, write_endio
);
1461 list_add_tail(&b
->write_list
, write_list
);
1464 static void __flush_write_list(struct list_head
*write_list
)
1466 struct blk_plug plug
;
1468 blk_start_plug(&plug
);
1469 while (!list_empty(write_list
)) {
1470 struct dm_buffer
*b
=
1471 list_entry(write_list
->next
, struct dm_buffer
, write_list
);
1472 list_del(&b
->write_list
);
1473 submit_io(b
, REQ_OP_WRITE
, IOPRIO_DEFAULT
, write_endio
);
1476 blk_finish_plug(&plug
);
1480 * Wait until any activity on the buffer finishes. Possibly write the
1481 * buffer if it is dirty. When this function finishes, there is no I/O
1482 * running on the buffer and the buffer is not dirty.
1484 static void __make_buffer_clean(struct dm_buffer
*b
)
1486 BUG_ON(atomic_read(&b
->hold_count
));
1488 /* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */
1489 if (!smp_load_acquire(&b
->state
)) /* fast case */
1492 wait_on_bit_io(&b
->state
, B_READING
, TASK_UNINTERRUPTIBLE
);
1493 __write_dirty_buffer(b
, NULL
);
1494 wait_on_bit_io(&b
->state
, B_WRITING
, TASK_UNINTERRUPTIBLE
);
1497 static enum evict_result
is_clean(struct dm_buffer
*b
, void *context
)
1499 struct dm_bufio_client
*c
= context
;
1501 /* These should never happen */
1502 if (WARN_ON_ONCE(test_bit(B_WRITING
, &b
->state
)))
1503 return ER_DONT_EVICT
;
1504 if (WARN_ON_ONCE(test_bit(B_DIRTY
, &b
->state
)))
1505 return ER_DONT_EVICT
;
1506 if (WARN_ON_ONCE(b
->list_mode
!= LIST_CLEAN
))
1507 return ER_DONT_EVICT
;
1509 if (static_branch_unlikely(&no_sleep_enabled
) && c
->no_sleep
&&
1510 unlikely(test_bit(B_READING
, &b
->state
)))
1511 return ER_DONT_EVICT
;
1516 static enum evict_result
is_dirty(struct dm_buffer
*b
, void *context
)
1518 /* These should never happen */
1519 if (WARN_ON_ONCE(test_bit(B_READING
, &b
->state
)))
1520 return ER_DONT_EVICT
;
1521 if (WARN_ON_ONCE(b
->list_mode
!= LIST_DIRTY
))
1522 return ER_DONT_EVICT
;
1528 * Find some buffer that is not held by anybody, clean it, unlink it and
1531 static struct dm_buffer
*__get_unclaimed_buffer(struct dm_bufio_client
*c
)
1533 struct dm_buffer
*b
;
1535 b
= cache_evict(&c
->cache
, LIST_CLEAN
, is_clean
, c
);
1537 /* this also waits for pending reads */
1538 __make_buffer_clean(b
);
1542 if (static_branch_unlikely(&no_sleep_enabled
) && c
->no_sleep
)
1545 b
= cache_evict(&c
->cache
, LIST_DIRTY
, is_dirty
, NULL
);
1547 __make_buffer_clean(b
);
1555 * Wait until some other threads free some buffer or release hold count on
1558 * This function is entered with c->lock held, drops it and regains it
1561 static void __wait_for_free_buffer(struct dm_bufio_client
*c
)
1563 DECLARE_WAITQUEUE(wait
, current
);
1565 add_wait_queue(&c
->free_buffer_wait
, &wait
);
1566 set_current_state(TASK_UNINTERRUPTIBLE
);
1570 * It's possible to miss a wake up event since we don't always
1571 * hold c->lock when wake_up is called. So we have a timeout here,
1574 io_schedule_timeout(5 * HZ
);
1576 remove_wait_queue(&c
->free_buffer_wait
, &wait
);
1589 * Allocate a new buffer. If the allocation is not possible, wait until
1590 * some other thread frees a buffer.
1592 * May drop the lock and regain it.
1594 static struct dm_buffer
*__alloc_buffer_wait_no_callback(struct dm_bufio_client
*c
, enum new_flag nf
)
1596 struct dm_buffer
*b
;
1597 bool tried_noio_alloc
= false;
1600 * dm-bufio is resistant to allocation failures (it just keeps
1601 * one buffer reserved in cases all the allocations fail).
1602 * So set flags to not try too hard:
1603 * GFP_NOWAIT: don't wait; if we need to sleep we'll release our
1604 * mutex and wait ourselves.
1605 * __GFP_NORETRY: don't retry and rather return failure
1606 * __GFP_NOMEMALLOC: don't use emergency reserves
1607 * __GFP_NOWARN: don't print a warning in case of failure
1609 * For debugging, if we set the cache size to 1, no new buffers will
1613 if (dm_bufio_cache_size_latch
!= 1) {
1614 b
= alloc_buffer(c
, GFP_NOWAIT
| __GFP_NORETRY
| __GFP_NOMEMALLOC
| __GFP_NOWARN
);
1619 if (nf
== NF_PREFETCH
)
1622 if (dm_bufio_cache_size_latch
!= 1 && !tried_noio_alloc
) {
1624 b
= alloc_buffer(c
, GFP_NOIO
| __GFP_NORETRY
| __GFP_NOMEMALLOC
| __GFP_NOWARN
);
1628 tried_noio_alloc
= true;
1631 if (!list_empty(&c
->reserved_buffers
)) {
1632 b
= list_to_buffer(c
->reserved_buffers
.next
);
1633 list_del(&b
->lru
.list
);
1634 c
->need_reserved_buffers
++;
1639 b
= __get_unclaimed_buffer(c
);
1643 __wait_for_free_buffer(c
);
1647 static struct dm_buffer
*__alloc_buffer_wait(struct dm_bufio_client
*c
, enum new_flag nf
)
1649 struct dm_buffer
*b
= __alloc_buffer_wait_no_callback(c
, nf
);
1654 if (c
->alloc_callback
)
1655 c
->alloc_callback(b
);
1661 * Free a buffer and wake other threads waiting for free buffers.
1663 static void __free_buffer_wake(struct dm_buffer
*b
)
1665 struct dm_bufio_client
*c
= b
->c
;
1668 if (!c
->need_reserved_buffers
)
1671 list_add(&b
->lru
.list
, &c
->reserved_buffers
);
1672 c
->need_reserved_buffers
--;
1676 * We hold the bufio lock here, so no one can add entries to the
1677 * wait queue anyway.
1679 if (unlikely(waitqueue_active(&c
->free_buffer_wait
)))
1680 wake_up(&c
->free_buffer_wait
);
1683 static enum evict_result
cleaned(struct dm_buffer
*b
, void *context
)
1685 if (WARN_ON_ONCE(test_bit(B_READING
, &b
->state
)))
1686 return ER_DONT_EVICT
; /* should never happen */
1688 if (test_bit(B_DIRTY
, &b
->state
) || test_bit(B_WRITING
, &b
->state
))
1689 return ER_DONT_EVICT
;
1694 static void __move_clean_buffers(struct dm_bufio_client
*c
)
1696 cache_mark_many(&c
->cache
, LIST_DIRTY
, LIST_CLEAN
, cleaned
, NULL
);
1699 struct write_context
{
1701 struct list_head
*write_list
;
1704 static enum it_action
write_one(struct dm_buffer
*b
, void *context
)
1706 struct write_context
*wc
= context
;
1708 if (wc
->no_wait
&& test_bit(B_WRITING
, &b
->state
))
1711 __write_dirty_buffer(b
, wc
->write_list
);
1715 static void __write_dirty_buffers_async(struct dm_bufio_client
*c
, int no_wait
,
1716 struct list_head
*write_list
)
1718 struct write_context wc
= {.no_wait
= no_wait
, .write_list
= write_list
};
1720 __move_clean_buffers(c
);
1721 cache_iterate(&c
->cache
, LIST_DIRTY
, write_one
, &wc
);
1725 * Check if we're over watermark.
1726 * If we are over threshold_buffers, start freeing buffers.
1727 * If we're over "limit_buffers", block until we get under the limit.
1729 static void __check_watermark(struct dm_bufio_client
*c
,
1730 struct list_head
*write_list
)
1732 if (cache_count(&c
->cache
, LIST_DIRTY
) >
1733 cache_count(&c
->cache
, LIST_CLEAN
) * DM_BUFIO_WRITEBACK_RATIO
)
1734 __write_dirty_buffers_async(c
, 1, write_list
);
1738 *--------------------------------------------------------------
1740 *--------------------------------------------------------------
1743 static void cache_put_and_wake(struct dm_bufio_client
*c
, struct dm_buffer
*b
)
1746 * Relying on waitqueue_active() is racey, but we sleep
1747 * with schedule_timeout anyway.
1749 if (cache_put(&c
->cache
, b
) &&
1750 unlikely(waitqueue_active(&c
->free_buffer_wait
)))
1751 wake_up(&c
->free_buffer_wait
);
1755 * This assumes you have already checked the cache to see if the buffer
1756 * is already present (it will recheck after dropping the lock for allocation).
1758 static struct dm_buffer
*__bufio_new(struct dm_bufio_client
*c
, sector_t block
,
1759 enum new_flag nf
, int *need_submit
,
1760 struct list_head
*write_list
)
1762 struct dm_buffer
*b
, *new_b
= NULL
;
1766 /* This can't be called with NF_GET */
1767 if (WARN_ON_ONCE(nf
== NF_GET
))
1770 new_b
= __alloc_buffer_wait(c
, nf
);
1775 * We've had a period where the mutex was unlocked, so need to
1776 * recheck the buffer tree.
1778 b
= cache_get(&c
->cache
, block
);
1780 __free_buffer_wake(new_b
);
1784 __check_watermark(c
, write_list
);
1787 atomic_set(&b
->hold_count
, 1);
1788 WRITE_ONCE(b
->last_accessed
, jiffies
);
1792 b
->list_mode
= LIST_CLEAN
;
1797 b
->state
= 1 << B_READING
;
1802 * We mustn't insert into the cache until the B_READING state
1803 * is set. Otherwise another thread could get it and use
1804 * it before it had been read.
1806 cache_insert(&c
->cache
, b
);
1811 if (nf
== NF_PREFETCH
) {
1812 cache_put_and_wake(c
, b
);
1817 * Note: it is essential that we don't wait for the buffer to be
1818 * read if dm_bufio_get function is used. Both dm_bufio_get and
1819 * dm_bufio_prefetch can be used in the driver request routine.
1820 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1821 * the same buffer, it would deadlock if we waited.
1823 if (nf
== NF_GET
&& unlikely(test_bit_acquire(B_READING
, &b
->state
))) {
1824 cache_put_and_wake(c
, b
);
1832 * The endio routine for reading: set the error, clear the bit and wake up
1833 * anyone waiting on the buffer.
1835 static void read_endio(struct dm_buffer
*b
, blk_status_t status
)
1837 b
->read_error
= status
;
1839 BUG_ON(!test_bit(B_READING
, &b
->state
));
1841 smp_mb__before_atomic();
1842 clear_bit(B_READING
, &b
->state
);
1843 smp_mb__after_atomic();
1845 wake_up_bit(&b
->state
, B_READING
);
1849 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these
1850 * functions is similar except that dm_bufio_new doesn't read the
1851 * buffer from the disk (assuming that the caller overwrites all the data
1852 * and uses dm_bufio_mark_buffer_dirty to write new data back).
1854 static void *new_read(struct dm_bufio_client
*c
, sector_t block
,
1855 enum new_flag nf
, struct dm_buffer
**bp
,
1856 unsigned short ioprio
)
1858 int need_submit
= 0;
1859 struct dm_buffer
*b
;
1861 LIST_HEAD(write_list
);
1866 * Fast path, hopefully the block is already in the cache. No need
1867 * to get the client lock for this.
1869 b
= cache_get(&c
->cache
, block
);
1871 if (nf
== NF_PREFETCH
) {
1872 cache_put_and_wake(c
, b
);
1877 * Note: it is essential that we don't wait for the buffer to be
1878 * read if dm_bufio_get function is used. Both dm_bufio_get and
1879 * dm_bufio_prefetch can be used in the driver request routine.
1880 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1881 * the same buffer, it would deadlock if we waited.
1883 if (nf
== NF_GET
&& unlikely(test_bit_acquire(B_READING
, &b
->state
))) {
1884 cache_put_and_wake(c
, b
);
1894 b
= __bufio_new(c
, block
, nf
, &need_submit
, &write_list
);
1898 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1899 if (b
&& (atomic_read(&b
->hold_count
) == 1))
1900 buffer_record_stack(b
);
1903 __flush_write_list(&write_list
);
1909 submit_io(b
, REQ_OP_READ
, ioprio
, read_endio
);
1911 if (nf
!= NF_GET
) /* we already tested this condition above */
1912 wait_on_bit_io(&b
->state
, B_READING
, TASK_UNINTERRUPTIBLE
);
1914 if (b
->read_error
) {
1915 int error
= blk_status_to_errno(b
->read_error
);
1917 dm_bufio_release(b
);
1919 return ERR_PTR(error
);
1927 void *dm_bufio_get(struct dm_bufio_client
*c
, sector_t block
,
1928 struct dm_buffer
**bp
)
1930 return new_read(c
, block
, NF_GET
, bp
, IOPRIO_DEFAULT
);
1932 EXPORT_SYMBOL_GPL(dm_bufio_get
);
1934 static void *__dm_bufio_read(struct dm_bufio_client
*c
, sector_t block
,
1935 struct dm_buffer
**bp
, unsigned short ioprio
)
1937 if (WARN_ON_ONCE(dm_bufio_in_request()))
1938 return ERR_PTR(-EINVAL
);
1940 return new_read(c
, block
, NF_READ
, bp
, ioprio
);
1943 void *dm_bufio_read(struct dm_bufio_client
*c
, sector_t block
,
1944 struct dm_buffer
**bp
)
1946 return __dm_bufio_read(c
, block
, bp
, IOPRIO_DEFAULT
);
1948 EXPORT_SYMBOL_GPL(dm_bufio_read
);
1950 void *dm_bufio_read_with_ioprio(struct dm_bufio_client
*c
, sector_t block
,
1951 struct dm_buffer
**bp
, unsigned short ioprio
)
1953 return __dm_bufio_read(c
, block
, bp
, ioprio
);
1955 EXPORT_SYMBOL_GPL(dm_bufio_read_with_ioprio
);
1957 void *dm_bufio_new(struct dm_bufio_client
*c
, sector_t block
,
1958 struct dm_buffer
**bp
)
1960 if (WARN_ON_ONCE(dm_bufio_in_request()))
1961 return ERR_PTR(-EINVAL
);
1963 return new_read(c
, block
, NF_FRESH
, bp
, IOPRIO_DEFAULT
);
1965 EXPORT_SYMBOL_GPL(dm_bufio_new
);
1967 static void __dm_bufio_prefetch(struct dm_bufio_client
*c
,
1968 sector_t block
, unsigned int n_blocks
,
1969 unsigned short ioprio
)
1971 struct blk_plug plug
;
1973 LIST_HEAD(write_list
);
1975 if (WARN_ON_ONCE(dm_bufio_in_request()))
1976 return; /* should never happen */
1978 blk_start_plug(&plug
);
1980 for (; n_blocks
--; block
++) {
1982 struct dm_buffer
*b
;
1984 b
= cache_get(&c
->cache
, block
);
1986 /* already in cache */
1987 cache_put_and_wake(c
, b
);
1992 b
= __bufio_new(c
, block
, NF_PREFETCH
, &need_submit
,
1994 if (unlikely(!list_empty(&write_list
))) {
1996 blk_finish_plug(&plug
);
1997 __flush_write_list(&write_list
);
1998 blk_start_plug(&plug
);
2001 if (unlikely(b
!= NULL
)) {
2005 submit_io(b
, REQ_OP_READ
, ioprio
, read_endio
);
2006 dm_bufio_release(b
);
2018 blk_finish_plug(&plug
);
2021 void dm_bufio_prefetch(struct dm_bufio_client
*c
, sector_t block
, unsigned int n_blocks
)
2023 return __dm_bufio_prefetch(c
, block
, n_blocks
, IOPRIO_DEFAULT
);
2025 EXPORT_SYMBOL_GPL(dm_bufio_prefetch
);
2027 void dm_bufio_prefetch_with_ioprio(struct dm_bufio_client
*c
, sector_t block
,
2028 unsigned int n_blocks
, unsigned short ioprio
)
2030 return __dm_bufio_prefetch(c
, block
, n_blocks
, ioprio
);
2032 EXPORT_SYMBOL_GPL(dm_bufio_prefetch_with_ioprio
);
2034 void dm_bufio_release(struct dm_buffer
*b
)
2036 struct dm_bufio_client
*c
= b
->c
;
2039 * If there were errors on the buffer, and the buffer is not
2040 * to be written, free the buffer. There is no point in caching
2043 if ((b
->read_error
|| b
->write_error
) &&
2044 !test_bit_acquire(B_READING
, &b
->state
) &&
2045 !test_bit(B_WRITING
, &b
->state
) &&
2046 !test_bit(B_DIRTY
, &b
->state
)) {
2049 /* cache remove can fail if there are other holders */
2050 if (cache_remove(&c
->cache
, b
)) {
2051 __free_buffer_wake(b
);
2059 cache_put_and_wake(c
, b
);
2061 EXPORT_SYMBOL_GPL(dm_bufio_release
);
2063 void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer
*b
,
2064 unsigned int start
, unsigned int end
)
2066 struct dm_bufio_client
*c
= b
->c
;
2068 BUG_ON(start
>= end
);
2069 BUG_ON(end
> b
->c
->block_size
);
2073 BUG_ON(test_bit(B_READING
, &b
->state
));
2075 if (!test_and_set_bit(B_DIRTY
, &b
->state
)) {
2076 b
->dirty_start
= start
;
2078 cache_mark(&c
->cache
, b
, LIST_DIRTY
);
2080 if (start
< b
->dirty_start
)
2081 b
->dirty_start
= start
;
2082 if (end
> b
->dirty_end
)
2088 EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty
);
2090 void dm_bufio_mark_buffer_dirty(struct dm_buffer
*b
)
2092 dm_bufio_mark_partial_buffer_dirty(b
, 0, b
->c
->block_size
);
2094 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty
);
2096 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client
*c
)
2098 LIST_HEAD(write_list
);
2100 if (WARN_ON_ONCE(dm_bufio_in_request()))
2101 return; /* should never happen */
2104 __write_dirty_buffers_async(c
, 0, &write_list
);
2106 __flush_write_list(&write_list
);
2108 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async
);
2111 * For performance, it is essential that the buffers are written asynchronously
2112 * and simultaneously (so that the block layer can merge the writes) and then
2115 * Finally, we flush hardware disk cache.
2117 static bool is_writing(struct lru_entry
*e
, void *context
)
2119 struct dm_buffer
*b
= le_to_buffer(e
);
2121 return test_bit(B_WRITING
, &b
->state
);
2124 int dm_bufio_write_dirty_buffers(struct dm_bufio_client
*c
)
2127 unsigned long nr_buffers
;
2128 struct lru_entry
*e
;
2131 LIST_HEAD(write_list
);
2134 __write_dirty_buffers_async(c
, 0, &write_list
);
2136 __flush_write_list(&write_list
);
2139 nr_buffers
= cache_count(&c
->cache
, LIST_DIRTY
);
2140 lru_iter_begin(&c
->cache
.lru
[LIST_DIRTY
], &it
);
2141 while ((e
= lru_iter_next(&it
, is_writing
, c
))) {
2142 struct dm_buffer
*b
= le_to_buffer(e
);
2143 __cache_inc_buffer(b
);
2145 BUG_ON(test_bit(B_READING
, &b
->state
));
2150 wait_on_bit_io(&b
->state
, B_WRITING
, TASK_UNINTERRUPTIBLE
);
2153 wait_on_bit_io(&b
->state
, B_WRITING
, TASK_UNINTERRUPTIBLE
);
2156 if (!test_bit(B_DIRTY
, &b
->state
) && !test_bit(B_WRITING
, &b
->state
))
2157 cache_mark(&c
->cache
, b
, LIST_CLEAN
);
2159 cache_put_and_wake(c
, b
);
2165 wake_up(&c
->free_buffer_wait
);
2168 a
= xchg(&c
->async_write_error
, 0);
2169 f
= dm_bufio_issue_flush(c
);
2175 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers
);
2178 * Use dm-io to send an empty barrier to flush the device.
2180 int dm_bufio_issue_flush(struct dm_bufio_client
*c
)
2182 struct dm_io_request io_req
= {
2183 .bi_opf
= REQ_OP_WRITE
| REQ_PREFLUSH
| REQ_SYNC
,
2184 .mem
.type
= DM_IO_KMEM
,
2185 .mem
.ptr
.addr
= NULL
,
2188 struct dm_io_region io_reg
= {
2194 if (WARN_ON_ONCE(dm_bufio_in_request()))
2197 return dm_io(&io_req
, 1, &io_reg
, NULL
, IOPRIO_DEFAULT
);
2199 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush
);
2202 * Use dm-io to send a discard request to flush the device.
2204 int dm_bufio_issue_discard(struct dm_bufio_client
*c
, sector_t block
, sector_t count
)
2206 struct dm_io_request io_req
= {
2207 .bi_opf
= REQ_OP_DISCARD
| REQ_SYNC
,
2208 .mem
.type
= DM_IO_KMEM
,
2209 .mem
.ptr
.addr
= NULL
,
2212 struct dm_io_region io_reg
= {
2214 .sector
= block_to_sector(c
, block
),
2215 .count
= block_to_sector(c
, count
),
2218 if (WARN_ON_ONCE(dm_bufio_in_request()))
2219 return -EINVAL
; /* discards are optional */
2221 return dm_io(&io_req
, 1, &io_reg
, NULL
, IOPRIO_DEFAULT
);
2223 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard
);
2225 static bool forget_buffer(struct dm_bufio_client
*c
, sector_t block
)
2227 struct dm_buffer
*b
;
2229 b
= cache_get(&c
->cache
, block
);
2231 if (likely(!smp_load_acquire(&b
->state
))) {
2232 if (cache_remove(&c
->cache
, b
))
2233 __free_buffer_wake(b
);
2235 cache_put_and_wake(c
, b
);
2237 cache_put_and_wake(c
, b
);
2241 return b
? true : false;
2245 * Free the given buffer.
2247 * This is just a hint, if the buffer is in use or dirty, this function
2250 void dm_bufio_forget(struct dm_bufio_client
*c
, sector_t block
)
2253 forget_buffer(c
, block
);
2256 EXPORT_SYMBOL_GPL(dm_bufio_forget
);
2258 static enum evict_result
idle(struct dm_buffer
*b
, void *context
)
2260 return b
->state
? ER_DONT_EVICT
: ER_EVICT
;
2263 void dm_bufio_forget_buffers(struct dm_bufio_client
*c
, sector_t block
, sector_t n_blocks
)
2266 cache_remove_range(&c
->cache
, block
, block
+ n_blocks
, idle
, __free_buffer_wake
);
2269 EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers
);
2271 void dm_bufio_set_minimum_buffers(struct dm_bufio_client
*c
, unsigned int n
)
2273 c
->minimum_buffers
= n
;
2275 EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers
);
2277 unsigned int dm_bufio_get_block_size(struct dm_bufio_client
*c
)
2279 return c
->block_size
;
2281 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size
);
2283 sector_t
dm_bufio_get_device_size(struct dm_bufio_client
*c
)
2285 sector_t s
= bdev_nr_sectors(c
->bdev
);
2291 if (likely(c
->sectors_per_block_bits
>= 0))
2292 s
>>= c
->sectors_per_block_bits
;
2294 sector_div(s
, c
->block_size
>> SECTOR_SHIFT
);
2297 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size
);
2299 struct dm_io_client
*dm_bufio_get_dm_io_client(struct dm_bufio_client
*c
)
2303 EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client
);
2305 sector_t
dm_bufio_get_block_number(struct dm_buffer
*b
)
2309 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number
);
2311 void *dm_bufio_get_block_data(struct dm_buffer
*b
)
2315 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data
);
2317 void *dm_bufio_get_aux_data(struct dm_buffer
*b
)
2321 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data
);
2323 struct dm_bufio_client
*dm_bufio_get_client(struct dm_buffer
*b
)
2327 EXPORT_SYMBOL_GPL(dm_bufio_get_client
);
2329 static enum it_action
warn_leak(struct dm_buffer
*b
, void *context
)
2331 bool *warned
= context
;
2333 WARN_ON(!(*warned
));
2335 DMERR("leaked buffer %llx, hold count %u, list %d",
2336 (unsigned long long)b
->block
, atomic_read(&b
->hold_count
), b
->list_mode
);
2337 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2338 stack_trace_print(b
->stack_entries
, b
->stack_len
, 1);
2339 /* mark unclaimed to avoid WARN_ON at end of drop_buffers() */
2340 atomic_set(&b
->hold_count
, 0);
2345 static void drop_buffers(struct dm_bufio_client
*c
)
2348 struct dm_buffer
*b
;
2350 if (WARN_ON(dm_bufio_in_request()))
2351 return; /* should never happen */
2354 * An optimization so that the buffers are not written one-by-one.
2356 dm_bufio_write_dirty_buffers_async(c
);
2360 while ((b
= __get_unclaimed_buffer(c
)))
2361 __free_buffer_wake(b
);
2363 for (i
= 0; i
< LIST_SIZE
; i
++) {
2364 bool warned
= false;
2366 cache_iterate(&c
->cache
, i
, warn_leak
, &warned
);
2369 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2370 while ((b
= __get_unclaimed_buffer(c
)))
2371 __free_buffer_wake(b
);
2374 for (i
= 0; i
< LIST_SIZE
; i
++)
2375 WARN_ON(cache_count(&c
->cache
, i
));
2380 static unsigned long get_retain_buffers(struct dm_bufio_client
*c
)
2382 unsigned long retain_bytes
= READ_ONCE(dm_bufio_retain_bytes
);
2384 if (likely(c
->sectors_per_block_bits
>= 0))
2385 retain_bytes
>>= c
->sectors_per_block_bits
+ SECTOR_SHIFT
;
2387 retain_bytes
/= c
->block_size
;
2389 return retain_bytes
;
2392 static void __scan(struct dm_bufio_client
*c
)
2395 struct dm_buffer
*b
;
2396 unsigned long freed
= 0;
2397 unsigned long retain_target
= get_retain_buffers(c
);
2398 unsigned long count
= cache_total(&c
->cache
);
2400 for (l
= 0; l
< LIST_SIZE
; l
++) {
2402 if (count
- freed
<= retain_target
)
2403 atomic_long_set(&c
->need_shrink
, 0);
2404 if (!atomic_long_read(&c
->need_shrink
))
2407 b
= cache_evict(&c
->cache
, l
,
2408 l
== LIST_CLEAN
? is_clean
: is_dirty
, c
);
2412 __make_buffer_clean(b
);
2413 __free_buffer_wake(b
);
2415 atomic_long_dec(&c
->need_shrink
);
2422 static void shrink_work(struct work_struct
*w
)
2424 struct dm_bufio_client
*c
= container_of(w
, struct dm_bufio_client
, shrink_work
);
2431 static unsigned long dm_bufio_shrink_scan(struct shrinker
*shrink
, struct shrink_control
*sc
)
2433 struct dm_bufio_client
*c
;
2435 c
= shrink
->private_data
;
2436 atomic_long_add(sc
->nr_to_scan
, &c
->need_shrink
);
2437 queue_work(dm_bufio_wq
, &c
->shrink_work
);
2439 return sc
->nr_to_scan
;
2442 static unsigned long dm_bufio_shrink_count(struct shrinker
*shrink
, struct shrink_control
*sc
)
2444 struct dm_bufio_client
*c
= shrink
->private_data
;
2445 unsigned long count
= cache_total(&c
->cache
);
2446 unsigned long retain_target
= get_retain_buffers(c
);
2447 unsigned long queued_for_cleanup
= atomic_long_read(&c
->need_shrink
);
2449 if (unlikely(count
< retain_target
))
2452 count
-= retain_target
;
2454 if (unlikely(count
< queued_for_cleanup
))
2457 count
-= queued_for_cleanup
;
2463 * Create the buffering interface
2465 struct dm_bufio_client
*dm_bufio_client_create(struct block_device
*bdev
, unsigned int block_size
,
2466 unsigned int reserved_buffers
, unsigned int aux_size
,
2467 void (*alloc_callback
)(struct dm_buffer
*),
2468 void (*write_callback
)(struct dm_buffer
*),
2472 unsigned int num_locks
;
2473 struct dm_bufio_client
*c
;
2476 if (!block_size
|| block_size
& ((1 << SECTOR_SHIFT
) - 1)) {
2477 DMERR("%s: block size not specified or is not multiple of 512b", __func__
);
2482 num_locks
= dm_num_hash_locks();
2483 c
= kzalloc(sizeof(*c
) + (num_locks
* sizeof(struct buffer_tree
)), GFP_KERNEL
);
2488 cache_init(&c
->cache
, num_locks
, (flags
& DM_BUFIO_CLIENT_NO_SLEEP
) != 0);
2491 c
->block_size
= block_size
;
2492 if (is_power_of_2(block_size
))
2493 c
->sectors_per_block_bits
= __ffs(block_size
) - SECTOR_SHIFT
;
2495 c
->sectors_per_block_bits
= -1;
2497 c
->alloc_callback
= alloc_callback
;
2498 c
->write_callback
= write_callback
;
2500 if (flags
& DM_BUFIO_CLIENT_NO_SLEEP
) {
2502 static_branch_inc(&no_sleep_enabled
);
2505 mutex_init(&c
->lock
);
2506 spin_lock_init(&c
->spinlock
);
2507 INIT_LIST_HEAD(&c
->reserved_buffers
);
2508 c
->need_reserved_buffers
= reserved_buffers
;
2510 dm_bufio_set_minimum_buffers(c
, DM_BUFIO_MIN_BUFFERS
);
2512 init_waitqueue_head(&c
->free_buffer_wait
);
2513 c
->async_write_error
= 0;
2515 c
->dm_io
= dm_io_client_create();
2516 if (IS_ERR(c
->dm_io
)) {
2517 r
= PTR_ERR(c
->dm_io
);
2521 if (block_size
<= KMALLOC_MAX_SIZE
&&
2522 (block_size
< PAGE_SIZE
|| !is_power_of_2(block_size
))) {
2523 unsigned int align
= min(1U << __ffs(block_size
), (unsigned int)PAGE_SIZE
);
2525 snprintf(slab_name
, sizeof(slab_name
), "dm_bufio_cache-%u", block_size
);
2526 c
->slab_cache
= kmem_cache_create(slab_name
, block_size
, align
,
2527 SLAB_RECLAIM_ACCOUNT
, NULL
);
2528 if (!c
->slab_cache
) {
2534 snprintf(slab_name
, sizeof(slab_name
), "dm_bufio_buffer-%u", aux_size
);
2536 snprintf(slab_name
, sizeof(slab_name
), "dm_bufio_buffer");
2537 c
->slab_buffer
= kmem_cache_create(slab_name
, sizeof(struct dm_buffer
) + aux_size
,
2538 0, SLAB_RECLAIM_ACCOUNT
, NULL
);
2539 if (!c
->slab_buffer
) {
2544 while (c
->need_reserved_buffers
) {
2545 struct dm_buffer
*b
= alloc_buffer(c
, GFP_KERNEL
);
2551 __free_buffer_wake(b
);
2554 INIT_WORK(&c
->shrink_work
, shrink_work
);
2555 atomic_long_set(&c
->need_shrink
, 0);
2557 c
->shrinker
= shrinker_alloc(0, "dm-bufio:(%u:%u)",
2558 MAJOR(bdev
->bd_dev
), MINOR(bdev
->bd_dev
));
2564 c
->shrinker
->count_objects
= dm_bufio_shrink_count
;
2565 c
->shrinker
->scan_objects
= dm_bufio_shrink_scan
;
2566 c
->shrinker
->seeks
= 1;
2567 c
->shrinker
->batch
= 0;
2568 c
->shrinker
->private_data
= c
;
2570 shrinker_register(c
->shrinker
);
2572 mutex_lock(&dm_bufio_clients_lock
);
2573 dm_bufio_client_count
++;
2574 list_add(&c
->client_list
, &dm_bufio_all_clients
);
2575 __cache_size_refresh();
2576 mutex_unlock(&dm_bufio_clients_lock
);
2581 while (!list_empty(&c
->reserved_buffers
)) {
2582 struct dm_buffer
*b
= list_to_buffer(c
->reserved_buffers
.next
);
2584 list_del(&b
->lru
.list
);
2587 kmem_cache_destroy(c
->slab_cache
);
2588 kmem_cache_destroy(c
->slab_buffer
);
2589 dm_io_client_destroy(c
->dm_io
);
2591 mutex_destroy(&c
->lock
);
2593 static_branch_dec(&no_sleep_enabled
);
2598 EXPORT_SYMBOL_GPL(dm_bufio_client_create
);
2601 * Free the buffering interface.
2602 * It is required that there are no references on any buffers.
2604 void dm_bufio_client_destroy(struct dm_bufio_client
*c
)
2610 shrinker_free(c
->shrinker
);
2611 flush_work(&c
->shrink_work
);
2613 mutex_lock(&dm_bufio_clients_lock
);
2615 list_del(&c
->client_list
);
2616 dm_bufio_client_count
--;
2617 __cache_size_refresh();
2619 mutex_unlock(&dm_bufio_clients_lock
);
2621 WARN_ON(c
->need_reserved_buffers
);
2623 while (!list_empty(&c
->reserved_buffers
)) {
2624 struct dm_buffer
*b
= list_to_buffer(c
->reserved_buffers
.next
);
2626 list_del(&b
->lru
.list
);
2630 for (i
= 0; i
< LIST_SIZE
; i
++)
2631 if (cache_count(&c
->cache
, i
))
2632 DMERR("leaked buffer count %d: %lu", i
, cache_count(&c
->cache
, i
));
2634 for (i
= 0; i
< LIST_SIZE
; i
++)
2635 WARN_ON(cache_count(&c
->cache
, i
));
2637 cache_destroy(&c
->cache
);
2638 kmem_cache_destroy(c
->slab_cache
);
2639 kmem_cache_destroy(c
->slab_buffer
);
2640 dm_io_client_destroy(c
->dm_io
);
2641 mutex_destroy(&c
->lock
);
2643 static_branch_dec(&no_sleep_enabled
);
2646 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy
);
2648 void dm_bufio_client_reset(struct dm_bufio_client
*c
)
2651 flush_work(&c
->shrink_work
);
2653 EXPORT_SYMBOL_GPL(dm_bufio_client_reset
);
2655 void dm_bufio_set_sector_offset(struct dm_bufio_client
*c
, sector_t start
)
2659 EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset
);
2661 /*--------------------------------------------------------------*/
2663 static unsigned int get_max_age_hz(void)
2665 unsigned int max_age
= READ_ONCE(dm_bufio_max_age
);
2667 if (max_age
> UINT_MAX
/ HZ
)
2668 max_age
= UINT_MAX
/ HZ
;
2670 return max_age
* HZ
;
2673 static bool older_than(struct dm_buffer
*b
, unsigned long age_hz
)
2675 return time_after_eq(jiffies
, READ_ONCE(b
->last_accessed
) + age_hz
);
2678 struct evict_params
{
2680 unsigned long age_hz
;
2683 * This gets updated with the largest last_accessed (ie. most
2684 * recently used) of the evicted buffers. It will not be reinitialised
2685 * by __evict_many(), so you can use it across multiple invocations.
2687 unsigned long last_accessed
;
2691 * We may not be able to evict this buffer if IO pending or the client
2692 * is still using it.
2694 * And if GFP_NOFS is used, we must not do any I/O because we hold
2695 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
2696 * rerouted to different bufio client.
2698 static enum evict_result
select_for_evict(struct dm_buffer
*b
, void *context
)
2700 struct evict_params
*params
= context
;
2702 if (!(params
->gfp
& __GFP_FS
) ||
2703 (static_branch_unlikely(&no_sleep_enabled
) && b
->c
->no_sleep
)) {
2704 if (test_bit_acquire(B_READING
, &b
->state
) ||
2705 test_bit(B_WRITING
, &b
->state
) ||
2706 test_bit(B_DIRTY
, &b
->state
))
2707 return ER_DONT_EVICT
;
2710 return older_than(b
, params
->age_hz
) ? ER_EVICT
: ER_STOP
;
2713 static unsigned long __evict_many(struct dm_bufio_client
*c
,
2714 struct evict_params
*params
,
2715 int list_mode
, unsigned long max_count
)
2717 unsigned long count
;
2718 unsigned long last_accessed
;
2719 struct dm_buffer
*b
;
2721 for (count
= 0; count
< max_count
; count
++) {
2722 b
= cache_evict(&c
->cache
, list_mode
, select_for_evict
, params
);
2726 last_accessed
= READ_ONCE(b
->last_accessed
);
2727 if (time_after_eq(params
->last_accessed
, last_accessed
))
2728 params
->last_accessed
= last_accessed
;
2730 __make_buffer_clean(b
);
2731 __free_buffer_wake(b
);
2739 static void evict_old_buffers(struct dm_bufio_client
*c
, unsigned long age_hz
)
2741 struct evict_params params
= {.gfp
= 0, .age_hz
= age_hz
, .last_accessed
= 0};
2742 unsigned long retain
= get_retain_buffers(c
);
2743 unsigned long count
;
2744 LIST_HEAD(write_list
);
2748 __check_watermark(c
, &write_list
);
2749 if (unlikely(!list_empty(&write_list
))) {
2751 __flush_write_list(&write_list
);
2755 count
= cache_total(&c
->cache
);
2757 __evict_many(c
, ¶ms
, LIST_CLEAN
, count
- retain
);
2762 static void cleanup_old_buffers(void)
2764 unsigned long max_age_hz
= get_max_age_hz();
2765 struct dm_bufio_client
*c
;
2767 mutex_lock(&dm_bufio_clients_lock
);
2769 __cache_size_refresh();
2771 list_for_each_entry(c
, &dm_bufio_all_clients
, client_list
)
2772 evict_old_buffers(c
, max_age_hz
);
2774 mutex_unlock(&dm_bufio_clients_lock
);
2777 static void work_fn(struct work_struct
*w
)
2779 cleanup_old_buffers();
2781 queue_delayed_work(dm_bufio_wq
, &dm_bufio_cleanup_old_work
,
2782 DM_BUFIO_WORK_TIMER_SECS
* HZ
);
2785 /*--------------------------------------------------------------*/
2788 * Global cleanup tries to evict the oldest buffers from across _all_
2789 * the clients. It does this by repeatedly evicting a few buffers from
2790 * the client that holds the oldest buffer. It's approximate, but hopefully
2793 static struct dm_bufio_client
*__pop_client(void)
2795 struct list_head
*h
;
2797 if (list_empty(&dm_bufio_all_clients
))
2800 h
= dm_bufio_all_clients
.next
;
2802 return container_of(h
, struct dm_bufio_client
, client_list
);
2806 * Inserts the client in the global client list based on its
2807 * 'oldest_buffer' field.
2809 static void __insert_client(struct dm_bufio_client
*new_client
)
2811 struct dm_bufio_client
*c
;
2812 struct list_head
*h
= dm_bufio_all_clients
.next
;
2814 while (h
!= &dm_bufio_all_clients
) {
2815 c
= container_of(h
, struct dm_bufio_client
, client_list
);
2816 if (time_after_eq(c
->oldest_buffer
, new_client
->oldest_buffer
))
2821 list_add_tail(&new_client
->client_list
, h
);
2824 static unsigned long __evict_a_few(unsigned long nr_buffers
)
2826 unsigned long count
;
2827 struct dm_bufio_client
*c
;
2828 struct evict_params params
= {
2831 /* set to jiffies in case there are no buffers in this client */
2832 .last_accessed
= jiffies
2840 count
= __evict_many(c
, ¶ms
, LIST_CLEAN
, nr_buffers
);
2844 c
->oldest_buffer
= params
.last_accessed
;
2850 static void check_watermarks(void)
2852 LIST_HEAD(write_list
);
2853 struct dm_bufio_client
*c
;
2855 mutex_lock(&dm_bufio_clients_lock
);
2856 list_for_each_entry(c
, &dm_bufio_all_clients
, client_list
) {
2858 __check_watermark(c
, &write_list
);
2861 mutex_unlock(&dm_bufio_clients_lock
);
2863 __flush_write_list(&write_list
);
2866 static void evict_old(void)
2868 unsigned long threshold
= dm_bufio_cache_size
-
2869 dm_bufio_cache_size
/ DM_BUFIO_LOW_WATERMARK_RATIO
;
2871 mutex_lock(&dm_bufio_clients_lock
);
2872 while (dm_bufio_current_allocated
> threshold
) {
2873 if (!__evict_a_few(64))
2877 mutex_unlock(&dm_bufio_clients_lock
);
2880 static void do_global_cleanup(struct work_struct
*w
)
2887 *--------------------------------------------------------------
2889 *--------------------------------------------------------------
2893 * This is called only once for the whole dm_bufio module.
2894 * It initializes memory limit.
2896 static int __init
dm_bufio_init(void)
2900 dm_bufio_allocated_kmem_cache
= 0;
2901 dm_bufio_allocated_get_free_pages
= 0;
2902 dm_bufio_allocated_vmalloc
= 0;
2903 dm_bufio_current_allocated
= 0;
2905 mem
= (__u64
)mult_frac(totalram_pages() - totalhigh_pages(),
2906 DM_BUFIO_MEMORY_PERCENT
, 100) << PAGE_SHIFT
;
2908 if (mem
> ULONG_MAX
)
2912 if (mem
> mult_frac(VMALLOC_TOTAL
, DM_BUFIO_VMALLOC_PERCENT
, 100))
2913 mem
= mult_frac(VMALLOC_TOTAL
, DM_BUFIO_VMALLOC_PERCENT
, 100);
2916 dm_bufio_default_cache_size
= mem
;
2918 mutex_lock(&dm_bufio_clients_lock
);
2919 __cache_size_refresh();
2920 mutex_unlock(&dm_bufio_clients_lock
);
2922 dm_bufio_wq
= alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM
, 0);
2926 INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work
, work_fn
);
2927 INIT_WORK(&dm_bufio_replacement_work
, do_global_cleanup
);
2928 queue_delayed_work(dm_bufio_wq
, &dm_bufio_cleanup_old_work
,
2929 DM_BUFIO_WORK_TIMER_SECS
* HZ
);
2935 * This is called once when unloading the dm_bufio module.
2937 static void __exit
dm_bufio_exit(void)
2941 cancel_delayed_work_sync(&dm_bufio_cleanup_old_work
);
2942 destroy_workqueue(dm_bufio_wq
);
2944 if (dm_bufio_client_count
) {
2945 DMCRIT("%s: dm_bufio_client_count leaked: %d",
2946 __func__
, dm_bufio_client_count
);
2950 if (dm_bufio_current_allocated
) {
2951 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2952 __func__
, dm_bufio_current_allocated
);
2956 if (dm_bufio_allocated_get_free_pages
) {
2957 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2958 __func__
, dm_bufio_allocated_get_free_pages
);
2962 if (dm_bufio_allocated_vmalloc
) {
2963 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2964 __func__
, dm_bufio_allocated_vmalloc
);
2968 WARN_ON(bug
); /* leaks are not worth crashing the system */
2971 module_init(dm_bufio_init
)
2972 module_exit(dm_bufio_exit
)
2974 module_param_named(max_cache_size_bytes
, dm_bufio_cache_size
, ulong
, 0644);
2975 MODULE_PARM_DESC(max_cache_size_bytes
, "Size of metadata cache");
2977 module_param_named(max_age_seconds
, dm_bufio_max_age
, uint
, 0644);
2978 MODULE_PARM_DESC(max_age_seconds
, "Max age of a buffer in seconds");
2980 module_param_named(retain_bytes
, dm_bufio_retain_bytes
, ulong
, 0644);
2981 MODULE_PARM_DESC(retain_bytes
, "Try to keep at least this many bytes cached in memory");
2983 module_param_named(peak_allocated_bytes
, dm_bufio_peak_allocated
, ulong
, 0644);
2984 MODULE_PARM_DESC(peak_allocated_bytes
, "Tracks the maximum allocated memory");
2986 module_param_named(allocated_kmem_cache_bytes
, dm_bufio_allocated_kmem_cache
, ulong
, 0444);
2987 MODULE_PARM_DESC(allocated_kmem_cache_bytes
, "Memory allocated with kmem_cache_alloc");
2989 module_param_named(allocated_get_free_pages_bytes
, dm_bufio_allocated_get_free_pages
, ulong
, 0444);
2990 MODULE_PARM_DESC(allocated_get_free_pages_bytes
, "Memory allocated with get_free_pages");
2992 module_param_named(allocated_vmalloc_bytes
, dm_bufio_allocated_vmalloc
, ulong
, 0444);
2993 MODULE_PARM_DESC(allocated_vmalloc_bytes
, "Memory allocated with vmalloc");
2995 module_param_named(current_allocated_bytes
, dm_bufio_current_allocated
, ulong
, 0444);
2996 MODULE_PARM_DESC(current_allocated_bytes
, "Memory currently used by the cache");
2998 MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>");
2999 MODULE_DESCRIPTION(DM_NAME
" buffered I/O library");
3000 MODULE_LICENSE("GPL");