1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2009-2011 Red Hat, Inc.
5 * Author: Mikulas Patocka <mpatocka@redhat.com>
7 * This file is released under the GPL.
10 #include <linux/dm-bufio.h>
12 #include <linux/device-mapper.h>
13 #include <linux/dm-io.h>
14 #include <linux/slab.h>
15 #include <linux/sched/mm.h>
16 #include <linux/jiffies.h>
17 #include <linux/vmalloc.h>
18 #include <linux/shrinker.h>
19 #include <linux/module.h>
20 #include <linux/rbtree.h>
21 #include <linux/stacktrace.h>
22 #include <linux/jump_label.h>
26 #define DM_MSG_PREFIX "bufio"
29 * Memory management policy:
30 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
31 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
32 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
33 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
36 #define DM_BUFIO_MIN_BUFFERS 8
38 #define DM_BUFIO_MEMORY_PERCENT 2
39 #define DM_BUFIO_VMALLOC_PERCENT 25
40 #define DM_BUFIO_WRITEBACK_RATIO 3
41 #define DM_BUFIO_LOW_WATERMARK_RATIO 16
44 * Check buffer ages in this interval (seconds)
46 #define DM_BUFIO_WORK_TIMER_SECS 30
49 * Free buffers when they are older than this (seconds)
51 #define DM_BUFIO_DEFAULT_AGE_SECS 300
54 * The nr of bytes of cached data to keep around.
56 #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
59 * Align buffer writes to this boundary.
60 * Tests show that SSDs have the highest IOPS when using 4k writes.
62 #define DM_BUFIO_WRITE_ALIGN 4096
65 * dm_buffer->list_mode
71 /*--------------------------------------------------------------*/
74 * Rather than use an LRU list, we use a clock algorithm where entries
75 * are held in a circular list. When an entry is 'hit' a reference bit
76 * is set. The least recently used entry is approximated by running a
77 * cursor around the list selecting unreferenced entries. Referenced
78 * entries have their reference bit cleared as the cursor passes them.
81 struct list_head list
;
87 struct list_head list
;
88 struct lru_entry
*stop
;
93 struct list_head
*cursor
;
96 struct list_head iterators
;
101 static void lru_init(struct lru
*lru
)
105 INIT_LIST_HEAD(&lru
->iterators
);
108 static void lru_destroy(struct lru
*lru
)
110 WARN_ON_ONCE(lru
->cursor
);
111 WARN_ON_ONCE(!list_empty(&lru
->iterators
));
115 * Insert a new entry into the lru.
117 static void lru_insert(struct lru
*lru
, struct lru_entry
*le
)
120 * Don't be tempted to set to 1, makes the lru aspect
123 atomic_set(&le
->referenced
, 0);
126 list_add_tail(&le
->list
, lru
->cursor
);
128 INIT_LIST_HEAD(&le
->list
);
129 lru
->cursor
= &le
->list
;
137 * Convert a list_head pointer to an lru_entry pointer.
139 static inline struct lru_entry
*to_le(struct list_head
*l
)
141 return container_of(l
, struct lru_entry
, list
);
145 * Initialize an lru_iter and add it to the list of cursors in the lru.
147 static void lru_iter_begin(struct lru
*lru
, struct lru_iter
*it
)
150 it
->stop
= lru
->cursor
? to_le(lru
->cursor
->prev
) : NULL
;
151 it
->e
= lru
->cursor
? to_le(lru
->cursor
) : NULL
;
152 list_add(&it
->list
, &lru
->iterators
);
156 * Remove an lru_iter from the list of cursors in the lru.
158 static inline void lru_iter_end(struct lru_iter
*it
)
163 /* Predicate function type to be used with lru_iter_next */
164 typedef bool (*iter_predicate
)(struct lru_entry
*le
, void *context
);
167 * Advance the cursor to the next entry that passes the
168 * predicate, and return that entry. Returns NULL if the
169 * iteration is complete.
171 static struct lru_entry
*lru_iter_next(struct lru_iter
*it
,
172 iter_predicate pred
, void *context
)
179 /* advance the cursor */
180 if (it
->e
== it
->stop
)
183 it
->e
= to_le(it
->e
->list
.next
);
185 if (pred(e
, context
))
193 * Invalidate a specific lru_entry and update all cursors in
194 * the lru accordingly.
196 static void lru_iter_invalidate(struct lru
*lru
, struct lru_entry
*e
)
200 list_for_each_entry(it
, &lru
->iterators
, list
) {
201 /* Move c->e forwards if necc. */
203 it
->e
= to_le(it
->e
->list
.next
);
208 /* Move it->stop backwards if necc. */
210 it
->stop
= to_le(it
->stop
->list
.prev
);
220 * Remove a specific entry from the lru.
222 static void lru_remove(struct lru
*lru
, struct lru_entry
*le
)
224 lru_iter_invalidate(lru
, le
);
225 if (lru
->count
== 1) {
228 if (lru
->cursor
== &le
->list
)
229 lru
->cursor
= lru
->cursor
->next
;
236 * Mark as referenced.
238 static inline void lru_reference(struct lru_entry
*le
)
240 atomic_set(&le
->referenced
, 1);
246 * Remove the least recently used entry (approx), that passes the predicate.
247 * Returns NULL on failure.
252 ER_STOP
, /* stop looking for something to evict */
255 typedef enum evict_result (*le_predicate
)(struct lru_entry
*le
, void *context
);
257 static struct lru_entry
*lru_evict(struct lru
*lru
, le_predicate pred
, void *context
, bool no_sleep
)
259 unsigned long tested
= 0;
260 struct list_head
*h
= lru
->cursor
;
261 struct lru_entry
*le
;
266 * In the worst case we have to loop around twice. Once to clear
267 * the reference flags, and then again to discover the predicate
268 * fails for all entries.
270 while (tested
< lru
->count
) {
271 le
= container_of(h
, struct lru_entry
, list
);
273 if (atomic_read(&le
->referenced
)) {
274 atomic_set(&le
->referenced
, 0);
277 switch (pred(le
, context
)) {
280 * Adjust the cursor, so we start the next
283 lru
->cursor
= le
->list
.next
;
291 lru
->cursor
= le
->list
.next
;
305 /*--------------------------------------------------------------*/
315 * Describes how the block was allocated:
316 * kmem_cache_alloc(), __get_free_pages() or vmalloc().
317 * See the comment at alloc_buffer_data.
321 DATA_MODE_KMALLOC
= 1,
322 DATA_MODE_GET_FREE_PAGES
= 2,
323 DATA_MODE_VMALLOC
= 3,
328 /* protected by the locks in dm_buffer_cache */
331 /* immutable, so don't need protecting */
334 unsigned char data_mode
; /* DATA_MODE_* */
337 * These two fields are used in isolation, so do not need
338 * a surrounding lock.
341 unsigned long last_accessed
;
344 * Everything else is protected by the mutex in
348 struct lru_entry lru
;
349 unsigned char list_mode
; /* LIST_* */
350 blk_status_t read_error
;
351 blk_status_t write_error
;
352 unsigned int dirty_start
;
353 unsigned int dirty_end
;
354 unsigned int write_start
;
355 unsigned int write_end
;
356 struct list_head write_list
;
357 struct dm_bufio_client
*c
;
358 void (*end_io
)(struct dm_buffer
*b
, blk_status_t bs
);
359 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
361 unsigned int stack_len
;
362 unsigned long stack_entries
[MAX_STACK
];
366 /*--------------------------------------------------------------*/
369 * The buffer cache manages buffers, particularly:
370 * - inc/dec of holder count
371 * - setting the last_accessed field
372 * - maintains clean/dirty state along with lru
373 * - selecting buffers that match predicates
375 * It does *not* handle:
376 * - allocation/freeing of buffers.
378 * - Eviction or cache sizing.
380 * cache_get() and cache_put() are threadsafe, you do not need to
381 * protect these calls with a surrounding mutex. All the other
382 * methods are not threadsafe; they do use locking primitives, but
383 * only enough to ensure get/put are threadsafe.
388 struct rw_semaphore lock
;
392 } ____cacheline_aligned_in_smp
;
394 struct dm_buffer_cache
{
395 struct lru lru
[LIST_SIZE
];
397 * We spread entries across multiple trees to reduce contention
400 unsigned int num_locks
;
402 struct buffer_tree trees
[];
405 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled
);
407 static inline unsigned int cache_index(sector_t block
, unsigned int num_locks
)
409 return dm_hash_locks_index(block
, num_locks
);
412 static inline void cache_read_lock(struct dm_buffer_cache
*bc
, sector_t block
)
414 if (static_branch_unlikely(&no_sleep_enabled
) && bc
->no_sleep
)
415 read_lock_bh(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.spinlock
);
417 down_read(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.lock
);
420 static inline void cache_read_unlock(struct dm_buffer_cache
*bc
, sector_t block
)
422 if (static_branch_unlikely(&no_sleep_enabled
) && bc
->no_sleep
)
423 read_unlock_bh(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.spinlock
);
425 up_read(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.lock
);
428 static inline void cache_write_lock(struct dm_buffer_cache
*bc
, sector_t block
)
430 if (static_branch_unlikely(&no_sleep_enabled
) && bc
->no_sleep
)
431 write_lock_bh(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.spinlock
);
433 down_write(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.lock
);
436 static inline void cache_write_unlock(struct dm_buffer_cache
*bc
, sector_t block
)
438 if (static_branch_unlikely(&no_sleep_enabled
) && bc
->no_sleep
)
439 write_unlock_bh(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.spinlock
);
441 up_write(&bc
->trees
[cache_index(block
, bc
->num_locks
)].u
.lock
);
445 * Sometimes we want to repeatedly get and drop locks as part of an iteration.
446 * This struct helps avoid redundant drop and gets of the same lock.
448 struct lock_history
{
449 struct dm_buffer_cache
*cache
;
451 unsigned int previous
;
452 unsigned int no_previous
;
455 static void lh_init(struct lock_history
*lh
, struct dm_buffer_cache
*cache
, bool write
)
459 lh
->no_previous
= cache
->num_locks
;
460 lh
->previous
= lh
->no_previous
;
463 static void __lh_lock(struct lock_history
*lh
, unsigned int index
)
466 if (static_branch_unlikely(&no_sleep_enabled
) && lh
->cache
->no_sleep
)
467 write_lock_bh(&lh
->cache
->trees
[index
].u
.spinlock
);
469 down_write(&lh
->cache
->trees
[index
].u
.lock
);
471 if (static_branch_unlikely(&no_sleep_enabled
) && lh
->cache
->no_sleep
)
472 read_lock_bh(&lh
->cache
->trees
[index
].u
.spinlock
);
474 down_read(&lh
->cache
->trees
[index
].u
.lock
);
478 static void __lh_unlock(struct lock_history
*lh
, unsigned int index
)
481 if (static_branch_unlikely(&no_sleep_enabled
) && lh
->cache
->no_sleep
)
482 write_unlock_bh(&lh
->cache
->trees
[index
].u
.spinlock
);
484 up_write(&lh
->cache
->trees
[index
].u
.lock
);
486 if (static_branch_unlikely(&no_sleep_enabled
) && lh
->cache
->no_sleep
)
487 read_unlock_bh(&lh
->cache
->trees
[index
].u
.spinlock
);
489 up_read(&lh
->cache
->trees
[index
].u
.lock
);
494 * Make sure you call this since it will unlock the final lock.
496 static void lh_exit(struct lock_history
*lh
)
498 if (lh
->previous
!= lh
->no_previous
) {
499 __lh_unlock(lh
, lh
->previous
);
500 lh
->previous
= lh
->no_previous
;
505 * Named 'next' because there is no corresponding
506 * 'up/unlock' call since it's done automatically.
508 static void lh_next(struct lock_history
*lh
, sector_t b
)
510 unsigned int index
= cache_index(b
, lh
->no_previous
); /* no_previous is num_locks */
512 if (lh
->previous
!= lh
->no_previous
) {
513 if (lh
->previous
!= index
) {
514 __lh_unlock(lh
, lh
->previous
);
515 __lh_lock(lh
, index
);
516 lh
->previous
= index
;
519 __lh_lock(lh
, index
);
520 lh
->previous
= index
;
524 static inline struct dm_buffer
*le_to_buffer(struct lru_entry
*le
)
526 return container_of(le
, struct dm_buffer
, lru
);
529 static struct dm_buffer
*list_to_buffer(struct list_head
*l
)
531 struct lru_entry
*le
= list_entry(l
, struct lru_entry
, list
);
533 return le_to_buffer(le
);
536 static void cache_init(struct dm_buffer_cache
*bc
, unsigned int num_locks
, bool no_sleep
)
540 bc
->num_locks
= num_locks
;
541 bc
->no_sleep
= no_sleep
;
543 for (i
= 0; i
< bc
->num_locks
; i
++) {
545 rwlock_init(&bc
->trees
[i
].u
.spinlock
);
547 init_rwsem(&bc
->trees
[i
].u
.lock
);
548 bc
->trees
[i
].root
= RB_ROOT
;
551 lru_init(&bc
->lru
[LIST_CLEAN
]);
552 lru_init(&bc
->lru
[LIST_DIRTY
]);
555 static void cache_destroy(struct dm_buffer_cache
*bc
)
559 for (i
= 0; i
< bc
->num_locks
; i
++)
560 WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc
->trees
[i
].root
));
562 lru_destroy(&bc
->lru
[LIST_CLEAN
]);
563 lru_destroy(&bc
->lru
[LIST_DIRTY
]);
569 * not threadsafe, or racey depending how you look at it
571 static inline unsigned long cache_count(struct dm_buffer_cache
*bc
, int list_mode
)
573 return bc
->lru
[list_mode
].count
;
576 static inline unsigned long cache_total(struct dm_buffer_cache
*bc
)
578 return cache_count(bc
, LIST_CLEAN
) + cache_count(bc
, LIST_DIRTY
);
584 * Gets a specific buffer, indexed by block.
585 * If the buffer is found then its holder count will be incremented and
586 * lru_reference will be called.
590 static struct dm_buffer
*__cache_get(const struct rb_root
*root
, sector_t block
)
592 struct rb_node
*n
= root
->rb_node
;
596 b
= container_of(n
, struct dm_buffer
, node
);
598 if (b
->block
== block
)
601 n
= block
< b
->block
? n
->rb_left
: n
->rb_right
;
607 static void __cache_inc_buffer(struct dm_buffer
*b
)
609 atomic_inc(&b
->hold_count
);
610 WRITE_ONCE(b
->last_accessed
, jiffies
);
613 static struct dm_buffer
*cache_get(struct dm_buffer_cache
*bc
, sector_t block
)
617 cache_read_lock(bc
, block
);
618 b
= __cache_get(&bc
->trees
[cache_index(block
, bc
->num_locks
)].root
, block
);
620 lru_reference(&b
->lru
);
621 __cache_inc_buffer(b
);
623 cache_read_unlock(bc
, block
);
631 * Returns true if the hold count hits zero.
634 static bool cache_put(struct dm_buffer_cache
*bc
, struct dm_buffer
*b
)
638 cache_read_lock(bc
, b
->block
);
639 BUG_ON(!atomic_read(&b
->hold_count
));
640 r
= atomic_dec_and_test(&b
->hold_count
);
641 cache_read_unlock(bc
, b
->block
);
648 typedef enum evict_result (*b_predicate
)(struct dm_buffer
*, void *);
651 * Evicts a buffer based on a predicate. The oldest buffer that
652 * matches the predicate will be selected. In addition to the
653 * predicate the hold_count of the selected buffer will be zero.
655 struct evict_wrapper
{
656 struct lock_history
*lh
;
662 * Wraps the buffer predicate turning it into an lru predicate. Adds
663 * extra test for hold_count.
665 static enum evict_result
__evict_pred(struct lru_entry
*le
, void *context
)
667 struct evict_wrapper
*w
= context
;
668 struct dm_buffer
*b
= le_to_buffer(le
);
670 lh_next(w
->lh
, b
->block
);
672 if (atomic_read(&b
->hold_count
))
673 return ER_DONT_EVICT
;
675 return w
->pred(b
, w
->context
);
678 static struct dm_buffer
*__cache_evict(struct dm_buffer_cache
*bc
, int list_mode
,
679 b_predicate pred
, void *context
,
680 struct lock_history
*lh
)
682 struct evict_wrapper w
= {.lh
= lh
, .pred
= pred
, .context
= context
};
683 struct lru_entry
*le
;
686 le
= lru_evict(&bc
->lru
[list_mode
], __evict_pred
, &w
, bc
->no_sleep
);
690 b
= le_to_buffer(le
);
691 /* __evict_pred will have locked the appropriate tree. */
692 rb_erase(&b
->node
, &bc
->trees
[cache_index(b
->block
, bc
->num_locks
)].root
);
697 static struct dm_buffer
*cache_evict(struct dm_buffer_cache
*bc
, int list_mode
,
698 b_predicate pred
, void *context
)
701 struct lock_history lh
;
703 lh_init(&lh
, bc
, true);
704 b
= __cache_evict(bc
, list_mode
, pred
, context
, &lh
);
713 * Mark a buffer as clean or dirty. Not threadsafe.
715 static void cache_mark(struct dm_buffer_cache
*bc
, struct dm_buffer
*b
, int list_mode
)
717 cache_write_lock(bc
, b
->block
);
718 if (list_mode
!= b
->list_mode
) {
719 lru_remove(&bc
->lru
[b
->list_mode
], &b
->lru
);
720 b
->list_mode
= list_mode
;
721 lru_insert(&bc
->lru
[b
->list_mode
], &b
->lru
);
723 cache_write_unlock(bc
, b
->block
);
729 * Runs through the lru associated with 'old_mode', if the predicate matches then
730 * it moves them to 'new_mode'. Not threadsafe.
732 static void __cache_mark_many(struct dm_buffer_cache
*bc
, int old_mode
, int new_mode
,
733 b_predicate pred
, void *context
, struct lock_history
*lh
)
735 struct lru_entry
*le
;
737 struct evict_wrapper w
= {.lh
= lh
, .pred
= pred
, .context
= context
};
740 le
= lru_evict(&bc
->lru
[old_mode
], __evict_pred
, &w
, bc
->no_sleep
);
744 b
= le_to_buffer(le
);
745 b
->list_mode
= new_mode
;
746 lru_insert(&bc
->lru
[b
->list_mode
], &b
->lru
);
750 static void cache_mark_many(struct dm_buffer_cache
*bc
, int old_mode
, int new_mode
,
751 b_predicate pred
, void *context
)
753 struct lock_history lh
;
755 lh_init(&lh
, bc
, true);
756 __cache_mark_many(bc
, old_mode
, new_mode
, pred
, context
, &lh
);
763 * Iterates through all clean or dirty entries calling a function for each
764 * entry. The callback may terminate the iteration early. Not threadsafe.
768 * Iterator functions should return one of these actions to indicate
769 * how the iteration should proceed.
776 typedef enum it_action (*iter_fn
)(struct dm_buffer
*b
, void *context
);
778 static void __cache_iterate(struct dm_buffer_cache
*bc
, int list_mode
,
779 iter_fn fn
, void *context
, struct lock_history
*lh
)
781 struct lru
*lru
= &bc
->lru
[list_mode
];
782 struct lru_entry
*le
, *first
;
787 first
= le
= to_le(lru
->cursor
);
789 struct dm_buffer
*b
= le_to_buffer(le
);
791 lh_next(lh
, b
->block
);
793 switch (fn(b
, context
)) {
802 le
= to_le(le
->list
.next
);
803 } while (le
!= first
);
806 static void cache_iterate(struct dm_buffer_cache
*bc
, int list_mode
,
807 iter_fn fn
, void *context
)
809 struct lock_history lh
;
811 lh_init(&lh
, bc
, false);
812 __cache_iterate(bc
, list_mode
, fn
, context
, &lh
);
819 * Passes ownership of the buffer to the cache. Returns false if the
820 * buffer was already present (in which case ownership does not pass).
821 * eg, a race with another thread.
823 * Holder count should be 1 on insertion.
827 static bool __cache_insert(struct rb_root
*root
, struct dm_buffer
*b
)
829 struct rb_node
**new = &root
->rb_node
, *parent
= NULL
;
830 struct dm_buffer
*found
;
833 found
= container_of(*new, struct dm_buffer
, node
);
835 if (found
->block
== b
->block
)
839 new = b
->block
< found
->block
?
840 &found
->node
.rb_left
: &found
->node
.rb_right
;
843 rb_link_node(&b
->node
, parent
, new);
844 rb_insert_color(&b
->node
, root
);
849 static bool cache_insert(struct dm_buffer_cache
*bc
, struct dm_buffer
*b
)
853 if (WARN_ON_ONCE(b
->list_mode
>= LIST_SIZE
))
856 cache_write_lock(bc
, b
->block
);
857 BUG_ON(atomic_read(&b
->hold_count
) != 1);
858 r
= __cache_insert(&bc
->trees
[cache_index(b
->block
, bc
->num_locks
)].root
, b
);
860 lru_insert(&bc
->lru
[b
->list_mode
], &b
->lru
);
861 cache_write_unlock(bc
, b
->block
);
869 * Removes buffer from cache, ownership of the buffer passes back to the caller.
870 * Fails if the hold_count is not one (ie. the caller holds the only reference).
874 static bool cache_remove(struct dm_buffer_cache
*bc
, struct dm_buffer
*b
)
878 cache_write_lock(bc
, b
->block
);
880 if (atomic_read(&b
->hold_count
) != 1) {
884 rb_erase(&b
->node
, &bc
->trees
[cache_index(b
->block
, bc
->num_locks
)].root
);
885 lru_remove(&bc
->lru
[b
->list_mode
], &b
->lru
);
888 cache_write_unlock(bc
, b
->block
);
895 typedef void (*b_release
)(struct dm_buffer
*);
897 static struct dm_buffer
*__find_next(struct rb_root
*root
, sector_t block
)
899 struct rb_node
*n
= root
->rb_node
;
901 struct dm_buffer
*best
= NULL
;
904 b
= container_of(n
, struct dm_buffer
, node
);
906 if (b
->block
== block
)
909 if (block
<= b
->block
) {
920 static void __remove_range(struct dm_buffer_cache
*bc
,
921 struct rb_root
*root
,
922 sector_t begin
, sector_t end
,
923 b_predicate pred
, b_release release
)
930 b
= __find_next(root
, begin
);
931 if (!b
|| (b
->block
>= end
))
934 begin
= b
->block
+ 1;
936 if (atomic_read(&b
->hold_count
))
939 if (pred(b
, NULL
) == ER_EVICT
) {
940 rb_erase(&b
->node
, root
);
941 lru_remove(&bc
->lru
[b
->list_mode
], &b
->lru
);
947 static void cache_remove_range(struct dm_buffer_cache
*bc
,
948 sector_t begin
, sector_t end
,
949 b_predicate pred
, b_release release
)
953 BUG_ON(bc
->no_sleep
);
954 for (i
= 0; i
< bc
->num_locks
; i
++) {
955 down_write(&bc
->trees
[i
].u
.lock
);
956 __remove_range(bc
, &bc
->trees
[i
].root
, begin
, end
, pred
, release
);
957 up_write(&bc
->trees
[i
].u
.lock
);
961 /*----------------------------------------------------------------*/
964 * Linking of buffers:
965 * All buffers are linked to buffer_cache with their node field.
967 * Clean buffers that are not being written (B_WRITING not set)
968 * are linked to lru[LIST_CLEAN] with their lru_list field.
970 * Dirty and clean buffers that are being written are linked to
971 * lru[LIST_DIRTY] with their lru_list field. When the write
972 * finishes, the buffer cannot be relinked immediately (because we
973 * are in an interrupt context and relinking requires process
974 * context), so some clean-not-writing buffers can be held on
975 * dirty_lru too. They are later added to lru in the process
978 struct dm_bufio_client
{
979 struct block_device
*bdev
;
980 unsigned int block_size
;
981 s8 sectors_per_block_bits
;
987 int async_write_error
;
989 void (*alloc_callback
)(struct dm_buffer
*buf
);
990 void (*write_callback
)(struct dm_buffer
*buf
);
991 struct kmem_cache
*slab_buffer
;
992 struct kmem_cache
*slab_cache
;
993 struct dm_io_client
*dm_io
;
995 struct list_head reserved_buffers
;
996 unsigned int need_reserved_buffers
;
998 unsigned int minimum_buffers
;
1002 struct shrinker
*shrinker
;
1003 struct work_struct shrink_work
;
1004 atomic_long_t need_shrink
;
1006 wait_queue_head_t free_buffer_wait
;
1008 struct list_head client_list
;
1011 * Used by global_cleanup to sort the clients list.
1013 unsigned long oldest_buffer
;
1015 struct dm_buffer_cache cache
; /* must be last member */
1018 /*----------------------------------------------------------------*/
1020 #define dm_bufio_in_request() (!!current->bio_list)
1022 static void dm_bufio_lock(struct dm_bufio_client
*c
)
1024 if (static_branch_unlikely(&no_sleep_enabled
) && c
->no_sleep
)
1025 spin_lock_bh(&c
->spinlock
);
1027 mutex_lock_nested(&c
->lock
, dm_bufio_in_request());
1030 static void dm_bufio_unlock(struct dm_bufio_client
*c
)
1032 if (static_branch_unlikely(&no_sleep_enabled
) && c
->no_sleep
)
1033 spin_unlock_bh(&c
->spinlock
);
1035 mutex_unlock(&c
->lock
);
1038 /*----------------------------------------------------------------*/
1041 * Default cache size: available memory divided by the ratio.
1043 static unsigned long dm_bufio_default_cache_size
;
1046 * Total cache size set by the user.
1048 static unsigned long dm_bufio_cache_size
;
1051 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
1052 * at any time. If it disagrees, the user has changed cache size.
1054 static unsigned long dm_bufio_cache_size_latch
;
1056 static DEFINE_SPINLOCK(global_spinlock
);
1059 * Buffers are freed after this timeout
1061 static unsigned int dm_bufio_max_age
= DM_BUFIO_DEFAULT_AGE_SECS
;
1062 static unsigned long dm_bufio_retain_bytes
= DM_BUFIO_DEFAULT_RETAIN_BYTES
;
1064 static unsigned long dm_bufio_peak_allocated
;
1065 static unsigned long dm_bufio_allocated_kmem_cache
;
1066 static unsigned long dm_bufio_allocated_kmalloc
;
1067 static unsigned long dm_bufio_allocated_get_free_pages
;
1068 static unsigned long dm_bufio_allocated_vmalloc
;
1069 static unsigned long dm_bufio_current_allocated
;
1071 /*----------------------------------------------------------------*/
1074 * The current number of clients.
1076 static int dm_bufio_client_count
;
1079 * The list of all clients.
1081 static LIST_HEAD(dm_bufio_all_clients
);
1084 * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
1086 static DEFINE_MUTEX(dm_bufio_clients_lock
);
1088 static struct workqueue_struct
*dm_bufio_wq
;
1089 static struct delayed_work dm_bufio_cleanup_old_work
;
1090 static struct work_struct dm_bufio_replacement_work
;
1093 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1094 static void buffer_record_stack(struct dm_buffer
*b
)
1096 b
->stack_len
= stack_trace_save(b
->stack_entries
, MAX_STACK
, 2);
1100 /*----------------------------------------------------------------*/
1102 static void adjust_total_allocated(struct dm_buffer
*b
, bool unlink
)
1104 unsigned char data_mode
;
1107 static unsigned long * const class_ptr
[DATA_MODE_LIMIT
] = {
1108 &dm_bufio_allocated_kmem_cache
,
1109 &dm_bufio_allocated_kmalloc
,
1110 &dm_bufio_allocated_get_free_pages
,
1111 &dm_bufio_allocated_vmalloc
,
1114 data_mode
= b
->data_mode
;
1115 diff
= (long)b
->c
->block_size
;
1119 spin_lock(&global_spinlock
);
1121 *class_ptr
[data_mode
] += diff
;
1123 dm_bufio_current_allocated
+= diff
;
1125 if (dm_bufio_current_allocated
> dm_bufio_peak_allocated
)
1126 dm_bufio_peak_allocated
= dm_bufio_current_allocated
;
1129 if (dm_bufio_current_allocated
> dm_bufio_cache_size
)
1130 queue_work(dm_bufio_wq
, &dm_bufio_replacement_work
);
1133 spin_unlock(&global_spinlock
);
1137 * Change the number of clients and recalculate per-client limit.
1139 static void __cache_size_refresh(void)
1141 if (WARN_ON(!mutex_is_locked(&dm_bufio_clients_lock
)))
1143 if (WARN_ON(dm_bufio_client_count
< 0))
1146 dm_bufio_cache_size_latch
= READ_ONCE(dm_bufio_cache_size
);
1149 * Use default if set to 0 and report the actual cache size used.
1151 if (!dm_bufio_cache_size_latch
) {
1152 (void)cmpxchg(&dm_bufio_cache_size
, 0,
1153 dm_bufio_default_cache_size
);
1154 dm_bufio_cache_size_latch
= dm_bufio_default_cache_size
;
1159 * Allocating buffer data.
1161 * Small buffers are allocated with kmem_cache, to use space optimally.
1163 * For large buffers, we choose between get_free_pages and vmalloc.
1164 * Each has advantages and disadvantages.
1166 * __get_free_pages can randomly fail if the memory is fragmented.
1167 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
1168 * as low as 128M) so using it for caching is not appropriate.
1170 * If the allocation may fail we use __get_free_pages. Memory fragmentation
1171 * won't have a fatal effect here, but it just causes flushes of some other
1172 * buffers and more I/O will be performed. Don't use __get_free_pages if it
1173 * always fails (i.e. order > MAX_PAGE_ORDER).
1175 * If the allocation shouldn't fail we use __vmalloc. This is only for the
1176 * initial reserve allocation, so there's no risk of wasting all vmalloc
1179 static void *alloc_buffer_data(struct dm_bufio_client
*c
, gfp_t gfp_mask
,
1180 unsigned char *data_mode
)
1182 if (unlikely(c
->slab_cache
!= NULL
)) {
1183 *data_mode
= DATA_MODE_SLAB
;
1184 return kmem_cache_alloc(c
->slab_cache
, gfp_mask
);
1187 if (unlikely(c
->block_size
< PAGE_SIZE
)) {
1188 *data_mode
= DATA_MODE_KMALLOC
;
1189 return kmalloc(c
->block_size
, gfp_mask
| __GFP_RECLAIMABLE
);
1192 if (c
->block_size
<= KMALLOC_MAX_SIZE
&&
1193 gfp_mask
& __GFP_NORETRY
) {
1194 *data_mode
= DATA_MODE_GET_FREE_PAGES
;
1195 return (void *)__get_free_pages(gfp_mask
,
1196 c
->sectors_per_block_bits
- (PAGE_SHIFT
- SECTOR_SHIFT
));
1199 *data_mode
= DATA_MODE_VMALLOC
;
1201 return __vmalloc(c
->block_size
, gfp_mask
);
1205 * Free buffer's data.
1207 static void free_buffer_data(struct dm_bufio_client
*c
,
1208 void *data
, unsigned char data_mode
)
1210 switch (data_mode
) {
1211 case DATA_MODE_SLAB
:
1212 kmem_cache_free(c
->slab_cache
, data
);
1215 case DATA_MODE_KMALLOC
:
1219 case DATA_MODE_GET_FREE_PAGES
:
1220 free_pages((unsigned long)data
,
1221 c
->sectors_per_block_bits
- (PAGE_SHIFT
- SECTOR_SHIFT
));
1224 case DATA_MODE_VMALLOC
:
1229 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
1236 * Allocate buffer and its data.
1238 static struct dm_buffer
*alloc_buffer(struct dm_bufio_client
*c
, gfp_t gfp_mask
)
1240 struct dm_buffer
*b
= kmem_cache_alloc(c
->slab_buffer
, gfp_mask
);
1247 b
->data
= alloc_buffer_data(c
, gfp_mask
, &b
->data_mode
);
1249 kmem_cache_free(c
->slab_buffer
, b
);
1252 adjust_total_allocated(b
, false);
1254 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1261 * Free buffer and its data.
1263 static void free_buffer(struct dm_buffer
*b
)
1265 struct dm_bufio_client
*c
= b
->c
;
1267 adjust_total_allocated(b
, true);
1268 free_buffer_data(c
, b
->data
, b
->data_mode
);
1269 kmem_cache_free(c
->slab_buffer
, b
);
1273 *--------------------------------------------------------------------------
1274 * Submit I/O on the buffer.
1276 * Bio interface is faster but it has some problems:
1277 * the vector list is limited (increasing this limit increases
1278 * memory-consumption per buffer, so it is not viable);
1280 * the memory must be direct-mapped, not vmalloced;
1282 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
1283 * it is not vmalloced, try using the bio interface.
1285 * If the buffer is big, if it is vmalloced or if the underlying device
1286 * rejects the bio because it is too large, use dm-io layer to do the I/O.
1287 * The dm-io layer splits the I/O into multiple requests, avoiding the above
1289 *--------------------------------------------------------------------------
1293 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
1294 * that the request was handled directly with bio interface.
1296 static void dmio_complete(unsigned long error
, void *context
)
1298 struct dm_buffer
*b
= context
;
1300 b
->end_io(b
, unlikely(error
!= 0) ? BLK_STS_IOERR
: 0);
1303 static void use_dmio(struct dm_buffer
*b
, enum req_op op
, sector_t sector
,
1304 unsigned int n_sectors
, unsigned int offset
,
1305 unsigned short ioprio
)
1308 struct dm_io_request io_req
= {
1310 .notify
.fn
= dmio_complete
,
1311 .notify
.context
= b
,
1312 .client
= b
->c
->dm_io
,
1314 struct dm_io_region region
= {
1320 if (b
->data_mode
!= DATA_MODE_VMALLOC
) {
1321 io_req
.mem
.type
= DM_IO_KMEM
;
1322 io_req
.mem
.ptr
.addr
= (char *)b
->data
+ offset
;
1324 io_req
.mem
.type
= DM_IO_VMA
;
1325 io_req
.mem
.ptr
.vma
= (char *)b
->data
+ offset
;
1328 r
= dm_io(&io_req
, 1, ®ion
, NULL
, ioprio
);
1330 b
->end_io(b
, errno_to_blk_status(r
));
1333 static void bio_complete(struct bio
*bio
)
1335 struct dm_buffer
*b
= bio
->bi_private
;
1336 blk_status_t status
= bio
->bi_status
;
1340 b
->end_io(b
, status
);
1343 static void use_bio(struct dm_buffer
*b
, enum req_op op
, sector_t sector
,
1344 unsigned int n_sectors
, unsigned int offset
,
1345 unsigned short ioprio
)
1351 bio
= bio_kmalloc(1, GFP_NOWAIT
| __GFP_NORETRY
| __GFP_NOWARN
);
1353 use_dmio(b
, op
, sector
, n_sectors
, offset
, ioprio
);
1356 bio_init(bio
, b
->c
->bdev
, bio
->bi_inline_vecs
, 1, op
);
1357 bio
->bi_iter
.bi_sector
= sector
;
1358 bio
->bi_end_io
= bio_complete
;
1359 bio
->bi_private
= b
;
1360 bio
->bi_ioprio
= ioprio
;
1362 ptr
= (char *)b
->data
+ offset
;
1363 len
= n_sectors
<< SECTOR_SHIFT
;
1365 __bio_add_page(bio
, virt_to_page(ptr
), len
, offset_in_page(ptr
));
1370 static inline sector_t
block_to_sector(struct dm_bufio_client
*c
, sector_t block
)
1374 if (likely(c
->sectors_per_block_bits
>= 0))
1375 sector
= block
<< c
->sectors_per_block_bits
;
1377 sector
= block
* (c
->block_size
>> SECTOR_SHIFT
);
1383 static void submit_io(struct dm_buffer
*b
, enum req_op op
, unsigned short ioprio
,
1384 void (*end_io
)(struct dm_buffer
*, blk_status_t
))
1386 unsigned int n_sectors
;
1388 unsigned int offset
, end
;
1392 sector
= block_to_sector(b
->c
, b
->block
);
1394 if (op
!= REQ_OP_WRITE
) {
1395 n_sectors
= b
->c
->block_size
>> SECTOR_SHIFT
;
1398 if (b
->c
->write_callback
)
1399 b
->c
->write_callback(b
);
1400 offset
= b
->write_start
;
1402 offset
&= -DM_BUFIO_WRITE_ALIGN
;
1403 end
+= DM_BUFIO_WRITE_ALIGN
- 1;
1404 end
&= -DM_BUFIO_WRITE_ALIGN
;
1405 if (unlikely(end
> b
->c
->block_size
))
1406 end
= b
->c
->block_size
;
1408 sector
+= offset
>> SECTOR_SHIFT
;
1409 n_sectors
= (end
- offset
) >> SECTOR_SHIFT
;
1412 if (b
->data_mode
!= DATA_MODE_VMALLOC
)
1413 use_bio(b
, op
, sector
, n_sectors
, offset
, ioprio
);
1415 use_dmio(b
, op
, sector
, n_sectors
, offset
, ioprio
);
1419 *--------------------------------------------------------------
1420 * Writing dirty buffers
1421 *--------------------------------------------------------------
1425 * The endio routine for write.
1427 * Set the error, clear B_WRITING bit and wake anyone who was waiting on
1430 static void write_endio(struct dm_buffer
*b
, blk_status_t status
)
1432 b
->write_error
= status
;
1433 if (unlikely(status
)) {
1434 struct dm_bufio_client
*c
= b
->c
;
1436 (void)cmpxchg(&c
->async_write_error
, 0,
1437 blk_status_to_errno(status
));
1440 BUG_ON(!test_bit(B_WRITING
, &b
->state
));
1442 smp_mb__before_atomic();
1443 clear_bit(B_WRITING
, &b
->state
);
1444 smp_mb__after_atomic();
1446 wake_up_bit(&b
->state
, B_WRITING
);
1450 * Initiate a write on a dirty buffer, but don't wait for it.
1452 * - If the buffer is not dirty, exit.
1453 * - If there some previous write going on, wait for it to finish (we can't
1454 * have two writes on the same buffer simultaneously).
1455 * - Submit our write and don't wait on it. We set B_WRITING indicating
1456 * that there is a write in progress.
1458 static void __write_dirty_buffer(struct dm_buffer
*b
,
1459 struct list_head
*write_list
)
1461 if (!test_bit(B_DIRTY
, &b
->state
))
1464 clear_bit(B_DIRTY
, &b
->state
);
1465 wait_on_bit_lock_io(&b
->state
, B_WRITING
, TASK_UNINTERRUPTIBLE
);
1467 b
->write_start
= b
->dirty_start
;
1468 b
->write_end
= b
->dirty_end
;
1471 submit_io(b
, REQ_OP_WRITE
, IOPRIO_DEFAULT
, write_endio
);
1473 list_add_tail(&b
->write_list
, write_list
);
1476 static void __flush_write_list(struct list_head
*write_list
)
1478 struct blk_plug plug
;
1480 blk_start_plug(&plug
);
1481 while (!list_empty(write_list
)) {
1482 struct dm_buffer
*b
=
1483 list_entry(write_list
->next
, struct dm_buffer
, write_list
);
1484 list_del(&b
->write_list
);
1485 submit_io(b
, REQ_OP_WRITE
, IOPRIO_DEFAULT
, write_endio
);
1488 blk_finish_plug(&plug
);
1492 * Wait until any activity on the buffer finishes. Possibly write the
1493 * buffer if it is dirty. When this function finishes, there is no I/O
1494 * running on the buffer and the buffer is not dirty.
1496 static void __make_buffer_clean(struct dm_buffer
*b
)
1498 BUG_ON(atomic_read(&b
->hold_count
));
1500 /* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */
1501 if (!smp_load_acquire(&b
->state
)) /* fast case */
1504 wait_on_bit_io(&b
->state
, B_READING
, TASK_UNINTERRUPTIBLE
);
1505 __write_dirty_buffer(b
, NULL
);
1506 wait_on_bit_io(&b
->state
, B_WRITING
, TASK_UNINTERRUPTIBLE
);
1509 static enum evict_result
is_clean(struct dm_buffer
*b
, void *context
)
1511 struct dm_bufio_client
*c
= context
;
1513 /* These should never happen */
1514 if (WARN_ON_ONCE(test_bit(B_WRITING
, &b
->state
)))
1515 return ER_DONT_EVICT
;
1516 if (WARN_ON_ONCE(test_bit(B_DIRTY
, &b
->state
)))
1517 return ER_DONT_EVICT
;
1518 if (WARN_ON_ONCE(b
->list_mode
!= LIST_CLEAN
))
1519 return ER_DONT_EVICT
;
1521 if (static_branch_unlikely(&no_sleep_enabled
) && c
->no_sleep
&&
1522 unlikely(test_bit(B_READING
, &b
->state
)))
1523 return ER_DONT_EVICT
;
1528 static enum evict_result
is_dirty(struct dm_buffer
*b
, void *context
)
1530 /* These should never happen */
1531 if (WARN_ON_ONCE(test_bit(B_READING
, &b
->state
)))
1532 return ER_DONT_EVICT
;
1533 if (WARN_ON_ONCE(b
->list_mode
!= LIST_DIRTY
))
1534 return ER_DONT_EVICT
;
1540 * Find some buffer that is not held by anybody, clean it, unlink it and
1543 static struct dm_buffer
*__get_unclaimed_buffer(struct dm_bufio_client
*c
)
1545 struct dm_buffer
*b
;
1547 b
= cache_evict(&c
->cache
, LIST_CLEAN
, is_clean
, c
);
1549 /* this also waits for pending reads */
1550 __make_buffer_clean(b
);
1554 if (static_branch_unlikely(&no_sleep_enabled
) && c
->no_sleep
)
1557 b
= cache_evict(&c
->cache
, LIST_DIRTY
, is_dirty
, NULL
);
1559 __make_buffer_clean(b
);
1567 * Wait until some other threads free some buffer or release hold count on
1570 * This function is entered with c->lock held, drops it and regains it
1573 static void __wait_for_free_buffer(struct dm_bufio_client
*c
)
1575 DECLARE_WAITQUEUE(wait
, current
);
1577 add_wait_queue(&c
->free_buffer_wait
, &wait
);
1578 set_current_state(TASK_UNINTERRUPTIBLE
);
1582 * It's possible to miss a wake up event since we don't always
1583 * hold c->lock when wake_up is called. So we have a timeout here,
1586 io_schedule_timeout(5 * HZ
);
1588 remove_wait_queue(&c
->free_buffer_wait
, &wait
);
1601 * Allocate a new buffer. If the allocation is not possible, wait until
1602 * some other thread frees a buffer.
1604 * May drop the lock and regain it.
1606 static struct dm_buffer
*__alloc_buffer_wait_no_callback(struct dm_bufio_client
*c
, enum new_flag nf
)
1608 struct dm_buffer
*b
;
1609 bool tried_noio_alloc
= false;
1612 * dm-bufio is resistant to allocation failures (it just keeps
1613 * one buffer reserved in cases all the allocations fail).
1614 * So set flags to not try too hard:
1615 * GFP_NOWAIT: don't wait; if we need to sleep we'll release our
1616 * mutex and wait ourselves.
1617 * __GFP_NORETRY: don't retry and rather return failure
1618 * __GFP_NOMEMALLOC: don't use emergency reserves
1619 * __GFP_NOWARN: don't print a warning in case of failure
1621 * For debugging, if we set the cache size to 1, no new buffers will
1625 if (dm_bufio_cache_size_latch
!= 1) {
1626 b
= alloc_buffer(c
, GFP_NOWAIT
| __GFP_NORETRY
| __GFP_NOMEMALLOC
| __GFP_NOWARN
);
1631 if (nf
== NF_PREFETCH
)
1634 if (dm_bufio_cache_size_latch
!= 1 && !tried_noio_alloc
) {
1636 b
= alloc_buffer(c
, GFP_NOIO
| __GFP_NORETRY
| __GFP_NOMEMALLOC
| __GFP_NOWARN
);
1640 tried_noio_alloc
= true;
1643 if (!list_empty(&c
->reserved_buffers
)) {
1644 b
= list_to_buffer(c
->reserved_buffers
.next
);
1645 list_del(&b
->lru
.list
);
1646 c
->need_reserved_buffers
++;
1651 b
= __get_unclaimed_buffer(c
);
1655 __wait_for_free_buffer(c
);
1659 static struct dm_buffer
*__alloc_buffer_wait(struct dm_bufio_client
*c
, enum new_flag nf
)
1661 struct dm_buffer
*b
= __alloc_buffer_wait_no_callback(c
, nf
);
1666 if (c
->alloc_callback
)
1667 c
->alloc_callback(b
);
1673 * Free a buffer and wake other threads waiting for free buffers.
1675 static void __free_buffer_wake(struct dm_buffer
*b
)
1677 struct dm_bufio_client
*c
= b
->c
;
1680 if (!c
->need_reserved_buffers
)
1683 list_add(&b
->lru
.list
, &c
->reserved_buffers
);
1684 c
->need_reserved_buffers
--;
1688 * We hold the bufio lock here, so no one can add entries to the
1689 * wait queue anyway.
1691 if (unlikely(waitqueue_active(&c
->free_buffer_wait
)))
1692 wake_up(&c
->free_buffer_wait
);
1695 static enum evict_result
cleaned(struct dm_buffer
*b
, void *context
)
1697 if (WARN_ON_ONCE(test_bit(B_READING
, &b
->state
)))
1698 return ER_DONT_EVICT
; /* should never happen */
1700 if (test_bit(B_DIRTY
, &b
->state
) || test_bit(B_WRITING
, &b
->state
))
1701 return ER_DONT_EVICT
;
1706 static void __move_clean_buffers(struct dm_bufio_client
*c
)
1708 cache_mark_many(&c
->cache
, LIST_DIRTY
, LIST_CLEAN
, cleaned
, NULL
);
1711 struct write_context
{
1713 struct list_head
*write_list
;
1716 static enum it_action
write_one(struct dm_buffer
*b
, void *context
)
1718 struct write_context
*wc
= context
;
1720 if (wc
->no_wait
&& test_bit(B_WRITING
, &b
->state
))
1723 __write_dirty_buffer(b
, wc
->write_list
);
1727 static void __write_dirty_buffers_async(struct dm_bufio_client
*c
, int no_wait
,
1728 struct list_head
*write_list
)
1730 struct write_context wc
= {.no_wait
= no_wait
, .write_list
= write_list
};
1732 __move_clean_buffers(c
);
1733 cache_iterate(&c
->cache
, LIST_DIRTY
, write_one
, &wc
);
1737 * Check if we're over watermark.
1738 * If we are over threshold_buffers, start freeing buffers.
1739 * If we're over "limit_buffers", block until we get under the limit.
1741 static void __check_watermark(struct dm_bufio_client
*c
,
1742 struct list_head
*write_list
)
1744 if (cache_count(&c
->cache
, LIST_DIRTY
) >
1745 cache_count(&c
->cache
, LIST_CLEAN
) * DM_BUFIO_WRITEBACK_RATIO
)
1746 __write_dirty_buffers_async(c
, 1, write_list
);
1750 *--------------------------------------------------------------
1752 *--------------------------------------------------------------
1755 static void cache_put_and_wake(struct dm_bufio_client
*c
, struct dm_buffer
*b
)
1758 * Relying on waitqueue_active() is racey, but we sleep
1759 * with schedule_timeout anyway.
1761 if (cache_put(&c
->cache
, b
) &&
1762 unlikely(waitqueue_active(&c
->free_buffer_wait
)))
1763 wake_up(&c
->free_buffer_wait
);
1767 * This assumes you have already checked the cache to see if the buffer
1768 * is already present (it will recheck after dropping the lock for allocation).
1770 static struct dm_buffer
*__bufio_new(struct dm_bufio_client
*c
, sector_t block
,
1771 enum new_flag nf
, int *need_submit
,
1772 struct list_head
*write_list
)
1774 struct dm_buffer
*b
, *new_b
= NULL
;
1778 /* This can't be called with NF_GET */
1779 if (WARN_ON_ONCE(nf
== NF_GET
))
1782 new_b
= __alloc_buffer_wait(c
, nf
);
1787 * We've had a period where the mutex was unlocked, so need to
1788 * recheck the buffer tree.
1790 b
= cache_get(&c
->cache
, block
);
1792 __free_buffer_wake(new_b
);
1796 __check_watermark(c
, write_list
);
1799 atomic_set(&b
->hold_count
, 1);
1800 WRITE_ONCE(b
->last_accessed
, jiffies
);
1804 b
->list_mode
= LIST_CLEAN
;
1809 b
->state
= 1 << B_READING
;
1814 * We mustn't insert into the cache until the B_READING state
1815 * is set. Otherwise another thread could get it and use
1816 * it before it had been read.
1818 cache_insert(&c
->cache
, b
);
1823 if (nf
== NF_PREFETCH
) {
1824 cache_put_and_wake(c
, b
);
1829 * Note: it is essential that we don't wait for the buffer to be
1830 * read if dm_bufio_get function is used. Both dm_bufio_get and
1831 * dm_bufio_prefetch can be used in the driver request routine.
1832 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1833 * the same buffer, it would deadlock if we waited.
1835 if (nf
== NF_GET
&& unlikely(test_bit_acquire(B_READING
, &b
->state
))) {
1836 cache_put_and_wake(c
, b
);
1844 * The endio routine for reading: set the error, clear the bit and wake up
1845 * anyone waiting on the buffer.
1847 static void read_endio(struct dm_buffer
*b
, blk_status_t status
)
1849 b
->read_error
= status
;
1851 BUG_ON(!test_bit(B_READING
, &b
->state
));
1853 smp_mb__before_atomic();
1854 clear_bit(B_READING
, &b
->state
);
1855 smp_mb__after_atomic();
1857 wake_up_bit(&b
->state
, B_READING
);
1861 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these
1862 * functions is similar except that dm_bufio_new doesn't read the
1863 * buffer from the disk (assuming that the caller overwrites all the data
1864 * and uses dm_bufio_mark_buffer_dirty to write new data back).
1866 static void *new_read(struct dm_bufio_client
*c
, sector_t block
,
1867 enum new_flag nf
, struct dm_buffer
**bp
,
1868 unsigned short ioprio
)
1870 int need_submit
= 0;
1871 struct dm_buffer
*b
;
1873 LIST_HEAD(write_list
);
1878 * Fast path, hopefully the block is already in the cache. No need
1879 * to get the client lock for this.
1881 b
= cache_get(&c
->cache
, block
);
1883 if (nf
== NF_PREFETCH
) {
1884 cache_put_and_wake(c
, b
);
1889 * Note: it is essential that we don't wait for the buffer to be
1890 * read if dm_bufio_get function is used. Both dm_bufio_get and
1891 * dm_bufio_prefetch can be used in the driver request routine.
1892 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1893 * the same buffer, it would deadlock if we waited.
1895 if (nf
== NF_GET
&& unlikely(test_bit_acquire(B_READING
, &b
->state
))) {
1896 cache_put_and_wake(c
, b
);
1906 b
= __bufio_new(c
, block
, nf
, &need_submit
, &write_list
);
1910 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1911 if (b
&& (atomic_read(&b
->hold_count
) == 1))
1912 buffer_record_stack(b
);
1915 __flush_write_list(&write_list
);
1921 submit_io(b
, REQ_OP_READ
, ioprio
, read_endio
);
1923 if (nf
!= NF_GET
) /* we already tested this condition above */
1924 wait_on_bit_io(&b
->state
, B_READING
, TASK_UNINTERRUPTIBLE
);
1926 if (b
->read_error
) {
1927 int error
= blk_status_to_errno(b
->read_error
);
1929 dm_bufio_release(b
);
1931 return ERR_PTR(error
);
1939 void *dm_bufio_get(struct dm_bufio_client
*c
, sector_t block
,
1940 struct dm_buffer
**bp
)
1942 return new_read(c
, block
, NF_GET
, bp
, IOPRIO_DEFAULT
);
1944 EXPORT_SYMBOL_GPL(dm_bufio_get
);
1946 static void *__dm_bufio_read(struct dm_bufio_client
*c
, sector_t block
,
1947 struct dm_buffer
**bp
, unsigned short ioprio
)
1949 if (WARN_ON_ONCE(dm_bufio_in_request()))
1950 return ERR_PTR(-EINVAL
);
1952 return new_read(c
, block
, NF_READ
, bp
, ioprio
);
1955 void *dm_bufio_read(struct dm_bufio_client
*c
, sector_t block
,
1956 struct dm_buffer
**bp
)
1958 return __dm_bufio_read(c
, block
, bp
, IOPRIO_DEFAULT
);
1960 EXPORT_SYMBOL_GPL(dm_bufio_read
);
1962 void *dm_bufio_read_with_ioprio(struct dm_bufio_client
*c
, sector_t block
,
1963 struct dm_buffer
**bp
, unsigned short ioprio
)
1965 return __dm_bufio_read(c
, block
, bp
, ioprio
);
1967 EXPORT_SYMBOL_GPL(dm_bufio_read_with_ioprio
);
1969 void *dm_bufio_new(struct dm_bufio_client
*c
, sector_t block
,
1970 struct dm_buffer
**bp
)
1972 if (WARN_ON_ONCE(dm_bufio_in_request()))
1973 return ERR_PTR(-EINVAL
);
1975 return new_read(c
, block
, NF_FRESH
, bp
, IOPRIO_DEFAULT
);
1977 EXPORT_SYMBOL_GPL(dm_bufio_new
);
1979 static void __dm_bufio_prefetch(struct dm_bufio_client
*c
,
1980 sector_t block
, unsigned int n_blocks
,
1981 unsigned short ioprio
)
1983 struct blk_plug plug
;
1985 LIST_HEAD(write_list
);
1987 if (WARN_ON_ONCE(dm_bufio_in_request()))
1988 return; /* should never happen */
1990 blk_start_plug(&plug
);
1992 for (; n_blocks
--; block
++) {
1994 struct dm_buffer
*b
;
1996 b
= cache_get(&c
->cache
, block
);
1998 /* already in cache */
1999 cache_put_and_wake(c
, b
);
2004 b
= __bufio_new(c
, block
, NF_PREFETCH
, &need_submit
,
2006 if (unlikely(!list_empty(&write_list
))) {
2008 blk_finish_plug(&plug
);
2009 __flush_write_list(&write_list
);
2010 blk_start_plug(&plug
);
2013 if (unlikely(b
!= NULL
)) {
2017 submit_io(b
, REQ_OP_READ
, ioprio
, read_endio
);
2018 dm_bufio_release(b
);
2030 blk_finish_plug(&plug
);
2033 void dm_bufio_prefetch(struct dm_bufio_client
*c
, sector_t block
, unsigned int n_blocks
)
2035 return __dm_bufio_prefetch(c
, block
, n_blocks
, IOPRIO_DEFAULT
);
2037 EXPORT_SYMBOL_GPL(dm_bufio_prefetch
);
2039 void dm_bufio_prefetch_with_ioprio(struct dm_bufio_client
*c
, sector_t block
,
2040 unsigned int n_blocks
, unsigned short ioprio
)
2042 return __dm_bufio_prefetch(c
, block
, n_blocks
, ioprio
);
2044 EXPORT_SYMBOL_GPL(dm_bufio_prefetch_with_ioprio
);
2046 void dm_bufio_release(struct dm_buffer
*b
)
2048 struct dm_bufio_client
*c
= b
->c
;
2051 * If there were errors on the buffer, and the buffer is not
2052 * to be written, free the buffer. There is no point in caching
2055 if ((b
->read_error
|| b
->write_error
) &&
2056 !test_bit_acquire(B_READING
, &b
->state
) &&
2057 !test_bit(B_WRITING
, &b
->state
) &&
2058 !test_bit(B_DIRTY
, &b
->state
)) {
2061 /* cache remove can fail if there are other holders */
2062 if (cache_remove(&c
->cache
, b
)) {
2063 __free_buffer_wake(b
);
2071 cache_put_and_wake(c
, b
);
2073 EXPORT_SYMBOL_GPL(dm_bufio_release
);
2075 void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer
*b
,
2076 unsigned int start
, unsigned int end
)
2078 struct dm_bufio_client
*c
= b
->c
;
2080 BUG_ON(start
>= end
);
2081 BUG_ON(end
> b
->c
->block_size
);
2085 BUG_ON(test_bit(B_READING
, &b
->state
));
2087 if (!test_and_set_bit(B_DIRTY
, &b
->state
)) {
2088 b
->dirty_start
= start
;
2090 cache_mark(&c
->cache
, b
, LIST_DIRTY
);
2092 if (start
< b
->dirty_start
)
2093 b
->dirty_start
= start
;
2094 if (end
> b
->dirty_end
)
2100 EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty
);
2102 void dm_bufio_mark_buffer_dirty(struct dm_buffer
*b
)
2104 dm_bufio_mark_partial_buffer_dirty(b
, 0, b
->c
->block_size
);
2106 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty
);
2108 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client
*c
)
2110 LIST_HEAD(write_list
);
2112 if (WARN_ON_ONCE(dm_bufio_in_request()))
2113 return; /* should never happen */
2116 __write_dirty_buffers_async(c
, 0, &write_list
);
2118 __flush_write_list(&write_list
);
2120 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async
);
2123 * For performance, it is essential that the buffers are written asynchronously
2124 * and simultaneously (so that the block layer can merge the writes) and then
2127 * Finally, we flush hardware disk cache.
2129 static bool is_writing(struct lru_entry
*e
, void *context
)
2131 struct dm_buffer
*b
= le_to_buffer(e
);
2133 return test_bit(B_WRITING
, &b
->state
);
2136 int dm_bufio_write_dirty_buffers(struct dm_bufio_client
*c
)
2139 unsigned long nr_buffers
;
2140 struct lru_entry
*e
;
2143 LIST_HEAD(write_list
);
2146 __write_dirty_buffers_async(c
, 0, &write_list
);
2148 __flush_write_list(&write_list
);
2151 nr_buffers
= cache_count(&c
->cache
, LIST_DIRTY
);
2152 lru_iter_begin(&c
->cache
.lru
[LIST_DIRTY
], &it
);
2153 while ((e
= lru_iter_next(&it
, is_writing
, c
))) {
2154 struct dm_buffer
*b
= le_to_buffer(e
);
2155 __cache_inc_buffer(b
);
2157 BUG_ON(test_bit(B_READING
, &b
->state
));
2162 wait_on_bit_io(&b
->state
, B_WRITING
, TASK_UNINTERRUPTIBLE
);
2165 wait_on_bit_io(&b
->state
, B_WRITING
, TASK_UNINTERRUPTIBLE
);
2168 if (!test_bit(B_DIRTY
, &b
->state
) && !test_bit(B_WRITING
, &b
->state
))
2169 cache_mark(&c
->cache
, b
, LIST_CLEAN
);
2171 cache_put_and_wake(c
, b
);
2177 wake_up(&c
->free_buffer_wait
);
2180 a
= xchg(&c
->async_write_error
, 0);
2181 f
= dm_bufio_issue_flush(c
);
2187 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers
);
2190 * Use dm-io to send an empty barrier to flush the device.
2192 int dm_bufio_issue_flush(struct dm_bufio_client
*c
)
2194 struct dm_io_request io_req
= {
2195 .bi_opf
= REQ_OP_WRITE
| REQ_PREFLUSH
| REQ_SYNC
,
2196 .mem
.type
= DM_IO_KMEM
,
2197 .mem
.ptr
.addr
= NULL
,
2200 struct dm_io_region io_reg
= {
2206 if (WARN_ON_ONCE(dm_bufio_in_request()))
2209 return dm_io(&io_req
, 1, &io_reg
, NULL
, IOPRIO_DEFAULT
);
2211 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush
);
2214 * Use dm-io to send a discard request to flush the device.
2216 int dm_bufio_issue_discard(struct dm_bufio_client
*c
, sector_t block
, sector_t count
)
2218 struct dm_io_request io_req
= {
2219 .bi_opf
= REQ_OP_DISCARD
| REQ_SYNC
,
2220 .mem
.type
= DM_IO_KMEM
,
2221 .mem
.ptr
.addr
= NULL
,
2224 struct dm_io_region io_reg
= {
2226 .sector
= block_to_sector(c
, block
),
2227 .count
= block_to_sector(c
, count
),
2230 if (WARN_ON_ONCE(dm_bufio_in_request()))
2231 return -EINVAL
; /* discards are optional */
2233 return dm_io(&io_req
, 1, &io_reg
, NULL
, IOPRIO_DEFAULT
);
2235 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard
);
2237 static bool forget_buffer(struct dm_bufio_client
*c
, sector_t block
)
2239 struct dm_buffer
*b
;
2241 b
= cache_get(&c
->cache
, block
);
2243 if (likely(!smp_load_acquire(&b
->state
))) {
2244 if (cache_remove(&c
->cache
, b
))
2245 __free_buffer_wake(b
);
2247 cache_put_and_wake(c
, b
);
2249 cache_put_and_wake(c
, b
);
2253 return b
? true : false;
2257 * Free the given buffer.
2259 * This is just a hint, if the buffer is in use or dirty, this function
2262 void dm_bufio_forget(struct dm_bufio_client
*c
, sector_t block
)
2265 forget_buffer(c
, block
);
2268 EXPORT_SYMBOL_GPL(dm_bufio_forget
);
2270 static enum evict_result
idle(struct dm_buffer
*b
, void *context
)
2272 return b
->state
? ER_DONT_EVICT
: ER_EVICT
;
2275 void dm_bufio_forget_buffers(struct dm_bufio_client
*c
, sector_t block
, sector_t n_blocks
)
2278 cache_remove_range(&c
->cache
, block
, block
+ n_blocks
, idle
, __free_buffer_wake
);
2281 EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers
);
2283 void dm_bufio_set_minimum_buffers(struct dm_bufio_client
*c
, unsigned int n
)
2285 c
->minimum_buffers
= n
;
2287 EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers
);
2289 unsigned int dm_bufio_get_block_size(struct dm_bufio_client
*c
)
2291 return c
->block_size
;
2293 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size
);
2295 sector_t
dm_bufio_get_device_size(struct dm_bufio_client
*c
)
2297 sector_t s
= bdev_nr_sectors(c
->bdev
);
2303 if (likely(c
->sectors_per_block_bits
>= 0))
2304 s
>>= c
->sectors_per_block_bits
;
2306 sector_div(s
, c
->block_size
>> SECTOR_SHIFT
);
2309 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size
);
2311 struct dm_io_client
*dm_bufio_get_dm_io_client(struct dm_bufio_client
*c
)
2315 EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client
);
2317 sector_t
dm_bufio_get_block_number(struct dm_buffer
*b
)
2321 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number
);
2323 void *dm_bufio_get_block_data(struct dm_buffer
*b
)
2327 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data
);
2329 void *dm_bufio_get_aux_data(struct dm_buffer
*b
)
2333 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data
);
2335 struct dm_bufio_client
*dm_bufio_get_client(struct dm_buffer
*b
)
2339 EXPORT_SYMBOL_GPL(dm_bufio_get_client
);
2341 static enum it_action
warn_leak(struct dm_buffer
*b
, void *context
)
2343 bool *warned
= context
;
2345 WARN_ON(!(*warned
));
2347 DMERR("leaked buffer %llx, hold count %u, list %d",
2348 (unsigned long long)b
->block
, atomic_read(&b
->hold_count
), b
->list_mode
);
2349 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2350 stack_trace_print(b
->stack_entries
, b
->stack_len
, 1);
2351 /* mark unclaimed to avoid WARN_ON at end of drop_buffers() */
2352 atomic_set(&b
->hold_count
, 0);
2357 static void drop_buffers(struct dm_bufio_client
*c
)
2360 struct dm_buffer
*b
;
2362 if (WARN_ON(dm_bufio_in_request()))
2363 return; /* should never happen */
2366 * An optimization so that the buffers are not written one-by-one.
2368 dm_bufio_write_dirty_buffers_async(c
);
2372 while ((b
= __get_unclaimed_buffer(c
)))
2373 __free_buffer_wake(b
);
2375 for (i
= 0; i
< LIST_SIZE
; i
++) {
2376 bool warned
= false;
2378 cache_iterate(&c
->cache
, i
, warn_leak
, &warned
);
2381 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2382 while ((b
= __get_unclaimed_buffer(c
)))
2383 __free_buffer_wake(b
);
2386 for (i
= 0; i
< LIST_SIZE
; i
++)
2387 WARN_ON(cache_count(&c
->cache
, i
));
2392 static unsigned long get_retain_buffers(struct dm_bufio_client
*c
)
2394 unsigned long retain_bytes
= READ_ONCE(dm_bufio_retain_bytes
);
2396 if (likely(c
->sectors_per_block_bits
>= 0))
2397 retain_bytes
>>= c
->sectors_per_block_bits
+ SECTOR_SHIFT
;
2399 retain_bytes
/= c
->block_size
;
2401 return retain_bytes
;
2404 static void __scan(struct dm_bufio_client
*c
)
2407 struct dm_buffer
*b
;
2408 unsigned long freed
= 0;
2409 unsigned long retain_target
= get_retain_buffers(c
);
2410 unsigned long count
= cache_total(&c
->cache
);
2412 for (l
= 0; l
< LIST_SIZE
; l
++) {
2414 if (count
- freed
<= retain_target
)
2415 atomic_long_set(&c
->need_shrink
, 0);
2416 if (!atomic_long_read(&c
->need_shrink
))
2419 b
= cache_evict(&c
->cache
, l
,
2420 l
== LIST_CLEAN
? is_clean
: is_dirty
, c
);
2424 __make_buffer_clean(b
);
2425 __free_buffer_wake(b
);
2427 atomic_long_dec(&c
->need_shrink
);
2434 static void shrink_work(struct work_struct
*w
)
2436 struct dm_bufio_client
*c
= container_of(w
, struct dm_bufio_client
, shrink_work
);
2443 static unsigned long dm_bufio_shrink_scan(struct shrinker
*shrink
, struct shrink_control
*sc
)
2445 struct dm_bufio_client
*c
;
2447 c
= shrink
->private_data
;
2448 atomic_long_add(sc
->nr_to_scan
, &c
->need_shrink
);
2449 queue_work(dm_bufio_wq
, &c
->shrink_work
);
2451 return sc
->nr_to_scan
;
2454 static unsigned long dm_bufio_shrink_count(struct shrinker
*shrink
, struct shrink_control
*sc
)
2456 struct dm_bufio_client
*c
= shrink
->private_data
;
2457 unsigned long count
= cache_total(&c
->cache
);
2458 unsigned long retain_target
= get_retain_buffers(c
);
2459 unsigned long queued_for_cleanup
= atomic_long_read(&c
->need_shrink
);
2461 if (unlikely(count
< retain_target
))
2464 count
-= retain_target
;
2466 if (unlikely(count
< queued_for_cleanup
))
2469 count
-= queued_for_cleanup
;
2475 * Create the buffering interface
2477 struct dm_bufio_client
*dm_bufio_client_create(struct block_device
*bdev
, unsigned int block_size
,
2478 unsigned int reserved_buffers
, unsigned int aux_size
,
2479 void (*alloc_callback
)(struct dm_buffer
*),
2480 void (*write_callback
)(struct dm_buffer
*),
2484 unsigned int num_locks
;
2485 struct dm_bufio_client
*c
;
2487 static atomic_t seqno
= ATOMIC_INIT(0);
2489 if (!block_size
|| block_size
& ((1 << SECTOR_SHIFT
) - 1)) {
2490 DMERR("%s: block size not specified or is not multiple of 512b", __func__
);
2495 num_locks
= dm_num_hash_locks();
2496 c
= kzalloc(sizeof(*c
) + (num_locks
* sizeof(struct buffer_tree
)), GFP_KERNEL
);
2501 cache_init(&c
->cache
, num_locks
, (flags
& DM_BUFIO_CLIENT_NO_SLEEP
) != 0);
2504 c
->block_size
= block_size
;
2505 if (is_power_of_2(block_size
))
2506 c
->sectors_per_block_bits
= __ffs(block_size
) - SECTOR_SHIFT
;
2508 c
->sectors_per_block_bits
= -1;
2510 c
->alloc_callback
= alloc_callback
;
2511 c
->write_callback
= write_callback
;
2513 if (flags
& DM_BUFIO_CLIENT_NO_SLEEP
) {
2515 static_branch_inc(&no_sleep_enabled
);
2518 mutex_init(&c
->lock
);
2519 spin_lock_init(&c
->spinlock
);
2520 INIT_LIST_HEAD(&c
->reserved_buffers
);
2521 c
->need_reserved_buffers
= reserved_buffers
;
2523 dm_bufio_set_minimum_buffers(c
, DM_BUFIO_MIN_BUFFERS
);
2525 init_waitqueue_head(&c
->free_buffer_wait
);
2526 c
->async_write_error
= 0;
2528 c
->dm_io
= dm_io_client_create();
2529 if (IS_ERR(c
->dm_io
)) {
2530 r
= PTR_ERR(c
->dm_io
);
2534 if (block_size
<= KMALLOC_MAX_SIZE
&& !is_power_of_2(block_size
)) {
2535 unsigned int align
= min(1U << __ffs(block_size
), (unsigned int)PAGE_SIZE
);
2537 snprintf(slab_name
, sizeof(slab_name
), "dm_bufio_cache-%u-%u",
2538 block_size
, atomic_inc_return(&seqno
));
2539 c
->slab_cache
= kmem_cache_create(slab_name
, block_size
, align
,
2540 SLAB_RECLAIM_ACCOUNT
, NULL
);
2541 if (!c
->slab_cache
) {
2547 snprintf(slab_name
, sizeof(slab_name
), "dm_bufio_buffer-%u-%u",
2548 aux_size
, atomic_inc_return(&seqno
));
2550 snprintf(slab_name
, sizeof(slab_name
), "dm_bufio_buffer-%u",
2551 atomic_inc_return(&seqno
));
2552 c
->slab_buffer
= kmem_cache_create(slab_name
, sizeof(struct dm_buffer
) + aux_size
,
2553 0, SLAB_RECLAIM_ACCOUNT
, NULL
);
2554 if (!c
->slab_buffer
) {
2559 while (c
->need_reserved_buffers
) {
2560 struct dm_buffer
*b
= alloc_buffer(c
, GFP_KERNEL
);
2566 __free_buffer_wake(b
);
2569 INIT_WORK(&c
->shrink_work
, shrink_work
);
2570 atomic_long_set(&c
->need_shrink
, 0);
2572 c
->shrinker
= shrinker_alloc(0, "dm-bufio:(%u:%u)",
2573 MAJOR(bdev
->bd_dev
), MINOR(bdev
->bd_dev
));
2579 c
->shrinker
->count_objects
= dm_bufio_shrink_count
;
2580 c
->shrinker
->scan_objects
= dm_bufio_shrink_scan
;
2581 c
->shrinker
->seeks
= 1;
2582 c
->shrinker
->batch
= 0;
2583 c
->shrinker
->private_data
= c
;
2585 shrinker_register(c
->shrinker
);
2587 mutex_lock(&dm_bufio_clients_lock
);
2588 dm_bufio_client_count
++;
2589 list_add(&c
->client_list
, &dm_bufio_all_clients
);
2590 __cache_size_refresh();
2591 mutex_unlock(&dm_bufio_clients_lock
);
2596 while (!list_empty(&c
->reserved_buffers
)) {
2597 struct dm_buffer
*b
= list_to_buffer(c
->reserved_buffers
.next
);
2599 list_del(&b
->lru
.list
);
2602 kmem_cache_destroy(c
->slab_cache
);
2603 kmem_cache_destroy(c
->slab_buffer
);
2604 dm_io_client_destroy(c
->dm_io
);
2606 mutex_destroy(&c
->lock
);
2608 static_branch_dec(&no_sleep_enabled
);
2613 EXPORT_SYMBOL_GPL(dm_bufio_client_create
);
2616 * Free the buffering interface.
2617 * It is required that there are no references on any buffers.
2619 void dm_bufio_client_destroy(struct dm_bufio_client
*c
)
2625 shrinker_free(c
->shrinker
);
2626 flush_work(&c
->shrink_work
);
2628 mutex_lock(&dm_bufio_clients_lock
);
2630 list_del(&c
->client_list
);
2631 dm_bufio_client_count
--;
2632 __cache_size_refresh();
2634 mutex_unlock(&dm_bufio_clients_lock
);
2636 WARN_ON(c
->need_reserved_buffers
);
2638 while (!list_empty(&c
->reserved_buffers
)) {
2639 struct dm_buffer
*b
= list_to_buffer(c
->reserved_buffers
.next
);
2641 list_del(&b
->lru
.list
);
2645 for (i
= 0; i
< LIST_SIZE
; i
++)
2646 if (cache_count(&c
->cache
, i
))
2647 DMERR("leaked buffer count %d: %lu", i
, cache_count(&c
->cache
, i
));
2649 for (i
= 0; i
< LIST_SIZE
; i
++)
2650 WARN_ON(cache_count(&c
->cache
, i
));
2652 cache_destroy(&c
->cache
);
2653 kmem_cache_destroy(c
->slab_cache
);
2654 kmem_cache_destroy(c
->slab_buffer
);
2655 dm_io_client_destroy(c
->dm_io
);
2656 mutex_destroy(&c
->lock
);
2658 static_branch_dec(&no_sleep_enabled
);
2661 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy
);
2663 void dm_bufio_client_reset(struct dm_bufio_client
*c
)
2666 flush_work(&c
->shrink_work
);
2668 EXPORT_SYMBOL_GPL(dm_bufio_client_reset
);
2670 void dm_bufio_set_sector_offset(struct dm_bufio_client
*c
, sector_t start
)
2674 EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset
);
2676 /*--------------------------------------------------------------*/
2678 static unsigned int get_max_age_hz(void)
2680 unsigned int max_age
= READ_ONCE(dm_bufio_max_age
);
2682 if (max_age
> UINT_MAX
/ HZ
)
2683 max_age
= UINT_MAX
/ HZ
;
2685 return max_age
* HZ
;
2688 static bool older_than(struct dm_buffer
*b
, unsigned long age_hz
)
2690 return time_after_eq(jiffies
, READ_ONCE(b
->last_accessed
) + age_hz
);
2693 struct evict_params
{
2695 unsigned long age_hz
;
2698 * This gets updated with the largest last_accessed (ie. most
2699 * recently used) of the evicted buffers. It will not be reinitialised
2700 * by __evict_many(), so you can use it across multiple invocations.
2702 unsigned long last_accessed
;
2706 * We may not be able to evict this buffer if IO pending or the client
2707 * is still using it.
2709 * And if GFP_NOFS is used, we must not do any I/O because we hold
2710 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
2711 * rerouted to different bufio client.
2713 static enum evict_result
select_for_evict(struct dm_buffer
*b
, void *context
)
2715 struct evict_params
*params
= context
;
2717 if (!(params
->gfp
& __GFP_FS
) ||
2718 (static_branch_unlikely(&no_sleep_enabled
) && b
->c
->no_sleep
)) {
2719 if (test_bit_acquire(B_READING
, &b
->state
) ||
2720 test_bit(B_WRITING
, &b
->state
) ||
2721 test_bit(B_DIRTY
, &b
->state
))
2722 return ER_DONT_EVICT
;
2725 return older_than(b
, params
->age_hz
) ? ER_EVICT
: ER_STOP
;
2728 static unsigned long __evict_many(struct dm_bufio_client
*c
,
2729 struct evict_params
*params
,
2730 int list_mode
, unsigned long max_count
)
2732 unsigned long count
;
2733 unsigned long last_accessed
;
2734 struct dm_buffer
*b
;
2736 for (count
= 0; count
< max_count
; count
++) {
2737 b
= cache_evict(&c
->cache
, list_mode
, select_for_evict
, params
);
2741 last_accessed
= READ_ONCE(b
->last_accessed
);
2742 if (time_after_eq(params
->last_accessed
, last_accessed
))
2743 params
->last_accessed
= last_accessed
;
2745 __make_buffer_clean(b
);
2746 __free_buffer_wake(b
);
2754 static void evict_old_buffers(struct dm_bufio_client
*c
, unsigned long age_hz
)
2756 struct evict_params params
= {.gfp
= 0, .age_hz
= age_hz
, .last_accessed
= 0};
2757 unsigned long retain
= get_retain_buffers(c
);
2758 unsigned long count
;
2759 LIST_HEAD(write_list
);
2763 __check_watermark(c
, &write_list
);
2764 if (unlikely(!list_empty(&write_list
))) {
2766 __flush_write_list(&write_list
);
2770 count
= cache_total(&c
->cache
);
2772 __evict_many(c
, ¶ms
, LIST_CLEAN
, count
- retain
);
2777 static void cleanup_old_buffers(void)
2779 unsigned long max_age_hz
= get_max_age_hz();
2780 struct dm_bufio_client
*c
;
2782 mutex_lock(&dm_bufio_clients_lock
);
2784 __cache_size_refresh();
2786 list_for_each_entry(c
, &dm_bufio_all_clients
, client_list
)
2787 evict_old_buffers(c
, max_age_hz
);
2789 mutex_unlock(&dm_bufio_clients_lock
);
2792 static void work_fn(struct work_struct
*w
)
2794 cleanup_old_buffers();
2796 queue_delayed_work(dm_bufio_wq
, &dm_bufio_cleanup_old_work
,
2797 DM_BUFIO_WORK_TIMER_SECS
* HZ
);
2800 /*--------------------------------------------------------------*/
2803 * Global cleanup tries to evict the oldest buffers from across _all_
2804 * the clients. It does this by repeatedly evicting a few buffers from
2805 * the client that holds the oldest buffer. It's approximate, but hopefully
2808 static struct dm_bufio_client
*__pop_client(void)
2810 struct list_head
*h
;
2812 if (list_empty(&dm_bufio_all_clients
))
2815 h
= dm_bufio_all_clients
.next
;
2817 return container_of(h
, struct dm_bufio_client
, client_list
);
2821 * Inserts the client in the global client list based on its
2822 * 'oldest_buffer' field.
2824 static void __insert_client(struct dm_bufio_client
*new_client
)
2826 struct dm_bufio_client
*c
;
2827 struct list_head
*h
= dm_bufio_all_clients
.next
;
2829 while (h
!= &dm_bufio_all_clients
) {
2830 c
= container_of(h
, struct dm_bufio_client
, client_list
);
2831 if (time_after_eq(c
->oldest_buffer
, new_client
->oldest_buffer
))
2836 list_add_tail(&new_client
->client_list
, h
);
2839 static unsigned long __evict_a_few(unsigned long nr_buffers
)
2841 unsigned long count
;
2842 struct dm_bufio_client
*c
;
2843 struct evict_params params
= {
2846 /* set to jiffies in case there are no buffers in this client */
2847 .last_accessed
= jiffies
2855 count
= __evict_many(c
, ¶ms
, LIST_CLEAN
, nr_buffers
);
2859 c
->oldest_buffer
= params
.last_accessed
;
2865 static void check_watermarks(void)
2867 LIST_HEAD(write_list
);
2868 struct dm_bufio_client
*c
;
2870 mutex_lock(&dm_bufio_clients_lock
);
2871 list_for_each_entry(c
, &dm_bufio_all_clients
, client_list
) {
2873 __check_watermark(c
, &write_list
);
2876 mutex_unlock(&dm_bufio_clients_lock
);
2878 __flush_write_list(&write_list
);
2881 static void evict_old(void)
2883 unsigned long threshold
= dm_bufio_cache_size
-
2884 dm_bufio_cache_size
/ DM_BUFIO_LOW_WATERMARK_RATIO
;
2886 mutex_lock(&dm_bufio_clients_lock
);
2887 while (dm_bufio_current_allocated
> threshold
) {
2888 if (!__evict_a_few(64))
2892 mutex_unlock(&dm_bufio_clients_lock
);
2895 static void do_global_cleanup(struct work_struct
*w
)
2902 *--------------------------------------------------------------
2904 *--------------------------------------------------------------
2908 * This is called only once for the whole dm_bufio module.
2909 * It initializes memory limit.
2911 static int __init
dm_bufio_init(void)
2915 dm_bufio_allocated_kmem_cache
= 0;
2916 dm_bufio_allocated_kmalloc
= 0;
2917 dm_bufio_allocated_get_free_pages
= 0;
2918 dm_bufio_allocated_vmalloc
= 0;
2919 dm_bufio_current_allocated
= 0;
2921 mem
= (__u64
)mult_frac(totalram_pages() - totalhigh_pages(),
2922 DM_BUFIO_MEMORY_PERCENT
, 100) << PAGE_SHIFT
;
2924 if (mem
> ULONG_MAX
)
2928 if (mem
> mult_frac(VMALLOC_TOTAL
, DM_BUFIO_VMALLOC_PERCENT
, 100))
2929 mem
= mult_frac(VMALLOC_TOTAL
, DM_BUFIO_VMALLOC_PERCENT
, 100);
2932 dm_bufio_default_cache_size
= mem
;
2934 mutex_lock(&dm_bufio_clients_lock
);
2935 __cache_size_refresh();
2936 mutex_unlock(&dm_bufio_clients_lock
);
2938 dm_bufio_wq
= alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM
, 0);
2942 INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work
, work_fn
);
2943 INIT_WORK(&dm_bufio_replacement_work
, do_global_cleanup
);
2944 queue_delayed_work(dm_bufio_wq
, &dm_bufio_cleanup_old_work
,
2945 DM_BUFIO_WORK_TIMER_SECS
* HZ
);
2951 * This is called once when unloading the dm_bufio module.
2953 static void __exit
dm_bufio_exit(void)
2957 cancel_delayed_work_sync(&dm_bufio_cleanup_old_work
);
2958 destroy_workqueue(dm_bufio_wq
);
2960 if (dm_bufio_client_count
) {
2961 DMCRIT("%s: dm_bufio_client_count leaked: %d",
2962 __func__
, dm_bufio_client_count
);
2966 if (dm_bufio_current_allocated
) {
2967 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2968 __func__
, dm_bufio_current_allocated
);
2972 if (dm_bufio_allocated_get_free_pages
) {
2973 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2974 __func__
, dm_bufio_allocated_get_free_pages
);
2978 if (dm_bufio_allocated_vmalloc
) {
2979 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2980 __func__
, dm_bufio_allocated_vmalloc
);
2984 WARN_ON(bug
); /* leaks are not worth crashing the system */
2987 module_init(dm_bufio_init
)
2988 module_exit(dm_bufio_exit
)
2990 module_param_named(max_cache_size_bytes
, dm_bufio_cache_size
, ulong
, 0644);
2991 MODULE_PARM_DESC(max_cache_size_bytes
, "Size of metadata cache");
2993 module_param_named(max_age_seconds
, dm_bufio_max_age
, uint
, 0644);
2994 MODULE_PARM_DESC(max_age_seconds
, "Max age of a buffer in seconds");
2996 module_param_named(retain_bytes
, dm_bufio_retain_bytes
, ulong
, 0644);
2997 MODULE_PARM_DESC(retain_bytes
, "Try to keep at least this many bytes cached in memory");
2999 module_param_named(peak_allocated_bytes
, dm_bufio_peak_allocated
, ulong
, 0644);
3000 MODULE_PARM_DESC(peak_allocated_bytes
, "Tracks the maximum allocated memory");
3002 module_param_named(allocated_kmem_cache_bytes
, dm_bufio_allocated_kmem_cache
, ulong
, 0444);
3003 MODULE_PARM_DESC(allocated_kmem_cache_bytes
, "Memory allocated with kmem_cache_alloc");
3005 module_param_named(allocated_kmalloc_bytes
, dm_bufio_allocated_kmalloc
, ulong
, 0444);
3006 MODULE_PARM_DESC(allocated_kmalloc_bytes
, "Memory allocated with kmalloc_alloc");
3008 module_param_named(allocated_get_free_pages_bytes
, dm_bufio_allocated_get_free_pages
, ulong
, 0444);
3009 MODULE_PARM_DESC(allocated_get_free_pages_bytes
, "Memory allocated with get_free_pages");
3011 module_param_named(allocated_vmalloc_bytes
, dm_bufio_allocated_vmalloc
, ulong
, 0444);
3012 MODULE_PARM_DESC(allocated_vmalloc_bytes
, "Memory allocated with vmalloc");
3014 module_param_named(current_allocated_bytes
, dm_bufio_current_allocated
, ulong
, 0444);
3015 MODULE_PARM_DESC(current_allocated_bytes
, "Memory currently used by the cache");
3017 MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>");
3018 MODULE_DESCRIPTION(DM_NAME
" buffered I/O library");
3019 MODULE_LICENSE("GPL");