2 * Copyright (C) 2011 Red Hat, Inc. All rights reserved.
4 * This file is released under the GPL.
6 #include "dm-block-manager.h"
7 #include "dm-persistent-data-internal.h"
9 #include <linux/dm-io.h>
10 #include <linux/slab.h>
11 #include <linux/device-mapper.h>
12 #include <linux/export.h>
14 #define DM_MSG_PREFIX "block manager"
16 /*----------------------------------------------------------------*/
18 #define SECTOR_SIZE (1 << SECTOR_SHIFT)
19 #define MAX_CACHE_SIZE 16U
27 BS_READ_LOCKED_DIRTY
, /* Block was dirty before it was read locked. */
34 struct list_head list
;
35 struct hlist_node hlist
;
38 struct dm_block_validator
*validator
;
40 wait_queue_head_t io_q
;
41 unsigned read_lock_count
;
42 unsigned write_lock_pending
;
43 enum dm_block_state state
;
46 * Extra flags like REQ_FLUSH and REQ_FUA can be set here. This is
47 * mainly as to avoid a race condition in flush_and_unlock() where
48 * the newly-unlocked superblock may have been submitted for a
49 * write before the write_all_dirty() call is made.
54 * Sadly we need an up pointer so we can get to the bm on io
57 struct dm_block_manager
*bm
;
60 struct dm_block_manager
{
61 struct block_device
*bdev
;
62 unsigned cache_size
; /* In bytes */
63 unsigned block_size
; /* In bytes */
67 * This will trigger every time an io completes.
69 wait_queue_head_t io_q
;
71 struct dm_io_client
*io
;
74 * Protects all the lists and the hash table.
78 struct list_head empty_list
; /* No block assigned */
79 struct list_head clean_list
; /* Unlocked and clean */
80 struct list_head dirty_list
; /* Unlocked and dirty */
81 struct list_head error_list
;
83 unsigned available_count
;
84 unsigned reading_count
;
85 unsigned writing_count
;
87 struct kmem_cache
*block_cache
; /* struct dm_block */
88 struct kmem_cache
*buffer_cache
; /* The buffers that store the raw data */
91 * Hash table of cached blocks, holds everything that isn't in the
97 struct hlist_head buckets
[0]; /* Must be last member of struct. */
100 dm_block_t
dm_block_location(struct dm_block
*b
)
104 EXPORT_SYMBOL_GPL(dm_block_location
);
106 void *dm_block_data(struct dm_block
*b
)
110 EXPORT_SYMBOL_GPL(dm_block_data
);
112 /*----------------------------------------------------------------
114 *--------------------------------------------------------------*/
115 static struct dm_block
*__find_block(struct dm_block_manager
*bm
, dm_block_t b
)
117 unsigned bucket
= dm_hash_block(b
, bm
->hash_mask
);
118 struct dm_block
*blk
;
119 struct hlist_node
*n
;
121 hlist_for_each_entry(blk
, n
, bm
->buckets
+ bucket
, hlist
)
128 static void __insert_block(struct dm_block_manager
*bm
, struct dm_block
*b
)
130 unsigned bucket
= dm_hash_block(b
->where
, bm
->hash_mask
);
132 hlist_add_head(&b
->hlist
, bm
->buckets
+ bucket
);
135 /*----------------------------------------------------------------
137 * __transition() handles transition of a block between different states.
138 * Study this to understand the state machine.
140 * Alternatively install graphviz and run:
141 * grep DOT dm-block-manager.c | grep -v ' ' |
142 * sed -e 's/.*DOT: //' -e 's/\*\///' |
143 * dot -Tps -o states.ps
145 * Assumes bm->lock is held.
146 *--------------------------------------------------------------*/
147 static void __transition(struct dm_block
*b
, enum dm_block_state new_state
)
149 /* DOT: digraph BlockStates { */
150 struct dm_block_manager
*bm
= b
->bm
;
154 /* DOT: error -> empty */
155 /* DOT: clean -> empty */
156 BUG_ON(!((b
->state
== BS_ERROR
) ||
157 (b
->state
== BS_CLEAN
)));
158 hlist_del(&b
->hlist
);
159 list_move(&b
->list
, &bm
->empty_list
);
160 b
->write_lock_pending
= 0;
161 b
->read_lock_count
= 0;
165 if (b
->state
== BS_ERROR
)
166 bm
->available_count
++;
170 /* DOT: reading -> clean */
171 /* DOT: writing -> clean */
172 /* DOT: read_locked -> clean */
173 BUG_ON(!((b
->state
== BS_READING
) ||
174 (b
->state
== BS_WRITING
) ||
175 (b
->state
== BS_READ_LOCKED
)));
178 BUG_ON(!bm
->reading_count
);
183 BUG_ON(!bm
->writing_count
);
191 list_add_tail(&b
->list
, &bm
->clean_list
);
192 bm
->available_count
++;
196 /* DOT: empty -> reading */
197 BUG_ON(!(b
->state
== BS_EMPTY
));
198 __insert_block(bm
, b
);
200 bm
->available_count
--;
205 /* DOT: dirty -> writing */
206 BUG_ON(!(b
->state
== BS_DIRTY
));
212 /* DOT: clean -> read_locked */
213 BUG_ON(!(b
->state
== BS_CLEAN
));
215 bm
->available_count
--;
218 case BS_READ_LOCKED_DIRTY
:
219 /* DOT: dirty -> read_locked_dirty */
220 BUG_ON(!((b
->state
== BS_DIRTY
)));
224 case BS_WRITE_LOCKED
:
225 /* DOT: dirty -> write_locked */
226 /* DOT: clean -> write_locked */
227 BUG_ON(!((b
->state
== BS_DIRTY
) ||
228 (b
->state
== BS_CLEAN
)));
231 if (b
->state
== BS_CLEAN
)
232 bm
->available_count
--;
236 /* DOT: write_locked -> dirty */
237 /* DOT: read_locked_dirty -> dirty */
238 BUG_ON(!((b
->state
== BS_WRITE_LOCKED
) ||
239 (b
->state
== BS_READ_LOCKED_DIRTY
)));
240 list_add_tail(&b
->list
, &bm
->dirty_list
);
244 /* DOT: writing -> error */
245 /* DOT: reading -> error */
246 BUG_ON(!((b
->state
== BS_WRITING
) ||
247 (b
->state
== BS_READING
)));
248 list_add_tail(&b
->list
, &bm
->error_list
);
252 b
->state
= new_state
;
256 /*----------------------------------------------------------------
258 *--------------------------------------------------------------*/
259 typedef void (completion_fn
)(unsigned long error
, struct dm_block
*b
);
261 static void submit_io(struct dm_block
*b
, int rw
,
264 struct dm_block_manager
*bm
= b
->bm
;
265 struct dm_io_request req
;
266 struct dm_io_region region
;
267 unsigned sectors_per_block
= bm
->block_size
>> SECTOR_SHIFT
;
269 region
.bdev
= bm
->bdev
;
270 region
.sector
= b
->where
* sectors_per_block
;
271 region
.count
= sectors_per_block
;
274 req
.mem
.type
= DM_IO_KMEM
;
276 req
.mem
.ptr
.addr
= b
->data
;
277 req
.notify
.fn
= (void (*)(unsigned long, void *)) fn
;
278 req
.notify
.context
= b
;
281 if (dm_io(&req
, 1, ®ion
, NULL
) < 0)
285 /*----------------------------------------------------------------
287 *--------------------------------------------------------------*/
288 static void __complete_io(unsigned long error
, struct dm_block
*b
)
290 struct dm_block_manager
*bm
= b
->bm
;
293 DMERR("io error = %lu, block = %llu",
294 error
, (unsigned long long)b
->where
);
295 __transition(b
, BS_ERROR
);
297 __transition(b
, BS_CLEAN
);
303 static void complete_io(unsigned long error
, struct dm_block
*b
)
305 struct dm_block_manager
*bm
= b
->bm
;
308 spin_lock_irqsave(&bm
->lock
, flags
);
309 __complete_io(error
, b
);
310 spin_unlock_irqrestore(&bm
->lock
, flags
);
313 static void read_block(struct dm_block
*b
)
315 submit_io(b
, READ
, complete_io
);
318 static void write_block(struct dm_block
*b
)
321 b
->validator
->prepare_for_write(b
->validator
, b
,
324 submit_io(b
, WRITE
| b
->io_flags
, complete_io
);
327 static void write_dirty(struct dm_block_manager
*bm
, unsigned count
)
329 struct dm_block
*b
, *tmp
;
330 struct list_head dirty
;
334 * Grab the first @count entries from the dirty list
336 INIT_LIST_HEAD(&dirty
);
337 spin_lock_irqsave(&bm
->lock
, flags
);
338 list_for_each_entry_safe(b
, tmp
, &bm
->dirty_list
, list
) {
341 __transition(b
, BS_WRITING
);
342 list_add_tail(&b
->list
, &dirty
);
344 spin_unlock_irqrestore(&bm
->lock
, flags
);
346 list_for_each_entry_safe(b
, tmp
, &dirty
, list
) {
352 static void write_all_dirty(struct dm_block_manager
*bm
)
354 write_dirty(bm
, bm
->cache_size
);
357 static void __clear_errors(struct dm_block_manager
*bm
)
359 struct dm_block
*b
, *tmp
;
360 list_for_each_entry_safe(b
, tmp
, &bm
->error_list
, list
)
361 __transition(b
, BS_EMPTY
);
364 /*----------------------------------------------------------------
366 *--------------------------------------------------------------*/
368 # define __retains(x) __attribute__((context(x, 1, 1)))
370 # define __retains(x)
373 #define __wait_block(wq, lock, flags, sched_fn, condition) \
378 add_wait_queue(wq, &wait); \
381 prepare_to_wait(wq, &wait, TASK_INTERRUPTIBLE); \
385 spin_unlock_irqrestore(lock, flags); \
386 if (signal_pending(current)) { \
388 spin_lock_irqsave(lock, flags); \
393 spin_lock_irqsave(lock, flags); \
396 finish_wait(wq, &wait); \
400 static int __wait_io(struct dm_block
*b
, unsigned long *flags
)
401 __retains(&b
->bm
->lock
)
403 __wait_block(&b
->io_q
, &b
->bm
->lock
, *flags
, io_schedule
,
404 ((b
->state
!= BS_READING
) && (b
->state
!= BS_WRITING
)));
407 static int __wait_unlocked(struct dm_block
*b
, unsigned long *flags
)
408 __retains(&b
->bm
->lock
)
410 __wait_block(&b
->io_q
, &b
->bm
->lock
, *flags
, schedule
,
411 ((b
->state
== BS_CLEAN
) || (b
->state
== BS_DIRTY
)));
414 static int __wait_read_lockable(struct dm_block
*b
, unsigned long *flags
)
415 __retains(&b
->bm
->lock
)
417 __wait_block(&b
->io_q
, &b
->bm
->lock
, *flags
, schedule
,
418 (!b
->write_lock_pending
&& (b
->state
== BS_CLEAN
||
419 b
->state
== BS_DIRTY
||
420 b
->state
== BS_READ_LOCKED
)));
423 static int __wait_all_writes(struct dm_block_manager
*bm
, unsigned long *flags
)
426 __wait_block(&bm
->io_q
, &bm
->lock
, *flags
, io_schedule
,
430 static int __wait_all_io(struct dm_block_manager
*bm
, unsigned long *flags
)
433 __wait_block(&bm
->io_q
, &bm
->lock
, *flags
, io_schedule
,
434 !bm
->writing_count
&& !bm
->reading_count
);
437 static int __wait_clean(struct dm_block_manager
*bm
, unsigned long *flags
)
440 __wait_block(&bm
->io_q
, &bm
->lock
, *flags
, io_schedule
,
441 (!list_empty(&bm
->clean_list
) ||
442 (!bm
->writing_count
)));
445 /*----------------------------------------------------------------
446 * Finding a free block to recycle
447 *--------------------------------------------------------------*/
448 static int recycle_block(struct dm_block_manager
*bm
, dm_block_t where
,
449 int need_read
, struct dm_block_validator
*v
,
450 struct dm_block
**result
)
454 unsigned long flags
, available
;
457 * Wait for a block to appear on the empty or clean lists.
459 spin_lock_irqsave(&bm
->lock
, flags
);
462 * Once we can lock and do io concurrently then we should
463 * probably flush at bm->cache_size / 2 and write _all_
466 available
= bm
->available_count
+ bm
->writing_count
;
467 if (available
< bm
->cache_size
/ 4) {
468 spin_unlock_irqrestore(&bm
->lock
, flags
);
469 write_dirty(bm
, bm
->cache_size
/ 4);
470 spin_lock_irqsave(&bm
->lock
, flags
);
473 if (!list_empty(&bm
->empty_list
)) {
474 b
= list_first_entry(&bm
->empty_list
, struct dm_block
, list
);
477 } else if (!list_empty(&bm
->clean_list
)) {
478 b
= list_first_entry(&bm
->clean_list
, struct dm_block
, list
);
479 __transition(b
, BS_EMPTY
);
483 __wait_clean(bm
, &flags
);
488 __transition(b
, BS_READING
);
491 memset(b
->data
, 0, bm
->block_size
);
492 __transition(b
, BS_CLEAN
);
494 spin_unlock_irqrestore(&bm
->lock
, flags
);
496 spin_lock_irqsave(&bm
->lock
, flags
);
497 __wait_io(b
, &flags
);
499 /* FIXME: Can b have been recycled between io completion and here? */
502 * Did the io succeed?
504 if (b
->state
== BS_ERROR
) {
506 * Since this is a read that has failed we can clear the error
507 * immediately. Failed writes are revealed during a commit.
509 __transition(b
, BS_EMPTY
);
514 r
= b
->validator
->check(b
->validator
, b
, bm
->block_size
);
516 DMERR("%s validator check failed for block %llu",
517 b
->validator
->name
, (unsigned long long)b
->where
);
518 __transition(b
, BS_EMPTY
);
522 spin_unlock_irqrestore(&bm
->lock
, flags
);
530 /*----------------------------------------------------------------
531 * Low level block management
532 *--------------------------------------------------------------*/
534 static struct dm_block
*alloc_block(struct dm_block_manager
*bm
)
536 struct dm_block
*b
= kmem_cache_alloc(bm
->block_cache
, GFP_KERNEL
);
541 INIT_LIST_HEAD(&b
->list
);
542 INIT_HLIST_NODE(&b
->hlist
);
544 b
->data
= kmem_cache_alloc(bm
->buffer_cache
, GFP_KERNEL
);
546 kmem_cache_free(bm
->block_cache
, b
);
552 init_waitqueue_head(&b
->io_q
);
553 b
->read_lock_count
= 0;
554 b
->write_lock_pending
= 0;
561 static void free_block(struct dm_block
*b
)
563 kmem_cache_free(b
->bm
->buffer_cache
, b
->data
);
564 kmem_cache_free(b
->bm
->block_cache
, b
);
567 static int populate_bm(struct dm_block_manager
*bm
, unsigned count
)
572 for (i
= 0; i
< count
; i
++) {
573 struct dm_block
*b
= alloc_block(bm
);
575 struct dm_block
*tmp
;
576 list_for_each_entry_safe(b
, tmp
, &bs
, list
)
581 list_add(&b
->list
, &bs
);
584 list_replace(&bs
, &bm
->empty_list
);
585 bm
->available_count
= count
;
590 /*----------------------------------------------------------------
592 *--------------------------------------------------------------*/
593 static unsigned calc_hash_size(unsigned cache_size
)
595 unsigned r
= 32; /* Minimum size is 16 */
597 while (r
< cache_size
)
603 struct dm_block_manager
*dm_block_manager_create(struct block_device
*bdev
,
608 unsigned hash_size
= calc_hash_size(cache_size
);
609 size_t len
= sizeof(struct dm_block_manager
) +
610 sizeof(struct hlist_head
) * hash_size
;
611 struct dm_block_manager
*bm
;
613 bm
= kmalloc(len
, GFP_KERNEL
);
618 bm
->cache_size
= max(MAX_CACHE_SIZE
, cache_size
);
619 bm
->block_size
= block_size
;
620 bm
->nr_blocks
= i_size_read(bdev
->bd_inode
);
621 do_div(bm
->nr_blocks
, block_size
);
622 init_waitqueue_head(&bm
->io_q
);
623 spin_lock_init(&bm
->lock
);
625 INIT_LIST_HEAD(&bm
->empty_list
);
626 INIT_LIST_HEAD(&bm
->clean_list
);
627 INIT_LIST_HEAD(&bm
->dirty_list
);
628 INIT_LIST_HEAD(&bm
->error_list
);
629 bm
->available_count
= 0;
630 bm
->reading_count
= 0;
631 bm
->writing_count
= 0;
633 bm
->block_cache
= kmem_cache_create("dm-block-manager-blocks",
634 sizeof(struct dm_block
),
635 __alignof__(struct dm_block
),
636 SLAB_HWCACHE_ALIGN
, NULL
);
637 if (!bm
->block_cache
)
640 bm
->buffer_cache
= kmem_cache_create("dm-block-manager-buffers",
641 block_size
, SECTOR_SIZE
,
643 if (!bm
->buffer_cache
)
644 goto bad_block_cache
;
646 bm
->hash_size
= hash_size
;
647 bm
->hash_mask
= hash_size
- 1;
648 for (i
= 0; i
< hash_size
; i
++)
649 INIT_HLIST_HEAD(bm
->buckets
+ i
);
651 bm
->io
= dm_io_client_create();
653 goto bad_buffer_cache
;
655 if (populate_bm(bm
, cache_size
) < 0)
661 dm_io_client_destroy(bm
->io
);
663 kmem_cache_destroy(bm
->buffer_cache
);
665 kmem_cache_destroy(bm
->block_cache
);
671 EXPORT_SYMBOL_GPL(dm_block_manager_create
);
673 void dm_block_manager_destroy(struct dm_block_manager
*bm
)
676 struct dm_block
*b
, *btmp
;
677 struct hlist_node
*n
, *tmp
;
679 dm_io_client_destroy(bm
->io
);
681 for (i
= 0; i
< bm
->hash_size
; i
++)
682 hlist_for_each_entry_safe(b
, n
, tmp
, bm
->buckets
+ i
, hlist
)
685 list_for_each_entry_safe(b
, btmp
, &bm
->empty_list
, list
)
688 kmem_cache_destroy(bm
->buffer_cache
);
689 kmem_cache_destroy(bm
->block_cache
);
693 EXPORT_SYMBOL_GPL(dm_block_manager_destroy
);
695 unsigned dm_bm_block_size(struct dm_block_manager
*bm
)
697 return bm
->block_size
;
699 EXPORT_SYMBOL_GPL(dm_bm_block_size
);
701 dm_block_t
dm_bm_nr_blocks(struct dm_block_manager
*bm
)
703 return bm
->nr_blocks
;
706 static int lock_internal(struct dm_block_manager
*bm
, dm_block_t block
,
707 int how
, int need_read
, int can_block
,
708 struct dm_block_validator
*v
,
709 struct dm_block
**result
)
715 spin_lock_irqsave(&bm
->lock
, flags
);
717 b
= __find_block(bm
, block
);
722 if (b
->validator
&& (v
!= b
->validator
)) {
723 DMERR("validator mismatch (old=%s vs new=%s) for block %llu",
724 b
->validator
->name
, v
? v
->name
: "NULL",
725 (unsigned long long)b
->where
);
726 spin_unlock_irqrestore(&bm
->lock
, flags
);
730 if (!b
->validator
&& v
) {
732 r
= b
->validator
->check(b
->validator
, b
, bm
->block_size
);
734 DMERR("%s validator check failed for block %llu",
736 (unsigned long long)b
->where
);
737 spin_unlock_irqrestore(&bm
->lock
, flags
);
745 if (b
->write_lock_pending
|| (b
->state
!= BS_CLEAN
&&
746 b
->state
!= BS_DIRTY
&&
747 b
->state
!= BS_READ_LOCKED
)) {
749 spin_unlock_irqrestore(&bm
->lock
, flags
);
753 __wait_read_lockable(b
, &flags
);
755 if (b
->where
!= block
)
761 while (b
->state
!= BS_CLEAN
&& b
->state
!= BS_DIRTY
) {
763 spin_unlock_irqrestore(&bm
->lock
, flags
);
767 b
->write_lock_pending
++;
768 __wait_unlocked(b
, &flags
);
769 b
->write_lock_pending
--;
770 if (b
->where
!= block
)
776 } else if (!can_block
) {
781 spin_unlock_irqrestore(&bm
->lock
, flags
);
782 r
= recycle_block(bm
, block
, need_read
, v
, &b
);
783 spin_lock_irqsave(&bm
->lock
, flags
);
789 b
->read_lock_count
++;
791 if (b
->state
== BS_DIRTY
)
792 __transition(b
, BS_READ_LOCKED_DIRTY
);
793 else if (b
->state
== BS_CLEAN
)
794 __transition(b
, BS_READ_LOCKED
);
798 __transition(b
, BS_WRITE_LOCKED
);
806 spin_unlock_irqrestore(&bm
->lock
, flags
);
811 int dm_bm_read_lock(struct dm_block_manager
*bm
, dm_block_t b
,
812 struct dm_block_validator
*v
,
813 struct dm_block
**result
)
815 return lock_internal(bm
, b
, READ
, 1, 1, v
, result
);
817 EXPORT_SYMBOL_GPL(dm_bm_read_lock
);
819 int dm_bm_write_lock(struct dm_block_manager
*bm
,
820 dm_block_t b
, struct dm_block_validator
*v
,
821 struct dm_block
**result
)
823 return lock_internal(bm
, b
, WRITE
, 1, 1, v
, result
);
825 EXPORT_SYMBOL_GPL(dm_bm_write_lock
);
827 int dm_bm_read_try_lock(struct dm_block_manager
*bm
,
828 dm_block_t b
, struct dm_block_validator
*v
,
829 struct dm_block
**result
)
831 return lock_internal(bm
, b
, READ
, 1, 0, v
, result
);
834 int dm_bm_write_lock_zero(struct dm_block_manager
*bm
,
835 dm_block_t b
, struct dm_block_validator
*v
,
836 struct dm_block
**result
)
838 int r
= lock_internal(bm
, b
, WRITE
, 0, 1, v
, result
);
841 memset((*result
)->data
, 0, bm
->block_size
);
846 int dm_bm_unlock(struct dm_block
*b
)
851 spin_lock_irqsave(&b
->bm
->lock
, flags
);
853 case BS_WRITE_LOCKED
:
854 __transition(b
, BS_DIRTY
);
859 if (!--b
->read_lock_count
) {
860 __transition(b
, BS_CLEAN
);
865 case BS_READ_LOCKED_DIRTY
:
866 if (!--b
->read_lock_count
) {
867 __transition(b
, BS_DIRTY
);
873 DMERR("block = %llu not locked",
874 (unsigned long long)b
->where
);
878 spin_unlock_irqrestore(&b
->bm
->lock
, flags
);
882 EXPORT_SYMBOL_GPL(dm_bm_unlock
);
884 static int __wait_flush(struct dm_block_manager
*bm
)
889 spin_lock_irqsave(&bm
->lock
, flags
);
890 __wait_all_writes(bm
, &flags
);
892 if (!list_empty(&bm
->error_list
)) {
896 spin_unlock_irqrestore(&bm
->lock
, flags
);
901 int dm_bm_flush_and_unlock(struct dm_block_manager
*bm
,
902 struct dm_block
*superblock
)
908 r
= __wait_flush(bm
);
912 spin_lock_irqsave(&bm
->lock
, flags
);
913 superblock
->io_flags
= REQ_FUA
| REQ_FLUSH
;
914 spin_unlock_irqrestore(&bm
->lock
, flags
);
916 dm_bm_unlock(superblock
);
919 return __wait_flush(bm
);
922 int dm_bm_rebind_block_device(struct dm_block_manager
*bm
,
923 struct block_device
*bdev
)
926 dm_block_t nr_blocks
= i_size_read(bdev
->bd_inode
);
928 do_div(nr_blocks
, bm
->block_size
);
930 spin_lock_irqsave(&bm
->lock
, flags
);
931 if (nr_blocks
< bm
->nr_blocks
) {
932 spin_unlock_irqrestore(&bm
->lock
, flags
);
937 bm
->nr_blocks
= nr_blocks
;
940 * Wait for any in-flight io that may be using the old bdev
942 __wait_all_io(bm
, &flags
);
943 spin_unlock_irqrestore(&bm
->lock
, flags
);
947 EXPORT_SYMBOL_GPL(dm_bm_rebind_block_device
);