2 * Copyright (C) 2011 Red Hat UK. All rights reserved.
4 * This file is released under the GPL.
7 #include "dm-thin-metadata.h"
9 #include <linux/device-mapper.h>
10 #include <linux/dm-io.h>
11 #include <linux/dm-kcopyd.h>
12 #include <linux/list.h>
13 #include <linux/init.h>
14 #include <linux/module.h>
15 #include <linux/slab.h>
17 #define DM_MSG_PREFIX "thin"
22 #define ENDIO_HOOK_POOL_SIZE 10240
23 #define DEFERRED_SET_SIZE 64
24 #define MAPPING_POOL_SIZE 1024
25 #define PRISON_CELLS 1024
28 * The block size of the device holding pool data must be
29 * between 64KB and 1GB.
31 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
32 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
34 #define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * 8)
37 * Device id is restricted to 24 bits.
39 #define MAX_DEV_ID ((1 << 24) - 1)
42 * How do we handle breaking sharing of data blocks?
43 * =================================================
45 * We use a standard copy-on-write btree to store the mappings for the
46 * devices (note I'm talking about copy-on-write of the metadata here, not
47 * the data). When you take an internal snapshot you clone the root node
48 * of the origin btree. After this there is no concept of an origin or a
49 * snapshot. They are just two device trees that happen to point to the
52 * When we get a write in we decide if it's to a shared data block using
53 * some timestamp magic. If it is, we have to break sharing.
55 * Let's say we write to a shared block in what was the origin. The
58 * i) plug io further to this physical block. (see bio_prison code).
60 * ii) quiesce any read io to that shared data block. Obviously
61 * including all devices that share this block. (see deferred_set code)
63 * iii) copy the data block to a newly allocate block. This step can be
64 * missed out if the io covers the block. (schedule_copy).
66 * iv) insert the new mapping into the origin's btree
67 * (process_prepared_mappings). This act of inserting breaks some
68 * sharing of btree nodes between the two devices. Breaking sharing only
69 * effects the btree of that specific device. Btrees for the other
70 * devices that share the block never change. The btree for the origin
71 * device as it was after the last commit is untouched, ie. we're using
72 * persistent data structures in the functional programming sense.
74 * v) unplug io to this physical block, including the io that triggered
75 * the breaking of sharing.
77 * Steps (ii) and (iii) occur in parallel.
79 * The metadata _doesn't_ need to be committed before the io continues. We
80 * get away with this because the io is always written to a _new_ block.
81 * If there's a crash, then:
83 * - The origin mapping will point to the old origin block (the shared
84 * one). This will contain the data as it was before the io that triggered
85 * the breaking of sharing came in.
87 * - The snap mapping still points to the old block. As it would after
90 * The downside of this scheme is the timestamp magic isn't perfect, and
91 * will continue to think that data block in the snapshot device is shared
92 * even after the write to the origin has broken sharing. I suspect data
93 * blocks will typically be shared by many different devices, so we're
94 * breaking sharing n + 1 times, rather than n, where n is the number of
95 * devices that reference this data block. At the moment I think the
96 * benefits far, far outweigh the disadvantages.
99 /*----------------------------------------------------------------*/
102 * Sometimes we can't deal with a bio straight away. We put them in prison
103 * where they can't cause any mischief. Bios are put in a cell identified
104 * by a key, multiple bios can be in the same cell. When the cell is
105 * subsequently unlocked the bios become available.
116 struct hlist_node list
;
117 struct bio_prison
*prison
;
120 struct bio_list bios
;
125 mempool_t
*cell_pool
;
129 struct hlist_head
*cells
;
132 static uint32_t calc_nr_buckets(unsigned nr_cells
)
137 nr_cells
= min(nr_cells
, 8192u);
146 * @nr_cells should be the number of cells you want in use _concurrently_.
147 * Don't confuse it with the number of distinct keys.
149 static struct bio_prison
*prison_create(unsigned nr_cells
)
152 uint32_t nr_buckets
= calc_nr_buckets(nr_cells
);
153 size_t len
= sizeof(struct bio_prison
) +
154 (sizeof(struct hlist_head
) * nr_buckets
);
155 struct bio_prison
*prison
= kmalloc(len
, GFP_KERNEL
);
160 spin_lock_init(&prison
->lock
);
161 prison
->cell_pool
= mempool_create_kmalloc_pool(nr_cells
,
162 sizeof(struct cell
));
163 prison
->nr_buckets
= nr_buckets
;
164 prison
->hash_mask
= nr_buckets
- 1;
165 prison
->cells
= (struct hlist_head
*) (prison
+ 1);
166 for (i
= 0; i
< nr_buckets
; i
++)
167 INIT_HLIST_HEAD(prison
->cells
+ i
);
172 static void prison_destroy(struct bio_prison
*prison
)
174 mempool_destroy(prison
->cell_pool
);
178 static uint32_t hash_key(struct bio_prison
*prison
, struct cell_key
*key
)
180 const unsigned long BIG_PRIME
= 4294967291UL;
181 uint64_t hash
= key
->block
* BIG_PRIME
;
183 return (uint32_t) (hash
& prison
->hash_mask
);
186 static struct cell
*__search_bucket(struct hlist_head
*bucket
,
187 struct cell_key
*key
)
190 struct hlist_node
*tmp
;
192 hlist_for_each_entry(cell
, tmp
, bucket
, list
)
193 if (!memcmp(&cell
->key
, key
, sizeof(cell
->key
)))
200 * This may block if a new cell needs allocating. You must ensure that
201 * cells will be unlocked even if the calling thread is blocked.
203 * Returns the number of entries in the cell prior to the new addition
206 static int bio_detain(struct bio_prison
*prison
, struct cell_key
*key
,
207 struct bio
*inmate
, struct cell
**ref
)
211 uint32_t hash
= hash_key(prison
, key
);
212 struct cell
*uninitialized_var(cell
), *cell2
= NULL
;
214 BUG_ON(hash
> prison
->nr_buckets
);
216 spin_lock_irqsave(&prison
->lock
, flags
);
217 cell
= __search_bucket(prison
->cells
+ hash
, key
);
221 * Allocate a new cell
223 spin_unlock_irqrestore(&prison
->lock
, flags
);
224 cell2
= mempool_alloc(prison
->cell_pool
, GFP_NOIO
);
225 spin_lock_irqsave(&prison
->lock
, flags
);
228 * We've been unlocked, so we have to double check that
229 * nobody else has inserted this cell in the meantime.
231 cell
= __search_bucket(prison
->cells
+ hash
, key
);
237 cell
->prison
= prison
;
238 memcpy(&cell
->key
, key
, sizeof(cell
->key
));
240 bio_list_init(&cell
->bios
);
241 hlist_add_head(&cell
->list
, prison
->cells
+ hash
);
246 bio_list_add(&cell
->bios
, inmate
);
247 spin_unlock_irqrestore(&prison
->lock
, flags
);
250 mempool_free(cell2
, prison
->cell_pool
);
257 static int bio_detain_if_occupied(struct bio_prison
*prison
, struct cell_key
*key
,
258 struct bio
*inmate
, struct cell
**ref
)
262 uint32_t hash
= hash_key(prison
, key
);
263 struct cell
*uninitialized_var(cell
);
265 BUG_ON(hash
> prison
->nr_buckets
);
267 spin_lock_irqsave(&prison
->lock
, flags
);
268 cell
= __search_bucket(prison
->cells
+ hash
, key
);
271 spin_unlock_irqrestore(&prison
->lock
, flags
);
276 bio_list_add(&cell
->bios
, inmate
);
277 spin_unlock_irqrestore(&prison
->lock
, flags
);
285 * @inmates must have been initialised prior to this call
287 static void __cell_release(struct cell
*cell
, struct bio_list
*inmates
)
289 struct bio_prison
*prison
= cell
->prison
;
291 hlist_del(&cell
->list
);
294 bio_list_merge(inmates
, &cell
->bios
);
296 mempool_free(cell
, prison
->cell_pool
);
299 static void cell_release(struct cell
*cell
, struct bio_list
*bios
)
302 struct bio_prison
*prison
= cell
->prison
;
304 spin_lock_irqsave(&prison
->lock
, flags
);
305 __cell_release(cell
, bios
);
306 spin_unlock_irqrestore(&prison
->lock
, flags
);
309 static void cell_error(struct cell
*cell
)
311 struct bio_prison
*prison
= cell
->prison
;
312 struct bio_list bios
;
316 bio_list_init(&bios
);
318 spin_lock_irqsave(&prison
->lock
, flags
);
319 __cell_release(cell
, &bios
);
320 spin_unlock_irqrestore(&prison
->lock
, flags
);
322 while ((bio
= bio_list_pop(&bios
)))
326 /*----------------------------------------------------------------*/
329 * We use the deferred set to keep track of pending reads to shared blocks.
330 * We do this to ensure the new mapping caused by a write isn't performed
331 * until these prior reads have completed. Otherwise the insertion of the
332 * new mapping could free the old block that the read bios are mapped to.
336 struct deferred_entry
{
337 struct deferred_set
*ds
;
339 struct list_head work_items
;
342 struct deferred_set
{
344 unsigned current_entry
;
346 struct deferred_entry entries
[DEFERRED_SET_SIZE
];
349 static void ds_init(struct deferred_set
*ds
)
353 spin_lock_init(&ds
->lock
);
354 ds
->current_entry
= 0;
356 for (i
= 0; i
< DEFERRED_SET_SIZE
; i
++) {
357 ds
->entries
[i
].ds
= ds
;
358 ds
->entries
[i
].count
= 0;
359 INIT_LIST_HEAD(&ds
->entries
[i
].work_items
);
363 static struct deferred_entry
*ds_inc(struct deferred_set
*ds
)
366 struct deferred_entry
*entry
;
368 spin_lock_irqsave(&ds
->lock
, flags
);
369 entry
= ds
->entries
+ ds
->current_entry
;
371 spin_unlock_irqrestore(&ds
->lock
, flags
);
376 static unsigned ds_next(unsigned index
)
378 return (index
+ 1) % DEFERRED_SET_SIZE
;
381 static void __sweep(struct deferred_set
*ds
, struct list_head
*head
)
383 while ((ds
->sweeper
!= ds
->current_entry
) &&
384 !ds
->entries
[ds
->sweeper
].count
) {
385 list_splice_init(&ds
->entries
[ds
->sweeper
].work_items
, head
);
386 ds
->sweeper
= ds_next(ds
->sweeper
);
389 if ((ds
->sweeper
== ds
->current_entry
) && !ds
->entries
[ds
->sweeper
].count
)
390 list_splice_init(&ds
->entries
[ds
->sweeper
].work_items
, head
);
393 static void ds_dec(struct deferred_entry
*entry
, struct list_head
*head
)
397 spin_lock_irqsave(&entry
->ds
->lock
, flags
);
398 BUG_ON(!entry
->count
);
400 __sweep(entry
->ds
, head
);
401 spin_unlock_irqrestore(&entry
->ds
->lock
, flags
);
405 * Returns 1 if deferred or 0 if no pending items to delay job.
407 static int ds_add_work(struct deferred_set
*ds
, struct list_head
*work
)
413 spin_lock_irqsave(&ds
->lock
, flags
);
414 if ((ds
->sweeper
== ds
->current_entry
) &&
415 !ds
->entries
[ds
->current_entry
].count
)
418 list_add(work
, &ds
->entries
[ds
->current_entry
].work_items
);
419 next_entry
= ds_next(ds
->current_entry
);
420 if (!ds
->entries
[next_entry
].count
)
421 ds
->current_entry
= next_entry
;
423 spin_unlock_irqrestore(&ds
->lock
, flags
);
428 /*----------------------------------------------------------------*/
433 static void build_data_key(struct dm_thin_device
*td
,
434 dm_block_t b
, struct cell_key
*key
)
437 key
->dev
= dm_thin_dev_id(td
);
441 static void build_virtual_key(struct dm_thin_device
*td
, dm_block_t b
,
442 struct cell_key
*key
)
445 key
->dev
= dm_thin_dev_id(td
);
449 /*----------------------------------------------------------------*/
452 * A pool device ties together a metadata device and a data device. It
453 * also provides the interface for creating and destroying internal
457 struct list_head list
;
458 struct dm_target
*ti
; /* Only set if a pool target is bound */
460 struct mapped_device
*pool_md
;
461 struct dm_pool_metadata
*pmd
;
463 uint32_t sectors_per_block
;
464 unsigned block_shift
;
465 dm_block_t offset_mask
;
466 dm_block_t low_water_mark
;
467 unsigned zero_new_blocks
:1;
469 struct bio_prison
*prison
;
470 struct dm_kcopyd_client
*copier
;
472 struct workqueue_struct
*producer_wq
;
473 struct workqueue_struct
*consumer_wq
;
474 struct work_struct producer
;
475 struct work_struct consumer
;
478 struct bio_list deferred_bios
;
479 struct list_head prepared_mappings
;
481 int low_water_triggered
; /* A dm event has been sent */
482 struct bio_list retry_list
;
484 struct deferred_set ds
; /* FIXME: move to thin_c */
486 mempool_t
*mapping_pool
;
487 mempool_t
*endio_hook_pool
;
493 * Target context for a pool.
496 struct dm_target
*ti
;
498 struct dm_dev
*data_dev
;
499 struct dm_dev
*metadata_dev
;
500 struct dm_target_callbacks callbacks
;
502 sector_t low_water_mark
;
503 unsigned zero_new_blocks
:1;
507 * Target context for a thin.
510 struct dm_dev
*pool_dev
;
514 struct dm_thin_device
*td
;
517 /* FIXME: Can cells and new_mappings be combined? */
521 bio_end_io_t
*saved_bi_end_io
;
522 struct deferred_entry
*entry
;
526 struct list_head list
;
531 dm_block_t virt_block
;
532 dm_block_t data_block
;
537 * If the bio covers the whole area of a block then we can avoid
538 * zeroing or copying. Instead this bio is hooked. The bio will
539 * still be in the cell, so care has to be taken to avoid issuing
543 bio_end_io_t
*saved_bi_end_io
;
546 /*----------------------------------------------------------------*/
548 static void save_and_set_endio(struct bio
*bio
, bio_end_io_t
**save
,
551 *save
= bio
->bi_end_io
;
555 /*----------------------------------------------------------------*/
558 * A global list that uses a struct mapped_device as a key.
560 static struct dm_thin_pool_table
{
562 struct list_head pools
;
563 } dm_thin_pool_table
;
565 static void pool_table_init(void)
567 spin_lock_init(&dm_thin_pool_table
.lock
);
569 INIT_LIST_HEAD(&dm_thin_pool_table
.pools
);
572 static void pool_table_insert(struct pool
*pool
)
574 spin_lock(&dm_thin_pool_table
.lock
);
575 list_add(&pool
->list
, &dm_thin_pool_table
.pools
);
576 spin_unlock(&dm_thin_pool_table
.lock
);
579 static void pool_table_remove(struct pool
*pool
)
581 spin_lock(&dm_thin_pool_table
.lock
);
582 list_del(&pool
->list
);
583 spin_unlock(&dm_thin_pool_table
.lock
);
586 static struct pool
*pool_table_lookup(struct mapped_device
*md
)
588 struct pool
*pool
= NULL
, *tmp
;
590 spin_lock(&dm_thin_pool_table
.lock
);
591 list_for_each_entry(tmp
, &dm_thin_pool_table
.pools
, list
)
592 if (tmp
->pool_md
== md
) {
596 spin_unlock(&dm_thin_pool_table
.lock
);
601 /*----------------------------------------------------------------*/
604 * This section of code contains the logic for processing a thin devices' IO.
605 * Much of the code depends on pool object resources (lists, workqueues, etc)
606 * but most is exclusively called from the thin target rather than the thin-pool
607 * target. wake_producer() being the most notable exception (which is also used
608 * by thin-pool to continue deferred IO processing after pool resume).
611 static dm_block_t
get_bio_block(struct thin_c
*tc
, struct bio
*bio
)
613 return bio
->bi_sector
>> tc
->pool
->block_shift
;
616 static void remap(struct thin_c
*tc
, struct bio
*bio
, dm_block_t block
)
618 struct pool
*pool
= tc
->pool
;
620 bio
->bi_bdev
= tc
->pool_dev
->bdev
;
621 bio
->bi_sector
= (block
<< pool
->block_shift
) +
622 (bio
->bi_sector
& pool
->offset_mask
);
625 static void remap_and_issue(struct thin_c
*tc
, struct bio
*bio
,
628 if (bio
->bi_rw
& (REQ_FLUSH
| REQ_FUA
)) {
629 int r
= dm_pool_commit_metadata(tc
->pool
->pmd
);
631 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
638 remap(tc
, bio
, block
);
639 generic_make_request(bio
);
642 static void wake_producer(struct pool
*pool
)
644 queue_work(pool
->producer_wq
, &pool
->producer
);
647 static void __maybe_add_mapping(struct new_mapping
*m
)
649 struct pool
*pool
= m
->tc
->pool
;
651 if (list_empty(&m
->list
) && m
->prepared
) {
652 list_add(&m
->list
, &pool
->prepared_mappings
);
653 queue_work(pool
->consumer_wq
, &pool
->consumer
);
657 static void copy_complete(int read_err
, unsigned long write_err
, void *context
)
660 struct new_mapping
*m
= context
;
661 struct pool
*pool
= m
->tc
->pool
;
663 m
->err
= read_err
|| write_err
? -EIO
: 0;
665 spin_lock_irqsave(&pool
->lock
, flags
);
667 __maybe_add_mapping(m
);
668 spin_unlock_irqrestore(&pool
->lock
, flags
);
671 static void overwrite_endio(struct bio
*bio
, int err
)
674 struct new_mapping
*m
= dm_get_mapinfo(bio
)->ptr
;
675 struct pool
*pool
= m
->tc
->pool
;
679 spin_lock_irqsave(&pool
->lock
, flags
);
681 __maybe_add_mapping(m
);
682 spin_unlock_irqrestore(&pool
->lock
, flags
);
685 static void shared_read_endio(struct bio
*bio
, int err
)
687 struct list_head mappings
;
688 struct new_mapping
*m
, *tmp
;
689 struct endio_hook
*h
= dm_get_mapinfo(bio
)->ptr
;
691 struct pool
*pool
= h
->tc
->pool
;
693 bio
->bi_end_io
= h
->saved_bi_end_io
;
696 INIT_LIST_HEAD(&mappings
);
697 ds_dec(h
->entry
, &mappings
);
699 spin_lock_irqsave(&pool
->lock
, flags
);
700 list_for_each_entry_safe(m
, tmp
, &mappings
, list
) {
702 INIT_LIST_HEAD(&m
->list
);
703 __maybe_add_mapping(m
);
705 spin_unlock_irqrestore(&pool
->lock
, flags
);
707 mempool_free(h
, pool
->endio_hook_pool
);
710 static int io_covers_block(struct pool
*pool
, struct bio
*bio
)
712 return ((bio
->bi_sector
& pool
->offset_mask
) == 0) &&
713 (bio
->bi_size
== (pool
->sectors_per_block
<< SECTOR_SHIFT
));
716 static void schedule_copy(struct thin_c
*tc
, dm_block_t virt_block
,
717 dm_block_t data_origin
, dm_block_t data_dest
,
718 struct cell
*cell
, struct bio
*bio
)
721 struct pool
*pool
= tc
->pool
;
722 struct new_mapping
*m
= mempool_alloc(pool
->mapping_pool
, GFP_NOIO
);
724 INIT_LIST_HEAD(&m
->list
);
727 m
->virt_block
= virt_block
;
728 m
->data_block
= data_dest
;
733 ds_add_work(&pool
->ds
, &m
->list
);
736 * IO to pool_dev remaps to the pool target's data_dev.
738 * If the whole block of data is being overwritten, we can issue the
739 * bio immediately. Otherwise we use kcopyd to clone the data first.
741 if (io_covers_block(pool
, bio
)) {
743 save_and_set_endio(bio
, &m
->saved_bi_end_io
, overwrite_endio
);
744 dm_get_mapinfo(bio
)->ptr
= m
;
745 remap_and_issue(tc
, bio
, data_dest
);
747 struct dm_io_region from
, to
;
749 from
.bdev
= tc
->pool_dev
->bdev
;
750 from
.sector
= data_origin
* pool
->sectors_per_block
;
751 from
.count
= pool
->sectors_per_block
;
753 to
.bdev
= tc
->pool_dev
->bdev
;
754 to
.sector
= data_dest
* pool
->sectors_per_block
;
755 to
.count
= pool
->sectors_per_block
;
757 r
= dm_kcopyd_copy(pool
->copier
, &from
, 1, &to
,
758 0, copy_complete
, m
);
760 mempool_free(m
, pool
->mapping_pool
);
761 DMERR("dm_kcopyd_copy() failed");
767 static void schedule_zero(struct thin_c
*tc
, dm_block_t virt_block
,
768 dm_block_t data_block
, struct cell
*cell
,
771 struct pool
*pool
= tc
->pool
;
772 struct new_mapping
*m
= mempool_alloc(pool
->mapping_pool
, GFP_NOIO
);
774 INIT_LIST_HEAD(&m
->list
);
777 m
->virt_block
= virt_block
;
778 m
->data_block
= data_block
;
784 * If the whole block of data is being overwritten or we are not
785 * zeroing pre-existing data, we can issue the bio immediately.
786 * Otherwise we use kcopyd to zero the data first.
788 if (!pool
->zero_new_blocks
|| io_covers_block(pool
, bio
)) {
790 save_and_set_endio(bio
, &m
->saved_bi_end_io
, overwrite_endio
);
791 dm_get_mapinfo(bio
)->ptr
= m
;
792 remap_and_issue(tc
, bio
, data_block
);
795 struct dm_io_region to
;
797 to
.bdev
= tc
->pool_dev
->bdev
;
798 to
.sector
= data_block
* pool
->sectors_per_block
;
799 to
.count
= pool
->sectors_per_block
;
801 r
= dm_kcopyd_zero(pool
->copier
, 1, &to
, 0, copy_complete
, m
);
803 mempool_free(m
, pool
->mapping_pool
);
804 DMERR("dm_kcopyd_zero() failed");
810 static void cell_remap_and_issue(struct thin_c
*tc
, struct cell
*cell
,
811 dm_block_t data_block
)
813 struct bio_list bios
;
816 bio_list_init(&bios
);
817 cell_release(cell
, &bios
);
819 while ((bio
= bio_list_pop(&bios
)))
820 remap_and_issue(tc
, bio
, data_block
);
823 static void cell_remap_and_issue_except(struct thin_c
*tc
, struct cell
*cell
,
824 dm_block_t data_block
,
825 struct bio
*exception
)
827 struct bio_list bios
;
830 bio_list_init(&bios
);
831 cell_release(cell
, &bios
);
833 while ((bio
= bio_list_pop(&bios
)))
834 if (bio
!= exception
)
835 remap_and_issue(tc
, bio
, data_block
);
838 static void retry_later(struct bio
*bio
)
840 struct thin_c
*tc
= dm_get_mapinfo(bio
)->ptr
;
841 struct pool
*pool
= tc
->pool
;
844 spin_lock_irqsave(&pool
->lock
, flags
);
845 bio_list_add(&pool
->retry_list
, bio
);
846 spin_unlock_irqrestore(&pool
->lock
, flags
);
849 static int alloc_data_block(struct thin_c
*tc
, dm_block_t
*result
)
852 dm_block_t free_blocks
;
854 struct pool
*pool
= tc
->pool
;
856 r
= dm_pool_get_free_block_count(pool
->pmd
, &free_blocks
);
860 if (free_blocks
<= pool
->low_water_mark
&& !pool
->low_water_triggered
) {
861 spin_lock_irqsave(&pool
->lock
, flags
);
862 pool
->low_water_triggered
= 1;
863 spin_unlock_irqrestore(&pool
->lock
, flags
);
864 dm_table_event(pool
->ti
->table
);
867 r
= dm_pool_alloc_data_block(pool
->pmd
, result
);
874 static void process_discard(struct thin_c
*tc
, struct bio
*bio
)
877 dm_block_t block
= get_bio_block(tc
, bio
);
878 struct dm_thin_lookup_result lookup_result
;
880 r
= dm_thin_find_block(tc
->td
, block
, 1, &lookup_result
);
883 if (lookup_result
.shared
)
885 * We just ignore shared discards for now, these
886 * are hard, and I want to get deferred
887 * deallocation working first.
892 r
= dm_thin_remove_block(tc
->td
, block
);
894 DMERR("dm_thin_remove_block() failed");
897 remap_and_issue(tc
, bio
, lookup_result
.block
);
903 * Either this isn't provisioned, or preparation for
904 * provisioning may be pending (we could find out by
905 * calling bio_detain_if_occupied). But even in this case
906 * it's easier to just forget the discard.
912 DMERR("dm_thin_find_block() failed, error = %d", r
);
918 static void no_space(struct cell
*cell
)
921 struct bio_list bios
;
923 bio_list_init(&bios
);
924 cell_release(cell
, &bios
);
926 while ((bio
= bio_list_pop(&bios
)))
930 static void break_sharing(struct thin_c
*tc
, struct bio
*bio
, dm_block_t block
,
931 struct cell_key
*key
,
932 struct dm_thin_lookup_result
*lookup_result
)
935 dm_block_t data_block
;
938 bio_detain(tc
->pool
->prison
, key
, bio
, &cell
);
940 r
= alloc_data_block(tc
, &data_block
);
943 schedule_copy(tc
, block
, lookup_result
->block
,
944 data_block
, cell
, bio
);
952 DMERR("%s: alloc_data_block() failed, error = %d", __func__
, r
);
958 static void process_shared_bio(struct thin_c
*tc
, struct bio
*bio
,
960 struct dm_thin_lookup_result
*lookup_result
)
964 struct pool
*pool
= tc
->pool
;
967 * If cell is already occupied, then sharing is already
968 * in the process of being broken so we have nothing
969 * further to do here.
971 build_data_key(tc
->td
, lookup_result
->block
, &key
);
972 if (bio_detain_if_occupied(pool
->prison
, &key
, bio
, &cell
))
975 if (bio_data_dir(bio
) == WRITE
)
976 break_sharing(tc
, bio
, block
, &key
, lookup_result
);
978 struct endio_hook
*h
;
979 h
= mempool_alloc(pool
->endio_hook_pool
, GFP_NOIO
);
982 h
->entry
= ds_inc(&pool
->ds
);
983 save_and_set_endio(bio
, &h
->saved_bi_end_io
, shared_read_endio
);
984 dm_get_mapinfo(bio
)->ptr
= h
;
985 remap_and_issue(tc
, bio
, lookup_result
->block
);
989 static void provision_block(struct thin_c
*tc
, struct bio
*bio
, dm_block_t block
)
992 dm_block_t data_block
;
997 * If cell is already occupied, then the block is already
998 * being provisioned so we have nothing further to do here.
1000 build_virtual_key(tc
->td
, block
, &key
);
1001 if (bio_detain(tc
->pool
->prison
, &key
, bio
, &cell
))
1004 r
= alloc_data_block(tc
, &data_block
);
1007 schedule_zero(tc
, block
, data_block
, cell
, bio
);
1015 DMERR("%s: alloc_data_block() failed, error = %d", __func__
, r
);
1021 static void process_bio(struct thin_c
*tc
, struct bio
*bio
)
1024 dm_block_t block
= get_bio_block(tc
, bio
);
1025 struct dm_thin_lookup_result lookup_result
;
1027 r
= dm_thin_find_block(tc
->td
, block
, 1, &lookup_result
);
1030 if (lookup_result
.shared
)
1031 process_shared_bio(tc
, bio
, block
, &lookup_result
);
1033 remap_and_issue(tc
, bio
, lookup_result
.block
);
1038 * When reading, we return zeroes regardless of the
1039 * zero_new_blocks setting.
1041 if (bio_data_dir(bio
) == READ
) {
1045 provision_block(tc
, bio
, block
);
1049 DMERR("dm_thin_find_block() failed, error = %d", r
);
1055 static void process_deferred_bios(struct pool
*pool
)
1057 unsigned long flags
;
1059 struct bio_list bios
;
1061 bio_list_init(&bios
);
1063 spin_lock_irqsave(&pool
->lock
, flags
);
1064 bio_list_merge(&bios
, &pool
->deferred_bios
);
1065 bio_list_init(&pool
->deferred_bios
);
1066 spin_unlock_irqrestore(&pool
->lock
, flags
);
1068 while ((bio
= bio_list_pop(&bios
))) {
1069 struct thin_c
*tc
= dm_get_mapinfo(bio
)->ptr
;
1071 if (bio
->bi_rw
& REQ_DISCARD
)
1072 process_discard(tc
, bio
);
1074 process_bio(tc
, bio
);
1078 static void process_prepared_mapping(struct new_mapping
*m
)
1080 struct thin_c
*tc
= m
->tc
;
1085 cell_error(m
->cell
);
1091 bio
->bi_end_io
= m
->saved_bi_end_io
;
1093 r
= dm_thin_insert_block(tc
->td
, m
->virt_block
, m
->data_block
);
1095 DMERR("dm_thin_insert_block() failed");
1096 cell_error(m
->cell
);
1101 cell_remap_and_issue_except(tc
, m
->cell
, m
->data_block
, bio
);
1104 cell_remap_and_issue(tc
, m
->cell
, m
->data_block
);
1107 mempool_free(m
, tc
->pool
->mapping_pool
);
1110 static void process_prepared_mappings(struct pool
*pool
)
1112 unsigned long flags
;
1113 struct list_head maps
;
1114 struct new_mapping
*m
, *tmp
;
1116 INIT_LIST_HEAD(&maps
);
1117 spin_lock_irqsave(&pool
->lock
, flags
);
1118 list_splice_init(&pool
->prepared_mappings
, &maps
);
1119 spin_unlock_irqrestore(&pool
->lock
, flags
);
1121 list_for_each_entry_safe(m
, tmp
, &maps
, list
)
1122 process_prepared_mapping(m
);
1125 static void do_producer(struct work_struct
*ws
)
1127 struct pool
*pool
= container_of(ws
, struct pool
, producer
);
1129 process_deferred_bios(pool
);
1132 static void do_consumer(struct work_struct
*ws
)
1134 struct pool
*pool
= container_of(ws
, struct pool
, consumer
);
1136 process_prepared_mappings(pool
);
1139 static void defer_bio(struct thin_c
*tc
, struct bio
*bio
)
1141 unsigned long flags
;
1142 struct pool
*pool
= tc
->pool
;
1144 spin_lock_irqsave(&pool
->lock
, flags
);
1145 bio_list_add(&pool
->deferred_bios
, bio
);
1146 spin_unlock_irqrestore(&pool
->lock
, flags
);
1148 wake_producer(pool
);
1152 * Non-blocking function designed to be called from the target's map
1155 static int bio_map(struct dm_target
*ti
, struct bio
*bio
,
1156 union map_info
*map_context
)
1159 struct thin_c
*tc
= ti
->private;
1160 dm_block_t block
= get_bio_block(tc
, bio
);
1161 struct dm_thin_device
*td
= tc
->td
;
1162 struct pool
*pool
= tc
->pool
;
1163 struct dm_thin_lookup_result result
;
1166 * FIXME(hch): In theory higher level code should prevent this
1167 * from happening, not sure why we ever get here.
1169 if ((bio
->bi_rw
& REQ_DISCARD
) &&
1170 bio
->bi_size
< (pool
->sectors_per_block
<< SECTOR_SHIFT
)) {
1171 DMERR("discard IO smaller than pool block size (%llu)",
1172 (unsigned long long)pool
->sectors_per_block
<< SECTOR_SHIFT
);
1174 return DM_MAPIO_SUBMITTED
;
1178 * Save the thin context for easy access from the deferred bio later.
1180 map_context
->ptr
= tc
;
1182 if (bio
->bi_rw
& (REQ_DISCARD
| REQ_FLUSH
| REQ_FUA
)) {
1184 return DM_MAPIO_SUBMITTED
;
1187 r
= dm_thin_find_block(td
, block
, 0, &result
);
1190 * Note that we defer readahead too.
1194 if (unlikely(result
.shared
)) {
1196 * We have a race condition here between the
1197 * result.shared value returned by the lookup and
1198 * snapshot creation, which may cause new
1201 * To avoid this always quiesce the origin before
1202 * taking the snap. You want to do this anyway to
1203 * ensure a consistent application view
1206 * More distant ancestors are irrelevant, the
1207 * shared flag will be set in their case.
1210 r
= DM_MAPIO_SUBMITTED
;
1212 remap(tc
, bio
, result
.block
);
1213 r
= DM_MAPIO_REMAPPED
;
1219 * In future, the failed dm_thin_find_block above could
1220 * provide the hint to load the metadata into cache.
1222 * When reading, we return zeroes regardless of the
1223 * zero_new_blocks setting.
1225 if (bio_data_dir(bio
) == READ
) {
1230 r
= DM_MAPIO_SUBMITTED
;
1235 r
= DM_MAPIO_SUBMITTED
;
1242 static int pool_is_congested(struct dm_target_callbacks
*cb
, int bdi_bits
)
1245 unsigned long flags
;
1246 struct pool_c
*pt
= container_of(cb
, struct pool_c
, callbacks
);
1248 spin_lock_irqsave(&pt
->pool
->lock
, flags
);
1249 r
= !bio_list_empty(&pt
->pool
->retry_list
);
1250 spin_unlock_irqrestore(&pt
->pool
->lock
, flags
);
1253 struct request_queue
*q
= bdev_get_queue(pt
->data_dev
->bdev
);
1254 r
= bdi_congested(&q
->backing_dev_info
, bdi_bits
);
1260 static void __requeue_bios(struct pool
*pool
)
1262 bio_list_merge(&pool
->deferred_bios
, &pool
->retry_list
);
1263 bio_list_init(&pool
->retry_list
);
1266 /*----------------------------------------------------------------
1267 * Binding of control targets to a pool object
1268 *--------------------------------------------------------------*/
1269 /* FIXME: add locking */
1270 static int bind_control_target(struct pool
*pool
, struct dm_target
*ti
)
1272 struct pool_c
*pt
= ti
->private;
1275 pool
->low_water_mark
= dm_sector_div_up(pt
->low_water_mark
,
1276 pool
->sectors_per_block
);
1277 pool
->zero_new_blocks
= pt
->zero_new_blocks
;
1278 dm_pool_rebind_metadata_device(pool
->pmd
, pt
->metadata_dev
->bdev
);
1283 static void unbind_control_target(struct pool
*pool
, struct dm_target
*ti
)
1289 /*----------------------------------------------------------------
1291 *--------------------------------------------------------------*/
1292 static void pool_destroy(struct pool
*pool
)
1294 if (dm_pool_metadata_close(pool
->pmd
) < 0)
1295 DMWARN("%s: dm_pool_metadata_close() failed.", __func__
);
1297 prison_destroy(pool
->prison
);
1298 dm_kcopyd_client_destroy(pool
->copier
);
1300 if (pool
->producer_wq
)
1301 destroy_workqueue(pool
->producer_wq
);
1303 if (pool
->consumer_wq
)
1304 destroy_workqueue(pool
->consumer_wq
);
1306 mempool_destroy(pool
->mapping_pool
);
1307 mempool_destroy(pool
->endio_hook_pool
);
1311 static struct pool
*pool_create(struct block_device
*metadata_dev
,
1312 unsigned long block_size
, char **error
)
1317 struct dm_pool_metadata
*pmd
;
1319 pmd
= dm_pool_metadata_open(metadata_dev
, block_size
);
1321 *error
= "Error creating metadata object";
1322 return (struct pool
*)pmd
;
1325 pool
= kmalloc(sizeof(*pool
), GFP_KERNEL
);
1327 *error
= "Error allocating memory for pool";
1328 err_p
= ERR_PTR(-ENOMEM
);
1333 pool
->sectors_per_block
= block_size
;
1334 pool
->block_shift
= ffs(block_size
) - 1;
1335 pool
->offset_mask
= block_size
- 1;
1336 pool
->low_water_mark
= 0;
1337 pool
->zero_new_blocks
= 1;
1338 pool
->prison
= prison_create(PRISON_CELLS
);
1339 if (!pool
->prison
) {
1340 *error
= "Error creating pool's bio prison";
1341 err_p
= ERR_PTR(-ENOMEM
);
1345 pool
->copier
= dm_kcopyd_client_create();
1346 if (IS_ERR(pool
->copier
)) {
1347 r
= PTR_ERR(pool
->copier
);
1348 *error
= "Error creating pool's kcopyd client";
1350 goto bad_kcopyd_client
;
1354 * Create singlethreaded workqueues that will service all devices
1355 * that use this metadata.
1357 pool
->producer_wq
= alloc_ordered_workqueue("dm-" DM_MSG_PREFIX
"-producer",
1359 if (!pool
->producer_wq
) {
1360 *error
= "Error creating pool's producer workqueue";
1361 err_p
= ERR_PTR(-ENOMEM
);
1362 goto bad_producer_wq
;
1365 pool
->consumer_wq
= alloc_ordered_workqueue("dm-" DM_MSG_PREFIX
"-consumer",
1367 if (!pool
->consumer_wq
) {
1368 *error
= "Error creating pool's consumer workqueue";
1369 err_p
= ERR_PTR(-ENOMEM
);
1370 goto bad_consumer_wq
;
1373 INIT_WORK(&pool
->producer
, do_producer
);
1374 INIT_WORK(&pool
->consumer
, do_consumer
);
1375 spin_lock_init(&pool
->lock
);
1376 bio_list_init(&pool
->deferred_bios
);
1377 INIT_LIST_HEAD(&pool
->prepared_mappings
);
1378 pool
->low_water_triggered
= 0;
1379 bio_list_init(&pool
->retry_list
);
1382 pool
->mapping_pool
=
1383 mempool_create_kmalloc_pool(MAPPING_POOL_SIZE
, sizeof(struct new_mapping
));
1384 if (!pool
->mapping_pool
) {
1385 *error
= "Error creating pool's mapping mempool";
1386 err_p
= ERR_PTR(-ENOMEM
);
1387 goto bad_mapping_pool
;
1390 pool
->endio_hook_pool
=
1391 mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE
, sizeof(struct endio_hook
));
1392 if (!pool
->endio_hook_pool
) {
1393 *error
= "Error creating pool's endio_hook mempool";
1394 err_p
= ERR_PTR(-ENOMEM
);
1395 goto bad_endio_hook_pool
;
1397 atomic_set(&pool
->ref_count
, 1);
1401 bad_endio_hook_pool
:
1402 mempool_destroy(pool
->mapping_pool
);
1404 destroy_workqueue(pool
->consumer_wq
);
1406 destroy_workqueue(pool
->producer_wq
);
1408 dm_kcopyd_client_destroy(pool
->copier
);
1410 prison_destroy(pool
->prison
);
1414 if (dm_pool_metadata_close(pmd
))
1415 DMWARN("%s: dm_pool_metadata_close() failed.", __func__
);
1420 static void pool_inc(struct pool
*pool
)
1422 atomic_inc(&pool
->ref_count
);
1425 static void pool_dec(struct pool
*pool
)
1427 if (atomic_dec_and_test(&pool
->ref_count
))
1431 static struct pool
*pool_find(struct mapped_device
*pool_md
,
1432 struct block_device
*metadata_dev
,
1433 unsigned long block_size
,
1438 pool
= pool_table_lookup(pool_md
);
1442 pool
= pool_create(metadata_dev
, block_size
, error
);
1447 /*----------------------------------------------------------------
1448 * Pool target methods
1449 *--------------------------------------------------------------*/
1450 static void pool_dtr(struct dm_target
*ti
)
1452 struct pool_c
*pt
= ti
->private;
1454 dm_put_device(ti
, pt
->metadata_dev
);
1455 dm_put_device(ti
, pt
->data_dev
);
1456 unbind_control_target(pt
->pool
, ti
);
1461 struct pool_features
{
1462 unsigned zero_new_blocks
:1;
1465 static int parse_pool_features(struct dm_arg_set
*as
, struct pool_features
*pf
,
1466 struct dm_target
*ti
)
1470 const char *arg_name
;
1472 static struct dm_arg _args
[] = {
1473 {0, 1, "Invalid number of pool feature arguments"},
1477 * No feature arguments supplied.
1482 r
= dm_read_arg_group(_args
, as
, &argc
, &ti
->error
);
1486 while (argc
&& !r
) {
1487 arg_name
= dm_shift_arg(as
);
1490 if (!strcasecmp(arg_name
, "skip_block_zeroing")) {
1491 pf
->zero_new_blocks
= 0;
1495 ti
->error
= "Unrecognised pool feature requested";
1503 * thin-pool <metadata dev> <data dev>
1504 * <data block size (sectors)>
1505 * <low water mark (sectors)>
1506 * [<#feature args> [<arg>]*]
1508 * Optional feature arguments are:
1509 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1511 static int pool_ctr(struct dm_target
*ti
, unsigned argc
, char **argv
)
1516 struct pool_features pf
;
1517 struct dm_arg_set as
;
1518 struct dm_dev
*data_dev
;
1519 unsigned long block_size
;
1520 dm_block_t low_water
;
1521 struct dm_dev
*metadata_dev
;
1522 sector_t metadata_dev_size
;
1525 ti
->error
= "Invalid argument count";
1531 r
= dm_get_device(ti
, argv
[0], FMODE_READ
| FMODE_WRITE
, &metadata_dev
);
1533 ti
->error
= "Error opening metadata block device";
1537 metadata_dev_size
= i_size_read(metadata_dev
->bdev
->bd_inode
) >> SECTOR_SHIFT
;
1538 if (metadata_dev_size
> METADATA_DEV_MAX_SECTORS
) {
1539 ti
->error
= "Metadata device is too large";
1544 r
= dm_get_device(ti
, argv
[1], FMODE_READ
| FMODE_WRITE
, &data_dev
);
1546 ti
->error
= "Error getting data device";
1550 if (kstrtoul(argv
[2], 10, &block_size
) || !block_size
||
1551 block_size
< DATA_DEV_BLOCK_SIZE_MIN_SECTORS
||
1552 block_size
> DATA_DEV_BLOCK_SIZE_MAX_SECTORS
||
1553 !is_power_of_2(block_size
)) {
1554 ti
->error
= "Invalid block size";
1559 if (kstrtoull(argv
[3], 10, (unsigned long long *)&low_water
) ||
1561 ti
->error
= "Invalid low water mark";
1567 * Set default pool features.
1569 memset(&pf
, 0, sizeof(pf
));
1570 pf
.zero_new_blocks
= 1;
1572 dm_consume_args(&as
, 4);
1573 r
= parse_pool_features(&as
, &pf
, ti
);
1577 pool
= pool_find(dm_table_get_md(ti
->table
), metadata_dev
->bdev
,
1578 block_size
, &ti
->error
);
1584 pt
= kmalloc(sizeof(*pt
), GFP_KERNEL
);
1592 pt
->metadata_dev
= metadata_dev
;
1593 pt
->data_dev
= data_dev
;
1594 pt
->low_water_mark
= low_water
;
1595 pt
->zero_new_blocks
= pf
.zero_new_blocks
;
1596 ti
->num_flush_requests
= 1;
1597 ti
->num_discard_requests
= 1;
1600 pt
->callbacks
.congested_fn
= pool_is_congested
;
1601 dm_table_add_target_callbacks(ti
->table
, &pt
->callbacks
);
1606 dm_put_device(ti
, data_dev
);
1608 dm_put_device(ti
, metadata_dev
);
1613 static int pool_map(struct dm_target
*ti
, struct bio
*bio
,
1614 union map_info
*map_context
)
1617 struct pool_c
*pt
= ti
->private;
1618 struct pool
*pool
= pt
->pool
;
1619 unsigned long flags
;
1621 spin_lock_irqsave(&pool
->lock
, flags
);
1622 bio
->bi_bdev
= pt
->data_dev
->bdev
;
1623 r
= DM_MAPIO_REMAPPED
;
1624 spin_unlock_irqrestore(&pool
->lock
, flags
);
1630 * Retrieves the number of blocks of the data device from
1631 * the superblock and compares it to the actual device size,
1632 * thus resizing the data device in case it has grown.
1634 * This both copes with opening preallocated data devices in the ctr
1635 * being followed by a resume
1637 * calling the resume method individually after userspace has
1638 * grown the data device in reaction to a table event.
1640 static int pool_preresume(struct dm_target
*ti
)
1643 struct pool_c
*pt
= ti
->private;
1644 struct pool
*pool
= pt
->pool
;
1645 dm_block_t data_size
, sb_data_size
;
1646 unsigned long flags
;
1649 * Take control of the pool object.
1651 r
= bind_control_target(pool
, ti
);
1655 data_size
= ti
->len
>> pool
->block_shift
;
1656 r
= dm_pool_get_data_dev_size(pool
->pmd
, &sb_data_size
);
1658 DMERR("failed to retrieve data device size");
1662 if (data_size
< sb_data_size
) {
1663 DMERR("pool target too small, is %llu blocks (expected %llu)",
1664 data_size
, sb_data_size
);
1667 } else if (data_size
> sb_data_size
) {
1668 r
= dm_pool_resize_data_dev(pool
->pmd
, data_size
);
1670 DMERR("failed to resize data device");
1674 r
= dm_pool_commit_metadata(pool
->pmd
);
1676 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1682 spin_lock_irqsave(&pool
->lock
, flags
);
1683 pool
->low_water_triggered
= 0;
1684 __requeue_bios(pool
);
1685 spin_unlock_irqrestore(&pool
->lock
, flags
);
1687 wake_producer(pool
);
1690 * The pool object is only present if the pool is active.
1692 pool
->pool_md
= dm_table_get_md(ti
->table
);
1693 pool_table_insert(pool
);
1698 static void pool_postsuspend(struct dm_target
*ti
)
1701 struct pool_c
*pt
= ti
->private;
1702 struct pool
*pool
= pt
->pool
;
1704 flush_workqueue(pool
->producer_wq
);
1705 flush_workqueue(pool
->consumer_wq
);
1707 r
= dm_pool_commit_metadata(pool
->pmd
);
1709 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1711 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
1714 pool_table_remove(pool
);
1715 pool
->pool_md
= NULL
;
1718 static int check_arg_count(unsigned argc
, unsigned args_required
)
1720 if (argc
!= args_required
) {
1721 DMWARN("Message received with %u arguments instead of %u.",
1722 argc
, args_required
);
1729 static int read_dev_id(char *arg
, dm_thin_id
*dev_id
, int warning
)
1731 if (!kstrtoull(arg
, 10, (unsigned long long *)dev_id
) &&
1732 *dev_id
<= MAX_DEV_ID
)
1736 DMWARN("Message received with invalid device id: %s", arg
);
1741 static int process_create_thin_mesg(unsigned argc
, char **argv
, struct pool
*pool
)
1746 r
= check_arg_count(argc
, 2);
1750 r
= read_dev_id(argv
[1], &dev_id
, 1);
1754 r
= dm_pool_create_thin(pool
->pmd
, dev_id
);
1756 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
1764 static int process_create_snap_mesg(unsigned argc
, char **argv
, struct pool
*pool
)
1767 dm_thin_id origin_dev_id
;
1770 r
= check_arg_count(argc
, 3);
1774 r
= read_dev_id(argv
[1], &dev_id
, 1);
1778 r
= read_dev_id(argv
[2], &origin_dev_id
, 1);
1782 r
= dm_pool_create_snap(pool
->pmd
, dev_id
, origin_dev_id
);
1784 DMWARN("Creation of new snapshot %s of device %s failed.",
1792 static int process_delete_mesg(unsigned argc
, char **argv
, struct pool
*pool
)
1797 r
= check_arg_count(argc
, 2);
1801 r
= read_dev_id(argv
[1], &dev_id
, 1);
1805 r
= dm_pool_delete_thin_device(pool
->pmd
, dev_id
);
1807 DMWARN("Deletion of thin device %s failed.", argv
[1]);
1812 static int process_trim_mesg(unsigned argc
, char **argv
, struct pool
*pool
)
1818 r
= check_arg_count(argc
, 3);
1822 r
= read_dev_id(argv
[1], &dev_id
, 1);
1826 if (kstrtoull(argv
[2], 10, (unsigned long long *)&new_size
)) {
1827 DMWARN("trim device %s: Invalid new size: %s sectors.",
1832 r
= dm_pool_trim_thin_device(pool
->pmd
, dev_id
,
1833 dm_sector_div_up(new_size
, pool
->sectors_per_block
));
1835 DMWARN("Attempt to trim thin device %s failed.", argv
[1]);
1840 static int process_set_transaction_id_mesg(unsigned argc
, char **argv
, struct pool
*pool
)
1842 dm_thin_id old_id
, new_id
;
1845 r
= check_arg_count(argc
, 3);
1849 if (kstrtoull(argv
[1], 10, (unsigned long long *)&old_id
)) {
1850 DMWARN("set_transaction_id message: Unrecognised id %s.", argv
[1]);
1854 if (kstrtoull(argv
[2], 10, (unsigned long long *)&new_id
)) {
1855 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv
[2]);
1859 r
= dm_pool_set_metadata_transaction_id(pool
->pmd
, old_id
, new_id
);
1861 DMWARN("Failed to change transaction id from %s to %s.",
1870 * Messages supported:
1871 * create_thin <dev_id>
1872 * create_snap <dev_id> <origin_id>
1874 * trim <dev_id> <new_size_in_sectors>
1875 * set_transaction_id <current_trans_id> <new_trans_id>
1877 static int pool_message(struct dm_target
*ti
, unsigned argc
, char **argv
)
1880 struct pool_c
*pt
= ti
->private;
1881 struct pool
*pool
= pt
->pool
;
1883 if (!strcasecmp(argv
[0], "create_thin"))
1884 r
= process_create_thin_mesg(argc
, argv
, pool
);
1886 else if (!strcasecmp(argv
[0], "create_snap"))
1887 r
= process_create_snap_mesg(argc
, argv
, pool
);
1889 else if (!strcasecmp(argv
[0], "delete"))
1890 r
= process_delete_mesg(argc
, argv
, pool
);
1892 else if (!strcasecmp(argv
[0], "trim"))
1893 r
= process_trim_mesg(argc
, argv
, pool
);
1895 else if (!strcasecmp(argv
[0], "set_transaction_id"))
1896 r
= process_set_transaction_id_mesg(argc
, argv
, pool
);
1899 DMWARN("Unrecognised thin pool target message received: %s", argv
[0]);
1902 r
= dm_pool_commit_metadata(pool
->pmd
);
1904 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
1913 * <transaction id> <free metadata space in sectors>
1914 * <free data space in sectors> <held metadata root>
1916 static int pool_status(struct dm_target
*ti
, status_type_t type
,
1917 char *result
, unsigned maxlen
)
1921 uint64_t transaction_id
;
1922 dm_block_t nr_free_blocks_data
;
1923 dm_block_t nr_free_blocks_metadata
;
1924 dm_block_t held_root
;
1925 char buf
[BDEVNAME_SIZE
];
1926 char buf2
[BDEVNAME_SIZE
];
1927 struct pool_c
*pt
= ti
->private;
1928 struct pool
*pool
= pt
->pool
;
1931 case STATUSTYPE_INFO
:
1932 r
= dm_pool_get_metadata_transaction_id(pool
->pmd
,
1937 r
= dm_pool_get_free_metadata_block_count(pool
->pmd
,
1938 &nr_free_blocks_metadata
);
1942 r
= dm_pool_get_free_block_count(pool
->pmd
,
1943 &nr_free_blocks_data
);
1947 r
= dm_pool_get_held_metadata_root(pool
->pmd
, &held_root
);
1951 DMEMIT("%llu %llu %llu ", (unsigned long long)transaction_id
,
1952 (unsigned long long)nr_free_blocks_metadata
* pool
->sectors_per_block
,
1953 (unsigned long long)nr_free_blocks_data
* pool
->sectors_per_block
);
1956 DMEMIT("%llu", held_root
);
1962 case STATUSTYPE_TABLE
:
1963 DMEMIT("%s %s %lu %llu ",
1964 format_dev_t(buf
, pt
->metadata_dev
->bdev
->bd_dev
),
1965 format_dev_t(buf2
, pt
->data_dev
->bdev
->bd_dev
),
1966 (unsigned long)pool
->sectors_per_block
,
1967 (unsigned long long)pt
->low_water_mark
);
1969 DMEMIT("%u ", !pool
->zero_new_blocks
);
1971 if (!pool
->zero_new_blocks
)
1972 DMEMIT("skip_block_zeroing ");
1979 static int pool_iterate_devices(struct dm_target
*ti
,
1980 iterate_devices_callout_fn fn
, void *data
)
1982 struct pool_c
*pt
= ti
->private;
1984 return fn(ti
, pt
->data_dev
, 0, ti
->len
, data
);
1987 static int pool_merge(struct dm_target
*ti
, struct bvec_merge_data
*bvm
,
1988 struct bio_vec
*biovec
, int max_size
)
1990 struct pool_c
*pt
= ti
->private;
1991 struct request_queue
*q
= bdev_get_queue(pt
->data_dev
->bdev
);
1993 if (!q
->merge_bvec_fn
)
1996 bvm
->bi_bdev
= pt
->data_dev
->bdev
;
1998 return min(max_size
, q
->merge_bvec_fn(q
, bvm
, biovec
));
2001 static void pool_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
2003 struct pool_c
*pt
= ti
->private;
2004 struct pool
*pool
= pt
->pool
;
2006 blk_limits_io_min(limits
, 0);
2007 blk_limits_io_opt(limits
, pool
->sectors_per_block
<< SECTOR_SHIFT
);
2010 static struct target_type pool_target
= {
2011 .name
= "thin-pool",
2012 .features
= DM_TARGET_SINGLETON
| DM_TARGET_ALWAYS_WRITEABLE
,
2013 .version
= {1, 0, 0},
2014 .module
= THIS_MODULE
,
2018 .postsuspend
= pool_postsuspend
,
2019 .preresume
= pool_preresume
,
2020 .message
= pool_message
,
2021 .status
= pool_status
,
2022 .merge
= pool_merge
,
2023 .iterate_devices
= pool_iterate_devices
,
2024 .io_hints
= pool_io_hints
,
2027 /*----------------------------------------------------------------*/
2029 static void thin_dtr(struct dm_target
*ti
)
2031 struct thin_c
*tc
= ti
->private;
2034 dm_pool_close_thin_device(tc
->td
);
2035 dm_put_device(ti
, tc
->pool_dev
);
2040 * Thin target parameters:
2042 * <pool_dev> <dev_id>
2044 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2045 * dev_id: the internal device identifier
2047 static int thin_ctr(struct dm_target
*ti
, unsigned argc
, char **argv
)
2051 struct dm_dev
*pool_dev
;
2052 struct mapped_device
*pool_md
;
2055 ti
->error
= "Invalid argument count";
2059 tc
= ti
->private = kzalloc(sizeof(*tc
), GFP_KERNEL
);
2061 ti
->error
= "Out of memory";
2065 r
= dm_get_device(ti
, argv
[0], dm_table_get_mode(ti
->table
), &pool_dev
);
2067 ti
->error
= "Error opening pool device";
2070 tc
->pool_dev
= pool_dev
;
2072 if (read_dev_id(argv
[1], (unsigned long long *)&tc
->dev_id
, 0)) {
2073 ti
->error
= "Invalid device id";
2078 pool_md
= dm_get_md(tc
->pool_dev
->bdev
->bd_dev
);
2080 ti
->error
= "Couldn't get pool mapped device";
2085 tc
->pool
= pool_table_lookup(pool_md
);
2087 ti
->error
= "Couldn't find pool object";
2089 goto bad_pool_lookup
;
2094 r
= dm_pool_open_thin_device(tc
->pool
->pmd
, tc
->dev_id
, &tc
->td
);
2096 ti
->error
= "Couldn't open thin internal device";
2100 ti
->split_io
= tc
->pool
->sectors_per_block
;
2101 ti
->num_flush_requests
= 1;
2102 ti
->num_discard_requests
= 1;
2104 * allow discards to issued to the thin device even
2105 * if the pool's data device doesn't support them.
2107 ti
->discards_supported
= 1;
2116 dm_put_device(ti
, tc
->pool_dev
);
2123 static int thin_map(struct dm_target
*ti
, struct bio
*bio
,
2124 union map_info
*map_context
)
2126 bio
->bi_sector
-= ti
->begin
;
2128 return bio_map(ti
, bio
, map_context
);
2132 * <nr mapped sectors> <highest mapped sector>
2134 static int thin_status(struct dm_target
*ti
, status_type_t type
,
2135 char *result
, unsigned maxlen
)
2139 dm_block_t mapped
, highest
;
2140 char buf
[BDEVNAME_SIZE
];
2141 struct thin_c
*tc
= ti
->private;
2147 case STATUSTYPE_INFO
:
2148 r
= dm_thin_get_mapped_count(tc
->td
, &mapped
);
2152 r
= dm_thin_get_highest_mapped_block(tc
->td
, &highest
);
2156 DMEMIT("%llu ", mapped
* tc
->pool
->sectors_per_block
);
2158 DMEMIT("%llu", ((highest
+ 1) *
2159 tc
->pool
->sectors_per_block
) - 1);
2164 case STATUSTYPE_TABLE
:
2166 format_dev_t(buf
, tc
->pool_dev
->bdev
->bd_dev
),
2167 (unsigned long) tc
->dev_id
);
2175 static int thin_iterate_devices(struct dm_target
*ti
,
2176 iterate_devices_callout_fn fn
, void *data
)
2178 struct thin_c
*tc
= ti
->private;
2180 return fn(ti
, tc
->pool_dev
, 0, tc
->pool
->sectors_per_block
, data
);
2183 static void thin_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
2185 struct thin_c
*tc
= ti
->private;
2187 blk_limits_io_min(limits
, 0);
2188 blk_limits_io_opt(limits
, tc
->pool
->sectors_per_block
<< SECTOR_SHIFT
);
2191 * Only allow discard requests aligned to our block size, and make
2192 * sure that we never get sent larger discard requests either.
2194 limits
->max_discard_sectors
= tc
->pool
->sectors_per_block
;
2195 limits
->discard_granularity
= tc
->pool
->sectors_per_block
<< SECTOR_SHIFT
;
2198 static struct target_type thin_target
= {
2200 .version
= {1, 0, 0},
2201 .module
= THIS_MODULE
,
2205 .status
= thin_status
,
2206 .iterate_devices
= thin_iterate_devices
,
2207 .io_hints
= thin_io_hints
,
2210 /*----------------------------------------------------------------*/
2212 static int __init
dm_thin_init(void)
2218 r
= dm_register_target(&thin_target
);
2222 r
= dm_register_target(&pool_target
);
2224 dm_unregister_target(&thin_target
);
2229 static void dm_thin_exit(void)
2231 dm_unregister_target(&thin_target
);
2232 dm_unregister_target(&pool_target
);
2235 module_init(dm_thin_init
);
2236 module_exit(dm_thin_exit
);
2238 MODULE_DESCRIPTION(DM_NAME
"device-mapper thin provisioning target");
2239 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2240 MODULE_LICENSE("GPL");
2242 /*----------------------------------------------------------------*/