Add linux-next specific files for 20110801
[linux-2.6/next.git] / drivers / md / dm-thin.c
blobff3c5d33b354707de8771f0ff02f6c68de506892
1 /*
2 * Copyright (C) 2011 Red Hat UK. All rights reserved.
4 * This file is released under the GPL.
5 */
7 #include "dm-thin-metadata.h"
9 #include <linux/device-mapper.h>
10 #include <linux/dm-io.h>
11 #include <linux/dm-kcopyd.h>
12 #include <linux/list.h>
13 #include <linux/init.h>
14 #include <linux/module.h>
15 #include <linux/slab.h>
17 #define DM_MSG_PREFIX "thin"
20 * Tunable constants
22 #define ENDIO_HOOK_POOL_SIZE 10240
23 #define DEFERRED_SET_SIZE 64
24 #define MAPPING_POOL_SIZE 1024
25 #define PRISON_CELLS 1024
28 * The block size of the device holding pool data must be
29 * between 64KB and 1GB.
31 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
32 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
34 #define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * 8)
37 * Device id is restricted to 24 bits.
39 #define MAX_DEV_ID ((1 << 24) - 1)
42 * How do we handle breaking sharing of data blocks?
43 * =================================================
45 * We use a standard copy-on-write btree to store the mappings for the
46 * devices (note I'm talking about copy-on-write of the metadata here, not
47 * the data). When you take an internal snapshot you clone the root node
48 * of the origin btree. After this there is no concept of an origin or a
49 * snapshot. They are just two device trees that happen to point to the
50 * same data blocks.
52 * When we get a write in we decide if it's to a shared data block using
53 * some timestamp magic. If it is, we have to break sharing.
55 * Let's say we write to a shared block in what was the origin. The
56 * steps are:
58 * i) plug io further to this physical block. (see bio_prison code).
60 * ii) quiesce any read io to that shared data block. Obviously
61 * including all devices that share this block. (see deferred_set code)
63 * iii) copy the data block to a newly allocate block. This step can be
64 * missed out if the io covers the block. (schedule_copy).
66 * iv) insert the new mapping into the origin's btree
67 * (process_prepared_mappings). This act of inserting breaks some
68 * sharing of btree nodes between the two devices. Breaking sharing only
69 * effects the btree of that specific device. Btrees for the other
70 * devices that share the block never change. The btree for the origin
71 * device as it was after the last commit is untouched, ie. we're using
72 * persistent data structures in the functional programming sense.
74 * v) unplug io to this physical block, including the io that triggered
75 * the breaking of sharing.
77 * Steps (ii) and (iii) occur in parallel.
79 * The metadata _doesn't_ need to be committed before the io continues. We
80 * get away with this because the io is always written to a _new_ block.
81 * If there's a crash, then:
83 * - The origin mapping will point to the old origin block (the shared
84 * one). This will contain the data as it was before the io that triggered
85 * the breaking of sharing came in.
87 * - The snap mapping still points to the old block. As it would after
88 * the commit.
90 * The downside of this scheme is the timestamp magic isn't perfect, and
91 * will continue to think that data block in the snapshot device is shared
92 * even after the write to the origin has broken sharing. I suspect data
93 * blocks will typically be shared by many different devices, so we're
94 * breaking sharing n + 1 times, rather than n, where n is the number of
95 * devices that reference this data block. At the moment I think the
96 * benefits far, far outweigh the disadvantages.
99 /*----------------------------------------------------------------*/
102 * Sometimes we can't deal with a bio straight away. We put them in prison
103 * where they can't cause any mischief. Bios are put in a cell identified
104 * by a key, multiple bios can be in the same cell. When the cell is
105 * subsequently unlocked the bios become available.
107 struct bio_prison;
109 struct cell_key {
110 int virtual;
111 dm_thin_id dev;
112 dm_block_t block;
115 struct cell {
116 struct hlist_node list;
117 struct bio_prison *prison;
118 struct cell_key key;
119 unsigned count;
120 struct bio_list bios;
123 struct bio_prison {
124 spinlock_t lock;
125 mempool_t *cell_pool;
127 unsigned nr_buckets;
128 unsigned hash_mask;
129 struct hlist_head *cells;
132 static uint32_t calc_nr_buckets(unsigned nr_cells)
134 uint32_t n = 128;
136 nr_cells /= 4;
137 nr_cells = min(nr_cells, 8192u);
139 while (n < nr_cells)
140 n <<= 1;
142 return n;
146 * @nr_cells should be the number of cells you want in use _concurrently_.
147 * Don't confuse it with the number of distinct keys.
149 static struct bio_prison *prison_create(unsigned nr_cells)
151 unsigned i;
152 uint32_t nr_buckets = calc_nr_buckets(nr_cells);
153 size_t len = sizeof(struct bio_prison) +
154 (sizeof(struct hlist_head) * nr_buckets);
155 struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
157 if (!prison)
158 return NULL;
160 spin_lock_init(&prison->lock);
161 prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
162 sizeof(struct cell));
163 prison->nr_buckets = nr_buckets;
164 prison->hash_mask = nr_buckets - 1;
165 prison->cells = (struct hlist_head *) (prison + 1);
166 for (i = 0; i < nr_buckets; i++)
167 INIT_HLIST_HEAD(prison->cells + i);
169 return prison;
172 static void prison_destroy(struct bio_prison *prison)
174 mempool_destroy(prison->cell_pool);
175 kfree(prison);
178 static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key)
180 const unsigned long BIG_PRIME = 4294967291UL;
181 uint64_t hash = key->block * BIG_PRIME;
183 return (uint32_t) (hash & prison->hash_mask);
186 static struct cell *__search_bucket(struct hlist_head *bucket,
187 struct cell_key *key)
189 struct cell *cell;
190 struct hlist_node *tmp;
192 hlist_for_each_entry(cell, tmp, bucket, list)
193 if (!memcmp(&cell->key, key, sizeof(cell->key)))
194 return cell;
196 return NULL;
200 * This may block if a new cell needs allocating. You must ensure that
201 * cells will be unlocked even if the calling thread is blocked.
203 * Returns the number of entries in the cell prior to the new addition
204 * or < 0 on failure.
206 static int bio_detain(struct bio_prison *prison, struct cell_key *key,
207 struct bio *inmate, struct cell **ref)
209 int r;
210 unsigned long flags;
211 uint32_t hash = hash_key(prison, key);
212 struct cell *uninitialized_var(cell), *cell2 = NULL;
214 BUG_ON(hash > prison->nr_buckets);
216 spin_lock_irqsave(&prison->lock, flags);
217 cell = __search_bucket(prison->cells + hash, key);
219 if (!cell) {
221 * Allocate a new cell
223 spin_unlock_irqrestore(&prison->lock, flags);
224 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
225 spin_lock_irqsave(&prison->lock, flags);
228 * We've been unlocked, so we have to double check that
229 * nobody else has inserted this cell in the meantime.
231 cell = __search_bucket(prison->cells + hash, key);
233 if (!cell) {
234 cell = cell2;
235 cell2 = NULL;
237 cell->prison = prison;
238 memcpy(&cell->key, key, sizeof(cell->key));
239 cell->count = 0;
240 bio_list_init(&cell->bios);
241 hlist_add_head(&cell->list, prison->cells + hash);
245 r = cell->count++;
246 bio_list_add(&cell->bios, inmate);
247 spin_unlock_irqrestore(&prison->lock, flags);
249 if (cell2)
250 mempool_free(cell2, prison->cell_pool);
252 *ref = cell;
254 return r;
257 static int bio_detain_if_occupied(struct bio_prison *prison, struct cell_key *key,
258 struct bio *inmate, struct cell **ref)
260 int r;
261 unsigned long flags;
262 uint32_t hash = hash_key(prison, key);
263 struct cell *uninitialized_var(cell);
265 BUG_ON(hash > prison->nr_buckets);
267 spin_lock_irqsave(&prison->lock, flags);
268 cell = __search_bucket(prison->cells + hash, key);
270 if (!cell) {
271 spin_unlock_irqrestore(&prison->lock, flags);
272 return 0;
275 r = cell->count++;
276 bio_list_add(&cell->bios, inmate);
277 spin_unlock_irqrestore(&prison->lock, flags);
279 *ref = cell;
281 return r;
285 * @inmates must have been initialised prior to this call
287 static void __cell_release(struct cell *cell, struct bio_list *inmates)
289 struct bio_prison *prison = cell->prison;
291 hlist_del(&cell->list);
293 if (inmates)
294 bio_list_merge(inmates, &cell->bios);
296 mempool_free(cell, prison->cell_pool);
299 static void cell_release(struct cell *cell, struct bio_list *bios)
301 unsigned long flags;
302 struct bio_prison *prison = cell->prison;
304 spin_lock_irqsave(&prison->lock, flags);
305 __cell_release(cell, bios);
306 spin_unlock_irqrestore(&prison->lock, flags);
309 static void cell_error(struct cell *cell)
311 struct bio_prison *prison = cell->prison;
312 struct bio_list bios;
313 struct bio *bio;
314 unsigned long flags;
316 bio_list_init(&bios);
318 spin_lock_irqsave(&prison->lock, flags);
319 __cell_release(cell, &bios);
320 spin_unlock_irqrestore(&prison->lock, flags);
322 while ((bio = bio_list_pop(&bios)))
323 bio_io_error(bio);
326 /*----------------------------------------------------------------*/
329 * We use the deferred set to keep track of pending reads to shared blocks.
330 * We do this to ensure the new mapping caused by a write isn't performed
331 * until these prior reads have completed. Otherwise the insertion of the
332 * new mapping could free the old block that the read bios are mapped to.
335 struct deferred_set;
336 struct deferred_entry {
337 struct deferred_set *ds;
338 unsigned count;
339 struct list_head work_items;
342 struct deferred_set {
343 spinlock_t lock;
344 unsigned current_entry;
345 unsigned sweeper;
346 struct deferred_entry entries[DEFERRED_SET_SIZE];
349 static void ds_init(struct deferred_set *ds)
351 int i;
353 spin_lock_init(&ds->lock);
354 ds->current_entry = 0;
355 ds->sweeper = 0;
356 for (i = 0; i < DEFERRED_SET_SIZE; i++) {
357 ds->entries[i].ds = ds;
358 ds->entries[i].count = 0;
359 INIT_LIST_HEAD(&ds->entries[i].work_items);
363 static struct deferred_entry *ds_inc(struct deferred_set *ds)
365 unsigned long flags;
366 struct deferred_entry *entry;
368 spin_lock_irqsave(&ds->lock, flags);
369 entry = ds->entries + ds->current_entry;
370 entry->count++;
371 spin_unlock_irqrestore(&ds->lock, flags);
373 return entry;
376 static unsigned ds_next(unsigned index)
378 return (index + 1) % DEFERRED_SET_SIZE;
381 static void __sweep(struct deferred_set *ds, struct list_head *head)
383 while ((ds->sweeper != ds->current_entry) &&
384 !ds->entries[ds->sweeper].count) {
385 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
386 ds->sweeper = ds_next(ds->sweeper);
389 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
390 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
393 static void ds_dec(struct deferred_entry *entry, struct list_head *head)
395 unsigned long flags;
397 spin_lock_irqsave(&entry->ds->lock, flags);
398 BUG_ON(!entry->count);
399 --entry->count;
400 __sweep(entry->ds, head);
401 spin_unlock_irqrestore(&entry->ds->lock, flags);
405 * Returns 1 if deferred or 0 if no pending items to delay job.
407 static int ds_add_work(struct deferred_set *ds, struct list_head *work)
409 int r = 1;
410 unsigned long flags;
411 unsigned next_entry;
413 spin_lock_irqsave(&ds->lock, flags);
414 if ((ds->sweeper == ds->current_entry) &&
415 !ds->entries[ds->current_entry].count)
416 r = 0;
417 else {
418 list_add(work, &ds->entries[ds->current_entry].work_items);
419 next_entry = ds_next(ds->current_entry);
420 if (!ds->entries[next_entry].count)
421 ds->current_entry = next_entry;
423 spin_unlock_irqrestore(&ds->lock, flags);
425 return r;
428 /*----------------------------------------------------------------*/
431 * Key building.
433 static void build_data_key(struct dm_thin_device *td,
434 dm_block_t b, struct cell_key *key)
436 key->virtual = 0;
437 key->dev = dm_thin_dev_id(td);
438 key->block = b;
441 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
442 struct cell_key *key)
444 key->virtual = 1;
445 key->dev = dm_thin_dev_id(td);
446 key->block = b;
449 /*----------------------------------------------------------------*/
452 * A pool device ties together a metadata device and a data device. It
453 * also provides the interface for creating and destroying internal
454 * devices.
456 struct pool {
457 struct list_head list;
458 struct dm_target *ti; /* Only set if a pool target is bound */
460 struct mapped_device *pool_md;
461 struct dm_pool_metadata *pmd;
463 uint32_t sectors_per_block;
464 unsigned block_shift;
465 dm_block_t offset_mask;
466 dm_block_t low_water_mark;
467 unsigned zero_new_blocks:1;
469 struct bio_prison *prison;
470 struct dm_kcopyd_client *copier;
472 struct workqueue_struct *producer_wq;
473 struct workqueue_struct *consumer_wq;
474 struct work_struct producer;
475 struct work_struct consumer;
477 spinlock_t lock;
478 struct bio_list deferred_bios;
479 struct list_head prepared_mappings;
481 int low_water_triggered; /* A dm event has been sent */
482 struct bio_list retry_list;
484 struct deferred_set ds; /* FIXME: move to thin_c */
486 mempool_t *mapping_pool;
487 mempool_t *endio_hook_pool;
489 atomic_t ref_count;
493 * Target context for a pool.
495 struct pool_c {
496 struct dm_target *ti;
497 struct pool *pool;
498 struct dm_dev *data_dev;
499 struct dm_dev *metadata_dev;
500 struct dm_target_callbacks callbacks;
502 sector_t low_water_mark;
503 unsigned zero_new_blocks:1;
507 * Target context for a thin.
509 struct thin_c {
510 struct dm_dev *pool_dev;
511 dm_thin_id dev_id;
513 struct pool *pool;
514 struct dm_thin_device *td;
517 /* FIXME: Can cells and new_mappings be combined? */
519 struct endio_hook {
520 struct thin_c *tc;
521 bio_end_io_t *saved_bi_end_io;
522 struct deferred_entry *entry;
525 struct new_mapping {
526 struct list_head list;
528 int prepared;
530 struct thin_c *tc;
531 dm_block_t virt_block;
532 dm_block_t data_block;
533 struct cell *cell;
534 int err;
537 * If the bio covers the whole area of a block then we can avoid
538 * zeroing or copying. Instead this bio is hooked. The bio will
539 * still be in the cell, so care has to be taken to avoid issuing
540 * the bio twice.
542 struct bio *bio;
543 bio_end_io_t *saved_bi_end_io;
546 /*----------------------------------------------------------------*/
548 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
549 bio_end_io_t *fn)
551 *save = bio->bi_end_io;
552 bio->bi_end_io = fn;
555 /*----------------------------------------------------------------*/
558 * A global list that uses a struct mapped_device as a key.
560 static struct dm_thin_pool_table {
561 spinlock_t lock;
562 struct list_head pools;
563 } dm_thin_pool_table;
565 static void pool_table_init(void)
567 spin_lock_init(&dm_thin_pool_table.lock);
569 INIT_LIST_HEAD(&dm_thin_pool_table.pools);
572 static void pool_table_insert(struct pool *pool)
574 spin_lock(&dm_thin_pool_table.lock);
575 list_add(&pool->list, &dm_thin_pool_table.pools);
576 spin_unlock(&dm_thin_pool_table.lock);
579 static void pool_table_remove(struct pool *pool)
581 spin_lock(&dm_thin_pool_table.lock);
582 list_del(&pool->list);
583 spin_unlock(&dm_thin_pool_table.lock);
586 static struct pool *pool_table_lookup(struct mapped_device *md)
588 struct pool *pool = NULL, *tmp;
590 spin_lock(&dm_thin_pool_table.lock);
591 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list)
592 if (tmp->pool_md == md) {
593 pool = tmp;
594 break;
596 spin_unlock(&dm_thin_pool_table.lock);
598 return pool;
601 /*----------------------------------------------------------------*/
604 * This section of code contains the logic for processing a thin devices' IO.
605 * Much of the code depends on pool object resources (lists, workqueues, etc)
606 * but most is exclusively called from the thin target rather than the thin-pool
607 * target. wake_producer() being the most notable exception (which is also used
608 * by thin-pool to continue deferred IO processing after pool resume).
611 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
613 return bio->bi_sector >> tc->pool->block_shift;
616 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
618 struct pool *pool = tc->pool;
620 bio->bi_bdev = tc->pool_dev->bdev;
621 bio->bi_sector = (block << pool->block_shift) +
622 (bio->bi_sector & pool->offset_mask);
625 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
626 dm_block_t block)
628 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
629 int r = dm_pool_commit_metadata(tc->pool->pmd);
630 if (r) {
631 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
632 __func__, r);
633 bio_io_error(bio);
634 return;
638 remap(tc, bio, block);
639 generic_make_request(bio);
642 static void wake_producer(struct pool *pool)
644 queue_work(pool->producer_wq, &pool->producer);
647 static void __maybe_add_mapping(struct new_mapping *m)
649 struct pool *pool = m->tc->pool;
651 if (list_empty(&m->list) && m->prepared) {
652 list_add(&m->list, &pool->prepared_mappings);
653 queue_work(pool->consumer_wq, &pool->consumer);
657 static void copy_complete(int read_err, unsigned long write_err, void *context)
659 unsigned long flags;
660 struct new_mapping *m = context;
661 struct pool *pool = m->tc->pool;
663 m->err = read_err || write_err ? -EIO : 0;
665 spin_lock_irqsave(&pool->lock, flags);
666 m->prepared = 1;
667 __maybe_add_mapping(m);
668 spin_unlock_irqrestore(&pool->lock, flags);
671 static void overwrite_endio(struct bio *bio, int err)
673 unsigned long flags;
674 struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
675 struct pool *pool = m->tc->pool;
677 m->err = err;
679 spin_lock_irqsave(&pool->lock, flags);
680 m->prepared = 1;
681 __maybe_add_mapping(m);
682 spin_unlock_irqrestore(&pool->lock, flags);
685 static void shared_read_endio(struct bio *bio, int err)
687 struct list_head mappings;
688 struct new_mapping *m, *tmp;
689 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
690 unsigned long flags;
691 struct pool *pool = h->tc->pool;
693 bio->bi_end_io = h->saved_bi_end_io;
694 bio_endio(bio, err);
696 INIT_LIST_HEAD(&mappings);
697 ds_dec(h->entry, &mappings);
699 spin_lock_irqsave(&pool->lock, flags);
700 list_for_each_entry_safe(m, tmp, &mappings, list) {
701 list_del(&m->list);
702 INIT_LIST_HEAD(&m->list);
703 __maybe_add_mapping(m);
705 spin_unlock_irqrestore(&pool->lock, flags);
707 mempool_free(h, pool->endio_hook_pool);
710 static int io_covers_block(struct pool *pool, struct bio *bio)
712 return ((bio->bi_sector & pool->offset_mask) == 0) &&
713 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
716 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
717 dm_block_t data_origin, dm_block_t data_dest,
718 struct cell *cell, struct bio *bio)
720 int r;
721 struct pool *pool = tc->pool;
722 struct new_mapping *m = mempool_alloc(pool->mapping_pool, GFP_NOIO);
724 INIT_LIST_HEAD(&m->list);
725 m->prepared = 0;
726 m->tc = tc;
727 m->virt_block = virt_block;
728 m->data_block = data_dest;
729 m->cell = cell;
730 m->err = 0;
731 m->bio = NULL;
733 ds_add_work(&pool->ds, &m->list);
736 * IO to pool_dev remaps to the pool target's data_dev.
738 * If the whole block of data is being overwritten, we can issue the
739 * bio immediately. Otherwise we use kcopyd to clone the data first.
741 if (io_covers_block(pool, bio)) {
742 m->bio = bio;
743 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
744 dm_get_mapinfo(bio)->ptr = m;
745 remap_and_issue(tc, bio, data_dest);
746 } else {
747 struct dm_io_region from, to;
749 from.bdev = tc->pool_dev->bdev;
750 from.sector = data_origin * pool->sectors_per_block;
751 from.count = pool->sectors_per_block;
753 to.bdev = tc->pool_dev->bdev;
754 to.sector = data_dest * pool->sectors_per_block;
755 to.count = pool->sectors_per_block;
757 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
758 0, copy_complete, m);
759 if (r < 0) {
760 mempool_free(m, pool->mapping_pool);
761 DMERR("dm_kcopyd_copy() failed");
762 cell_error(cell);
767 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
768 dm_block_t data_block, struct cell *cell,
769 struct bio *bio)
771 struct pool *pool = tc->pool;
772 struct new_mapping *m = mempool_alloc(pool->mapping_pool, GFP_NOIO);
774 INIT_LIST_HEAD(&m->list);
775 m->prepared = 0;
776 m->tc = tc;
777 m->virt_block = virt_block;
778 m->data_block = data_block;
779 m->cell = cell;
780 m->err = 0;
781 m->bio = NULL;
784 * If the whole block of data is being overwritten or we are not
785 * zeroing pre-existing data, we can issue the bio immediately.
786 * Otherwise we use kcopyd to zero the data first.
788 if (!pool->zero_new_blocks || io_covers_block(pool, bio)) {
789 m->bio = bio;
790 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
791 dm_get_mapinfo(bio)->ptr = m;
792 remap_and_issue(tc, bio, data_block);
793 } else {
794 int r;
795 struct dm_io_region to;
797 to.bdev = tc->pool_dev->bdev;
798 to.sector = data_block * pool->sectors_per_block;
799 to.count = pool->sectors_per_block;
801 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
802 if (r < 0) {
803 mempool_free(m, pool->mapping_pool);
804 DMERR("dm_kcopyd_zero() failed");
805 cell_error(cell);
810 static void cell_remap_and_issue(struct thin_c *tc, struct cell *cell,
811 dm_block_t data_block)
813 struct bio_list bios;
814 struct bio *bio;
816 bio_list_init(&bios);
817 cell_release(cell, &bios);
819 while ((bio = bio_list_pop(&bios)))
820 remap_and_issue(tc, bio, data_block);
823 static void cell_remap_and_issue_except(struct thin_c *tc, struct cell *cell,
824 dm_block_t data_block,
825 struct bio *exception)
827 struct bio_list bios;
828 struct bio *bio;
830 bio_list_init(&bios);
831 cell_release(cell, &bios);
833 while ((bio = bio_list_pop(&bios)))
834 if (bio != exception)
835 remap_and_issue(tc, bio, data_block);
838 static void retry_later(struct bio *bio)
840 struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
841 struct pool *pool = tc->pool;
842 unsigned long flags;
844 spin_lock_irqsave(&pool->lock, flags);
845 bio_list_add(&pool->retry_list, bio);
846 spin_unlock_irqrestore(&pool->lock, flags);
849 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
851 int r;
852 dm_block_t free_blocks;
853 unsigned long flags;
854 struct pool *pool = tc->pool;
856 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
857 if (r)
858 return r;
860 if (free_blocks <= pool->low_water_mark && !pool->low_water_triggered) {
861 spin_lock_irqsave(&pool->lock, flags);
862 pool->low_water_triggered = 1;
863 spin_unlock_irqrestore(&pool->lock, flags);
864 dm_table_event(pool->ti->table);
867 r = dm_pool_alloc_data_block(pool->pmd, result);
868 if (r)
869 return r;
871 return 0;
874 static void process_discard(struct thin_c *tc, struct bio *bio)
876 int r;
877 dm_block_t block = get_bio_block(tc, bio);
878 struct dm_thin_lookup_result lookup_result;
880 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
881 switch (r) {
882 case 0:
883 if (lookup_result.shared)
885 * We just ignore shared discards for now, these
886 * are hard, and I want to get deferred
887 * deallocation working first.
889 bio_endio(bio, 0);
891 else {
892 r = dm_thin_remove_block(tc->td, block);
893 if (r) {
894 DMERR("dm_thin_remove_block() failed");
895 bio_io_error(bio);
896 } else
897 remap_and_issue(tc, bio, lookup_result.block);
899 break;
901 case -ENODATA:
903 * Either this isn't provisioned, or preparation for
904 * provisioning may be pending (we could find out by
905 * calling bio_detain_if_occupied). But even in this case
906 * it's easier to just forget the discard.
908 bio_endio(bio, 0);
909 break;
911 default:
912 DMERR("dm_thin_find_block() failed, error = %d", r);
913 bio_io_error(bio);
914 break;
918 static void no_space(struct cell *cell)
920 struct bio *bio;
921 struct bio_list bios;
923 bio_list_init(&bios);
924 cell_release(cell, &bios);
926 while ((bio = bio_list_pop(&bios)))
927 retry_later(bio);
930 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
931 struct cell_key *key,
932 struct dm_thin_lookup_result *lookup_result)
934 int r;
935 dm_block_t data_block;
936 struct cell *cell;
938 bio_detain(tc->pool->prison, key, bio, &cell);
940 r = alloc_data_block(tc, &data_block);
941 switch (r) {
942 case 0:
943 schedule_copy(tc, block, lookup_result->block,
944 data_block, cell, bio);
945 break;
947 case -ENOSPC:
948 no_space(cell);
949 break;
951 default:
952 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
953 cell_error(cell);
954 break;
958 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
959 dm_block_t block,
960 struct dm_thin_lookup_result *lookup_result)
962 struct cell *cell;
963 struct cell_key key;
964 struct pool *pool = tc->pool;
967 * If cell is already occupied, then sharing is already
968 * in the process of being broken so we have nothing
969 * further to do here.
971 build_data_key(tc->td, lookup_result->block, &key);
972 if (bio_detain_if_occupied(pool->prison, &key, bio, &cell))
973 return;
975 if (bio_data_dir(bio) == WRITE)
976 break_sharing(tc, bio, block, &key, lookup_result);
977 else {
978 struct endio_hook *h;
979 h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
981 h->tc = tc;
982 h->entry = ds_inc(&pool->ds);
983 save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
984 dm_get_mapinfo(bio)->ptr = h;
985 remap_and_issue(tc, bio, lookup_result->block);
989 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block)
991 int r;
992 dm_block_t data_block;
993 struct cell *cell;
994 struct cell_key key;
997 * If cell is already occupied, then the block is already
998 * being provisioned so we have nothing further to do here.
1000 build_virtual_key(tc->td, block, &key);
1001 if (bio_detain(tc->pool->prison, &key, bio, &cell))
1002 return;
1004 r = alloc_data_block(tc, &data_block);
1005 switch (r) {
1006 case 0:
1007 schedule_zero(tc, block, data_block, cell, bio);
1008 break;
1010 case -ENOSPC:
1011 no_space(cell);
1012 break;
1014 default:
1015 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1016 cell_error(cell);
1017 break;
1021 static void process_bio(struct thin_c *tc, struct bio *bio)
1023 int r;
1024 dm_block_t block = get_bio_block(tc, bio);
1025 struct dm_thin_lookup_result lookup_result;
1027 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1028 switch (r) {
1029 case 0:
1030 if (lookup_result.shared)
1031 process_shared_bio(tc, bio, block, &lookup_result);
1032 else
1033 remap_and_issue(tc, bio, lookup_result.block);
1034 break;
1036 case -ENODATA:
1038 * When reading, we return zeroes regardless of the
1039 * zero_new_blocks setting.
1041 if (bio_data_dir(bio) == READ) {
1042 zero_fill_bio(bio);
1043 bio_endio(bio, 0);
1044 } else
1045 provision_block(tc, bio, block);
1046 break;
1048 default:
1049 DMERR("dm_thin_find_block() failed, error = %d", r);
1050 bio_io_error(bio);
1051 break;
1055 static void process_deferred_bios(struct pool *pool)
1057 unsigned long flags;
1058 struct bio *bio;
1059 struct bio_list bios;
1061 bio_list_init(&bios);
1063 spin_lock_irqsave(&pool->lock, flags);
1064 bio_list_merge(&bios, &pool->deferred_bios);
1065 bio_list_init(&pool->deferred_bios);
1066 spin_unlock_irqrestore(&pool->lock, flags);
1068 while ((bio = bio_list_pop(&bios))) {
1069 struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
1071 if (bio->bi_rw & REQ_DISCARD)
1072 process_discard(tc, bio);
1073 else
1074 process_bio(tc, bio);
1078 static void process_prepared_mapping(struct new_mapping *m)
1080 struct thin_c *tc = m->tc;
1081 struct bio *bio;
1082 int r;
1084 if (m->err) {
1085 cell_error(m->cell);
1086 return;
1089 bio = m->bio;
1090 if (bio)
1091 bio->bi_end_io = m->saved_bi_end_io;
1093 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
1094 if (r) {
1095 DMERR("dm_thin_insert_block() failed");
1096 cell_error(m->cell);
1097 return;
1100 if (bio) {
1101 cell_remap_and_issue_except(tc, m->cell, m->data_block, bio);
1102 bio_endio(bio, 0);
1103 } else
1104 cell_remap_and_issue(tc, m->cell, m->data_block);
1106 list_del(&m->list);
1107 mempool_free(m, tc->pool->mapping_pool);
1110 static void process_prepared_mappings(struct pool *pool)
1112 unsigned long flags;
1113 struct list_head maps;
1114 struct new_mapping *m, *tmp;
1116 INIT_LIST_HEAD(&maps);
1117 spin_lock_irqsave(&pool->lock, flags);
1118 list_splice_init(&pool->prepared_mappings, &maps);
1119 spin_unlock_irqrestore(&pool->lock, flags);
1121 list_for_each_entry_safe(m, tmp, &maps, list)
1122 process_prepared_mapping(m);
1125 static void do_producer(struct work_struct *ws)
1127 struct pool *pool = container_of(ws, struct pool, producer);
1129 process_deferred_bios(pool);
1132 static void do_consumer(struct work_struct *ws)
1134 struct pool *pool = container_of(ws, struct pool, consumer);
1136 process_prepared_mappings(pool);
1139 static void defer_bio(struct thin_c *tc, struct bio *bio)
1141 unsigned long flags;
1142 struct pool *pool = tc->pool;
1144 spin_lock_irqsave(&pool->lock, flags);
1145 bio_list_add(&pool->deferred_bios, bio);
1146 spin_unlock_irqrestore(&pool->lock, flags);
1148 wake_producer(pool);
1152 * Non-blocking function designed to be called from the target's map
1153 * function.
1155 static int bio_map(struct dm_target *ti, struct bio *bio,
1156 union map_info *map_context)
1158 int r;
1159 struct thin_c *tc = ti->private;
1160 dm_block_t block = get_bio_block(tc, bio);
1161 struct dm_thin_device *td = tc->td;
1162 struct pool *pool = tc->pool;
1163 struct dm_thin_lookup_result result;
1166 * FIXME(hch): In theory higher level code should prevent this
1167 * from happening, not sure why we ever get here.
1169 if ((bio->bi_rw & REQ_DISCARD) &&
1170 bio->bi_size < (pool->sectors_per_block << SECTOR_SHIFT)) {
1171 DMERR("discard IO smaller than pool block size (%llu)",
1172 (unsigned long long)pool->sectors_per_block << SECTOR_SHIFT);
1173 bio_endio(bio, 0);
1174 return DM_MAPIO_SUBMITTED;
1178 * Save the thin context for easy access from the deferred bio later.
1180 map_context->ptr = tc;
1182 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1183 defer_bio(tc, bio);
1184 return DM_MAPIO_SUBMITTED;
1187 r = dm_thin_find_block(td, block, 0, &result);
1190 * Note that we defer readahead too.
1192 switch (r) {
1193 case 0:
1194 if (unlikely(result.shared)) {
1196 * We have a race condition here between the
1197 * result.shared value returned by the lookup and
1198 * snapshot creation, which may cause new
1199 * sharing.
1201 * To avoid this always quiesce the origin before
1202 * taking the snap. You want to do this anyway to
1203 * ensure a consistent application view
1204 * (i.e. lockfs).
1206 * More distant ancestors are irrelevant, the
1207 * shared flag will be set in their case.
1209 defer_bio(tc, bio);
1210 r = DM_MAPIO_SUBMITTED;
1211 } else {
1212 remap(tc, bio, result.block);
1213 r = DM_MAPIO_REMAPPED;
1215 break;
1217 case -ENODATA:
1219 * In future, the failed dm_thin_find_block above could
1220 * provide the hint to load the metadata into cache.
1222 * When reading, we return zeroes regardless of the
1223 * zero_new_blocks setting.
1225 if (bio_data_dir(bio) == READ) {
1226 zero_fill_bio(bio);
1227 bio_endio(bio, 0);
1228 } else
1229 defer_bio(tc, bio);
1230 r = DM_MAPIO_SUBMITTED;
1231 break;
1233 case -EWOULDBLOCK:
1234 defer_bio(tc, bio);
1235 r = DM_MAPIO_SUBMITTED;
1236 break;
1239 return r;
1242 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1244 int r;
1245 unsigned long flags;
1246 struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1248 spin_lock_irqsave(&pt->pool->lock, flags);
1249 r = !bio_list_empty(&pt->pool->retry_list);
1250 spin_unlock_irqrestore(&pt->pool->lock, flags);
1252 if (!r) {
1253 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1254 r = bdi_congested(&q->backing_dev_info, bdi_bits);
1257 return r;
1260 static void __requeue_bios(struct pool *pool)
1262 bio_list_merge(&pool->deferred_bios, &pool->retry_list);
1263 bio_list_init(&pool->retry_list);
1266 /*----------------------------------------------------------------
1267 * Binding of control targets to a pool object
1268 *--------------------------------------------------------------*/
1269 /* FIXME: add locking */
1270 static int bind_control_target(struct pool *pool, struct dm_target *ti)
1272 struct pool_c *pt = ti->private;
1274 pool->ti = ti;
1275 pool->low_water_mark = dm_sector_div_up(pt->low_water_mark,
1276 pool->sectors_per_block);
1277 pool->zero_new_blocks = pt->zero_new_blocks;
1278 dm_pool_rebind_metadata_device(pool->pmd, pt->metadata_dev->bdev);
1280 return 0;
1283 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1285 if (pool->ti == ti)
1286 pool->ti = NULL;
1289 /*----------------------------------------------------------------
1290 * Pool creation
1291 *--------------------------------------------------------------*/
1292 static void pool_destroy(struct pool *pool)
1294 if (dm_pool_metadata_close(pool->pmd) < 0)
1295 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1297 prison_destroy(pool->prison);
1298 dm_kcopyd_client_destroy(pool->copier);
1300 if (pool->producer_wq)
1301 destroy_workqueue(pool->producer_wq);
1303 if (pool->consumer_wq)
1304 destroy_workqueue(pool->consumer_wq);
1306 mempool_destroy(pool->mapping_pool);
1307 mempool_destroy(pool->endio_hook_pool);
1308 kfree(pool);
1311 static struct pool *pool_create(struct block_device *metadata_dev,
1312 unsigned long block_size, char **error)
1314 int r;
1315 void *err_p;
1316 struct pool *pool;
1317 struct dm_pool_metadata *pmd;
1319 pmd = dm_pool_metadata_open(metadata_dev, block_size);
1320 if (IS_ERR(pmd)) {
1321 *error = "Error creating metadata object";
1322 return (struct pool *)pmd;
1325 pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1326 if (!pool) {
1327 *error = "Error allocating memory for pool";
1328 err_p = ERR_PTR(-ENOMEM);
1329 goto bad_pool;
1332 pool->pmd = pmd;
1333 pool->sectors_per_block = block_size;
1334 pool->block_shift = ffs(block_size) - 1;
1335 pool->offset_mask = block_size - 1;
1336 pool->low_water_mark = 0;
1337 pool->zero_new_blocks = 1;
1338 pool->prison = prison_create(PRISON_CELLS);
1339 if (!pool->prison) {
1340 *error = "Error creating pool's bio prison";
1341 err_p = ERR_PTR(-ENOMEM);
1342 goto bad_prison;
1345 pool->copier = dm_kcopyd_client_create();
1346 if (IS_ERR(pool->copier)) {
1347 r = PTR_ERR(pool->copier);
1348 *error = "Error creating pool's kcopyd client";
1349 err_p = ERR_PTR(r);
1350 goto bad_kcopyd_client;
1354 * Create singlethreaded workqueues that will service all devices
1355 * that use this metadata.
1357 pool->producer_wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX "-producer",
1358 WQ_MEM_RECLAIM);
1359 if (!pool->producer_wq) {
1360 *error = "Error creating pool's producer workqueue";
1361 err_p = ERR_PTR(-ENOMEM);
1362 goto bad_producer_wq;
1365 pool->consumer_wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX "-consumer",
1366 WQ_MEM_RECLAIM);
1367 if (!pool->consumer_wq) {
1368 *error = "Error creating pool's consumer workqueue";
1369 err_p = ERR_PTR(-ENOMEM);
1370 goto bad_consumer_wq;
1373 INIT_WORK(&pool->producer, do_producer);
1374 INIT_WORK(&pool->consumer, do_consumer);
1375 spin_lock_init(&pool->lock);
1376 bio_list_init(&pool->deferred_bios);
1377 INIT_LIST_HEAD(&pool->prepared_mappings);
1378 pool->low_water_triggered = 0;
1379 bio_list_init(&pool->retry_list);
1380 ds_init(&pool->ds);
1382 pool->mapping_pool =
1383 mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping));
1384 if (!pool->mapping_pool) {
1385 *error = "Error creating pool's mapping mempool";
1386 err_p = ERR_PTR(-ENOMEM);
1387 goto bad_mapping_pool;
1390 pool->endio_hook_pool =
1391 mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook));
1392 if (!pool->endio_hook_pool) {
1393 *error = "Error creating pool's endio_hook mempool";
1394 err_p = ERR_PTR(-ENOMEM);
1395 goto bad_endio_hook_pool;
1397 atomic_set(&pool->ref_count, 1);
1399 return pool;
1401 bad_endio_hook_pool:
1402 mempool_destroy(pool->mapping_pool);
1403 bad_mapping_pool:
1404 destroy_workqueue(pool->consumer_wq);
1405 bad_consumer_wq:
1406 destroy_workqueue(pool->producer_wq);
1407 bad_producer_wq:
1408 dm_kcopyd_client_destroy(pool->copier);
1409 bad_kcopyd_client:
1410 prison_destroy(pool->prison);
1411 bad_prison:
1412 kfree(pool);
1413 bad_pool:
1414 if (dm_pool_metadata_close(pmd))
1415 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1417 return err_p;
1420 static void pool_inc(struct pool *pool)
1422 atomic_inc(&pool->ref_count);
1425 static void pool_dec(struct pool *pool)
1427 if (atomic_dec_and_test(&pool->ref_count))
1428 pool_destroy(pool);
1431 static struct pool *pool_find(struct mapped_device *pool_md,
1432 struct block_device *metadata_dev,
1433 unsigned long block_size,
1434 char **error)
1436 struct pool *pool;
1438 pool = pool_table_lookup(pool_md);
1439 if (pool)
1440 pool_inc(pool);
1441 else
1442 pool = pool_create(metadata_dev, block_size, error);
1444 return pool;
1447 /*----------------------------------------------------------------
1448 * Pool target methods
1449 *--------------------------------------------------------------*/
1450 static void pool_dtr(struct dm_target *ti)
1452 struct pool_c *pt = ti->private;
1454 dm_put_device(ti, pt->metadata_dev);
1455 dm_put_device(ti, pt->data_dev);
1456 unbind_control_target(pt->pool, ti);
1457 pool_dec(pt->pool);
1458 kfree(pt);
1461 struct pool_features {
1462 unsigned zero_new_blocks:1;
1465 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1466 struct dm_target *ti)
1468 int r;
1469 unsigned argc;
1470 const char *arg_name;
1472 static struct dm_arg _args[] = {
1473 {0, 1, "Invalid number of pool feature arguments"},
1477 * No feature arguments supplied.
1479 if (!as->argc)
1480 return 0;
1482 r = dm_read_arg_group(_args, as, &argc, &ti->error);
1483 if (r)
1484 return -EINVAL;
1486 while (argc && !r) {
1487 arg_name = dm_shift_arg(as);
1488 argc--;
1490 if (!strcasecmp(arg_name, "skip_block_zeroing")) {
1491 pf->zero_new_blocks = 0;
1492 continue;
1495 ti->error = "Unrecognised pool feature requested";
1496 r = -EINVAL;
1499 return r;
1503 * thin-pool <metadata dev> <data dev>
1504 * <data block size (sectors)>
1505 * <low water mark (sectors)>
1506 * [<#feature args> [<arg>]*]
1508 * Optional feature arguments are:
1509 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1511 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1513 int r;
1514 struct pool_c *pt;
1515 struct pool *pool;
1516 struct pool_features pf;
1517 struct dm_arg_set as;
1518 struct dm_dev *data_dev;
1519 unsigned long block_size;
1520 dm_block_t low_water;
1521 struct dm_dev *metadata_dev;
1522 sector_t metadata_dev_size;
1524 if (argc < 4) {
1525 ti->error = "Invalid argument count";
1526 return -EINVAL;
1528 as.argc = argc;
1529 as.argv = argv;
1531 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1532 if (r) {
1533 ti->error = "Error opening metadata block device";
1534 return r;
1537 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1538 if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) {
1539 ti->error = "Metadata device is too large";
1540 r = -EINVAL;
1541 goto out_metadata;
1544 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1545 if (r) {
1546 ti->error = "Error getting data device";
1547 goto out_metadata;
1550 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1551 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1552 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1553 !is_power_of_2(block_size)) {
1554 ti->error = "Invalid block size";
1555 r = -EINVAL;
1556 goto out;
1559 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water) ||
1560 !low_water) {
1561 ti->error = "Invalid low water mark";
1562 r = -EINVAL;
1563 goto out;
1567 * Set default pool features.
1569 memset(&pf, 0, sizeof(pf));
1570 pf.zero_new_blocks = 1;
1572 dm_consume_args(&as, 4);
1573 r = parse_pool_features(&as, &pf, ti);
1574 if (r)
1575 goto out;
1577 pool = pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1578 block_size, &ti->error);
1579 if (IS_ERR(pool)) {
1580 r = PTR_ERR(pool);
1581 goto out;
1584 pt = kmalloc(sizeof(*pt), GFP_KERNEL);
1585 if (!pt) {
1586 pool_destroy(pool);
1587 r = -ENOMEM;
1588 goto out;
1590 pt->pool = pool;
1591 pt->ti = ti;
1592 pt->metadata_dev = metadata_dev;
1593 pt->data_dev = data_dev;
1594 pt->low_water_mark = low_water;
1595 pt->zero_new_blocks = pf.zero_new_blocks;
1596 ti->num_flush_requests = 1;
1597 ti->num_discard_requests = 1;
1598 ti->private = pt;
1600 pt->callbacks.congested_fn = pool_is_congested;
1601 dm_table_add_target_callbacks(ti->table, &pt->callbacks);
1603 return 0;
1605 out:
1606 dm_put_device(ti, data_dev);
1607 out_metadata:
1608 dm_put_device(ti, metadata_dev);
1610 return r;
1613 static int pool_map(struct dm_target *ti, struct bio *bio,
1614 union map_info *map_context)
1616 int r;
1617 struct pool_c *pt = ti->private;
1618 struct pool *pool = pt->pool;
1619 unsigned long flags;
1621 spin_lock_irqsave(&pool->lock, flags);
1622 bio->bi_bdev = pt->data_dev->bdev;
1623 r = DM_MAPIO_REMAPPED;
1624 spin_unlock_irqrestore(&pool->lock, flags);
1626 return r;
1630 * Retrieves the number of blocks of the data device from
1631 * the superblock and compares it to the actual device size,
1632 * thus resizing the data device in case it has grown.
1634 * This both copes with opening preallocated data devices in the ctr
1635 * being followed by a resume
1636 * -and-
1637 * calling the resume method individually after userspace has
1638 * grown the data device in reaction to a table event.
1640 static int pool_preresume(struct dm_target *ti)
1642 int r;
1643 struct pool_c *pt = ti->private;
1644 struct pool *pool = pt->pool;
1645 dm_block_t data_size, sb_data_size;
1646 unsigned long flags;
1649 * Take control of the pool object.
1651 r = bind_control_target(pool, ti);
1652 if (r)
1653 return r;
1655 data_size = ti->len >> pool->block_shift;
1656 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
1657 if (r) {
1658 DMERR("failed to retrieve data device size");
1659 return r;
1662 if (data_size < sb_data_size) {
1663 DMERR("pool target too small, is %llu blocks (expected %llu)",
1664 data_size, sb_data_size);
1665 return -EINVAL;
1667 } else if (data_size > sb_data_size) {
1668 r = dm_pool_resize_data_dev(pool->pmd, data_size);
1669 if (r) {
1670 DMERR("failed to resize data device");
1671 return r;
1674 r = dm_pool_commit_metadata(pool->pmd);
1675 if (r) {
1676 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1677 __func__, r);
1678 return r;
1682 spin_lock_irqsave(&pool->lock, flags);
1683 pool->low_water_triggered = 0;
1684 __requeue_bios(pool);
1685 spin_unlock_irqrestore(&pool->lock, flags);
1687 wake_producer(pool);
1690 * The pool object is only present if the pool is active.
1692 pool->pool_md = dm_table_get_md(ti->table);
1693 pool_table_insert(pool);
1695 return 0;
1698 static void pool_postsuspend(struct dm_target *ti)
1700 int r;
1701 struct pool_c *pt = ti->private;
1702 struct pool *pool = pt->pool;
1704 flush_workqueue(pool->producer_wq);
1705 flush_workqueue(pool->consumer_wq);
1707 r = dm_pool_commit_metadata(pool->pmd);
1708 if (r < 0) {
1709 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1710 __func__, r);
1711 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
1714 pool_table_remove(pool);
1715 pool->pool_md = NULL;
1718 static int check_arg_count(unsigned argc, unsigned args_required)
1720 if (argc != args_required) {
1721 DMWARN("Message received with %u arguments instead of %u.",
1722 argc, args_required);
1723 return -EINVAL;
1726 return 0;
1729 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
1731 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
1732 *dev_id <= MAX_DEV_ID)
1733 return 0;
1735 if (warning)
1736 DMWARN("Message received with invalid device id: %s", arg);
1738 return -EINVAL;
1741 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
1743 dm_thin_id dev_id;
1744 int r;
1746 r = check_arg_count(argc, 2);
1747 if (r)
1748 return r;
1750 r = read_dev_id(argv[1], &dev_id, 1);
1751 if (r)
1752 return r;
1754 r = dm_pool_create_thin(pool->pmd, dev_id);
1755 if (r) {
1756 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
1757 argv[1]);
1758 return r;
1761 return 0;
1764 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
1766 dm_thin_id dev_id;
1767 dm_thin_id origin_dev_id;
1768 int r;
1770 r = check_arg_count(argc, 3);
1771 if (r)
1772 return r;
1774 r = read_dev_id(argv[1], &dev_id, 1);
1775 if (r)
1776 return r;
1778 r = read_dev_id(argv[2], &origin_dev_id, 1);
1779 if (r)
1780 return r;
1782 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
1783 if (r) {
1784 DMWARN("Creation of new snapshot %s of device %s failed.",
1785 argv[1], argv[2]);
1786 return r;
1789 return 0;
1792 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
1794 dm_thin_id dev_id;
1795 int r;
1797 r = check_arg_count(argc, 2);
1798 if (r)
1799 return r;
1801 r = read_dev_id(argv[1], &dev_id, 1);
1802 if (r)
1803 return r;
1805 r = dm_pool_delete_thin_device(pool->pmd, dev_id);
1806 if (r)
1807 DMWARN("Deletion of thin device %s failed.", argv[1]);
1809 return r;
1812 static int process_trim_mesg(unsigned argc, char **argv, struct pool *pool)
1814 dm_thin_id dev_id;
1815 sector_t new_size;
1816 int r;
1818 r = check_arg_count(argc, 3);
1819 if (r)
1820 return r;
1822 r = read_dev_id(argv[1], &dev_id, 1);
1823 if (r)
1824 return r;
1826 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_size)) {
1827 DMWARN("trim device %s: Invalid new size: %s sectors.",
1828 argv[1], argv[2]);
1829 return -EINVAL;
1832 r = dm_pool_trim_thin_device(pool->pmd, dev_id,
1833 dm_sector_div_up(new_size, pool->sectors_per_block));
1834 if (r)
1835 DMWARN("Attempt to trim thin device %s failed.", argv[1]);
1837 return r;
1840 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
1842 dm_thin_id old_id, new_id;
1843 int r;
1845 r = check_arg_count(argc, 3);
1846 if (r)
1847 return r;
1849 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
1850 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
1851 return -EINVAL;
1854 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
1855 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
1856 return -EINVAL;
1859 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
1860 if (r) {
1861 DMWARN("Failed to change transaction id from %s to %s.",
1862 argv[1], argv[2]);
1863 return r;
1866 return 0;
1870 * Messages supported:
1871 * create_thin <dev_id>
1872 * create_snap <dev_id> <origin_id>
1873 * delete <dev_id>
1874 * trim <dev_id> <new_size_in_sectors>
1875 * set_transaction_id <current_trans_id> <new_trans_id>
1877 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
1879 int r = -EINVAL;
1880 struct pool_c *pt = ti->private;
1881 struct pool *pool = pt->pool;
1883 if (!strcasecmp(argv[0], "create_thin"))
1884 r = process_create_thin_mesg(argc, argv, pool);
1886 else if (!strcasecmp(argv[0], "create_snap"))
1887 r = process_create_snap_mesg(argc, argv, pool);
1889 else if (!strcasecmp(argv[0], "delete"))
1890 r = process_delete_mesg(argc, argv, pool);
1892 else if (!strcasecmp(argv[0], "trim"))
1893 r = process_trim_mesg(argc, argv, pool);
1895 else if (!strcasecmp(argv[0], "set_transaction_id"))
1896 r = process_set_transaction_id_mesg(argc, argv, pool);
1898 else
1899 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
1901 if (!r) {
1902 r = dm_pool_commit_metadata(pool->pmd);
1903 if (r)
1904 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
1905 argv[0], r);
1908 return r;
1912 * Status line is:
1913 * <transaction id> <free metadata space in sectors>
1914 * <free data space in sectors> <held metadata root>
1916 static int pool_status(struct dm_target *ti, status_type_t type,
1917 char *result, unsigned maxlen)
1919 int r;
1920 unsigned sz = 0;
1921 uint64_t transaction_id;
1922 dm_block_t nr_free_blocks_data;
1923 dm_block_t nr_free_blocks_metadata;
1924 dm_block_t held_root;
1925 char buf[BDEVNAME_SIZE];
1926 char buf2[BDEVNAME_SIZE];
1927 struct pool_c *pt = ti->private;
1928 struct pool *pool = pt->pool;
1930 switch (type) {
1931 case STATUSTYPE_INFO:
1932 r = dm_pool_get_metadata_transaction_id(pool->pmd,
1933 &transaction_id);
1934 if (r)
1935 return r;
1937 r = dm_pool_get_free_metadata_block_count(pool->pmd,
1938 &nr_free_blocks_metadata);
1939 if (r)
1940 return r;
1942 r = dm_pool_get_free_block_count(pool->pmd,
1943 &nr_free_blocks_data);
1944 if (r)
1945 return r;
1947 r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
1948 if (r)
1949 return r;
1951 DMEMIT("%llu %llu %llu ", (unsigned long long)transaction_id,
1952 (unsigned long long)nr_free_blocks_metadata * pool->sectors_per_block,
1953 (unsigned long long)nr_free_blocks_data * pool->sectors_per_block);
1955 if (held_root)
1956 DMEMIT("%llu", held_root);
1957 else
1958 DMEMIT("-");
1960 break;
1962 case STATUSTYPE_TABLE:
1963 DMEMIT("%s %s %lu %llu ",
1964 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
1965 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
1966 (unsigned long)pool->sectors_per_block,
1967 (unsigned long long)pt->low_water_mark);
1969 DMEMIT("%u ", !pool->zero_new_blocks);
1971 if (!pool->zero_new_blocks)
1972 DMEMIT("skip_block_zeroing ");
1973 break;
1976 return 0;
1979 static int pool_iterate_devices(struct dm_target *ti,
1980 iterate_devices_callout_fn fn, void *data)
1982 struct pool_c *pt = ti->private;
1984 return fn(ti, pt->data_dev, 0, ti->len, data);
1987 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
1988 struct bio_vec *biovec, int max_size)
1990 struct pool_c *pt = ti->private;
1991 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1993 if (!q->merge_bvec_fn)
1994 return max_size;
1996 bvm->bi_bdev = pt->data_dev->bdev;
1998 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2001 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2003 struct pool_c *pt = ti->private;
2004 struct pool *pool = pt->pool;
2006 blk_limits_io_min(limits, 0);
2007 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2010 static struct target_type pool_target = {
2011 .name = "thin-pool",
2012 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE,
2013 .version = {1, 0, 0},
2014 .module = THIS_MODULE,
2015 .ctr = pool_ctr,
2016 .dtr = pool_dtr,
2017 .map = pool_map,
2018 .postsuspend = pool_postsuspend,
2019 .preresume = pool_preresume,
2020 .message = pool_message,
2021 .status = pool_status,
2022 .merge = pool_merge,
2023 .iterate_devices = pool_iterate_devices,
2024 .io_hints = pool_io_hints,
2027 /*----------------------------------------------------------------*/
2029 static void thin_dtr(struct dm_target *ti)
2031 struct thin_c *tc = ti->private;
2033 pool_dec(tc->pool);
2034 dm_pool_close_thin_device(tc->td);
2035 dm_put_device(ti, tc->pool_dev);
2036 kfree(tc);
2040 * Thin target parameters:
2042 * <pool_dev> <dev_id>
2044 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2045 * dev_id: the internal device identifier
2047 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2049 int r;
2050 struct thin_c *tc;
2051 struct dm_dev *pool_dev;
2052 struct mapped_device *pool_md;
2054 if (argc != 2) {
2055 ti->error = "Invalid argument count";
2056 return -EINVAL;
2059 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2060 if (!tc) {
2061 ti->error = "Out of memory";
2062 return -ENOMEM;
2065 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2066 if (r) {
2067 ti->error = "Error opening pool device";
2068 goto bad_pool_dev;
2070 tc->pool_dev = pool_dev;
2072 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2073 ti->error = "Invalid device id";
2074 r = -EINVAL;
2075 goto bad_common;
2078 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2079 if (!pool_md) {
2080 ti->error = "Couldn't get pool mapped device";
2081 r = -EINVAL;
2082 goto bad_common;
2085 tc->pool = pool_table_lookup(pool_md);
2086 if (!tc->pool) {
2087 ti->error = "Couldn't find pool object";
2088 r = -EINVAL;
2089 goto bad_pool_lookup;
2091 pool_inc(tc->pool);
2092 dm_put(pool_md);
2094 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2095 if (r) {
2096 ti->error = "Couldn't open thin internal device";
2097 goto bad_thin_open;
2100 ti->split_io = tc->pool->sectors_per_block;
2101 ti->num_flush_requests = 1;
2102 ti->num_discard_requests = 1;
2104 * allow discards to issued to the thin device even
2105 * if the pool's data device doesn't support them.
2107 ti->discards_supported = 1;
2109 return 0;
2111 bad_thin_open:
2112 pool_dec(tc->pool);
2113 bad_pool_lookup:
2114 dm_put(pool_md);
2115 bad_common:
2116 dm_put_device(ti, tc->pool_dev);
2117 bad_pool_dev:
2118 kfree(tc);
2120 return r;
2123 static int thin_map(struct dm_target *ti, struct bio *bio,
2124 union map_info *map_context)
2126 bio->bi_sector -= ti->begin;
2128 return bio_map(ti, bio, map_context);
2132 * <nr mapped sectors> <highest mapped sector>
2134 static int thin_status(struct dm_target *ti, status_type_t type,
2135 char *result, unsigned maxlen)
2137 int r;
2138 ssize_t sz = 0;
2139 dm_block_t mapped, highest;
2140 char buf[BDEVNAME_SIZE];
2141 struct thin_c *tc = ti->private;
2143 if (!tc->td)
2144 DMEMIT("-");
2145 else {
2146 switch (type) {
2147 case STATUSTYPE_INFO:
2148 r = dm_thin_get_mapped_count(tc->td, &mapped);
2149 if (r)
2150 return r;
2152 r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2153 if (r < 0)
2154 return r;
2156 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2157 if (r)
2158 DMEMIT("%llu", ((highest + 1) *
2159 tc->pool->sectors_per_block) - 1);
2160 else
2161 DMEMIT("-");
2162 break;
2164 case STATUSTYPE_TABLE:
2165 DMEMIT("%s %lu",
2166 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2167 (unsigned long) tc->dev_id);
2168 break;
2172 return 0;
2175 static int thin_iterate_devices(struct dm_target *ti,
2176 iterate_devices_callout_fn fn, void *data)
2178 struct thin_c *tc = ti->private;
2180 return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block, data);
2183 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2185 struct thin_c *tc = ti->private;
2187 blk_limits_io_min(limits, 0);
2188 blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
2191 * Only allow discard requests aligned to our block size, and make
2192 * sure that we never get sent larger discard requests either.
2194 limits->max_discard_sectors = tc->pool->sectors_per_block;
2195 limits->discard_granularity = tc->pool->sectors_per_block << SECTOR_SHIFT;
2198 static struct target_type thin_target = {
2199 .name = "thin",
2200 .version = {1, 0, 0},
2201 .module = THIS_MODULE,
2202 .ctr = thin_ctr,
2203 .dtr = thin_dtr,
2204 .map = thin_map,
2205 .status = thin_status,
2206 .iterate_devices = thin_iterate_devices,
2207 .io_hints = thin_io_hints,
2210 /*----------------------------------------------------------------*/
2212 static int __init dm_thin_init(void)
2214 int r;
2216 pool_table_init();
2218 r = dm_register_target(&thin_target);
2219 if (r)
2220 return r;
2222 r = dm_register_target(&pool_target);
2223 if (r)
2224 dm_unregister_target(&thin_target);
2226 return r;
2229 static void dm_thin_exit(void)
2231 dm_unregister_target(&thin_target);
2232 dm_unregister_target(&pool_target);
2235 module_init(dm_thin_init);
2236 module_exit(dm_thin_exit);
2238 MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target");
2239 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2240 MODULE_LICENSE("GPL");
2242 /*----------------------------------------------------------------*/