drivers/md/dm-thin.c

   1 /*
   2  * Copyright (C) 2011 Red Hat UK.  All rights reserved.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm-thin-metadata.h"
   8
   9 #include <linux/device-mapper.h>
  10 #include <linux/dm-io.h>
  11 #include <linux/dm-kcopyd.h>
  12 #include <linux/list.h>
  13 #include <linux/init.h>
  14 #include <linux/module.h>
  15 #include <linux/slab.h>
  16
  17 #define DM_MSG_PREFIX   "thin"
  18
  19 /*
  20  * Tunable constants
  21  */
  22 #define ENDIO_HOOK_POOL_SIZE 10240
  23 #define DEFERRED_SET_SIZE 64
  24 #define MAPPING_POOL_SIZE 1024
  25 #define PRISON_CELLS 1024
  26
  27 /*
  28  * The block size of the device holding pool data must be
  29  * between 64KB and 1GB.
  30  */
  31 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
  32 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
  33
  34 #define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * 8)
  35
  36 /*
  37  * Device id is restricted to 24 bits.
  38  */
  39 #define MAX_DEV_ID ((1 << 24) - 1)
  40
  41 /*
  42  * How do we handle breaking sharing of data blocks?
  43  * =================================================
  44  *
  45  * We use a standard copy-on-write btree to store the mappings for the
  46  * devices (note I'm talking about copy-on-write of the metadata here, not
  47  * the data).  When you take an internal snapshot you clone the root node
  48  * of the origin btree.  After this there is no concept of an origin or a
  49  * snapshot.  They are just two device trees that happen to point to the
  50  * same data blocks.
  51  *
  52  * When we get a write in we decide if it's to a shared data block using
  53  * some timestamp magic.  If it is, we have to break sharing.
  54  *
  55  * Let's say we write to a shared block in what was the origin.  The
  56  * steps are:
  57  *
  58  * i) plug io further to this physical block. (see bio_prison code).
  59  *
  60  * ii) quiesce any read io to that shared data block.  Obviously
  61  * including all devices that share this block.  (see deferred_set code)
  62  *
  63  * iii) copy the data block to a newly allocate block.  This step can be
  64  * missed out if the io covers the block. (schedule_copy).
  65  *
  66  * iv) insert the new mapping into the origin's btree
  67  * (process_prepared_mappings).  This act of inserting breaks some
  68  * sharing of btree nodes between the two devices.  Breaking sharing only
  69  * effects the btree of that specific device.  Btrees for the other
  70  * devices that share the block never change.  The btree for the origin
  71  * device as it was after the last commit is untouched, ie. we're using
  72  * persistent data structures in the functional programming sense.
  73  *
  74  * v) unplug io to this physical block, including the io that triggered
  75  * the breaking of sharing.
  76  *
  77  * Steps (ii) and (iii) occur in parallel.
  78  *
  79  * The metadata _doesn't_ need to be committed before the io continues.  We
  80  * get away with this because the io is always written to a _new_ block.
  81  * If there's a crash, then:
  82  *
  83  * - The origin mapping will point to the old origin block (the shared
  84  * one).  This will contain the data as it was before the io that triggered
  85  * the breaking of sharing came in.
  86  *
  87  * - The snap mapping still points to the old block.  As it would after
  88  * the commit.
  89  *
  90  * The downside of this scheme is the timestamp magic isn't perfect, and
  91  * will continue to think that data block in the snapshot device is shared
  92  * even after the write to the origin has broken sharing.  I suspect data
  93  * blocks will typically be shared by many different devices, so we're
  94  * breaking sharing n + 1 times, rather than n, where n is the number of
  95  * devices that reference this data block.  At the moment I think the
  96  * benefits far, far outweigh the disadvantages.
  97  */
  98
  99 /*----------------------------------------------------------------*/
 100
 101 /*
 102  * Sometimes we can't deal with a bio straight away.  We put them in prison
 103  * where they can't cause any mischief.  Bios are put in a cell identified
 104  * by a key, multiple bios can be in the same cell.  When the cell is
 105  * subsequently unlocked the bios become available.
 106  */
 107 struct bio_prison;
 108
 109 struct cell_key {
 110         int virtual;
 111         dm_thin_id dev;
 112         dm_block_t block;
 113 };
 114
 115 struct cell {
 116         struct hlist_node list;
 117         struct bio_prison *prison;
 118         struct cell_key key;
 119         unsigned count;
 120         struct bio_list bios;
 121 };
 122
 123 struct bio_prison {
 124         spinlock_t lock;
 125         mempool_t *cell_pool;
 126
 127         unsigned nr_buckets;
 128         unsigned hash_mask;
 129         struct hlist_head *cells;
 130 };
 131
 132 static uint32_t calc_nr_buckets(unsigned nr_cells)
 133 {
 134         uint32_t n = 128;
 135
 136         nr_cells /= 4;
 137         nr_cells = min(nr_cells, 8192u);
 138
 139         while (n < nr_cells)
 140                 n <<= 1;
 141
 142         return n;
 143 }
 144
 145 /*
 146  * @nr_cells should be the number of cells you want in use _concurrently_.
 147  * Don't confuse it with the number of distinct keys.
 148  */
 149 static struct bio_prison *prison_create(unsigned nr_cells)
 150 {
 151         unsigned i;
 152         uint32_t nr_buckets = calc_nr_buckets(nr_cells);
 153         size_t len = sizeof(struct bio_prison) +
 154                 (sizeof(struct hlist_head) * nr_buckets);
 155         struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
 156
 157         if (!prison)
 158                 return NULL;
 159
 160         spin_lock_init(&prison->lock);
 161         prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
 162                                                         sizeof(struct cell));
 163         prison->nr_buckets = nr_buckets;
 164         prison->hash_mask = nr_buckets - 1;
 165         prison->cells = (struct hlist_head *) (prison + 1);
 166         for (i = 0; i < nr_buckets; i++)
 167                 INIT_HLIST_HEAD(prison->cells + i);
 168
 169         return prison;
 170 }
 171
 172 static void prison_destroy(struct bio_prison *prison)
 173 {
 174         mempool_destroy(prison->cell_pool);
 175         kfree(prison);
 176 }
 177
 178 static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key)
 179 {
 180         const unsigned long BIG_PRIME = 4294967291UL;
 181         uint64_t hash = key->block * BIG_PRIME;
 182
 183         return (uint32_t) (hash & prison->hash_mask);
 184 }
 185
 186 static struct cell *__search_bucket(struct hlist_head *bucket,
 187                                     struct cell_key *key)
 188 {
 189         struct cell *cell;
 190         struct hlist_node *tmp;
 191
 192         hlist_for_each_entry(cell, tmp, bucket, list)
 193                 if (!memcmp(&cell->key, key, sizeof(cell->key)))
 194                         return cell;
 195
 196         return NULL;
 197 }
 198
 199 /*
 200  * This may block if a new cell needs allocating.  You must ensure that
 201  * cells will be unlocked even if the calling thread is blocked.
 202  *
 203  * Returns the number of entries in the cell prior to the new addition
 204  * or < 0 on failure.
 205  */
 206 static int bio_detain(struct bio_prison *prison, struct cell_key *key,
 207                       struct bio *inmate, struct cell **ref)
 208 {
 209         int r;
 210         unsigned long flags;
 211         uint32_t hash = hash_key(prison, key);
 212         struct cell *uninitialized_var(cell), *cell2 = NULL;
 213
 214         BUG_ON(hash > prison->nr_buckets);
 215
 216         spin_lock_irqsave(&prison->lock, flags);
 217         cell = __search_bucket(prison->cells + hash, key);
 218
 219         if (!cell) {
 220                 /*
 221                  * Allocate a new cell
 222                  */
 223                 spin_unlock_irqrestore(&prison->lock, flags);
 224                 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
 225                 spin_lock_irqsave(&prison->lock, flags);
 226
 227                 /*
 228                  * We've been unlocked, so we have to double check that
 229                  * nobody else has inserted this cell in the meantime.
 230                  */
 231                 cell = __search_bucket(prison->cells + hash, key);
 232
 233                 if (!cell) {
 234                         cell = cell2;
 235                         cell2 = NULL;
 236
 237                         cell->prison = prison;
 238                         memcpy(&cell->key, key, sizeof(cell->key));
 239                         cell->count = 0;
 240                         bio_list_init(&cell->bios);
 241                         hlist_add_head(&cell->list, prison->cells + hash);
 242                 }
 243         }
 244
 245         r = cell->count++;
 246         bio_list_add(&cell->bios, inmate);
 247         spin_unlock_irqrestore(&prison->lock, flags);
 248
 249         if (cell2)
 250                 mempool_free(cell2, prison->cell_pool);
 251
 252         *ref = cell;
 253
 254         return r;
 255 }
 256
 257 static int bio_detain_if_occupied(struct bio_prison *prison, struct cell_key *key,
 258                                   struct bio *inmate, struct cell **ref)
 259 {
 260         int r;
 261         unsigned long flags;
 262         uint32_t hash = hash_key(prison, key);
 263         struct cell *uninitialized_var(cell);
 264
 265         BUG_ON(hash > prison->nr_buckets);
 266
 267         spin_lock_irqsave(&prison->lock, flags);
 268         cell = __search_bucket(prison->cells + hash, key);
 269
 270         if (!cell) {
 271                 spin_unlock_irqrestore(&prison->lock, flags);
 272                 return 0;
 273         }
 274
 275         r = cell->count++;
 276         bio_list_add(&cell->bios, inmate);
 277         spin_unlock_irqrestore(&prison->lock, flags);
 278
 279         *ref = cell;
 280
 281         return r;
 282 }
 283
 284 /*
 285  * @inmates must have been initialised prior to this call
 286  */
 287 static void __cell_release(struct cell *cell, struct bio_list *inmates)
 288 {
 289         struct bio_prison *prison = cell->prison;
 290
 291         hlist_del(&cell->list);
 292
 293         if (inmates)
 294                 bio_list_merge(inmates, &cell->bios);
 295
 296         mempool_free(cell, prison->cell_pool);
 297 }
 298
 299 static void cell_release(struct cell *cell, struct bio_list *bios)
 300 {
 301         unsigned long flags;
 302         struct bio_prison *prison = cell->prison;
 303
 304         spin_lock_irqsave(&prison->lock, flags);
 305         __cell_release(cell, bios);
 306         spin_unlock_irqrestore(&prison->lock, flags);
 307 }
 308
 309 static void cell_error(struct cell *cell)
 310 {
 311         struct bio_prison *prison = cell->prison;
 312         struct bio_list bios;
 313         struct bio *bio;
 314         unsigned long flags;
 315
 316         bio_list_init(&bios);
 317
 318         spin_lock_irqsave(&prison->lock, flags);
 319         __cell_release(cell, &bios);
 320         spin_unlock_irqrestore(&prison->lock, flags);
 321
 322         while ((bio = bio_list_pop(&bios)))
 323                 bio_io_error(bio);
 324 }
 325
 326 /*----------------------------------------------------------------*/
 327
 328 /*
 329  * We use the deferred set to keep track of pending reads to shared blocks.
 330  * We do this to ensure the new mapping caused by a write isn't performed
 331  * until these prior reads have completed.  Otherwise the insertion of the
 332  * new mapping could free the old block that the read bios are mapped to.
 333  */
 334
 335 struct deferred_set;
 336 struct deferred_entry {
 337         struct deferred_set *ds;
 338         unsigned count;
 339         struct list_head work_items;
 340 };
 341
 342 struct deferred_set {
 343         spinlock_t lock;
 344         unsigned current_entry;
 345         unsigned sweeper;
 346         struct deferred_entry entries[DEFERRED_SET_SIZE];
 347 };
 348
 349 static void ds_init(struct deferred_set *ds)
 350 {
 351         int i;
 352
 353         spin_lock_init(&ds->lock);
 354         ds->current_entry = 0;
 355         ds->sweeper = 0;
 356         for (i = 0; i < DEFERRED_SET_SIZE; i++) {
 357                 ds->entries[i].ds = ds;
 358                 ds->entries[i].count = 0;
 359                 INIT_LIST_HEAD(&ds->entries[i].work_items);
 360         }
 361 }
 362
 363 static struct deferred_entry *ds_inc(struct deferred_set *ds)
 364 {
 365         unsigned long flags;
 366         struct deferred_entry *entry;
 367
 368         spin_lock_irqsave(&ds->lock, flags);
 369         entry = ds->entries + ds->current_entry;
 370         entry->count++;
 371         spin_unlock_irqrestore(&ds->lock, flags);
 372
 373         return entry;
 374 }
 375
 376 static unsigned ds_next(unsigned index)
 377 {
 378         return (index + 1) % DEFERRED_SET_SIZE;
 379 }
 380
 381 static void __sweep(struct deferred_set *ds, struct list_head *head)
 382 {
 383         while ((ds->sweeper != ds->current_entry) &&
 384                !ds->entries[ds->sweeper].count) {
 385                 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
 386                 ds->sweeper = ds_next(ds->sweeper);
 387         }
 388
 389         if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
 390                 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
 391 }
 392
 393 static void ds_dec(struct deferred_entry *entry, struct list_head *head)
 394 {
 395         unsigned long flags;
 396
 397         spin_lock_irqsave(&entry->ds->lock, flags);
 398         BUG_ON(!entry->count);
 399         --entry->count;
 400         __sweep(entry->ds, head);
 401         spin_unlock_irqrestore(&entry->ds->lock, flags);
 402 }
 403
 404 /*
 405  * Returns 1 if deferred or 0 if no pending items to delay job.
 406  */
 407 static int ds_add_work(struct deferred_set *ds, struct list_head *work)
 408 {
 409         int r = 1;
 410         unsigned long flags;
 411         unsigned next_entry;
 412
 413         spin_lock_irqsave(&ds->lock, flags);
 414         if ((ds->sweeper == ds->current_entry) &&
 415             !ds->entries[ds->current_entry].count)
 416                 r = 0;
 417         else {
 418                 list_add(work, &ds->entries[ds->current_entry].work_items);
 419                 next_entry = ds_next(ds->current_entry);
 420                 if (!ds->entries[next_entry].count)
 421                         ds->current_entry = next_entry;
 422         }
 423         spin_unlock_irqrestore(&ds->lock, flags);
 424
 425         return r;
 426 }
 427
 428 /*----------------------------------------------------------------*/
 429
 430 /*
 431  * Key building.
 432  */
 433 static void build_data_key(struct dm_thin_device *td,
 434                            dm_block_t b, struct cell_key *key)
 435 {
 436         key->virtual = 0;
 437         key->dev = dm_thin_dev_id(td);
 438         key->block = b;
 439 }
 440
 441 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 442                               struct cell_key *key)
 443 {
 444         key->virtual = 1;
 445         key->dev = dm_thin_dev_id(td);
 446         key->block = b;
 447 }
 448
 449 /*----------------------------------------------------------------*/
 450
 451 /*
 452  * A pool device ties together a metadata device and a data device.  It
 453  * also provides the interface for creating and destroying internal
 454  * devices.
 455  */
 456 struct pool {
 457         struct list_head list;
 458         struct dm_target *ti;   /* Only set if a pool target is bound */
 459
 460         struct mapped_device *pool_md;
 461         struct dm_pool_metadata *pmd;
 462
 463         uint32_t sectors_per_block;
 464         unsigned block_shift;
 465         dm_block_t offset_mask;
 466         dm_block_t low_water_mark;
 467         unsigned zero_new_blocks:1;
 468
 469         struct bio_prison *prison;
 470         struct dm_kcopyd_client *copier;
 471
 472         struct workqueue_struct *producer_wq;
 473         struct workqueue_struct *consumer_wq;
 474         struct work_struct producer;
 475         struct work_struct consumer;
 476
 477         spinlock_t lock;
 478         struct bio_list deferred_bios;
 479         struct list_head prepared_mappings;
 480
 481         int low_water_triggered;        /* A dm event has been sent */
 482         struct bio_list retry_list;
 483
 484         struct deferred_set ds; /* FIXME: move to thin_c */
 485
 486         mempool_t *mapping_pool;
 487         mempool_t *endio_hook_pool;
 488
 489         atomic_t ref_count;
 490 };
 491
 492 /*
 493  * Target context for a pool.
 494  */
 495 struct pool_c {
 496         struct dm_target *ti;
 497         struct pool *pool;
 498         struct dm_dev *data_dev;
 499         struct dm_dev *metadata_dev;
 500         struct dm_target_callbacks callbacks;
 501
 502         sector_t low_water_mark;
 503         unsigned zero_new_blocks:1;
 504 };
 505
 506 /*
 507  * Target context for a thin.
 508  */
 509 struct thin_c {
 510         struct dm_dev *pool_dev;
 511         dm_thin_id dev_id;
 512
 513         struct pool *pool;
 514         struct dm_thin_device *td;
 515 };
 516
 517 /* FIXME: Can cells and new_mappings be combined? */
 518
 519 struct endio_hook {
 520         struct thin_c *tc;
 521         bio_end_io_t *saved_bi_end_io;
 522         struct deferred_entry *entry;
 523 };
 524
 525 struct new_mapping {
 526         struct list_head list;
 527
 528         int prepared;
 529
 530         struct thin_c *tc;
 531         dm_block_t virt_block;
 532         dm_block_t data_block;
 533         struct cell *cell;
 534         int err;
 535
 536         /*
 537          * If the bio covers the whole area of a block then we can avoid
 538          * zeroing or copying.  Instead this bio is hooked.  The bio will
 539          * still be in the cell, so care has to be taken to avoid issuing
 540          * the bio twice.
 541          */
 542         struct bio *bio;
 543         bio_end_io_t *saved_bi_end_io;
 544 };
 545
 546 /*----------------------------------------------------------------*/
 547
 548 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
 549                                bio_end_io_t *fn)
 550 {
 551         *save = bio->bi_end_io;
 552         bio->bi_end_io = fn;
 553 }
 554
 555 /*----------------------------------------------------------------*/
 556
 557 /*
 558  * A global list that uses a struct mapped_device as a key.
 559  */
 560 static struct dm_thin_pool_table {
 561         spinlock_t lock;
 562         struct list_head pools;
 563 } dm_thin_pool_table;
 564
 565 static void pool_table_init(void)
 566 {
 567         spin_lock_init(&dm_thin_pool_table.lock);
 568
 569         INIT_LIST_HEAD(&dm_thin_pool_table.pools);
 570 }
 571
 572 static void pool_table_insert(struct pool *pool)
 573 {
 574         spin_lock(&dm_thin_pool_table.lock);
 575         list_add(&pool->list, &dm_thin_pool_table.pools);
 576         spin_unlock(&dm_thin_pool_table.lock);
 577 }
 578
 579 static void pool_table_remove(struct pool *pool)
 580 {
 581         spin_lock(&dm_thin_pool_table.lock);
 582         list_del(&pool->list);
 583         spin_unlock(&dm_thin_pool_table.lock);
 584 }
 585
 586 static struct pool *pool_table_lookup(struct mapped_device *md)
 587 {
 588         struct pool *pool = NULL, *tmp;
 589
 590         spin_lock(&dm_thin_pool_table.lock);
 591         list_for_each_entry(tmp, &dm_thin_pool_table.pools, list)
 592                 if (tmp->pool_md == md) {
 593                         pool = tmp;
 594                         break;
 595                 }
 596         spin_unlock(&dm_thin_pool_table.lock);
 597
 598         return pool;
 599 }
 600
 601 /*----------------------------------------------------------------*/
 602
 603 /*
 604  * This section of code contains the logic for processing a thin devices' IO.
 605  * Much of the code depends on pool object resources (lists, workqueues, etc)
 606  * but most is exclusively called from the thin target rather than the thin-pool
 607  * target.  wake_producer() being the most notable exception (which is also used
 608  * by thin-pool to continue deferred IO processing after pool resume).
 609  */
 610
 611 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 612 {
 613         return bio->bi_sector >> tc->pool->block_shift;
 614 }
 615
 616 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 617 {
 618         struct pool *pool = tc->pool;
 619
 620         bio->bi_bdev = tc->pool_dev->bdev;
 621         bio->bi_sector = (block << pool->block_shift) +
 622                 (bio->bi_sector & pool->offset_mask);
 623 }
 624
 625 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 626                             dm_block_t block)
 627 {
 628         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
 629                 int r = dm_pool_commit_metadata(tc->pool->pmd);
 630                 if (r) {
 631                         DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
 632                               __func__, r);
 633                         bio_io_error(bio);
 634                         return;
 635                 }
 636         }
 637
 638         remap(tc, bio, block);
 639         generic_make_request(bio);
 640 }
 641
 642 static void wake_producer(struct pool *pool)
 643 {
 644         queue_work(pool->producer_wq, &pool->producer);
 645 }
 646
 647 static void __maybe_add_mapping(struct new_mapping *m)
 648 {
 649         struct pool *pool = m->tc->pool;
 650
 651         if (list_empty(&m->list) && m->prepared) {
 652                 list_add(&m->list, &pool->prepared_mappings);
 653                 queue_work(pool->consumer_wq, &pool->consumer);
 654         }
 655 }
 656
 657 static void copy_complete(int read_err, unsigned long write_err, void *context)
 658 {
 659         unsigned long flags;
 660         struct new_mapping *m = context;
 661         struct pool *pool = m->tc->pool;
 662
 663         m->err = read_err || write_err ? -EIO : 0;
 664
 665         spin_lock_irqsave(&pool->lock, flags);
 666         m->prepared = 1;
 667         __maybe_add_mapping(m);
 668         spin_unlock_irqrestore(&pool->lock, flags);
 669 }
 670
 671 static void overwrite_endio(struct bio *bio, int err)
 672 {
 673         unsigned long flags;
 674         struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
 675         struct pool *pool = m->tc->pool;
 676
 677         m->err = err;
 678
 679         spin_lock_irqsave(&pool->lock, flags);
 680         m->prepared = 1;
 681         __maybe_add_mapping(m);
 682         spin_unlock_irqrestore(&pool->lock, flags);
 683 }
 684
 685 static void shared_read_endio(struct bio *bio, int err)
 686 {
 687         struct list_head mappings;
 688         struct new_mapping *m, *tmp;
 689         struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
 690         unsigned long flags;
 691         struct pool *pool = h->tc->pool;
 692
 693         bio->bi_end_io = h->saved_bi_end_io;
 694         bio_endio(bio, err);
 695
 696         INIT_LIST_HEAD(&mappings);
 697         ds_dec(h->entry, &mappings);
 698
 699         spin_lock_irqsave(&pool->lock, flags);
 700         list_for_each_entry_safe(m, tmp, &mappings, list) {
 701                 list_del(&m->list);
 702                 INIT_LIST_HEAD(&m->list);
 703                 __maybe_add_mapping(m);
 704         }
 705         spin_unlock_irqrestore(&pool->lock, flags);
 706
 707         mempool_free(h, pool->endio_hook_pool);
 708 }
 709
 710 static int io_covers_block(struct pool *pool, struct bio *bio)
 711 {
 712         return ((bio->bi_sector & pool->offset_mask) == 0) &&
 713                 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
 714 }
 715
 716 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 717                           dm_block_t data_origin, dm_block_t data_dest,
 718                           struct cell *cell, struct bio *bio)
 719 {
 720         int r;
 721         struct pool *pool = tc->pool;
 722         struct new_mapping *m = mempool_alloc(pool->mapping_pool, GFP_NOIO);
 723
 724         INIT_LIST_HEAD(&m->list);
 725         m->prepared = 0;
 726         m->tc = tc;
 727         m->virt_block = virt_block;
 728         m->data_block = data_dest;
 729         m->cell = cell;
 730         m->err = 0;
 731         m->bio = NULL;
 732
 733         ds_add_work(&pool->ds, &m->list);
 734
 735         /*
 736          * IO to pool_dev remaps to the pool target's data_dev.
 737          *
 738          * If the whole block of data is being overwritten, we can issue the
 739          * bio immediately. Otherwise we use kcopyd to clone the data first.
 740          */
 741         if (io_covers_block(pool, bio)) {
 742                 m->bio = bio;
 743                 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 744                 dm_get_mapinfo(bio)->ptr = m;
 745                 remap_and_issue(tc, bio, data_dest);
 746         } else {
 747                 struct dm_io_region from, to;
 748
 749                 from.bdev = tc->pool_dev->bdev;
 750                 from.sector = data_origin * pool->sectors_per_block;
 751                 from.count = pool->sectors_per_block;
 752
 753                 to.bdev = tc->pool_dev->bdev;
 754                 to.sector = data_dest * pool->sectors_per_block;
 755                 to.count = pool->sectors_per_block;
 756
 757                 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
 758                                    0, copy_complete, m);
 759                 if (r < 0) {
 760                         mempool_free(m, pool->mapping_pool);
 761                         DMERR("dm_kcopyd_copy() failed");
 762                         cell_error(cell);
 763                 }
 764         }
 765 }
 766
 767 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 768                           dm_block_t data_block, struct cell *cell,
 769                           struct bio *bio)
 770 {
 771         struct pool *pool = tc->pool;
 772         struct new_mapping *m = mempool_alloc(pool->mapping_pool, GFP_NOIO);
 773
 774         INIT_LIST_HEAD(&m->list);
 775         m->prepared = 0;
 776         m->tc = tc;
 777         m->virt_block = virt_block;
 778         m->data_block = data_block;
 779         m->cell = cell;
 780         m->err = 0;
 781         m->bio = NULL;
 782
 783         /*
 784          * If the whole block of data is being overwritten or we are not
 785          * zeroing pre-existing data, we can issue the bio immediately.
 786          * Otherwise we use kcopyd to zero the data first.
 787          */
 788         if (!pool->zero_new_blocks || io_covers_block(pool, bio)) {
 789                 m->bio = bio;
 790                 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 791                 dm_get_mapinfo(bio)->ptr = m;
 792                 remap_and_issue(tc, bio, data_block);
 793         } else {
 794                 int r;
 795                 struct dm_io_region to;
 796
 797                 to.bdev = tc->pool_dev->bdev;
 798                 to.sector = data_block * pool->sectors_per_block;
 799                 to.count = pool->sectors_per_block;
 800
 801                 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
 802                 if (r < 0) {
 803                         mempool_free(m, pool->mapping_pool);
 804                         DMERR("dm_kcopyd_zero() failed");
 805                         cell_error(cell);
 806                 }
 807         }
 808 }
 809
 810 static void cell_remap_and_issue(struct thin_c *tc, struct cell *cell,
 811                                  dm_block_t data_block)
 812 {
 813         struct bio_list bios;
 814         struct bio *bio;
 815
 816         bio_list_init(&bios);
 817         cell_release(cell, &bios);
 818
 819         while ((bio = bio_list_pop(&bios)))
 820                 remap_and_issue(tc, bio, data_block);
 821 }
 822
 823 static void cell_remap_and_issue_except(struct thin_c *tc, struct cell *cell,
 824                                         dm_block_t data_block,
 825                                         struct bio *exception)
 826 {
 827         struct bio_list bios;
 828         struct bio *bio;
 829
 830         bio_list_init(&bios);
 831         cell_release(cell, &bios);
 832
 833         while ((bio = bio_list_pop(&bios)))
 834                 if (bio != exception)
 835                         remap_and_issue(tc, bio, data_block);
 836 }
 837
 838 static void retry_later(struct bio *bio)
 839 {
 840         struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
 841         struct pool *pool = tc->pool;
 842         unsigned long flags;
 843
 844         spin_lock_irqsave(&pool->lock, flags);
 845         bio_list_add(&pool->retry_list, bio);
 846         spin_unlock_irqrestore(&pool->lock, flags);
 847 }
 848
 849 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 850 {
 851         int r;
 852         dm_block_t free_blocks;
 853         unsigned long flags;
 854         struct pool *pool = tc->pool;
 855
 856         r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 857         if (r)
 858                 return r;
 859
 860         if (free_blocks <= pool->low_water_mark && !pool->low_water_triggered) {
 861                 spin_lock_irqsave(&pool->lock, flags);
 862                 pool->low_water_triggered = 1;
 863                 spin_unlock_irqrestore(&pool->lock, flags);
 864                 dm_table_event(pool->ti->table);
 865         }
 866
 867         r = dm_pool_alloc_data_block(pool->pmd, result);
 868         if (r)
 869                 return r;
 870
 871         return 0;
 872 }
 873
 874 static void process_discard(struct thin_c *tc, struct bio *bio)
 875 {
 876         int r;
 877         dm_block_t block = get_bio_block(tc, bio);
 878         struct dm_thin_lookup_result lookup_result;
 879
 880         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 881         switch (r) {
 882         case 0:
 883                 if (lookup_result.shared)
 884                         /*
 885                          * We just ignore shared discards for now, these
 886                          * are hard, and I want to get deferred
 887                          * deallocation working first.
 888                          */
 889                         bio_endio(bio, 0);
 890
 891                 else {
 892                         r = dm_thin_remove_block(tc->td, block);
 893                         if (r) {
 894                                 DMERR("dm_thin_remove_block() failed");
 895                                 bio_io_error(bio);
 896                         } else
 897                                 remap_and_issue(tc, bio, lookup_result.block);
 898                 }
 899                 break;
 900
 901         case -ENODATA:
 902                 /*
 903                  * Either this isn't provisioned, or preparation for
 904                  * provisioning may be pending (we could find out by
 905                  * calling bio_detain_if_occupied).  But even in this case
 906                  * it's easier to just forget the discard.
 907                  */
 908                 bio_endio(bio, 0);
 909                 break;
 910
 911         default:
 912                 DMERR("dm_thin_find_block() failed, error = %d", r);
 913                 bio_io_error(bio);
 914                 break;
 915         }
 916 }
 917
 918 static void no_space(struct cell *cell)
 919 {
 920         struct bio *bio;
 921         struct bio_list bios;
 922
 923         bio_list_init(&bios);
 924         cell_release(cell, &bios);
 925
 926         while ((bio = bio_list_pop(&bios)))
 927                 retry_later(bio);
 928 }
 929
 930 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 931                           struct cell_key *key,
 932                           struct dm_thin_lookup_result *lookup_result)
 933 {
 934         int r;
 935         dm_block_t data_block;
 936         struct cell *cell;
 937
 938         bio_detain(tc->pool->prison, key, bio, &cell);
 939
 940         r = alloc_data_block(tc, &data_block);
 941         switch (r) {
 942         case 0:
 943                 schedule_copy(tc, block, lookup_result->block,
 944                               data_block, cell, bio);
 945                 break;
 946
 947         case -ENOSPC:
 948                 no_space(cell);
 949                 break;
 950
 951         default:
 952                 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
 953                 cell_error(cell);
 954                 break;
 955         }
 956 }
 957
 958 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 959                                dm_block_t block,
 960                                struct dm_thin_lookup_result *lookup_result)
 961 {
 962         struct cell *cell;
 963         struct cell_key key;
 964         struct pool *pool = tc->pool;
 965
 966         /*
 967          * If cell is already occupied, then sharing is already
 968          * in the process of being broken so we have nothing
 969          * further to do here.
 970          */
 971         build_data_key(tc->td, lookup_result->block, &key);
 972         if (bio_detain_if_occupied(pool->prison, &key, bio, &cell))
 973                 return;
 974
 975         if (bio_data_dir(bio) == WRITE)
 976                 break_sharing(tc, bio, block, &key, lookup_result);
 977         else {
 978                 struct endio_hook *h;
 979                 h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
 980
 981                 h->tc = tc;
 982                 h->entry = ds_inc(&pool->ds);
 983                 save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
 984                 dm_get_mapinfo(bio)->ptr = h;
 985                 remap_and_issue(tc, bio, lookup_result->block);
 986         }
 987 }
 988
 989 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block)
 990 {
 991         int r;
 992         dm_block_t data_block;
 993         struct cell *cell;
 994         struct cell_key key;
 995
 996         /*
 997          * If cell is already occupied, then the block is already
 998          * being provisioned so we have nothing further to do here.
 999          */
1000         build_virtual_key(tc->td, block, &key);
1001         if (bio_detain(tc->pool->prison, &key, bio, &cell))
1002                 return;
1003
1004         r = alloc_data_block(tc, &data_block);
1005         switch (r) {
1006         case 0:
1007                 schedule_zero(tc, block, data_block, cell, bio);
1008                 break;
1009
1010         case -ENOSPC:
1011                 no_space(cell);
1012                 break;
1013
1014         default:
1015                 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1016                 cell_error(cell);
1017                 break;
1018         }
1019 }
1020
1021 static void process_bio(struct thin_c *tc, struct bio *bio)
1022 {
1023         int r;
1024         dm_block_t block = get_bio_block(tc, bio);
1025         struct dm_thin_lookup_result lookup_result;
1026
1027         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1028         switch (r) {
1029         case 0:
1030                 if (lookup_result.shared)
1031                         process_shared_bio(tc, bio, block, &lookup_result);
1032                 else
1033                         remap_and_issue(tc, bio, lookup_result.block);
1034                 break;
1035
1036         case -ENODATA:
1037                 /*
1038                  * When reading, we return zeroes regardless of the
1039                  * zero_new_blocks setting.
1040                  */
1041                 if (bio_data_dir(bio) == READ) {
1042                         zero_fill_bio(bio);
1043                         bio_endio(bio, 0);
1044                 } else
1045                         provision_block(tc, bio, block);
1046                 break;
1047
1048         default:
1049                 DMERR("dm_thin_find_block() failed, error = %d", r);
1050                 bio_io_error(bio);
1051                 break;
1052         }
1053 }
1054
1055 static void process_deferred_bios(struct pool *pool)
1056 {
1057         unsigned long flags;
1058         struct bio *bio;
1059         struct bio_list bios;
1060
1061         bio_list_init(&bios);
1062
1063         spin_lock_irqsave(&pool->lock, flags);
1064         bio_list_merge(&bios, &pool->deferred_bios);
1065         bio_list_init(&pool->deferred_bios);
1066         spin_unlock_irqrestore(&pool->lock, flags);
1067
1068         while ((bio = bio_list_pop(&bios))) {
1069                 struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
1070
1071                 if (bio->bi_rw & REQ_DISCARD)
1072                         process_discard(tc, bio);
1073                 else
1074                         process_bio(tc, bio);
1075         }
1076 }
1077
1078 static void process_prepared_mapping(struct new_mapping *m)
1079 {
1080         struct thin_c *tc = m->tc;
1081         struct bio *bio;
1082         int r;
1083
1084         if (m->err) {
1085                 cell_error(m->cell);
1086                 return;
1087         }
1088
1089         bio = m->bio;
1090         if (bio)
1091                 bio->bi_end_io = m->saved_bi_end_io;
1092
1093         r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
1094         if (r) {
1095                 DMERR("dm_thin_insert_block() failed");
1096                 cell_error(m->cell);
1097                 return;
1098         }
1099
1100         if (bio) {
1101                 cell_remap_and_issue_except(tc, m->cell, m->data_block, bio);
1102                 bio_endio(bio, 0);
1103         } else
1104                 cell_remap_and_issue(tc, m->cell, m->data_block);
1105
1106         list_del(&m->list);
1107         mempool_free(m, tc->pool->mapping_pool);
1108 }
1109
1110 static void process_prepared_mappings(struct pool *pool)
1111 {
1112         unsigned long flags;
1113         struct list_head maps;
1114         struct new_mapping *m, *tmp;
1115
1116         INIT_LIST_HEAD(&maps);
1117         spin_lock_irqsave(&pool->lock, flags);
1118         list_splice_init(&pool->prepared_mappings, &maps);
1119         spin_unlock_irqrestore(&pool->lock, flags);
1120
1121         list_for_each_entry_safe(m, tmp, &maps, list)
1122                 process_prepared_mapping(m);
1123 }
1124
1125 static void do_producer(struct work_struct *ws)
1126 {
1127         struct pool *pool = container_of(ws, struct pool, producer);
1128
1129         process_deferred_bios(pool);
1130 }
1131
1132 static void do_consumer(struct work_struct *ws)
1133 {
1134         struct pool *pool = container_of(ws, struct pool, consumer);
1135
1136         process_prepared_mappings(pool);
1137 }
1138
1139 static void defer_bio(struct thin_c *tc, struct bio *bio)
1140 {
1141         unsigned long flags;
1142         struct pool *pool = tc->pool;
1143
1144         spin_lock_irqsave(&pool->lock, flags);
1145         bio_list_add(&pool->deferred_bios, bio);
1146         spin_unlock_irqrestore(&pool->lock, flags);
1147
1148         wake_producer(pool);
1149 }
1150
1151 /*
1152  * Non-blocking function designed to be called from the target's map
1153  * function.
1154  */
1155 static int bio_map(struct dm_target *ti, struct bio *bio,
1156                    union map_info *map_context)
1157 {
1158         int r;
1159         struct thin_c *tc = ti->private;
1160         dm_block_t block = get_bio_block(tc, bio);
1161         struct dm_thin_device *td = tc->td;
1162         struct pool *pool = tc->pool;
1163         struct dm_thin_lookup_result result;
1164
1165         /*
1166          * FIXME(hch): In theory higher level code should prevent this
1167          * from happening, not sure why we ever get here.
1168          */
1169         if ((bio->bi_rw & REQ_DISCARD) &&
1170             bio->bi_size < (pool->sectors_per_block << SECTOR_SHIFT)) {
1171                 DMERR("discard IO smaller than pool block size (%llu)",
1172                       (unsigned long long)pool->sectors_per_block << SECTOR_SHIFT);
1173                 bio_endio(bio, 0);
1174                 return DM_MAPIO_SUBMITTED;
1175         }
1176
1177         /*
1178          * Save the thin context for easy access from the deferred bio later.
1179          */
1180         map_context->ptr = tc;
1181
1182         if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1183                 defer_bio(tc, bio);
1184                 return DM_MAPIO_SUBMITTED;
1185         }
1186
1187         r = dm_thin_find_block(td, block, 0, &result);
1188
1189         /*
1190          * Note that we defer readahead too.
1191          */
1192         switch (r) {
1193         case 0:
1194                 if (unlikely(result.shared)) {
1195                         /*
1196                          * We have a race condition here between the
1197                          * result.shared value returned by the lookup and
1198                          * snapshot creation, which may cause new
1199                          * sharing.
1200                          *
1201                          * To avoid this always quiesce the origin before
1202                          * taking the snap.  You want to do this anyway to
1203                          * ensure a consistent application view
1204                          * (i.e. lockfs).
1205                          *
1206                          * More distant ancestors are irrelevant, the
1207                          * shared flag will be set in their case.
1208                          */
1209                         defer_bio(tc, bio);
1210                         r = DM_MAPIO_SUBMITTED;
1211                 } else {
1212                         remap(tc, bio, result.block);
1213                         r = DM_MAPIO_REMAPPED;
1214                 }
1215                 break;
1216
1217         case -ENODATA:
1218                 /*
1219                  * In future, the failed dm_thin_find_block above could
1220                  * provide the hint to load the metadata into cache.
1221                  *
1222                  * When reading, we return zeroes regardless of the
1223                  * zero_new_blocks setting.
1224                  */
1225                 if (bio_data_dir(bio) == READ) {
1226                         zero_fill_bio(bio);
1227                         bio_endio(bio, 0);
1228                 } else
1229                         defer_bio(tc, bio);
1230                 r = DM_MAPIO_SUBMITTED;
1231                 break;
1232
1233         case -EWOULDBLOCK:
1234                 defer_bio(tc, bio);
1235                 r = DM_MAPIO_SUBMITTED;
1236                 break;
1237         }
1238
1239         return r;
1240 }
1241
1242 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1243 {
1244         int r;
1245         unsigned long flags;
1246         struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1247
1248         spin_lock_irqsave(&pt->pool->lock, flags);
1249         r = !bio_list_empty(&pt->pool->retry_list);
1250         spin_unlock_irqrestore(&pt->pool->lock, flags);
1251
1252         if (!r) {
1253                 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1254                 r = bdi_congested(&q->backing_dev_info, bdi_bits);
1255         }
1256
1257         return r;
1258 }
1259
1260 static void __requeue_bios(struct pool *pool)
1261 {
1262         bio_list_merge(&pool->deferred_bios, &pool->retry_list);
1263         bio_list_init(&pool->retry_list);
1264 }
1265
1266 /*----------------------------------------------------------------
1267  * Binding of control targets to a pool object
1268  *--------------------------------------------------------------*/
1269 /* FIXME: add locking */
1270 static int bind_control_target(struct pool *pool, struct dm_target *ti)
1271 {
1272         struct pool_c *pt = ti->private;
1273
1274         pool->ti = ti;
1275         pool->low_water_mark = dm_sector_div_up(pt->low_water_mark,
1276                                                 pool->sectors_per_block);
1277         pool->zero_new_blocks = pt->zero_new_blocks;
1278         dm_pool_rebind_metadata_device(pool->pmd, pt->metadata_dev->bdev);
1279
1280         return 0;
1281 }
1282
1283 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1284 {
1285         if (pool->ti == ti)
1286                 pool->ti = NULL;
1287 }
1288
1289 /*----------------------------------------------------------------
1290  * Pool creation
1291  *--------------------------------------------------------------*/
1292 static void pool_destroy(struct pool *pool)
1293 {
1294         if (dm_pool_metadata_close(pool->pmd) < 0)
1295                 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1296
1297         prison_destroy(pool->prison);
1298         dm_kcopyd_client_destroy(pool->copier);
1299
1300         if (pool->producer_wq)
1301                 destroy_workqueue(pool->producer_wq);
1302
1303         if (pool->consumer_wq)
1304                 destroy_workqueue(pool->consumer_wq);
1305
1306         mempool_destroy(pool->mapping_pool);
1307         mempool_destroy(pool->endio_hook_pool);
1308         kfree(pool);
1309 }
1310
1311 static struct pool *pool_create(struct block_device *metadata_dev,
1312                                 unsigned long block_size, char **error)
1313 {
1314         int r;
1315         void *err_p;
1316         struct pool *pool;
1317         struct dm_pool_metadata *pmd;
1318
1319         pmd = dm_pool_metadata_open(metadata_dev, block_size);
1320         if (IS_ERR(pmd)) {
1321                 *error = "Error creating metadata object";
1322                 return (struct pool *)pmd;
1323         }
1324
1325         pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1326         if (!pool) {
1327                 *error = "Error allocating memory for pool";
1328                 err_p = ERR_PTR(-ENOMEM);
1329                 goto bad_pool;
1330         }
1331
1332         pool->pmd = pmd;
1333         pool->sectors_per_block = block_size;
1334         pool->block_shift = ffs(block_size) - 1;
1335         pool->offset_mask = block_size - 1;
1336         pool->low_water_mark = 0;
1337         pool->zero_new_blocks = 1;
1338         pool->prison = prison_create(PRISON_CELLS);
1339         if (!pool->prison) {
1340                 *error = "Error creating pool's bio prison";
1341                 err_p = ERR_PTR(-ENOMEM);
1342                 goto bad_prison;
1343         }
1344
1345         pool->copier = dm_kcopyd_client_create();
1346         if (IS_ERR(pool->copier)) {
1347                 r = PTR_ERR(pool->copier);
1348                 *error = "Error creating pool's kcopyd client";
1349                 err_p = ERR_PTR(r);
1350                 goto bad_kcopyd_client;
1351         }
1352
1353         /*
1354          * Create singlethreaded workqueues that will service all devices
1355          * that use this metadata.
1356          */
1357         pool->producer_wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX "-producer",
1358                                                     WQ_MEM_RECLAIM);
1359         if (!pool->producer_wq) {
1360                 *error = "Error creating pool's producer workqueue";
1361                 err_p = ERR_PTR(-ENOMEM);
1362                 goto bad_producer_wq;
1363         }
1364
1365         pool->consumer_wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX "-consumer",
1366                                                     WQ_MEM_RECLAIM);
1367         if (!pool->consumer_wq) {
1368                 *error = "Error creating pool's consumer workqueue";
1369                 err_p = ERR_PTR(-ENOMEM);
1370                 goto bad_consumer_wq;
1371         }
1372
1373         INIT_WORK(&pool->producer, do_producer);
1374         INIT_WORK(&pool->consumer, do_consumer);
1375         spin_lock_init(&pool->lock);
1376         bio_list_init(&pool->deferred_bios);
1377         INIT_LIST_HEAD(&pool->prepared_mappings);
1378         pool->low_water_triggered = 0;
1379         bio_list_init(&pool->retry_list);
1380         ds_init(&pool->ds);
1381
1382         pool->mapping_pool =
1383                 mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping));
1384         if (!pool->mapping_pool) {
1385                 *error = "Error creating pool's mapping mempool";
1386                 err_p = ERR_PTR(-ENOMEM);
1387                 goto bad_mapping_pool;
1388         }
1389
1390         pool->endio_hook_pool =
1391                 mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook));
1392         if (!pool->endio_hook_pool) {
1393                 *error = "Error creating pool's endio_hook mempool";
1394                 err_p = ERR_PTR(-ENOMEM);
1395                 goto bad_endio_hook_pool;
1396         }
1397         atomic_set(&pool->ref_count, 1);
1398
1399         return pool;
1400
1401 bad_endio_hook_pool:
1402         mempool_destroy(pool->mapping_pool);
1403 bad_mapping_pool:
1404         destroy_workqueue(pool->consumer_wq);
1405 bad_consumer_wq:
1406         destroy_workqueue(pool->producer_wq);
1407 bad_producer_wq:
1408         dm_kcopyd_client_destroy(pool->copier);
1409 bad_kcopyd_client:
1410         prison_destroy(pool->prison);
1411 bad_prison:
1412         kfree(pool);
1413 bad_pool:
1414         if (dm_pool_metadata_close(pmd))
1415                 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1416
1417         return err_p;
1418 }
1419
1420 static void pool_inc(struct pool *pool)
1421 {
1422         atomic_inc(&pool->ref_count);
1423 }
1424
1425 static void pool_dec(struct pool *pool)
1426 {
1427         if (atomic_dec_and_test(&pool->ref_count))
1428                 pool_destroy(pool);
1429 }
1430
1431 static struct pool *pool_find(struct mapped_device *pool_md,
1432                               struct block_device *metadata_dev,
1433                               unsigned long block_size,
1434                               char **error)
1435 {
1436         struct pool *pool;
1437
1438         pool = pool_table_lookup(pool_md);
1439         if (pool)
1440                 pool_inc(pool);
1441         else
1442                 pool = pool_create(metadata_dev, block_size, error);
1443
1444         return pool;
1445 }
1446
1447 /*----------------------------------------------------------------
1448  * Pool target methods
1449  *--------------------------------------------------------------*/
1450 static void pool_dtr(struct dm_target *ti)
1451 {
1452         struct pool_c *pt = ti->private;
1453
1454         dm_put_device(ti, pt->metadata_dev);
1455         dm_put_device(ti, pt->data_dev);
1456         unbind_control_target(pt->pool, ti);
1457         pool_dec(pt->pool);
1458         kfree(pt);
1459 }
1460
1461 struct pool_features {
1462         unsigned zero_new_blocks:1;
1463 };
1464
1465 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1466                                struct dm_target *ti)
1467 {
1468         int r;
1469         unsigned argc;
1470         const char *arg_name;
1471
1472         static struct dm_arg _args[] = {
1473                 {0, 1, "Invalid number of pool feature arguments"},
1474         };
1475
1476         /*
1477          * No feature arguments supplied.
1478          */
1479         if (!as->argc)
1480                 return 0;
1481
1482         r = dm_read_arg_group(_args, as, &argc, &ti->error);
1483         if (r)
1484                 return -EINVAL;
1485
1486         while (argc && !r) {
1487                 arg_name = dm_shift_arg(as);
1488                 argc--;
1489
1490                 if (!strcasecmp(arg_name, "skip_block_zeroing")) {
1491                         pf->zero_new_blocks = 0;
1492                         continue;
1493                 }
1494
1495                 ti->error = "Unrecognised pool feature requested";
1496                 r = -EINVAL;
1497         }
1498
1499         return r;
1500 }
1501
1502 /*
1503  * thin-pool <metadata dev> <data dev>
1504  *           <data block size (sectors)>
1505  *           <low water mark (sectors)>
1506  *           [<#feature args> [<arg>]*]
1507  *
1508  * Optional feature arguments are:
1509  *           skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1510  */
1511 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1512 {
1513         int r;
1514         struct pool_c *pt;
1515         struct pool *pool;
1516         struct pool_features pf;
1517         struct dm_arg_set as;
1518         struct dm_dev *data_dev;
1519         unsigned long block_size;
1520         dm_block_t low_water;
1521         struct dm_dev *metadata_dev;
1522         sector_t metadata_dev_size;
1523
1524         if (argc < 4) {
1525                 ti->error = "Invalid argument count";
1526                 return -EINVAL;
1527         }
1528         as.argc = argc;
1529         as.argv = argv;
1530
1531         r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1532         if (r) {
1533                 ti->error = "Error opening metadata block device";
1534                 return r;
1535         }
1536
1537         metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1538         if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) {
1539                 ti->error = "Metadata device is too large";
1540                 r = -EINVAL;
1541                 goto out_metadata;
1542         }
1543
1544         r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1545         if (r) {
1546                 ti->error = "Error getting data device";
1547                 goto out_metadata;
1548         }
1549
1550         if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1551             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1552             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1553             !is_power_of_2(block_size)) {
1554                 ti->error = "Invalid block size";
1555                 r = -EINVAL;
1556                 goto out;
1557         }
1558
1559         if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water) ||
1560             !low_water) {
1561                 ti->error = "Invalid low water mark";
1562                 r = -EINVAL;
1563                 goto out;
1564         }
1565
1566         /*
1567          * Set default pool features.
1568          */
1569         memset(&pf, 0, sizeof(pf));
1570         pf.zero_new_blocks = 1;
1571
1572         dm_consume_args(&as, 4);
1573         r = parse_pool_features(&as, &pf, ti);
1574         if (r)
1575                 goto out;
1576
1577         pool = pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1578                          block_size, &ti->error);
1579         if (IS_ERR(pool)) {
1580                 r = PTR_ERR(pool);
1581                 goto out;
1582         }
1583
1584         pt = kmalloc(sizeof(*pt), GFP_KERNEL);
1585         if (!pt) {
1586                 pool_destroy(pool);
1587                 r = -ENOMEM;
1588                 goto out;
1589         }
1590         pt->pool = pool;
1591         pt->ti = ti;
1592         pt->metadata_dev = metadata_dev;
1593         pt->data_dev = data_dev;
1594         pt->low_water_mark = low_water;
1595         pt->zero_new_blocks = pf.zero_new_blocks;
1596         ti->num_flush_requests = 1;
1597         ti->num_discard_requests = 1;
1598         ti->private = pt;
1599
1600         pt->callbacks.congested_fn = pool_is_congested;
1601         dm_table_add_target_callbacks(ti->table, &pt->callbacks);
1602
1603         return 0;
1604
1605 out:
1606         dm_put_device(ti, data_dev);
1607 out_metadata:
1608         dm_put_device(ti, metadata_dev);
1609
1610         return r;
1611 }
1612
1613 static int pool_map(struct dm_target *ti, struct bio *bio,
1614                     union map_info *map_context)
1615 {
1616         int r;
1617         struct pool_c *pt = ti->private;
1618         struct pool *pool = pt->pool;
1619         unsigned long flags;
1620
1621         spin_lock_irqsave(&pool->lock, flags);
1622         bio->bi_bdev = pt->data_dev->bdev;
1623         r = DM_MAPIO_REMAPPED;
1624         spin_unlock_irqrestore(&pool->lock, flags);
1625
1626         return r;
1627 }
1628
1629 /*
1630  * Retrieves the number of blocks of the data device from
1631  * the superblock and compares it to the actual device size,
1632  * thus resizing the data device in case it has grown.
1633  *
1634  * This both copes with opening preallocated data devices in the ctr
1635  * being followed by a resume
1636  * -and-
1637  * calling the resume method individually after userspace has
1638  * grown the data device in reaction to a table event.
1639  */
1640 static int pool_preresume(struct dm_target *ti)
1641 {
1642         int r;
1643         struct pool_c *pt = ti->private;
1644         struct pool *pool = pt->pool;
1645         dm_block_t data_size, sb_data_size;
1646         unsigned long flags;
1647
1648         /*
1649          * Take control of the pool object.
1650          */
1651         r = bind_control_target(pool, ti);
1652         if (r)
1653                 return r;
1654
1655         data_size = ti->len >> pool->block_shift;
1656         r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
1657         if (r) {
1658                 DMERR("failed to retrieve data device size");
1659                 return r;
1660         }
1661
1662         if (data_size < sb_data_size) {
1663                 DMERR("pool target too small, is %llu blocks (expected %llu)",
1664                       data_size, sb_data_size);
1665                 return -EINVAL;
1666
1667         } else if (data_size > sb_data_size) {
1668                 r = dm_pool_resize_data_dev(pool->pmd, data_size);
1669                 if (r) {
1670                         DMERR("failed to resize data device");
1671                         return r;
1672                 }
1673
1674                 r = dm_pool_commit_metadata(pool->pmd);
1675                 if (r) {
1676                         DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1677                               __func__, r);
1678                         return r;
1679                 }
1680         }
1681
1682         spin_lock_irqsave(&pool->lock, flags);
1683         pool->low_water_triggered = 0;
1684         __requeue_bios(pool);
1685         spin_unlock_irqrestore(&pool->lock, flags);
1686
1687         wake_producer(pool);
1688
1689         /*
1690          * The pool object is only present if the pool is active.
1691          */
1692         pool->pool_md = dm_table_get_md(ti->table);
1693         pool_table_insert(pool);
1694
1695         return 0;
1696 }
1697
1698 static void pool_postsuspend(struct dm_target *ti)
1699 {
1700         int r;
1701         struct pool_c *pt = ti->private;
1702         struct pool *pool = pt->pool;
1703
1704         flush_workqueue(pool->producer_wq);
1705         flush_workqueue(pool->consumer_wq);
1706
1707         r = dm_pool_commit_metadata(pool->pmd);
1708         if (r < 0) {
1709                 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1710                       __func__, r);
1711                 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
1712         }
1713
1714         pool_table_remove(pool);
1715         pool->pool_md = NULL;
1716 }
1717
1718 static int check_arg_count(unsigned argc, unsigned args_required)
1719 {
1720         if (argc != args_required) {
1721                 DMWARN("Message received with %u arguments instead of %u.",
1722                        argc, args_required);
1723                 return -EINVAL;
1724         }
1725
1726         return 0;
1727 }
1728
1729 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
1730 {
1731         if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
1732             *dev_id <= MAX_DEV_ID)
1733                 return 0;
1734
1735         if (warning)
1736                 DMWARN("Message received with invalid device id: %s", arg);
1737
1738         return -EINVAL;
1739 }
1740
1741 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
1742 {
1743         dm_thin_id dev_id;
1744         int r;
1745
1746         r = check_arg_count(argc, 2);
1747         if (r)
1748                 return r;
1749
1750         r = read_dev_id(argv[1], &dev_id, 1);
1751         if (r)
1752                 return r;
1753
1754         r = dm_pool_create_thin(pool->pmd, dev_id);
1755         if (r) {
1756                 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
1757                        argv[1]);
1758                 return r;
1759         }
1760
1761         return 0;
1762 }
1763
1764 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
1765 {
1766         dm_thin_id dev_id;
1767         dm_thin_id origin_dev_id;
1768         int r;
1769
1770         r = check_arg_count(argc, 3);
1771         if (r)
1772                 return r;
1773
1774         r = read_dev_id(argv[1], &dev_id, 1);
1775         if (r)
1776                 return r;
1777
1778         r = read_dev_id(argv[2], &origin_dev_id, 1);
1779         if (r)
1780                 return r;
1781
1782         r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
1783         if (r) {
1784                 DMWARN("Creation of new snapshot %s of device %s failed.",
1785                        argv[1], argv[2]);
1786                 return r;
1787         }
1788
1789         return 0;
1790 }
1791
1792 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
1793 {
1794         dm_thin_id dev_id;
1795         int r;
1796
1797         r = check_arg_count(argc, 2);
1798         if (r)
1799                 return r;
1800
1801         r = read_dev_id(argv[1], &dev_id, 1);
1802         if (r)
1803                 return r;
1804
1805         r = dm_pool_delete_thin_device(pool->pmd, dev_id);
1806         if (r)
1807                 DMWARN("Deletion of thin device %s failed.", argv[1]);
1808
1809         return r;
1810 }
1811
1812 static int process_trim_mesg(unsigned argc, char **argv, struct pool *pool)
1813 {
1814         dm_thin_id dev_id;
1815         sector_t new_size;
1816         int r;
1817
1818         r = check_arg_count(argc, 3);
1819         if (r)
1820                 return r;
1821
1822         r = read_dev_id(argv[1], &dev_id, 1);
1823         if (r)
1824                 return r;
1825
1826         if (kstrtoull(argv[2], 10, (unsigned long long *)&new_size)) {
1827                 DMWARN("trim device %s: Invalid new size: %s sectors.",
1828                        argv[1], argv[2]);
1829                 return -EINVAL;
1830         }
1831
1832         r = dm_pool_trim_thin_device(pool->pmd, dev_id,
1833                         dm_sector_div_up(new_size, pool->sectors_per_block));
1834         if (r)
1835                 DMWARN("Attempt to trim thin device %s failed.", argv[1]);
1836
1837         return r;
1838 }
1839
1840 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
1841 {
1842         dm_thin_id old_id, new_id;
1843         int r;
1844
1845         r = check_arg_count(argc, 3);
1846         if (r)
1847                 return r;
1848
1849         if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
1850                 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
1851                 return -EINVAL;
1852         }
1853
1854         if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
1855                 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
1856                 return -EINVAL;
1857         }
1858
1859         r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
1860         if (r) {
1861                 DMWARN("Failed to change transaction id from %s to %s.",
1862                        argv[1], argv[2]);
1863                 return r;
1864         }
1865
1866         return 0;
1867 }
1868
1869 /*
1870  * Messages supported:
1871  *   create_thin        <dev_id>
1872  *   create_snap        <dev_id> <origin_id>
1873  *   delete             <dev_id>
1874  *   trim               <dev_id> <new_size_in_sectors>
1875  *   set_transaction_id <current_trans_id> <new_trans_id>
1876  */
1877 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
1878 {
1879         int r = -EINVAL;
1880         struct pool_c *pt = ti->private;
1881         struct pool *pool = pt->pool;
1882
1883         if (!strcasecmp(argv[0], "create_thin"))
1884                 r = process_create_thin_mesg(argc, argv, pool);
1885
1886         else if (!strcasecmp(argv[0], "create_snap"))
1887                 r = process_create_snap_mesg(argc, argv, pool);
1888
1889         else if (!strcasecmp(argv[0], "delete"))
1890                 r = process_delete_mesg(argc, argv, pool);
1891
1892         else if (!strcasecmp(argv[0], "trim"))
1893                 r = process_trim_mesg(argc, argv, pool);
1894
1895         else if (!strcasecmp(argv[0], "set_transaction_id"))
1896                 r = process_set_transaction_id_mesg(argc, argv, pool);
1897
1898         else
1899                 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
1900
1901         if (!r) {
1902                 r = dm_pool_commit_metadata(pool->pmd);
1903                 if (r)
1904                         DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
1905                               argv[0], r);
1906         }
1907
1908         return r;
1909 }
1910
1911 /*
1912  * Status line is:
1913  *    <transaction id> <free metadata space in sectors>
1914  *    <free data space in sectors> <held metadata root>
1915  */
1916 static int pool_status(struct dm_target *ti, status_type_t type,
1917                        char *result, unsigned maxlen)
1918 {
1919         int r;
1920         unsigned sz = 0;
1921         uint64_t transaction_id;
1922         dm_block_t nr_free_blocks_data;
1923         dm_block_t nr_free_blocks_metadata;
1924         dm_block_t held_root;
1925         char buf[BDEVNAME_SIZE];
1926         char buf2[BDEVNAME_SIZE];
1927         struct pool_c *pt = ti->private;
1928         struct pool *pool = pt->pool;
1929
1930         switch (type) {
1931         case STATUSTYPE_INFO:
1932                 r = dm_pool_get_metadata_transaction_id(pool->pmd,
1933                                                         &transaction_id);
1934                 if (r)
1935                         return r;
1936
1937                 r = dm_pool_get_free_metadata_block_count(pool->pmd,
1938                                                           &nr_free_blocks_metadata);
1939                 if (r)
1940                         return r;
1941
1942                 r = dm_pool_get_free_block_count(pool->pmd,
1943                                                  &nr_free_blocks_data);
1944                 if (r)
1945                         return r;
1946
1947                 r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
1948                 if (r)
1949                         return r;
1950
1951                 DMEMIT("%llu %llu %llu ", (unsigned long long)transaction_id,
1952                        (unsigned long long)nr_free_blocks_metadata * pool->sectors_per_block,
1953                        (unsigned long long)nr_free_blocks_data * pool->sectors_per_block);
1954
1955                 if (held_root)
1956                         DMEMIT("%llu", held_root);
1957                 else
1958                         DMEMIT("-");
1959
1960                 break;
1961
1962         case STATUSTYPE_TABLE:
1963                 DMEMIT("%s %s %lu %llu ",
1964                        format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
1965                        format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
1966                        (unsigned long)pool->sectors_per_block,
1967                        (unsigned long long)pt->low_water_mark);
1968
1969                 DMEMIT("%u ", !pool->zero_new_blocks);
1970
1971                 if (!pool->zero_new_blocks)
1972                         DMEMIT("skip_block_zeroing ");
1973                 break;
1974         }
1975
1976         return 0;
1977 }
1978
1979 static int pool_iterate_devices(struct dm_target *ti,
1980                                 iterate_devices_callout_fn fn, void *data)
1981 {
1982         struct pool_c *pt = ti->private;
1983
1984         return fn(ti, pt->data_dev, 0, ti->len, data);
1985 }
1986
1987 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
1988                       struct bio_vec *biovec, int max_size)
1989 {
1990         struct pool_c *pt = ti->private;
1991         struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1992
1993         if (!q->merge_bvec_fn)
1994                 return max_size;
1995
1996         bvm->bi_bdev = pt->data_dev->bdev;
1997
1998         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
1999 }
2000
2001 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2002 {
2003         struct pool_c *pt = ti->private;
2004         struct pool *pool = pt->pool;
2005
2006         blk_limits_io_min(limits, 0);
2007         blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2008 }
2009
2010 static struct target_type pool_target = {
2011         .name = "thin-pool",
2012         .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE,
2013         .version = {1, 0, 0},
2014         .module = THIS_MODULE,
2015         .ctr = pool_ctr,
2016         .dtr = pool_dtr,
2017         .map = pool_map,
2018         .postsuspend = pool_postsuspend,
2019         .preresume = pool_preresume,
2020         .message = pool_message,
2021         .status = pool_status,
2022         .merge = pool_merge,
2023         .iterate_devices = pool_iterate_devices,
2024         .io_hints = pool_io_hints,
2025 };
2026
2027 /*----------------------------------------------------------------*/
2028
2029 static void thin_dtr(struct dm_target *ti)
2030 {
2031         struct thin_c *tc = ti->private;
2032
2033         pool_dec(tc->pool);
2034         dm_pool_close_thin_device(tc->td);
2035         dm_put_device(ti, tc->pool_dev);
2036         kfree(tc);
2037 }
2038
2039 /*
2040  * Thin target parameters:
2041  *
2042  * <pool_dev> <dev_id>
2043  *
2044  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2045  * dev_id: the internal device identifier
2046  */
2047 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2048 {
2049         int r;
2050         struct thin_c *tc;
2051         struct dm_dev *pool_dev;
2052         struct mapped_device *pool_md;
2053
2054         if (argc != 2) {
2055                 ti->error = "Invalid argument count";
2056                 return -EINVAL;
2057         }
2058
2059         tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2060         if (!tc) {
2061                 ti->error = "Out of memory";
2062                 return -ENOMEM;
2063         }
2064
2065         r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2066         if (r) {
2067                 ti->error = "Error opening pool device";
2068                 goto bad_pool_dev;
2069         }
2070         tc->pool_dev = pool_dev;
2071
2072         if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2073                 ti->error = "Invalid device id";
2074                 r = -EINVAL;
2075                 goto bad_common;
2076         }
2077
2078         pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2079         if (!pool_md) {
2080                 ti->error = "Couldn't get pool mapped device";
2081                 r = -EINVAL;
2082                 goto bad_common;
2083         }
2084
2085         tc->pool = pool_table_lookup(pool_md);
2086         if (!tc->pool) {
2087                 ti->error = "Couldn't find pool object";
2088                 r = -EINVAL;
2089                 goto bad_pool_lookup;
2090         }
2091         pool_inc(tc->pool);
2092         dm_put(pool_md);
2093
2094         r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2095         if (r) {
2096                 ti->error = "Couldn't open thin internal device";
2097                 goto bad_thin_open;
2098         }
2099
2100         ti->split_io = tc->pool->sectors_per_block;
2101         ti->num_flush_requests = 1;
2102         ti->num_discard_requests = 1;
2103         /*
2104          * allow discards to issued to the thin device even
2105          * if the pool's data device doesn't support them.
2106          */
2107         ti->discards_supported = 1;
2108
2109         return 0;
2110
2111 bad_thin_open:
2112         pool_dec(tc->pool);
2113 bad_pool_lookup:
2114         dm_put(pool_md);
2115 bad_common:
2116         dm_put_device(ti, tc->pool_dev);
2117 bad_pool_dev:
2118         kfree(tc);
2119
2120         return r;
2121 }
2122
2123 static int thin_map(struct dm_target *ti, struct bio *bio,
2124                     union map_info *map_context)
2125 {
2126         bio->bi_sector -= ti->begin;
2127
2128         return bio_map(ti, bio, map_context);
2129 }
2130
2131 /*
2132  * <nr mapped sectors> <highest mapped sector>
2133  */
2134 static int thin_status(struct dm_target *ti, status_type_t type,
2135                        char *result, unsigned maxlen)
2136 {
2137         int r;
2138         ssize_t sz = 0;
2139         dm_block_t mapped, highest;
2140         char buf[BDEVNAME_SIZE];
2141         struct thin_c *tc = ti->private;
2142
2143         if (!tc->td)
2144                 DMEMIT("-");
2145         else {
2146                 switch (type) {
2147                 case STATUSTYPE_INFO:
2148                         r = dm_thin_get_mapped_count(tc->td, &mapped);
2149                         if (r)
2150                                 return r;
2151
2152                         r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2153                         if (r < 0)
2154                                 return r;
2155
2156                         DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2157                         if (r)
2158                                 DMEMIT("%llu", ((highest + 1) *
2159                                                 tc->pool->sectors_per_block) - 1);
2160                         else
2161                                 DMEMIT("-");
2162                         break;
2163
2164                 case STATUSTYPE_TABLE:
2165                         DMEMIT("%s %lu",
2166                                format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2167                                (unsigned long) tc->dev_id);
2168                         break;
2169                 }
2170         }
2171
2172         return 0;
2173 }
2174
2175 static int thin_iterate_devices(struct dm_target *ti,
2176                                 iterate_devices_callout_fn fn, void *data)
2177 {
2178         struct thin_c *tc = ti->private;
2179
2180         return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block, data);
2181 }
2182
2183 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2184 {
2185         struct thin_c *tc = ti->private;
2186
2187         blk_limits_io_min(limits, 0);
2188         blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
2189
2190         /*
2191          * Only allow discard requests aligned to our block size, and make
2192          * sure that we never get sent larger discard requests either.
2193          */
2194         limits->max_discard_sectors = tc->pool->sectors_per_block;
2195         limits->discard_granularity = tc->pool->sectors_per_block << SECTOR_SHIFT;
2196 }
2197
2198 static struct target_type thin_target = {
2199         .name = "thin",
2200         .version = {1, 0, 0},
2201         .module = THIS_MODULE,
2202         .ctr = thin_ctr,
2203         .dtr = thin_dtr,
2204         .map = thin_map,
2205         .status = thin_status,
2206         .iterate_devices = thin_iterate_devices,
2207         .io_hints = thin_io_hints,
2208 };
2209
2210 /*----------------------------------------------------------------*/
2211
2212 static int __init dm_thin_init(void)
2213 {
2214         int r;
2215
2216         pool_table_init();
2217
2218         r = dm_register_target(&thin_target);
2219         if (r)
2220                 return r;
2221
2222         r = dm_register_target(&pool_target);
2223         if (r)
2224                 dm_unregister_target(&thin_target);
2225
2226         return r;
2227 }
2228
2229 static void dm_thin_exit(void)
2230 {
2231         dm_unregister_target(&thin_target);
2232         dm_unregister_target(&pool_target);
2233 }
2234
2235 module_init(dm_thin_init);
2236 module_exit(dm_thin_exit);
2237
2238 MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target");
2239 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2240 MODULE_LICENSE("GPL");
2241
2242 /*----------------------------------------------------------------*/