drivers/md/dm-thin.c

   1 /*
   2  * Copyright (C) 2011 Red Hat UK.  All rights reserved.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm-thin-metadata.h"
   8
   9 #include <linux/device-mapper.h>
  10 #include <linux/dm-io.h>
  11 #include <linux/dm-kcopyd.h>
  12 #include <linux/list.h>
  13 #include <linux/init.h>
  14 #include <linux/module.h>
  15 #include <linux/slab.h>
  16
  17 #define DM_MSG_PREFIX   "thin"
  18
  19 /*
  20  * Tunable constants
  21  */
  22 #define ENDIO_HOOK_POOL_SIZE 10240
  23 #define DEFERRED_SET_SIZE 64
  24 #define MAPPING_POOL_SIZE 1024
  25 #define PRISON_CELLS 1024
  26
  27 /*
  28  * The block size of the device holding pool data must be
  29  * between 64KB and 1GB.
  30  */
  31 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
  32 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
  33
  34 #define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * 8)
  35
  36 /*
  37  * Device id is restricted to 24 bits.
  38  */
  39 #define MAX_DEV_ID ((1 << 24) - 1)
  40
  41 /*
  42  * How do we handle breaking sharing of data blocks?
  43  * =================================================
  44  *
  45  * We use a standard copy-on-write btree to store the mappings for the
  46  * devices (note I'm talking about copy-on-write of the metadata here, not
  47  * the data).  When you take an internal snapshot you clone the root node
  48  * of the origin btree.  After this there is no concept of an origin or a
  49  * snapshot.  They are just two device trees that happen to point to the
  50  * same data blocks.
  51  *
  52  * When we get a write in we decide if it's to a shared data block using
  53  * some timestamp magic.  If it is, we have to break sharing.
  54  *
  55  * Let's say we write to a shared block in what was the origin.  The
  56  * steps are:
  57  *
  58  * i) plug io further to this physical block. (see bio_prison code).
  59  *
  60  * ii) quiesce any read io to that shared data block.  Obviously
  61  * including all devices that share this block.  (see deferred_set code)
  62  *
  63  * iii) copy the data block to a newly allocate block.  This step can be
  64  * missed out if the io covers the block. (schedule_copy).
  65  *
  66  * iv) insert the new mapping into the origin's btree
  67  * (process_prepared_mappings).  This act of inserting breaks some
  68  * sharing of btree nodes between the two devices.  Breaking sharing only
  69  * effects the btree of that specific device.  Btrees for the other
  70  * devices that share the block never change.  The btree for the origin
  71  * device as it was after the last commit is untouched, ie. we're using
  72  * persistent data structures in the functional programming sense.
  73  *
  74  * v) unplug io to this physical block, including the io that triggered
  75  * the breaking of sharing.
  76  *
  77  * Steps (ii) and (iii) occur in parallel.
  78  *
  79  * The metadata _doesn't_ need to be committed before the io continues.  We
  80  * get away with this because the io is always written to a _new_ block.
  81  * If there's a crash, then:
  82  *
  83  * - The origin mapping will point to the old origin block (the shared
  84  * one).  This will contain the data as it was before the io that triggered
  85  * the breaking of sharing came in.
  86  *
  87  * - The snap mapping still points to the old block.  As it would after
  88  * the commit.
  89  *
  90  * The downside of this scheme is the timestamp magic isn't perfect, and
  91  * will continue to think that data block in the snapshot device is shared
  92  * even after the write to the origin has broken sharing.  I suspect data
  93  * blocks will typically be shared by many different devices, so we're
  94  * breaking sharing n + 1 times, rather than n, where n is the number of
  95  * devices that reference this data block.  At the moment I think the
  96  * benefits far, far outweigh the disadvantages.
  97  */
  98
  99 /*----------------------------------------------------------------*/
 100
 101 /*
 102  * Sometimes we can't deal with a bio straight away.  We put them in prison
 103  * where they can't cause any mischief.  Bios are put in a cell identified
 104  * by a key, multiple bios can be in the same cell.  When the cell is
 105  * subsequently unlocked the bios become available.
 106  */
 107 struct bio_prison;
 108
 109 struct cell_key {
 110         int virtual;
 111         dm_thin_id dev;
 112         dm_block_t block;
 113 };
 114
 115 struct cell {
 116         struct hlist_node list;
 117         struct bio_prison *prison;
 118         struct cell_key key;
 119         unsigned count;
 120         struct bio_list bios;
 121 };
 122
 123 struct bio_prison {
 124         spinlock_t lock;
 125         mempool_t *cell_pool;
 126
 127         unsigned nr_buckets;
 128         unsigned hash_mask;
 129         struct hlist_head *cells;
 130 };
 131
 132 static uint32_t calc_nr_buckets(unsigned nr_cells)
 133 {
 134         uint32_t n = 128;
 135
 136         nr_cells /= 4;
 137         nr_cells = min(nr_cells, 8192u);
 138
 139         while (n < nr_cells)
 140                 n <<= 1;
 141
 142         return n;
 143 }
 144
 145 /*
 146  * @nr_cells should be the number of cells you want in use _concurrently_.
 147  * Don't confuse it with the number of distinct keys.
 148  */
 149 static struct bio_prison *prison_create(unsigned nr_cells)
 150 {
 151         unsigned i;
 152         uint32_t nr_buckets = calc_nr_buckets(nr_cells);
 153         size_t len = sizeof(struct bio_prison) +
 154                 (sizeof(struct hlist_head) * nr_buckets);
 155         struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
 156
 157         if (!prison)
 158                 return NULL;
 159
 160         spin_lock_init(&prison->lock);
 161         prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
 162                                                         sizeof(struct cell));
 163         if (!prison->cell_pool) {
 164                 kfree(prison);
 165                 return NULL;
 166         }
 167
 168         prison->nr_buckets = nr_buckets;
 169         prison->hash_mask = nr_buckets - 1;
 170         prison->cells = (struct hlist_head *) (prison + 1);
 171         for (i = 0; i < nr_buckets; i++)
 172                 INIT_HLIST_HEAD(prison->cells + i);
 173
 174         return prison;
 175 }
 176
 177 static void prison_destroy(struct bio_prison *prison)
 178 {
 179         mempool_destroy(prison->cell_pool);
 180         kfree(prison);
 181 }
 182
 183 static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key)
 184 {
 185         const unsigned long BIG_PRIME = 4294967291UL;
 186         uint64_t hash = key->block * BIG_PRIME;
 187
 188         return (uint32_t) (hash & prison->hash_mask);
 189 }
 190
 191 static int keys_equal(struct cell_key *lhs, struct cell_key *rhs)
 192 {
 193                return (lhs->virtual == rhs->virtual) &&
 194                        (lhs->dev == rhs->dev) &&
 195                        (lhs->block == rhs->block);
 196 }
 197
 198 static struct cell *__search_bucket(struct hlist_head *bucket,
 199                                     struct cell_key *key)
 200 {
 201         struct cell *cell;
 202         struct hlist_node *tmp;
 203
 204         hlist_for_each_entry(cell, tmp, bucket, list)
 205                 if (keys_equal(&cell->key, key))
 206                         return cell;
 207
 208         return NULL;
 209 }
 210
 211 /*
 212  * This may block if a new cell needs allocating.  You must ensure that
 213  * cells will be unlocked even if the calling thread is blocked.
 214  *
 215  * Returns the number of entries in the cell prior to the new addition
 216  * or < 0 on failure.
 217  */
 218 static int bio_detain(struct bio_prison *prison, struct cell_key *key,
 219                       struct bio *inmate, struct cell **ref)
 220 {
 221         int r;
 222         unsigned long flags;
 223         uint32_t hash = hash_key(prison, key);
 224         struct cell *uninitialized_var(cell), *cell2 = NULL;
 225
 226         BUG_ON(hash > prison->nr_buckets);
 227
 228         spin_lock_irqsave(&prison->lock, flags);
 229         cell = __search_bucket(prison->cells + hash, key);
 230
 231         if (!cell) {
 232                 /*
 233                  * Allocate a new cell
 234                  */
 235                 spin_unlock_irqrestore(&prison->lock, flags);
 236                 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
 237                 spin_lock_irqsave(&prison->lock, flags);
 238
 239                 /*
 240                  * We've been unlocked, so we have to double check that
 241                  * nobody else has inserted this cell in the meantime.
 242                  */
 243                 cell = __search_bucket(prison->cells + hash, key);
 244
 245                 if (!cell) {
 246                         cell = cell2;
 247                         cell2 = NULL;
 248
 249                         cell->prison = prison;
 250                         memcpy(&cell->key, key, sizeof(cell->key));
 251                         cell->count = 0;
 252                         bio_list_init(&cell->bios);
 253                         hlist_add_head(&cell->list, prison->cells + hash);
 254                 }
 255         }
 256
 257         r = cell->count++;
 258         bio_list_add(&cell->bios, inmate);
 259         spin_unlock_irqrestore(&prison->lock, flags);
 260
 261         if (cell2)
 262                 mempool_free(cell2, prison->cell_pool);
 263
 264         *ref = cell;
 265
 266         return r;
 267 }
 268
 269 /*
 270  * @inmates must have been initialised prior to this call
 271  */
 272 static void __cell_release(struct cell *cell, struct bio_list *inmates)
 273 {
 274         struct bio_prison *prison = cell->prison;
 275
 276         hlist_del(&cell->list);
 277
 278         if (inmates)
 279                 bio_list_merge(inmates, &cell->bios);
 280
 281         mempool_free(cell, prison->cell_pool);
 282 }
 283
 284 static void cell_release(struct cell *cell, struct bio_list *bios)
 285 {
 286         unsigned long flags;
 287         struct bio_prison *prison = cell->prison;
 288
 289         spin_lock_irqsave(&prison->lock, flags);
 290         __cell_release(cell, bios);
 291         spin_unlock_irqrestore(&prison->lock, flags);
 292 }
 293
 294 /*
 295  * There are a couple of places where we put a bio into a cell briefly
 296  * before taking it out again.  In these situations we know that no other
 297  * bio may be in the cell.  This function releases the cell, and also does
 298  * a sanity check.
 299  */
 300 static void cell_release_singleton(struct cell *cell, struct bio *bio)
 301 {
 302         struct bio_prison *prison = cell->prison;
 303         struct bio_list bios;
 304         struct bio *b;
 305         unsigned long flags;
 306
 307         bio_list_init(&bios);
 308
 309         spin_lock_irqsave(&prison->lock, flags);
 310         __cell_release(cell, &bios);
 311         spin_unlock_irqrestore(&prison->lock, flags);
 312
 313         b = bio_list_pop(&bios);
 314         BUG_ON(b != bio);
 315         BUG_ON(!bio_list_empty(&bios));
 316 }
 317
 318 static void cell_error(struct cell *cell)
 319 {
 320         struct bio_prison *prison = cell->prison;
 321         struct bio_list bios;
 322         struct bio *bio;
 323         unsigned long flags;
 324
 325         bio_list_init(&bios);
 326
 327         spin_lock_irqsave(&prison->lock, flags);
 328         __cell_release(cell, &bios);
 329         spin_unlock_irqrestore(&prison->lock, flags);
 330
 331         while ((bio = bio_list_pop(&bios)))
 332                 bio_io_error(bio);
 333 }
 334
 335 /*----------------------------------------------------------------*/
 336
 337 /*
 338  * We use the deferred set to keep track of pending reads to shared blocks.
 339  * We do this to ensure the new mapping caused by a write isn't performed
 340  * until these prior reads have completed.  Otherwise the insertion of the
 341  * new mapping could free the old block that the read bios are mapped to.
 342  */
 343
 344 struct deferred_set;
 345 struct deferred_entry {
 346         struct deferred_set *ds;
 347         unsigned count;
 348         struct list_head work_items;
 349 };
 350
 351 struct deferred_set {
 352         spinlock_t lock;
 353         unsigned current_entry;
 354         unsigned sweeper;
 355         struct deferred_entry entries[DEFERRED_SET_SIZE];
 356 };
 357
 358 static void ds_init(struct deferred_set *ds)
 359 {
 360         int i;
 361
 362         spin_lock_init(&ds->lock);
 363         ds->current_entry = 0;
 364         ds->sweeper = 0;
 365         for (i = 0; i < DEFERRED_SET_SIZE; i++) {
 366                 ds->entries[i].ds = ds;
 367                 ds->entries[i].count = 0;
 368                 INIT_LIST_HEAD(&ds->entries[i].work_items);
 369         }
 370 }
 371
 372 static struct deferred_entry *ds_inc(struct deferred_set *ds)
 373 {
 374         unsigned long flags;
 375         struct deferred_entry *entry;
 376
 377         spin_lock_irqsave(&ds->lock, flags);
 378         entry = ds->entries + ds->current_entry;
 379         entry->count++;
 380         spin_unlock_irqrestore(&ds->lock, flags);
 381
 382         return entry;
 383 }
 384
 385 static unsigned ds_next(unsigned index)
 386 {
 387         return (index + 1) % DEFERRED_SET_SIZE;
 388 }
 389
 390 static void __sweep(struct deferred_set *ds, struct list_head *head)
 391 {
 392         while ((ds->sweeper != ds->current_entry) &&
 393                !ds->entries[ds->sweeper].count) {
 394                 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
 395                 ds->sweeper = ds_next(ds->sweeper);
 396         }
 397
 398         if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
 399                 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
 400 }
 401
 402 static void ds_dec(struct deferred_entry *entry, struct list_head *head)
 403 {
 404         unsigned long flags;
 405
 406         spin_lock_irqsave(&entry->ds->lock, flags);
 407         BUG_ON(!entry->count);
 408         --entry->count;
 409         __sweep(entry->ds, head);
 410         spin_unlock_irqrestore(&entry->ds->lock, flags);
 411 }
 412
 413 /*
 414  * Returns 1 if deferred or 0 if no pending items to delay job.
 415  */
 416 static int ds_add_work(struct deferred_set *ds, struct list_head *work)
 417 {
 418         int r = 1;
 419         unsigned long flags;
 420         unsigned next_entry;
 421
 422         spin_lock_irqsave(&ds->lock, flags);
 423         if ((ds->sweeper == ds->current_entry) &&
 424             !ds->entries[ds->current_entry].count)
 425                 r = 0;
 426         else {
 427                 list_add(work, &ds->entries[ds->current_entry].work_items);
 428                 next_entry = ds_next(ds->current_entry);
 429                 if (!ds->entries[next_entry].count)
 430                         ds->current_entry = next_entry;
 431         }
 432         spin_unlock_irqrestore(&ds->lock, flags);
 433
 434         return r;
 435 }
 436
 437 /*----------------------------------------------------------------*/
 438
 439 /*
 440  * Key building.
 441  */
 442 static void build_data_key(struct dm_thin_device *td,
 443                            dm_block_t b, struct cell_key *key)
 444 {
 445         key->virtual = 0;
 446         key->dev = dm_thin_dev_id(td);
 447         key->block = b;
 448 }
 449
 450 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 451                               struct cell_key *key)
 452 {
 453         key->virtual = 1;
 454         key->dev = dm_thin_dev_id(td);
 455         key->block = b;
 456 }
 457
 458 /*----------------------------------------------------------------*/
 459
 460 struct new_mapping;
 461
 462 /*
 463  * A pool device ties together a metadata device and a data device.  It
 464  * also provides the interface for creating and destroying internal
 465  * devices.
 466  */
 467 struct pool {
 468         struct list_head list;
 469         struct dm_target *ti;   /* Only set if a pool target is bound */
 470
 471         struct mapped_device *pool_md;
 472         struct dm_pool_metadata *pmd;
 473
 474         uint32_t sectors_per_block;
 475         unsigned block_shift;
 476         dm_block_t offset_mask;
 477         dm_block_t low_water_mark;
 478         unsigned zero_new_blocks:1;
 479
 480         struct bio_prison *prison;
 481         struct dm_kcopyd_client *copier;
 482
 483         struct workqueue_struct *wq;
 484         struct work_struct worker;
 485
 486         spinlock_t lock;
 487         struct bio_list deferred_bios;
 488         struct list_head prepared_mappings;
 489
 490         int low_water_triggered;        /* A dm event has been sent */
 491         struct bio_list retry_on_resume_list;
 492
 493         struct deferred_set ds; /* FIXME: move to thin_c */
 494
 495         struct new_mapping *next_mapping;
 496
 497         mempool_t *mapping_pool;
 498         mempool_t *endio_hook_pool;
 499
 500         atomic_t ref_count;
 501 };
 502
 503 /*
 504  * Target context for a pool.
 505  */
 506 struct pool_c {
 507         struct dm_target *ti;
 508         struct pool *pool;
 509         struct dm_dev *data_dev;
 510         struct dm_dev *metadata_dev;
 511         struct dm_target_callbacks callbacks;
 512
 513         sector_t low_water_mark;
 514         unsigned zero_new_blocks:1;
 515 };
 516
 517 /*
 518  * Target context for a thin.
 519  */
 520 struct thin_c {
 521         struct dm_dev *pool_dev;
 522         dm_thin_id dev_id;
 523
 524         struct pool *pool;
 525         struct dm_thin_device *td;
 526 };
 527
 528 /*----------------------------------------------------------------*/
 529
 530 /*
 531  * A global list that uses a struct mapped_device as a key.
 532  */
 533 static struct dm_thin_pool_table {
 534         spinlock_t lock;
 535         struct list_head pools;
 536 } dm_thin_pool_table;
 537
 538 static void pool_table_init(void)
 539 {
 540         spin_lock_init(&dm_thin_pool_table.lock);
 541
 542         INIT_LIST_HEAD(&dm_thin_pool_table.pools);
 543 }
 544
 545 static void pool_table_insert(struct pool *pool)
 546 {
 547         spin_lock(&dm_thin_pool_table.lock);
 548         list_add(&pool->list, &dm_thin_pool_table.pools);
 549         spin_unlock(&dm_thin_pool_table.lock);
 550 }
 551
 552 static void pool_table_remove(struct pool *pool)
 553 {
 554         spin_lock(&dm_thin_pool_table.lock);
 555         list_del(&pool->list);
 556         spin_unlock(&dm_thin_pool_table.lock);
 557 }
 558
 559 static struct pool *pool_table_lookup(struct mapped_device *md)
 560 {
 561         struct pool *pool = NULL, *tmp;
 562
 563         spin_lock(&dm_thin_pool_table.lock);
 564         list_for_each_entry(tmp, &dm_thin_pool_table.pools, list)
 565                 if (tmp->pool_md == md) {
 566                         pool = tmp;
 567                         break;
 568                 }
 569         spin_unlock(&dm_thin_pool_table.lock);
 570
 571         return pool;
 572 }
 573
 574 /*----------------------------------------------------------------*/
 575
 576 /*
 577  * This section of code contains the logic for processing a thin device's IO.
 578  * Much of the code depends on pool object resources (lists, workqueues, etc)
 579  * but most is exclusively called from the thin target rather than the thin-pool
 580  * target.
 581  */
 582
 583 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 584 {
 585         return bio->bi_sector >> tc->pool->block_shift;
 586 }
 587
 588 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 589 {
 590         struct pool *pool = tc->pool;
 591
 592         bio->bi_bdev = tc->pool_dev->bdev;
 593         bio->bi_sector = (block << pool->block_shift) +
 594                 (bio->bi_sector & pool->offset_mask);
 595 }
 596
 597 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 598                             dm_block_t block)
 599 {
 600         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
 601                 int r = dm_pool_commit_metadata(tc->pool->pmd);
 602                 if (r) {
 603                         DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
 604                               __func__, r);
 605                         bio_io_error(bio);
 606                         return;
 607                 }
 608         }
 609
 610         remap(tc, bio, block);
 611         generic_make_request(bio);
 612 }
 613
 614 /*
 615  * wake_worker() is used by thin_defer_bio and pool_preresume to continue
 616  * deferred IO processing after pool resume.
 617  */
 618 static void wake_worker(struct pool *pool)
 619 {
 620         queue_work(pool->wq, &pool->worker);
 621 }
 622
 623 /*----------------------------------------------------------------*/
 624
 625 /*
 626  * Bio endio functions.
 627  */
 628
 629 struct endio_hook {
 630         struct thin_c *tc;
 631         bio_end_io_t *saved_bi_end_io;
 632         struct deferred_entry *entry;
 633 };
 634
 635 struct new_mapping {
 636         struct list_head list;
 637
 638         int prepared;
 639
 640         struct thin_c *tc;
 641         dm_block_t virt_block;
 642         dm_block_t data_block;
 643         struct cell *cell;
 644         int err;
 645
 646         /*
 647          * If the bio covers the whole area of a block then we can avoid
 648          * zeroing or copying.  Instead this bio is hooked.  The bio will
 649          * still be in the cell, so care has to be taken to avoid issuing
 650          * the bio twice.
 651          */
 652         struct bio *bio;
 653         bio_end_io_t *saved_bi_end_io;
 654 };
 655
 656 static void __maybe_add_mapping(struct new_mapping *m)
 657 {
 658         struct pool *pool = m->tc->pool;
 659
 660         if (list_empty(&m->list) && m->prepared) {
 661                 list_add(&m->list, &pool->prepared_mappings);
 662                 wake_worker(pool);
 663         }
 664 }
 665
 666 static void copy_complete(int read_err, unsigned long write_err, void *context)
 667 {
 668         unsigned long flags;
 669         struct new_mapping *m = context;
 670         struct pool *pool = m->tc->pool;
 671
 672         m->err = read_err || write_err ? -EIO : 0;
 673
 674         spin_lock_irqsave(&pool->lock, flags);
 675         m->prepared = 1;
 676         __maybe_add_mapping(m);
 677         spin_unlock_irqrestore(&pool->lock, flags);
 678 }
 679
 680 static void overwrite_endio(struct bio *bio, int err)
 681 {
 682         unsigned long flags;
 683         struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
 684         struct pool *pool = m->tc->pool;
 685
 686         m->err = err;
 687
 688         spin_lock_irqsave(&pool->lock, flags);
 689         m->prepared = 1;
 690         __maybe_add_mapping(m);
 691         spin_unlock_irqrestore(&pool->lock, flags);
 692 }
 693
 694 static void shared_read_endio(struct bio *bio, int err)
 695 {
 696         struct list_head mappings;
 697         struct new_mapping *m, *tmp;
 698         struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
 699         unsigned long flags;
 700         struct pool *pool = h->tc->pool;
 701
 702         bio->bi_end_io = h->saved_bi_end_io;
 703         bio_endio(bio, err);
 704
 705         INIT_LIST_HEAD(&mappings);
 706         ds_dec(h->entry, &mappings);
 707
 708         spin_lock_irqsave(&pool->lock, flags);
 709         list_for_each_entry_safe(m, tmp, &mappings, list) {
 710                 list_del(&m->list);
 711                 INIT_LIST_HEAD(&m->list);
 712                 __maybe_add_mapping(m);
 713         }
 714         spin_unlock_irqrestore(&pool->lock, flags);
 715
 716         mempool_free(h, pool->endio_hook_pool);
 717 }
 718
 719 /*----------------------------------------------------------------*/
 720
 721 /*
 722  * Workqueue.
 723  */
 724
 725 /*
 726  * Prepared mapping jobs.
 727  */
 728
 729 /*
 730  * This sends the bios in the cell back to the deferred_bios list.
 731  */
 732 static void cell_defer(struct thin_c *tc, struct cell *cell,
 733                        dm_block_t data_block)
 734 {
 735         struct pool *pool = tc->pool;
 736         unsigned long flags;
 737
 738         spin_lock_irqsave(&pool->lock, flags);
 739         cell_release(cell, &pool->deferred_bios);
 740         spin_unlock_irqrestore(&pool->lock, flags);
 741
 742         wake_worker(pool);
 743 }
 744
 745 /*
 746  * As above, but ignoring @exception (a write bio that covers
 747  * the block) because it has already been processed.
 748  */
 749 static void cell_defer_except(struct thin_c *tc, struct cell *cell,
 750                               struct bio *exception)
 751 {
 752         struct bio_list bios;
 753         struct bio *bio;
 754         struct pool *pool = tc->pool;
 755         unsigned long flags;
 756
 757         bio_list_init(&bios);
 758         cell_release(cell, &bios);
 759
 760         spin_lock_irqsave(&pool->lock, flags);
 761         while ((bio = bio_list_pop(&bios)))
 762                 if (bio != exception)
 763                         bio_list_add(&pool->deferred_bios, bio);
 764         spin_unlock_irqrestore(&pool->lock, flags);
 765
 766         wake_worker(pool);
 767 }
 768
 769 static void process_prepared_mapping(struct new_mapping *m)
 770 {
 771         struct thin_c *tc = m->tc;
 772         struct bio *bio;
 773         int r;
 774
 775         bio = m->bio;
 776         if (bio)
 777                 bio->bi_end_io = m->saved_bi_end_io;
 778
 779         if (m->err) {
 780                 cell_error(m->cell);
 781                 return;
 782         }
 783
 784         /*
 785          * Commit the prepared block into the mapping btree.
 786          * Any I/O for this block arriving after this point will get
 787          * remapped to it directly.
 788          */
 789         r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
 790         if (r) {
 791                 DMERR("dm_thin_insert_block() failed");
 792                 cell_error(m->cell);
 793                 return;
 794         }
 795
 796         /*
 797          * Release any bios held while the block was being provisioned.
 798          * If we are processing a write bio that completely covers the block,
 799          * we already processed it so can ignore it now when processing
 800          * the bios in the cell.
 801          */
 802         if (bio) {
 803                 cell_defer_except(tc, m->cell, bio);
 804                 bio_endio(bio, 0);
 805         } else
 806                 cell_defer(tc, m->cell, m->data_block);
 807
 808         mempool_free(m, tc->pool->mapping_pool);
 809 }
 810
 811 static void process_prepared_mappings(struct pool *pool)
 812 {
 813         unsigned long flags;
 814         struct list_head maps;
 815         struct new_mapping *m;
 816
 817         INIT_LIST_HEAD(&maps);
 818         spin_lock_irqsave(&pool->lock, flags);
 819         list_splice_init(&pool->prepared_mappings, &maps);
 820         spin_unlock_irqrestore(&pool->lock, flags);
 821
 822         list_for_each_entry(m, &maps, list)
 823                 process_prepared_mapping(m);
 824 }
 825
 826 /*
 827  * Deferred bio jobs.
 828  */
 829 static int io_overwrites_block(struct pool *pool, struct bio *bio)
 830 {
 831         return ((bio_data_dir(bio) == WRITE) &&
 832                 (bio->bi_sector & pool->offset_mask) == 0) &&
 833                 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
 834 }
 835
 836 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
 837                                bio_end_io_t *fn)
 838 {
 839         *save = bio->bi_end_io;
 840         bio->bi_end_io = fn;
 841 }
 842
 843 static int ensure_next_mapping(struct pool *pool)
 844 {
 845         if (pool->next_mapping)
 846                 return 0;
 847
 848         pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
 849
 850         return pool->next_mapping ? 0 : -ENOMEM;
 851 }
 852
 853 static struct new_mapping *get_next_mapping(struct pool *pool)
 854 {
 855         struct new_mapping *r = pool->next_mapping;
 856
 857         BUG_ON(!pool->next_mapping);
 858
 859         pool->next_mapping = NULL;
 860
 861         return r;
 862 }
 863
 864 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 865                           dm_block_t data_origin, dm_block_t data_dest,
 866                           struct cell *cell, struct bio *bio)
 867 {
 868         int r;
 869         struct pool *pool = tc->pool;
 870         struct new_mapping *m = get_next_mapping(pool);
 871
 872         INIT_LIST_HEAD(&m->list);
 873         m->prepared = 0;
 874         m->tc = tc;
 875         m->virt_block = virt_block;
 876         m->data_block = data_dest;
 877         m->cell = cell;
 878         m->err = 0;
 879         m->bio = NULL;
 880
 881         ds_add_work(&pool->ds, &m->list);
 882
 883         /*
 884          * IO to pool_dev remaps to the pool target's data_dev.
 885          *
 886          * If the whole block of data is being overwritten, we can issue the
 887          * bio immediately. Otherwise we use kcopyd to clone the data first.
 888          */
 889         if (io_overwrites_block(pool, bio)) {
 890                 m->bio = bio;
 891                 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 892                 dm_get_mapinfo(bio)->ptr = m;
 893                 remap_and_issue(tc, bio, data_dest);
 894         } else {
 895                 struct dm_io_region from, to;
 896
 897                 from.bdev = tc->pool_dev->bdev;
 898                 from.sector = data_origin * pool->sectors_per_block;
 899                 from.count = pool->sectors_per_block;
 900
 901                 to.bdev = tc->pool_dev->bdev;
 902                 to.sector = data_dest * pool->sectors_per_block;
 903                 to.count = pool->sectors_per_block;
 904
 905                 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
 906                                    0, copy_complete, m);
 907                 if (r < 0) {
 908                         mempool_free(m, pool->mapping_pool);
 909                         DMERR("dm_kcopyd_copy() failed");
 910                         cell_error(cell);
 911                 }
 912         }
 913 }
 914
 915 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 916                           dm_block_t data_block, struct cell *cell,
 917                           struct bio *bio)
 918 {
 919         struct pool *pool = tc->pool;
 920         struct new_mapping *m = get_next_mapping(pool);
 921
 922         INIT_LIST_HEAD(&m->list);
 923         m->prepared = 0;
 924         m->tc = tc;
 925         m->virt_block = virt_block;
 926         m->data_block = data_block;
 927         m->cell = cell;
 928         m->err = 0;
 929         m->bio = NULL;
 930
 931         /*
 932          * If the whole block of data is being overwritten or we are not
 933          * zeroing pre-existing data, we can issue the bio immediately.
 934          * Otherwise we use kcopyd to zero the data first.
 935          */
 936         if (!pool->zero_new_blocks)
 937                 process_prepared_mapping(m);
 938         else if (io_overwrites_block(pool, bio)) {
 939                 m->bio = bio;
 940                 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 941                 dm_get_mapinfo(bio)->ptr = m;
 942                 remap_and_issue(tc, bio, data_block);
 943         } else {
 944                 int r;
 945                 struct dm_io_region to;
 946
 947                 to.bdev = tc->pool_dev->bdev;
 948                 to.sector = data_block * pool->sectors_per_block;
 949                 to.count = pool->sectors_per_block;
 950
 951                 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
 952                 if (r < 0) {
 953                         mempool_free(m, pool->mapping_pool);
 954                         DMERR("dm_kcopyd_zero() failed");
 955                         cell_error(cell);
 956                 }
 957         }
 958 }
 959
 960 /*
 961  * If we have run out of space, queue bios until the device is
 962  * resumed, presumably after having been reloaded with more space.
 963  */
 964 static void retry_when_resumed(struct bio *bio)
 965 {
 966         struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
 967         struct pool *pool = tc->pool;
 968         unsigned long flags;
 969
 970         spin_lock_irqsave(&pool->lock, flags);
 971         bio_list_add(&pool->retry_on_resume_list, bio);
 972         spin_unlock_irqrestore(&pool->lock, flags);
 973 }
 974
 975 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 976 {
 977         int r;
 978         dm_block_t free_blocks;
 979         unsigned long flags;
 980         struct pool *pool = tc->pool;
 981
 982         r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 983         if (r)
 984                 return r;
 985
 986         if (free_blocks <= pool->low_water_mark && !pool->low_water_triggered) {
 987                 spin_lock_irqsave(&pool->lock, flags);
 988                 pool->low_water_triggered = 1;
 989                 spin_unlock_irqrestore(&pool->lock, flags);
 990                 dm_table_event(pool->ti->table);
 991         }
 992
 993         r = dm_pool_alloc_data_block(pool->pmd, result);
 994         if (r)
 995                 return r;
 996
 997         return 0;
 998 }
 999
1000 static void no_space(struct cell *cell)
1001 {
1002         struct bio *bio;
1003         struct bio_list bios;
1004
1005         bio_list_init(&bios);
1006         cell_release(cell, &bios);
1007
1008         while ((bio = bio_list_pop(&bios)))
1009                 retry_when_resumed(bio);
1010 }
1011
1012 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1013                           struct cell_key *key,
1014                           struct dm_thin_lookup_result *lookup_result,
1015                           struct cell *cell)
1016 {
1017         int r;
1018         dm_block_t data_block;
1019
1020         r = alloc_data_block(tc, &data_block);
1021         switch (r) {
1022         case 0:
1023                 schedule_copy(tc, block, lookup_result->block,
1024                               data_block, cell, bio);
1025                 break;
1026
1027         case -ENOSPC:
1028                 no_space(cell);
1029                 break;
1030
1031         default:
1032                 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1033                 cell_error(cell);
1034                 break;
1035         }
1036 }
1037
1038 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1039                                dm_block_t block,
1040                                struct dm_thin_lookup_result *lookup_result)
1041 {
1042         struct cell *cell;
1043         struct cell_key key;
1044         struct pool *pool = tc->pool;
1045
1046         /*
1047          * If cell is already occupied, then sharing is already in the process
1048          * of being broken so we have nothing further to do here.
1049          */
1050         build_data_key(tc->td, lookup_result->block, &key);
1051         if (bio_detain(pool->prison, &key, bio, &cell))
1052                 return;
1053
1054         if (bio_data_dir(bio) == WRITE)
1055                 break_sharing(tc, bio, block, &key, lookup_result, cell);
1056         else {
1057                 struct endio_hook *h;
1058                 h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1059
1060                 h->tc = tc;
1061                 h->entry = ds_inc(&pool->ds);
1062                 save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
1063                 dm_get_mapinfo(bio)->ptr = h;
1064
1065                 cell_release_singleton(cell, bio);
1066                 remap_and_issue(tc, bio, lookup_result->block);
1067         }
1068 }
1069
1070 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1071                             struct cell *cell)
1072 {
1073         int r;
1074         dm_block_t data_block;
1075
1076         /*
1077          * Remap empty bios (flushes) immediately, without provisioning.
1078          */
1079         if (!bio->bi_size) {
1080                 cell_release_singleton(cell, bio);
1081                 remap_and_issue(tc, bio, 0);
1082                 return;
1083         }
1084
1085         /*
1086          * Fill read bios with zeroes and complete them immediately.
1087          */
1088         if (bio_data_dir(bio) == READ) {
1089                 zero_fill_bio(bio);
1090                 cell_release_singleton(cell, bio);
1091                 bio_endio(bio, 0);
1092                 return;
1093         }
1094
1095         r = alloc_data_block(tc, &data_block);
1096         switch (r) {
1097         case 0:
1098                 schedule_zero(tc, block, data_block, cell, bio);
1099                 break;
1100
1101         case -ENOSPC:
1102                 no_space(cell);
1103                 break;
1104
1105         default:
1106                 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1107                 cell_error(cell);
1108                 break;
1109         }
1110 }
1111
1112 static void process_bio(struct thin_c *tc, struct bio *bio)
1113 {
1114         int r;
1115         dm_block_t block = get_bio_block(tc, bio);
1116         struct cell *cell;
1117         struct cell_key key;
1118         struct dm_thin_lookup_result lookup_result;
1119
1120         /*
1121          * If cell is already occupied, then the block is already
1122          * being provisioned so we have nothing further to do here.
1123          */
1124         build_virtual_key(tc->td, block, &key);
1125         if (bio_detain(tc->pool->prison, &key, bio, &cell))
1126                 return;
1127
1128         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1129         switch (r) {
1130         case 0:
1131                 /*
1132                  * We can release this cell now.  This thread is the only
1133                  * one that puts bios into a cell, and we know there were
1134                  * no preceding bios.
1135                  */
1136                 cell_release_singleton(cell, bio);
1137
1138                 if (lookup_result.shared)
1139                         process_shared_bio(tc, bio, block, &lookup_result);
1140                 else
1141                         remap_and_issue(tc, bio, lookup_result.block);
1142                 break;
1143
1144         case -ENODATA:
1145                 provision_block(tc, bio, block, cell);
1146                 break;
1147
1148         default:
1149                 DMERR("dm_thin_find_block() failed, error = %d", r);
1150                 bio_io_error(bio);
1151                 break;
1152         }
1153 }
1154
1155 static void process_deferred_bios(struct pool *pool)
1156 {
1157         unsigned long flags;
1158         struct bio *bio;
1159         struct bio_list bios;
1160
1161         bio_list_init(&bios);
1162
1163         spin_lock_irqsave(&pool->lock, flags);
1164         bio_list_merge(&bios, &pool->deferred_bios);
1165         bio_list_init(&pool->deferred_bios);
1166         spin_unlock_irqrestore(&pool->lock, flags);
1167
1168         while ((bio = bio_list_pop(&bios))) {
1169                 struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
1170
1171                 /*
1172                  * If we've got no free new_mapping structs, and processing this bio
1173                  * might require one, we pause until there are some prepared mappings to
1174                  * process.
1175                  */
1176                 if (ensure_next_mapping(pool)) {
1177                         spin_lock_irqsave(&pool->lock, flags);
1178                         bio_list_merge(&pool->deferred_bios, &bios);
1179                         spin_unlock_irqrestore(&pool->lock, flags);
1180
1181                         return;
1182                 }
1183
1184                 process_bio(tc, bio);
1185         }
1186 }
1187
1188 static void do_worker(struct work_struct *ws)
1189 {
1190         struct pool *pool = container_of(ws, struct pool, worker);
1191
1192         process_prepared_mappings(pool);
1193         process_deferred_bios(pool);
1194 }
1195
1196 /*----------------------------------------------------------------*/
1197
1198 /*
1199  * Mapping functions.
1200  */
1201
1202 /*
1203  * Called only while mapping a thin bio to hand it over to the workqueue.
1204  */
1205 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1206 {
1207         unsigned long flags;
1208         struct pool *pool = tc->pool;
1209
1210         spin_lock_irqsave(&pool->lock, flags);
1211         bio_list_add(&pool->deferred_bios, bio);
1212         spin_unlock_irqrestore(&pool->lock, flags);
1213
1214         wake_worker(pool);
1215 }
1216
1217 /*
1218  * Non-blocking function designed to be called from the target's map
1219  * function.
1220  */
1221 static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1222                         union map_info *map_context)
1223 {
1224         int r;
1225         struct thin_c *tc = ti->private;
1226         dm_block_t block = get_bio_block(tc, bio);
1227         struct dm_thin_device *td = tc->td;
1228         struct dm_thin_lookup_result result;
1229
1230         /*
1231          * Save the thin context for easy access from the deferred bio later.
1232          */
1233         map_context->ptr = tc;
1234
1235         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1236                 thin_defer_bio(tc, bio);
1237                 return DM_MAPIO_SUBMITTED;
1238         }
1239
1240         r = dm_thin_find_block(td, block, 0, &result);
1241
1242         /*
1243          * Note that we defer readahead too.
1244          */
1245         switch (r) {
1246         case 0:
1247                 if (unlikely(result.shared)) {
1248                         /*
1249                          * We have a race condition here between the
1250                          * result.shared value returned by the lookup and
1251                          * snapshot creation, which may cause new
1252                          * sharing.
1253                          *
1254                          * To avoid this always quiesce the origin before
1255                          * taking the snap.  You want to do this anyway to
1256                          * ensure a consistent application view
1257                          * (i.e. lockfs).
1258                          *
1259                          * More distant ancestors are irrelevant: the
1260                          * shared flag will be set in their case.
1261                          */
1262                         thin_defer_bio(tc, bio);
1263                         r = DM_MAPIO_SUBMITTED;
1264                 } else {
1265                         remap(tc, bio, result.block);
1266                         r = DM_MAPIO_REMAPPED;
1267                 }
1268                 break;
1269
1270         case -ENODATA:
1271                 /*
1272                  * In future, the failed dm_thin_find_block above could
1273                  * provide the hint to load the metadata into cache.
1274                  */
1275         case -EWOULDBLOCK:
1276                 thin_defer_bio(tc, bio);
1277                 r = DM_MAPIO_SUBMITTED;
1278                 break;
1279         }
1280
1281         return r;
1282 }
1283
1284 static int pool_map(struct dm_target *ti, struct bio *bio,
1285                     union map_info *map_context)
1286 {
1287         int r;
1288         struct pool_c *pt = ti->private;
1289         struct pool *pool = pt->pool;
1290         unsigned long flags;
1291
1292         /*
1293          * As this is a singleton target, ti->begin is always zero.
1294          */
1295         spin_lock_irqsave(&pool->lock, flags);
1296         bio->bi_bdev = pt->data_dev->bdev;
1297         r = DM_MAPIO_REMAPPED;
1298         spin_unlock_irqrestore(&pool->lock, flags);
1299
1300         return r;
1301 }
1302
1303 /*----------------------------------------------------------------
1304  * Binding of control targets to a pool object
1305  *--------------------------------------------------------------*/
1306 /* FIXME: add locking */
1307 static int bind_control_target(struct pool *pool, struct dm_target *ti)
1308 {
1309         struct pool_c *pt = ti->private;
1310
1311         pool->ti = ti;
1312         pool->low_water_mark = dm_sector_div_up(pt->low_water_mark,
1313                                                 pool->sectors_per_block);
1314         pool->zero_new_blocks = pt->zero_new_blocks;
1315         dm_pool_rebind_metadata_device(pool->pmd, pt->metadata_dev->bdev);
1316
1317         return 0;
1318 }
1319
1320 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1321 {
1322         if (pool->ti == ti)
1323                 pool->ti = NULL;
1324 }
1325
1326 /*----------------------------------------------------------------
1327  * Pool creation
1328  *--------------------------------------------------------------*/
1329 static void pool_destroy(struct pool *pool)
1330 {
1331         if (dm_pool_metadata_close(pool->pmd) < 0)
1332                 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1333
1334         prison_destroy(pool->prison);
1335         dm_kcopyd_client_destroy(pool->copier);
1336
1337         if (pool->wq)
1338                 destroy_workqueue(pool->wq);
1339
1340         if (pool->next_mapping)
1341                 mempool_free(pool->next_mapping, pool->mapping_pool);
1342
1343         mempool_destroy(pool->mapping_pool);
1344         mempool_destroy(pool->endio_hook_pool);
1345         kfree(pool);
1346 }
1347
1348 static struct pool *pool_create(struct block_device *metadata_dev,
1349                                 unsigned long block_size, char **error)
1350 {
1351         int r;
1352         void *err_p;
1353         struct pool *pool;
1354         struct dm_pool_metadata *pmd;
1355
1356         pmd = dm_pool_metadata_open(metadata_dev, block_size);
1357         if (IS_ERR(pmd)) {
1358                 *error = "Error creating metadata object";
1359                 return (struct pool *)pmd;
1360         }
1361
1362         pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1363         if (!pool) {
1364                 *error = "Error allocating memory for pool";
1365                 err_p = ERR_PTR(-ENOMEM);
1366                 goto bad_pool;
1367         }
1368
1369         pool->pmd = pmd;
1370         pool->sectors_per_block = block_size;
1371         pool->block_shift = ffs(block_size) - 1;
1372         pool->offset_mask = block_size - 1;
1373         pool->low_water_mark = 0;
1374         pool->zero_new_blocks = 1;
1375         pool->prison = prison_create(PRISON_CELLS);
1376         if (!pool->prison) {
1377                 *error = "Error creating pool's bio prison";
1378                 err_p = ERR_PTR(-ENOMEM);
1379                 goto bad_prison;
1380         }
1381
1382         pool->copier = dm_kcopyd_client_create();
1383         if (IS_ERR(pool->copier)) {
1384                 r = PTR_ERR(pool->copier);
1385                 *error = "Error creating pool's kcopyd client";
1386                 err_p = ERR_PTR(r);
1387                 goto bad_kcopyd_client;
1388         }
1389
1390         /*
1391          * Create singlethreaded workqueue that will service all devices
1392          * that use this metadata.
1393          */
1394         pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1395         if (!pool->wq) {
1396                 *error = "Error creating pool's workqueue";
1397                 err_p = ERR_PTR(-ENOMEM);
1398                 goto bad_wq;
1399         }
1400
1401         INIT_WORK(&pool->worker, do_worker);
1402         spin_lock_init(&pool->lock);
1403         bio_list_init(&pool->deferred_bios);
1404         INIT_LIST_HEAD(&pool->prepared_mappings);
1405         pool->low_water_triggered = 0;
1406         bio_list_init(&pool->retry_on_resume_list);
1407         ds_init(&pool->ds);
1408
1409         pool->next_mapping = NULL;
1410         pool->mapping_pool =
1411                 mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping));
1412         if (!pool->mapping_pool) {
1413                 *error = "Error creating pool's mapping mempool";
1414                 err_p = ERR_PTR(-ENOMEM);
1415                 goto bad_mapping_pool;
1416         }
1417
1418         pool->endio_hook_pool =
1419                 mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook));
1420         if (!pool->endio_hook_pool) {
1421                 *error = "Error creating pool's endio_hook mempool";
1422                 err_p = ERR_PTR(-ENOMEM);
1423                 goto bad_endio_hook_pool;
1424         }
1425         atomic_set(&pool->ref_count, 1);
1426
1427         return pool;
1428
1429 bad_endio_hook_pool:
1430         mempool_destroy(pool->mapping_pool);
1431 bad_mapping_pool:
1432         destroy_workqueue(pool->wq);
1433 bad_wq:
1434         dm_kcopyd_client_destroy(pool->copier);
1435 bad_kcopyd_client:
1436         prison_destroy(pool->prison);
1437 bad_prison:
1438         kfree(pool);
1439 bad_pool:
1440         if (dm_pool_metadata_close(pmd))
1441                 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1442
1443         return err_p;
1444 }
1445
1446 static void pool_inc(struct pool *pool)
1447 {
1448         atomic_inc(&pool->ref_count);
1449 }
1450
1451 static void pool_dec(struct pool *pool)
1452 {
1453         if (atomic_dec_and_test(&pool->ref_count))
1454                 pool_destroy(pool);
1455 }
1456
1457 static struct pool *pool_find(struct mapped_device *pool_md,
1458                               struct block_device *metadata_dev,
1459                               unsigned long block_size,
1460                               char **error)
1461 {
1462         struct pool *pool;
1463
1464         pool = pool_table_lookup(pool_md);
1465         if (pool)
1466                 pool_inc(pool);
1467         else
1468                 pool = pool_create(metadata_dev, block_size, error);
1469
1470         return pool;
1471 }
1472
1473 /*----------------------------------------------------------------
1474  * Pool target methods
1475  *--------------------------------------------------------------*/
1476 struct pool_features {
1477         unsigned zero_new_blocks:1;
1478 };
1479
1480 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1481                                struct dm_target *ti)
1482 {
1483         int r;
1484         unsigned argc;
1485         const char *arg_name;
1486
1487         static struct dm_arg _args[] = {
1488                 {0, 1, "Invalid number of pool feature arguments"},
1489         };
1490
1491         /*
1492          * No feature arguments supplied.
1493          */
1494         if (!as->argc)
1495                 return 0;
1496
1497         r = dm_read_arg_group(_args, as, &argc, &ti->error);
1498         if (r)
1499                 return -EINVAL;
1500
1501         while (argc && !r) {
1502                 arg_name = dm_shift_arg(as);
1503                 argc--;
1504
1505                 if (!strcasecmp(arg_name, "skip_block_zeroing")) {
1506                         pf->zero_new_blocks = 0;
1507                         continue;
1508                 }
1509
1510                 ti->error = "Unrecognised pool feature requested";
1511                 r = -EINVAL;
1512         }
1513
1514         return r;
1515 }
1516
1517 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1518 {
1519         int r;
1520         unsigned long flags;
1521         struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1522
1523         spin_lock_irqsave(&pt->pool->lock, flags);
1524         r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1525         spin_unlock_irqrestore(&pt->pool->lock, flags);
1526
1527         if (!r) {
1528                 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1529                 r = bdi_congested(&q->backing_dev_info, bdi_bits);
1530         }
1531
1532         return r;
1533 }
1534
1535 /*
1536  * thin-pool <metadata dev> <data dev>
1537  *           <data block size (sectors)>
1538  *           <low water mark (sectors)>
1539  *           [<#feature args> [<arg>]*]
1540  *
1541  * Optional feature arguments are:
1542  *           skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1543  */
1544 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1545 {
1546         int r;
1547         struct pool_c *pt;
1548         struct pool *pool;
1549         struct pool_features pf;
1550         struct dm_arg_set as;
1551         struct dm_dev *data_dev;
1552         unsigned long block_size;
1553         dm_block_t low_water;
1554         struct dm_dev *metadata_dev;
1555         sector_t metadata_dev_size;
1556
1557         if (argc < 4) {
1558                 ti->error = "Invalid argument count";
1559                 return -EINVAL;
1560         }
1561         as.argc = argc;
1562         as.argv = argv;
1563
1564         r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1565         if (r) {
1566                 ti->error = "Error opening metadata block device";
1567                 return r;
1568         }
1569
1570         metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1571         if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) {
1572                 ti->error = "Metadata device is too large";
1573                 r = -EINVAL;
1574                 goto out_metadata;
1575         }
1576
1577         r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1578         if (r) {
1579                 ti->error = "Error getting data device";
1580                 goto out_metadata;
1581         }
1582
1583         if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1584             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1585             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1586             !is_power_of_2(block_size)) {
1587                 ti->error = "Invalid block size";
1588                 r = -EINVAL;
1589                 goto out;
1590         }
1591
1592         if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water) ||
1593             !low_water) {
1594                 ti->error = "Invalid low water mark";
1595                 r = -EINVAL;
1596                 goto out;
1597         }
1598
1599         /*
1600          * Set default pool features.
1601          */
1602         memset(&pf, 0, sizeof(pf));
1603         pf.zero_new_blocks = 1;
1604
1605         dm_consume_args(&as, 4);
1606         r = parse_pool_features(&as, &pf, ti);
1607         if (r)
1608                 goto out;
1609
1610         pool = pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1611                          block_size, &ti->error);
1612         if (IS_ERR(pool)) {
1613                 r = PTR_ERR(pool);
1614                 goto out;
1615         }
1616
1617         pt = kzalloc(sizeof(*pt), GFP_KERNEL);
1618         if (!pt) {
1619                 pool_destroy(pool);
1620                 r = -ENOMEM;
1621                 goto out;
1622         }
1623         pt->pool = pool;
1624         pt->ti = ti;
1625         pt->metadata_dev = metadata_dev;
1626         pt->data_dev = data_dev;
1627         pt->low_water_mark = low_water;
1628         pt->zero_new_blocks = pf.zero_new_blocks;
1629         ti->num_flush_requests = 1;
1630         ti->num_discard_requests = 0;
1631         ti->discards_supported = 0;
1632         ti->private = pt;
1633
1634         pt->callbacks.congested_fn = pool_is_congested;
1635         dm_table_add_target_callbacks(ti->table, &pt->callbacks);
1636
1637         return 0;
1638
1639 out:
1640         dm_put_device(ti, data_dev);
1641 out_metadata:
1642         dm_put_device(ti, metadata_dev);
1643
1644         return r;
1645 }
1646
1647 static void pool_dtr(struct dm_target *ti)
1648 {
1649         struct pool_c *pt = ti->private;
1650
1651         unbind_control_target(pt->pool, ti);
1652         pool_dec(pt->pool);
1653
1654         dm_put_device(ti, pt->metadata_dev);
1655         dm_put_device(ti, pt->data_dev);
1656
1657         kfree(pt);
1658 }
1659
1660 static void __requeue_bios(struct pool *pool)
1661 {
1662         bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1663         bio_list_init(&pool->retry_on_resume_list);
1664 }
1665
1666 /*
1667  * Retrieves the number of blocks of the data device from
1668  * the superblock and compares it to the actual device size,
1669  * thus resizing the data device in case it has grown.
1670  *
1671  * This both copes with opening preallocated data devices in the ctr
1672  * being followed by a resume
1673  * -and-
1674  * calling the resume method individually after userspace has
1675  * grown the data device in reaction to a table event.
1676  */
1677 static int pool_preresume(struct dm_target *ti)
1678 {
1679         int r;
1680         struct pool_c *pt = ti->private;
1681         struct pool *pool = pt->pool;
1682         dm_block_t data_size, sb_data_size;
1683         unsigned long flags;
1684
1685         /*
1686          * Take control of the pool object.
1687          */
1688         r = bind_control_target(pool, ti);
1689         if (r)
1690                 return r;
1691
1692         data_size = ti->len >> pool->block_shift;
1693         r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
1694         if (r) {
1695                 DMERR("failed to retrieve data device size");
1696                 return r;
1697         }
1698
1699         if (data_size < sb_data_size) {
1700                 DMERR("pool target too small, is %llu blocks (expected %llu)",
1701                       data_size, sb_data_size);
1702                 return -EINVAL;
1703
1704         } else if (data_size > sb_data_size) {
1705                 r = dm_pool_resize_data_dev(pool->pmd, data_size);
1706                 if (r) {
1707                         DMERR("failed to resize data device");
1708                         return r;
1709                 }
1710
1711                 r = dm_pool_commit_metadata(pool->pmd);
1712                 if (r) {
1713                         DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1714                               __func__, r);
1715                         return r;
1716                 }
1717         }
1718
1719         spin_lock_irqsave(&pool->lock, flags);
1720         pool->low_water_triggered = 0;
1721         __requeue_bios(pool);
1722         spin_unlock_irqrestore(&pool->lock, flags);
1723
1724         wake_worker(pool);
1725
1726         /*
1727          * The pool object is only present if the pool is active.
1728          */
1729         pool->pool_md = dm_table_get_md(ti->table);
1730         pool_table_insert(pool);
1731
1732         return 0;
1733 }
1734
1735 static void pool_postsuspend(struct dm_target *ti)
1736 {
1737         int r;
1738         struct pool_c *pt = ti->private;
1739         struct pool *pool = pt->pool;
1740
1741         flush_workqueue(pool->wq);
1742
1743         r = dm_pool_commit_metadata(pool->pmd);
1744         if (r) {
1745                 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1746                       __func__, r);
1747                 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
1748         }
1749
1750         pool_table_remove(pool);
1751         pool->pool_md = NULL;
1752 }
1753
1754 static int check_arg_count(unsigned argc, unsigned args_required)
1755 {
1756         if (argc != args_required) {
1757                 DMWARN("Message received with %u arguments instead of %u.",
1758                        argc, args_required);
1759                 return -EINVAL;
1760         }
1761
1762         return 0;
1763 }
1764
1765 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
1766 {
1767         if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
1768             *dev_id <= MAX_DEV_ID)
1769                 return 0;
1770
1771         if (warning)
1772                 DMWARN("Message received with invalid device id: %s", arg);
1773
1774         return -EINVAL;
1775 }
1776
1777 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
1778 {
1779         dm_thin_id dev_id;
1780         int r;
1781
1782         r = check_arg_count(argc, 2);
1783         if (r)
1784                 return r;
1785
1786         r = read_dev_id(argv[1], &dev_id, 1);
1787         if (r)
1788                 return r;
1789
1790         r = dm_pool_create_thin(pool->pmd, dev_id);
1791         if (r) {
1792                 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
1793                        argv[1]);
1794                 return r;
1795         }
1796
1797         return 0;
1798 }
1799
1800 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
1801 {
1802         dm_thin_id dev_id;
1803         dm_thin_id origin_dev_id;
1804         int r;
1805
1806         r = check_arg_count(argc, 3);
1807         if (r)
1808                 return r;
1809
1810         r = read_dev_id(argv[1], &dev_id, 1);
1811         if (r)
1812                 return r;
1813
1814         r = read_dev_id(argv[2], &origin_dev_id, 1);
1815         if (r)
1816                 return r;
1817
1818         r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
1819         if (r) {
1820                 DMWARN("Creation of new snapshot %s of device %s failed.",
1821                        argv[1], argv[2]);
1822                 return r;
1823         }
1824
1825         return 0;
1826 }
1827
1828 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
1829 {
1830         dm_thin_id dev_id;
1831         int r;
1832
1833         r = check_arg_count(argc, 2);
1834         if (r)
1835                 return r;
1836
1837         r = read_dev_id(argv[1], &dev_id, 1);
1838         if (r)
1839                 return r;
1840
1841         r = dm_pool_delete_thin_device(pool->pmd, dev_id);
1842         if (r)
1843                 DMWARN("Deletion of thin device %s failed.", argv[1]);
1844
1845         return r;
1846 }
1847
1848 static int process_trim_mesg(unsigned argc, char **argv, struct pool *pool)
1849 {
1850         dm_thin_id dev_id;
1851         sector_t new_size;
1852         int r;
1853
1854         r = check_arg_count(argc, 3);
1855         if (r)
1856                 return r;
1857
1858         r = read_dev_id(argv[1], &dev_id, 1);
1859         if (r)
1860                 return r;
1861
1862         if (kstrtoull(argv[2], 10, (unsigned long long *)&new_size)) {
1863                 DMWARN("trim device %s: Invalid new size: %s sectors.",
1864                        argv[1], argv[2]);
1865                 return -EINVAL;
1866         }
1867
1868         r = dm_pool_trim_thin_device(pool->pmd, dev_id,
1869                         dm_sector_div_up(new_size, pool->sectors_per_block));
1870         if (r)
1871                 DMWARN("Attempt to trim thin device %s failed.", argv[1]);
1872
1873         return r;
1874 }
1875
1876 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
1877 {
1878         dm_thin_id old_id, new_id;
1879         int r;
1880
1881         r = check_arg_count(argc, 3);
1882         if (r)
1883                 return r;
1884
1885         if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
1886                 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
1887                 return -EINVAL;
1888         }
1889
1890         if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
1891                 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
1892                 return -EINVAL;
1893         }
1894
1895         r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
1896         if (r) {
1897                 DMWARN("Failed to change transaction id from %s to %s.",
1898                        argv[1], argv[2]);
1899                 return r;
1900         }
1901
1902         return 0;
1903 }
1904
1905 /*
1906  * Messages supported:
1907  *   create_thin        <dev_id>
1908  *   create_snap        <dev_id> <origin_id>
1909  *   delete             <dev_id>
1910  *   trim               <dev_id> <new_size_in_sectors>
1911  *   set_transaction_id <current_trans_id> <new_trans_id>
1912  */
1913 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
1914 {
1915         int r = -EINVAL;
1916         struct pool_c *pt = ti->private;
1917         struct pool *pool = pt->pool;
1918
1919         if (!strcasecmp(argv[0], "create_thin"))
1920                 r = process_create_thin_mesg(argc, argv, pool);
1921
1922         else if (!strcasecmp(argv[0], "create_snap"))
1923                 r = process_create_snap_mesg(argc, argv, pool);
1924
1925         else if (!strcasecmp(argv[0], "delete"))
1926                 r = process_delete_mesg(argc, argv, pool);
1927
1928         else if (!strcasecmp(argv[0], "trim"))
1929                 r = process_trim_mesg(argc, argv, pool);
1930
1931         else if (!strcasecmp(argv[0], "set_transaction_id"))
1932                 r = process_set_transaction_id_mesg(argc, argv, pool);
1933
1934         else
1935                 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
1936
1937         if (!r) {
1938                 r = dm_pool_commit_metadata(pool->pmd);
1939                 if (r)
1940                         DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
1941                               argv[0], r);
1942         }
1943
1944         return r;
1945 }
1946
1947 /*
1948  * Status line is:
1949  *    <transaction id> <used metadata sectors>/<total metadata sectors>
1950  *    <used data sectors>/<total data sectors> <held metadata root>
1951  */
1952 static int pool_status(struct dm_target *ti, status_type_t type,
1953                        char *result, unsigned maxlen)
1954 {
1955         int r;
1956         unsigned sz = 0;
1957         uint64_t transaction_id;
1958         dm_block_t nr_free_blocks_data;
1959         dm_block_t nr_free_blocks_metadata;
1960         dm_block_t nr_blocks_data;
1961         dm_block_t nr_blocks_metadata;
1962         dm_block_t held_root;
1963         char buf[BDEVNAME_SIZE];
1964         char buf2[BDEVNAME_SIZE];
1965         struct pool_c *pt = ti->private;
1966         struct pool *pool = pt->pool;
1967
1968         switch (type) {
1969         case STATUSTYPE_INFO:
1970                 r = dm_pool_get_metadata_transaction_id(pool->pmd,
1971                                                         &transaction_id);
1972                 if (r)
1973                         return r;
1974
1975                 r = dm_pool_get_free_metadata_block_count(pool->pmd,
1976                                                           &nr_free_blocks_metadata);
1977                 if (r)
1978                         return r;
1979
1980                 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
1981                 if (r)
1982                         return r;
1983
1984                 r = dm_pool_get_free_block_count(pool->pmd,
1985                                                  &nr_free_blocks_data);
1986                 if (r)
1987                         return r;
1988
1989                 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
1990                 if (r)
1991                         return r;
1992
1993                 r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
1994                 if (r)
1995                         return r;
1996
1997                 DMEMIT("%llu %llu/%llu %llu/%llu", (unsigned long long)transaction_id,
1998                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata) *
1999                                            pool->sectors_per_block,
2000                        (unsigned long long)nr_blocks_metadata * pool->sectors_per_block,
2001                        (unsigned long long)(nr_blocks_data - nr_free_blocks_data) *
2002                                            pool->sectors_per_block,
2003                        (unsigned long long)nr_blocks_data * pool->sectors_per_block);
2004
2005                 if (held_root)
2006                         DMEMIT("%llu", held_root);
2007                 else
2008                         DMEMIT("-");
2009
2010                 break;
2011
2012         case STATUSTYPE_TABLE:
2013                 DMEMIT("%s %s %lu %llu ",
2014                        format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2015                        format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2016                        (unsigned long)pool->sectors_per_block,
2017                        (unsigned long long)pt->low_water_mark);
2018
2019                 DMEMIT("%u ", !pool->zero_new_blocks);
2020
2021                 if (!pool->zero_new_blocks)
2022                         DMEMIT("skip_block_zeroing ");
2023                 break;
2024         }
2025
2026         return 0;
2027 }
2028
2029 static int pool_iterate_devices(struct dm_target *ti,
2030                                 iterate_devices_callout_fn fn, void *data)
2031 {
2032         struct pool_c *pt = ti->private;
2033
2034         return fn(ti, pt->data_dev, 0, ti->len, data);
2035 }
2036
2037 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2038                       struct bio_vec *biovec, int max_size)
2039 {
2040         struct pool_c *pt = ti->private;
2041         struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2042
2043         if (!q->merge_bvec_fn)
2044                 return max_size;
2045
2046         bvm->bi_bdev = pt->data_dev->bdev;
2047
2048         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2049 }
2050
2051 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2052 {
2053         struct pool_c *pt = ti->private;
2054         struct pool *pool = pt->pool;
2055
2056         blk_limits_io_min(limits, 0);
2057         blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2058 }
2059
2060 static struct target_type pool_target = {
2061         .name = "thin-pool",
2062         .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE,
2063         .version = {1, 0, 0},
2064         .module = THIS_MODULE,
2065         .ctr = pool_ctr,
2066         .dtr = pool_dtr,
2067         .map = pool_map,
2068         .postsuspend = pool_postsuspend,
2069         .preresume = pool_preresume,
2070         .message = pool_message,
2071         .status = pool_status,
2072         .merge = pool_merge,
2073         .iterate_devices = pool_iterate_devices,
2074         .io_hints = pool_io_hints,
2075 };
2076
2077 /*----------------------------------------------------------------*/
2078
2079 static void thin_dtr(struct dm_target *ti)
2080 {
2081         struct thin_c *tc = ti->private;
2082
2083         pool_dec(tc->pool);
2084         dm_pool_close_thin_device(tc->td);
2085         dm_put_device(ti, tc->pool_dev);
2086         kfree(tc);
2087 }
2088
2089 /*
2090  * Thin target parameters:
2091  *
2092  * <pool_dev> <dev_id>
2093  *
2094  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2095  * dev_id: the internal device identifier
2096  */
2097 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2098 {
2099         int r;
2100         struct thin_c *tc;
2101         struct dm_dev *pool_dev;
2102         struct mapped_device *pool_md;
2103
2104         if (argc != 2) {
2105                 ti->error = "Invalid argument count";
2106                 return -EINVAL;
2107         }
2108
2109         tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2110         if (!tc) {
2111                 ti->error = "Out of memory";
2112                 return -ENOMEM;
2113         }
2114
2115         r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2116         if (r) {
2117                 ti->error = "Error opening pool device";
2118                 goto bad_pool_dev;
2119         }
2120         tc->pool_dev = pool_dev;
2121
2122         if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2123                 ti->error = "Invalid device id";
2124                 r = -EINVAL;
2125                 goto bad_common;
2126         }
2127
2128         pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2129         if (!pool_md) {
2130                 ti->error = "Couldn't get pool mapped device";
2131                 r = -EINVAL;
2132                 goto bad_common;
2133         }
2134
2135         tc->pool = pool_table_lookup(pool_md);
2136         if (!tc->pool) {
2137                 ti->error = "Couldn't find pool object";
2138                 r = -EINVAL;
2139                 goto bad_pool_lookup;
2140         }
2141         pool_inc(tc->pool);
2142
2143         r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2144         if (r) {
2145                 ti->error = "Couldn't open thin internal device";
2146                 goto bad_thin_open;
2147         }
2148
2149         ti->split_io = tc->pool->sectors_per_block;
2150         ti->num_flush_requests = 1;
2151         ti->num_discard_requests = 0;
2152         ti->discards_supported = 0;
2153
2154         dm_put(pool_md);
2155
2156         return 0;
2157
2158 bad_thin_open:
2159         pool_dec(tc->pool);
2160 bad_pool_lookup:
2161         dm_put(pool_md);
2162 bad_common:
2163         dm_put_device(ti, tc->pool_dev);
2164 bad_pool_dev:
2165         kfree(tc);
2166
2167         return r;
2168 }
2169
2170 static int thin_map(struct dm_target *ti, struct bio *bio,
2171                     union map_info *map_context)
2172 {
2173         bio->bi_sector -= ti->begin;
2174
2175         return thin_bio_map(ti, bio, map_context);
2176 }
2177
2178 /*
2179  * <nr mapped sectors> <highest mapped sector>
2180  */
2181 static int thin_status(struct dm_target *ti, status_type_t type,
2182                        char *result, unsigned maxlen)
2183 {
2184         int r;
2185         ssize_t sz = 0;
2186         dm_block_t mapped, highest;
2187         char buf[BDEVNAME_SIZE];
2188         struct thin_c *tc = ti->private;
2189
2190         if (!tc->td)
2191                 DMEMIT("-");
2192         else {
2193                 switch (type) {
2194                 case STATUSTYPE_INFO:
2195                         r = dm_thin_get_mapped_count(tc->td, &mapped);
2196                         if (r)
2197                                 return r;
2198
2199                         r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2200                         if (r < 0)
2201                                 return r;
2202
2203                         DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2204                         if (r)
2205                                 DMEMIT("%llu", ((highest + 1) *
2206                                                 tc->pool->sectors_per_block) - 1);
2207                         else
2208                                 DMEMIT("-");
2209                         break;
2210
2211                 case STATUSTYPE_TABLE:
2212                         DMEMIT("%s %lu",
2213                                format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2214                                (unsigned long) tc->dev_id);
2215                         break;
2216                 }
2217         }
2218
2219         return 0;
2220 }
2221
2222 static int thin_iterate_devices(struct dm_target *ti,
2223                                 iterate_devices_callout_fn fn, void *data)
2224 {
2225         struct thin_c *tc = ti->private;
2226
2227         return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block, data);
2228 }
2229
2230 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2231 {
2232         struct thin_c *tc = ti->private;
2233
2234         blk_limits_io_min(limits, 0);
2235         blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
2236 }
2237
2238 static struct target_type thin_target = {
2239         .name = "thin",
2240         .version = {1, 0, 0},
2241         .module = THIS_MODULE,
2242         .ctr = thin_ctr,
2243         .dtr = thin_dtr,
2244         .map = thin_map,
2245         .status = thin_status,
2246         .iterate_devices = thin_iterate_devices,
2247         .io_hints = thin_io_hints,
2248 };
2249
2250 /*----------------------------------------------------------------*/
2251
2252 static int __init dm_thin_init(void)
2253 {
2254         int r;
2255
2256         pool_table_init();
2257
2258         r = dm_register_target(&thin_target);
2259         if (r)
2260                 return r;
2261
2262         r = dm_register_target(&pool_target);
2263         if (r)
2264                 dm_unregister_target(&thin_target);
2265
2266         return r;
2267 }
2268
2269 static void dm_thin_exit(void)
2270 {
2271         dm_unregister_target(&thin_target);
2272         dm_unregister_target(&pool_target);
2273 }
2274
2275 module_init(dm_thin_init);
2276 module_exit(dm_thin_exit);
2277
2278 MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target");
2279 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2280 MODULE_LICENSE("GPL");