drivers/md/dm-thin.c

   1 /*
   2  * Copyright (C) 2011-2012 Red Hat UK.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm-thin-metadata.h"
   8 #include "dm-bio-prison.h"
   9 #include "dm.h"
  10
  11 #include <linux/device-mapper.h>
  12 #include <linux/dm-io.h>
  13 #include <linux/dm-kcopyd.h>
  14 #include <linux/list.h>
  15 #include <linux/rculist.h>
  16 #include <linux/init.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/rbtree.h>
  20
  21 #define DM_MSG_PREFIX   "thin"
  22
  23 /*
  24  * Tunable constants
  25  */
  26 #define ENDIO_HOOK_POOL_SIZE 1024
  27 #define MAPPING_POOL_SIZE 1024
  28 #define PRISON_CELLS 1024
  29 #define COMMIT_PERIOD HZ
  30 #define NO_SPACE_TIMEOUT_SECS 60
  31
  32 static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
  33
  34 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
  35                 "A percentage of time allocated for copy on write");
  36
  37 /*
  38  * The block size of the device holding pool data must be
  39  * between 64KB and 1GB.
  40  */
  41 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
  42 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
  43
  44 /*
  45  * Device id is restricted to 24 bits.
  46  */
  47 #define MAX_DEV_ID ((1 << 24) - 1)
  48
  49 /*
  50  * How do we handle breaking sharing of data blocks?
  51  * =================================================
  52  *
  53  * We use a standard copy-on-write btree to store the mappings for the
  54  * devices (note I'm talking about copy-on-write of the metadata here, not
  55  * the data).  When you take an internal snapshot you clone the root node
  56  * of the origin btree.  After this there is no concept of an origin or a
  57  * snapshot.  They are just two device trees that happen to point to the
  58  * same data blocks.
  59  *
  60  * When we get a write in we decide if it's to a shared data block using
  61  * some timestamp magic.  If it is, we have to break sharing.
  62  *
  63  * Let's say we write to a shared block in what was the origin.  The
  64  * steps are:
  65  *
  66  * i) plug io further to this physical block. (see bio_prison code).
  67  *
  68  * ii) quiesce any read io to that shared data block.  Obviously
  69  * including all devices that share this block.  (see dm_deferred_set code)
  70  *
  71  * iii) copy the data block to a newly allocate block.  This step can be
  72  * missed out if the io covers the block. (schedule_copy).
  73  *
  74  * iv) insert the new mapping into the origin's btree
  75  * (process_prepared_mapping).  This act of inserting breaks some
  76  * sharing of btree nodes between the two devices.  Breaking sharing only
  77  * effects the btree of that specific device.  Btrees for the other
  78  * devices that share the block never change.  The btree for the origin
  79  * device as it was after the last commit is untouched, ie. we're using
  80  * persistent data structures in the functional programming sense.
  81  *
  82  * v) unplug io to this physical block, including the io that triggered
  83  * the breaking of sharing.
  84  *
  85  * Steps (ii) and (iii) occur in parallel.
  86  *
  87  * The metadata _doesn't_ need to be committed before the io continues.  We
  88  * get away with this because the io is always written to a _new_ block.
  89  * If there's a crash, then:
  90  *
  91  * - The origin mapping will point to the old origin block (the shared
  92  * one).  This will contain the data as it was before the io that triggered
  93  * the breaking of sharing came in.
  94  *
  95  * - The snap mapping still points to the old block.  As it would after
  96  * the commit.
  97  *
  98  * The downside of this scheme is the timestamp magic isn't perfect, and
  99  * will continue to think that data block in the snapshot device is shared
 100  * even after the write to the origin has broken sharing.  I suspect data
 101  * blocks will typically be shared by many different devices, so we're
 102  * breaking sharing n + 1 times, rather than n, where n is the number of
 103  * devices that reference this data block.  At the moment I think the
 104  * benefits far, far outweigh the disadvantages.
 105  */
 106
 107 /*----------------------------------------------------------------*/
 108
 109 /*
 110  * Key building.
 111  */
 112 static void build_data_key(struct dm_thin_device *td,
 113                            dm_block_t b, struct dm_cell_key *key)
 114 {
 115         key->virtual = 0;
 116         key->dev = dm_thin_dev_id(td);
 117         key->block = b;
 118 }
 119
 120 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 121                               struct dm_cell_key *key)
 122 {
 123         key->virtual = 1;
 124         key->dev = dm_thin_dev_id(td);
 125         key->block = b;
 126 }
 127
 128 /*----------------------------------------------------------------*/
 129
 130 /*
 131  * A pool device ties together a metadata device and a data device.  It
 132  * also provides the interface for creating and destroying internal
 133  * devices.
 134  */
 135 struct dm_thin_new_mapping;
 136
 137 /*
 138  * The pool runs in 4 modes.  Ordered in degraded order for comparisons.
 139  */
 140 enum pool_mode {
 141         PM_WRITE,               /* metadata may be changed */
 142         PM_OUT_OF_DATA_SPACE,   /* metadata may be changed, though data may not be allocated */
 143         PM_READ_ONLY,           /* metadata may not be changed */
 144         PM_FAIL,                /* all I/O fails */
 145 };
 146
 147 struct pool_features {
 148         enum pool_mode mode;
 149
 150         bool zero_new_blocks:1;
 151         bool discard_enabled:1;
 152         bool discard_passdown:1;
 153         bool error_if_no_space:1;
 154 };
 155
 156 struct thin_c;
 157 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
 158 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
 159
 160 struct pool {
 161         struct list_head list;
 162         struct dm_target *ti;   /* Only set if a pool target is bound */
 163
 164         struct mapped_device *pool_md;
 165         struct block_device *md_dev;
 166         struct dm_pool_metadata *pmd;
 167
 168         dm_block_t low_water_blocks;
 169         uint32_t sectors_per_block;
 170         int sectors_per_block_shift;
 171
 172         struct pool_features pf;
 173         bool low_water_triggered:1;     /* A dm event has been sent */
 174
 175         struct dm_bio_prison *prison;
 176         struct dm_kcopyd_client *copier;
 177
 178         struct workqueue_struct *wq;
 179         struct work_struct worker;
 180         struct delayed_work waker;
 181         struct delayed_work no_space_timeout;
 182
 183         unsigned long last_commit_jiffies;
 184         unsigned ref_count;
 185
 186         spinlock_t lock;
 187         struct bio_list deferred_flush_bios;
 188         struct list_head prepared_mappings;
 189         struct list_head prepared_discards;
 190         struct list_head active_thins;
 191
 192         struct dm_deferred_set *shared_read_ds;
 193         struct dm_deferred_set *all_io_ds;
 194
 195         struct dm_thin_new_mapping *next_mapping;
 196         mempool_t *mapping_pool;
 197
 198         process_bio_fn process_bio;
 199         process_bio_fn process_discard;
 200
 201         process_mapping_fn process_prepared_mapping;
 202         process_mapping_fn process_prepared_discard;
 203 };
 204
 205 static enum pool_mode get_pool_mode(struct pool *pool);
 206 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 207
 208 /*
 209  * Target context for a pool.
 210  */
 211 struct pool_c {
 212         struct dm_target *ti;
 213         struct pool *pool;
 214         struct dm_dev *data_dev;
 215         struct dm_dev *metadata_dev;
 216         struct dm_target_callbacks callbacks;
 217
 218         dm_block_t low_water_blocks;
 219         struct pool_features requested_pf; /* Features requested during table load */
 220         struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
 221 };
 222
 223 /*
 224  * Target context for a thin.
 225  */
 226 struct thin_c {
 227         struct list_head list;
 228         struct dm_dev *pool_dev;
 229         struct dm_dev *origin_dev;
 230         sector_t origin_size;
 231         dm_thin_id dev_id;
 232
 233         struct pool *pool;
 234         struct dm_thin_device *td;
 235         bool requeue_mode:1;
 236         spinlock_t lock;
 237         struct bio_list deferred_bio_list;
 238         struct bio_list retry_on_resume_list;
 239         struct rb_root sort_bio_list; /* sorted list of deferred bios */
 240
 241         /*
 242          * Ensures the thin is not destroyed until the worker has finished
 243          * iterating the active_thins list.
 244          */
 245         atomic_t refcount;
 246         struct completion can_destroy;
 247 };
 248
 249 /*----------------------------------------------------------------*/
 250
 251 /*
 252  * wake_worker() is used when new work is queued and when pool_resume is
 253  * ready to continue deferred IO processing.
 254  */
 255 static void wake_worker(struct pool *pool)
 256 {
 257         queue_work(pool->wq, &pool->worker);
 258 }
 259
 260 /*----------------------------------------------------------------*/
 261
 262 static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
 263                       struct dm_bio_prison_cell **cell_result)
 264 {
 265         int r;
 266         struct dm_bio_prison_cell *cell_prealloc;
 267
 268         /*
 269          * Allocate a cell from the prison's mempool.
 270          * This might block but it can't fail.
 271          */
 272         cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
 273
 274         r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
 275         if (r)
 276                 /*
 277                  * We reused an old cell; we can get rid of
 278                  * the new one.
 279                  */
 280                 dm_bio_prison_free_cell(pool->prison, cell_prealloc);
 281
 282         return r;
 283 }
 284
 285 static void cell_release(struct pool *pool,
 286                          struct dm_bio_prison_cell *cell,
 287                          struct bio_list *bios)
 288 {
 289         dm_cell_release(pool->prison, cell, bios);
 290         dm_bio_prison_free_cell(pool->prison, cell);
 291 }
 292
 293 static void cell_release_no_holder(struct pool *pool,
 294                                    struct dm_bio_prison_cell *cell,
 295                                    struct bio_list *bios)
 296 {
 297         dm_cell_release_no_holder(pool->prison, cell, bios);
 298         dm_bio_prison_free_cell(pool->prison, cell);
 299 }
 300
 301 static void cell_defer_no_holder_no_free(struct thin_c *tc,
 302                                          struct dm_bio_prison_cell *cell)
 303 {
 304         struct pool *pool = tc->pool;
 305         unsigned long flags;
 306
 307         spin_lock_irqsave(&tc->lock, flags);
 308         dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
 309         spin_unlock_irqrestore(&tc->lock, flags);
 310
 311         wake_worker(pool);
 312 }
 313
 314 static void cell_error_with_code(struct pool *pool,
 315                                  struct dm_bio_prison_cell *cell, int error_code)
 316 {
 317         dm_cell_error(pool->prison, cell, error_code);
 318         dm_bio_prison_free_cell(pool->prison, cell);
 319 }
 320
 321 static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
 322 {
 323         cell_error_with_code(pool, cell, -EIO);
 324 }
 325
 326 /*----------------------------------------------------------------*/
 327
 328 /*
 329  * A global list of pools that uses a struct mapped_device as a key.
 330  */
 331 static struct dm_thin_pool_table {
 332         struct mutex mutex;
 333         struct list_head pools;
 334 } dm_thin_pool_table;
 335
 336 static void pool_table_init(void)
 337 {
 338         mutex_init(&dm_thin_pool_table.mutex);
 339         INIT_LIST_HEAD(&dm_thin_pool_table.pools);
 340 }
 341
 342 static void __pool_table_insert(struct pool *pool)
 343 {
 344         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 345         list_add(&pool->list, &dm_thin_pool_table.pools);
 346 }
 347
 348 static void __pool_table_remove(struct pool *pool)
 349 {
 350         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 351         list_del(&pool->list);
 352 }
 353
 354 static struct pool *__pool_table_lookup(struct mapped_device *md)
 355 {
 356         struct pool *pool = NULL, *tmp;
 357
 358         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 359
 360         list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 361                 if (tmp->pool_md == md) {
 362                         pool = tmp;
 363                         break;
 364                 }
 365         }
 366
 367         return pool;
 368 }
 369
 370 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
 371 {
 372         struct pool *pool = NULL, *tmp;
 373
 374         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 375
 376         list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 377                 if (tmp->md_dev == md_dev) {
 378                         pool = tmp;
 379                         break;
 380                 }
 381         }
 382
 383         return pool;
 384 }
 385
 386 /*----------------------------------------------------------------*/
 387
 388 struct dm_thin_endio_hook {
 389         struct thin_c *tc;
 390         struct dm_deferred_entry *shared_read_entry;
 391         struct dm_deferred_entry *all_io_entry;
 392         struct dm_thin_new_mapping *overwrite_mapping;
 393         struct rb_node rb_node;
 394 };
 395
 396 static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 397 {
 398         struct bio *bio;
 399         struct bio_list bios;
 400         unsigned long flags;
 401
 402         bio_list_init(&bios);
 403
 404         spin_lock_irqsave(&tc->lock, flags);
 405         bio_list_merge(&bios, master);
 406         bio_list_init(master);
 407         spin_unlock_irqrestore(&tc->lock, flags);
 408
 409         while ((bio = bio_list_pop(&bios)))
 410                 bio_endio(bio, DM_ENDIO_REQUEUE);
 411 }
 412
 413 static void requeue_io(struct thin_c *tc)
 414 {
 415         requeue_bio_list(tc, &tc->deferred_bio_list);
 416         requeue_bio_list(tc, &tc->retry_on_resume_list);
 417 }
 418
 419 static void error_thin_retry_list(struct thin_c *tc)
 420 {
 421         struct bio *bio;
 422         unsigned long flags;
 423         struct bio_list bios;
 424
 425         bio_list_init(&bios);
 426
 427         spin_lock_irqsave(&tc->lock, flags);
 428         bio_list_merge(&bios, &tc->retry_on_resume_list);
 429         bio_list_init(&tc->retry_on_resume_list);
 430         spin_unlock_irqrestore(&tc->lock, flags);
 431
 432         while ((bio = bio_list_pop(&bios)))
 433                 bio_io_error(bio);
 434 }
 435
 436 static void error_retry_list(struct pool *pool)
 437 {
 438         struct thin_c *tc;
 439
 440         rcu_read_lock();
 441         list_for_each_entry_rcu(tc, &pool->active_thins, list)
 442                 error_thin_retry_list(tc);
 443         rcu_read_unlock();
 444 }
 445
 446 /*
 447  * This section of code contains the logic for processing a thin device's IO.
 448  * Much of the code depends on pool object resources (lists, workqueues, etc)
 449  * but most is exclusively called from the thin target rather than the thin-pool
 450  * target.
 451  */
 452
 453 static bool block_size_is_power_of_two(struct pool *pool)
 454 {
 455         return pool->sectors_per_block_shift >= 0;
 456 }
 457
 458 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 459 {
 460         struct pool *pool = tc->pool;
 461         sector_t block_nr = bio->bi_iter.bi_sector;
 462
 463         if (block_size_is_power_of_two(pool))
 464                 block_nr >>= pool->sectors_per_block_shift;
 465         else
 466                 (void) sector_div(block_nr, pool->sectors_per_block);
 467
 468         return block_nr;
 469 }
 470
 471 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 472 {
 473         struct pool *pool = tc->pool;
 474         sector_t bi_sector = bio->bi_iter.bi_sector;
 475
 476         bio->bi_bdev = tc->pool_dev->bdev;
 477         if (block_size_is_power_of_two(pool))
 478                 bio->bi_iter.bi_sector =
 479                         (block << pool->sectors_per_block_shift) |
 480                         (bi_sector & (pool->sectors_per_block - 1));
 481         else
 482                 bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
 483                                  sector_div(bi_sector, pool->sectors_per_block);
 484 }
 485
 486 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 487 {
 488         bio->bi_bdev = tc->origin_dev->bdev;
 489 }
 490
 491 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 492 {
 493         return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
 494                 dm_thin_changed_this_transaction(tc->td);
 495 }
 496
 497 static void inc_all_io_entry(struct pool *pool, struct bio *bio)
 498 {
 499         struct dm_thin_endio_hook *h;
 500
 501         if (bio->bi_rw & REQ_DISCARD)
 502                 return;
 503
 504         h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 505         h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
 506 }
 507
 508 static void issue(struct thin_c *tc, struct bio *bio)
 509 {
 510         struct pool *pool = tc->pool;
 511         unsigned long flags;
 512
 513         if (!bio_triggers_commit(tc, bio)) {
 514                 generic_make_request(bio);
 515                 return;
 516         }
 517
 518         /*
 519          * Complete bio with an error if earlier I/O caused changes to
 520          * the metadata that can't be committed e.g, due to I/O errors
 521          * on the metadata device.
 522          */
 523         if (dm_thin_aborted_changes(tc->td)) {
 524                 bio_io_error(bio);
 525                 return;
 526         }
 527
 528         /*
 529          * Batch together any bios that trigger commits and then issue a
 530          * single commit for them in process_deferred_bios().
 531          */
 532         spin_lock_irqsave(&pool->lock, flags);
 533         bio_list_add(&pool->deferred_flush_bios, bio);
 534         spin_unlock_irqrestore(&pool->lock, flags);
 535 }
 536
 537 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
 538 {
 539         remap_to_origin(tc, bio);
 540         issue(tc, bio);
 541 }
 542
 543 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 544                             dm_block_t block)
 545 {
 546         remap(tc, bio, block);
 547         issue(tc, bio);
 548 }
 549
 550 /*----------------------------------------------------------------*/
 551
 552 /*
 553  * Bio endio functions.
 554  */
 555 struct dm_thin_new_mapping {
 556         struct list_head list;
 557
 558         bool pass_discard:1;
 559         bool definitely_not_shared:1;
 560
 561         /*
 562          * Track quiescing, copying and zeroing preparation actions.  When this
 563          * counter hits zero the block is prepared and can be inserted into the
 564          * btree.
 565          */
 566         atomic_t prepare_actions;
 567
 568         int err;
 569         struct thin_c *tc;
 570         dm_block_t virt_block;
 571         dm_block_t data_block;
 572         struct dm_bio_prison_cell *cell, *cell2;
 573
 574         /*
 575          * If the bio covers the whole area of a block then we can avoid
 576          * zeroing or copying.  Instead this bio is hooked.  The bio will
 577          * still be in the cell, so care has to be taken to avoid issuing
 578          * the bio twice.
 579          */
 580         struct bio *bio;
 581         bio_end_io_t *saved_bi_end_io;
 582 };
 583
 584 static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
 585 {
 586         struct pool *pool = m->tc->pool;
 587
 588         if (atomic_dec_and_test(&m->prepare_actions)) {
 589                 list_add_tail(&m->list, &pool->prepared_mappings);
 590                 wake_worker(pool);
 591         }
 592 }
 593
 594 static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
 595 {
 596         unsigned long flags;
 597         struct pool *pool = m->tc->pool;
 598
 599         spin_lock_irqsave(&pool->lock, flags);
 600         __complete_mapping_preparation(m);
 601         spin_unlock_irqrestore(&pool->lock, flags);
 602 }
 603
 604 static void copy_complete(int read_err, unsigned long write_err, void *context)
 605 {
 606         struct dm_thin_new_mapping *m = context;
 607
 608         m->err = read_err || write_err ? -EIO : 0;
 609         complete_mapping_preparation(m);
 610 }
 611
 612 static void overwrite_endio(struct bio *bio, int err)
 613 {
 614         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 615         struct dm_thin_new_mapping *m = h->overwrite_mapping;
 616
 617         m->err = err;
 618         complete_mapping_preparation(m);
 619 }
 620
 621 /*----------------------------------------------------------------*/
 622
 623 /*
 624  * Workqueue.
 625  */
 626
 627 /*
 628  * Prepared mapping jobs.
 629  */
 630
 631 /*
 632  * This sends the bios in the cell back to the deferred_bios list.
 633  */
 634 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 635 {
 636         struct pool *pool = tc->pool;
 637         unsigned long flags;
 638
 639         spin_lock_irqsave(&tc->lock, flags);
 640         cell_release(pool, cell, &tc->deferred_bio_list);
 641         spin_unlock_irqrestore(&tc->lock, flags);
 642
 643         wake_worker(pool);
 644 }
 645
 646 /*
 647  * Same as cell_defer above, except it omits the original holder of the cell.
 648  */
 649 static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 650 {
 651         struct pool *pool = tc->pool;
 652         unsigned long flags;
 653
 654         spin_lock_irqsave(&tc->lock, flags);
 655         cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
 656         spin_unlock_irqrestore(&tc->lock, flags);
 657
 658         wake_worker(pool);
 659 }
 660
 661 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 662 {
 663         if (m->bio) {
 664                 m->bio->bi_end_io = m->saved_bi_end_io;
 665                 atomic_inc(&m->bio->bi_remaining);
 666         }
 667         cell_error(m->tc->pool, m->cell);
 668         list_del(&m->list);
 669         mempool_free(m, m->tc->pool->mapping_pool);
 670 }
 671
 672 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 673 {
 674         struct thin_c *tc = m->tc;
 675         struct pool *pool = tc->pool;
 676         struct bio *bio;
 677         int r;
 678
 679         bio = m->bio;
 680         if (bio) {
 681                 bio->bi_end_io = m->saved_bi_end_io;
 682                 atomic_inc(&bio->bi_remaining);
 683         }
 684
 685         if (m->err) {
 686                 cell_error(pool, m->cell);
 687                 goto out;
 688         }
 689
 690         /*
 691          * Commit the prepared block into the mapping btree.
 692          * Any I/O for this block arriving after this point will get
 693          * remapped to it directly.
 694          */
 695         r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
 696         if (r) {
 697                 metadata_operation_failed(pool, "dm_thin_insert_block", r);
 698                 cell_error(pool, m->cell);
 699                 goto out;
 700         }
 701
 702         /*
 703          * Release any bios held while the block was being provisioned.
 704          * If we are processing a write bio that completely covers the block,
 705          * we already processed it so can ignore it now when processing
 706          * the bios in the cell.
 707          */
 708         if (bio) {
 709                 cell_defer_no_holder(tc, m->cell);
 710                 bio_endio(bio, 0);
 711         } else
 712                 cell_defer(tc, m->cell);
 713
 714 out:
 715         list_del(&m->list);
 716         mempool_free(m, pool->mapping_pool);
 717 }
 718
 719 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
 720 {
 721         struct thin_c *tc = m->tc;
 722
 723         bio_io_error(m->bio);
 724         cell_defer_no_holder(tc, m->cell);
 725         cell_defer_no_holder(tc, m->cell2);
 726         mempool_free(m, tc->pool->mapping_pool);
 727 }
 728
 729 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
 730 {
 731         struct thin_c *tc = m->tc;
 732
 733         inc_all_io_entry(tc->pool, m->bio);
 734         cell_defer_no_holder(tc, m->cell);
 735         cell_defer_no_holder(tc, m->cell2);
 736
 737         if (m->pass_discard)
 738                 if (m->definitely_not_shared)
 739                         remap_and_issue(tc, m->bio, m->data_block);
 740                 else {
 741                         bool used = false;
 742                         if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
 743                                 bio_endio(m->bio, 0);
 744                         else
 745                                 remap_and_issue(tc, m->bio, m->data_block);
 746                 }
 747         else
 748                 bio_endio(m->bio, 0);
 749
 750         mempool_free(m, tc->pool->mapping_pool);
 751 }
 752
 753 static void process_prepared_discard(struct dm_thin_new_mapping *m)
 754 {
 755         int r;
 756         struct thin_c *tc = m->tc;
 757
 758         r = dm_thin_remove_block(tc->td, m->virt_block);
 759         if (r)
 760                 DMERR_LIMIT("dm_thin_remove_block() failed");
 761
 762         process_prepared_discard_passdown(m);
 763 }
 764
 765 static void process_prepared(struct pool *pool, struct list_head *head,
 766                              process_mapping_fn *fn)
 767 {
 768         unsigned long flags;
 769         struct list_head maps;
 770         struct dm_thin_new_mapping *m, *tmp;
 771
 772         INIT_LIST_HEAD(&maps);
 773         spin_lock_irqsave(&pool->lock, flags);
 774         list_splice_init(head, &maps);
 775         spin_unlock_irqrestore(&pool->lock, flags);
 776
 777         list_for_each_entry_safe(m, tmp, &maps, list)
 778                 (*fn)(m);
 779 }
 780
 781 /*
 782  * Deferred bio jobs.
 783  */
 784 static int io_overlaps_block(struct pool *pool, struct bio *bio)
 785 {
 786         return bio->bi_iter.bi_size ==
 787                 (pool->sectors_per_block << SECTOR_SHIFT);
 788 }
 789
 790 static int io_overwrites_block(struct pool *pool, struct bio *bio)
 791 {
 792         return (bio_data_dir(bio) == WRITE) &&
 793                 io_overlaps_block(pool, bio);
 794 }
 795
 796 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
 797                                bio_end_io_t *fn)
 798 {
 799         *save = bio->bi_end_io;
 800         bio->bi_end_io = fn;
 801 }
 802
 803 static int ensure_next_mapping(struct pool *pool)
 804 {
 805         if (pool->next_mapping)
 806                 return 0;
 807
 808         pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
 809
 810         return pool->next_mapping ? 0 : -ENOMEM;
 811 }
 812
 813 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
 814 {
 815         struct dm_thin_new_mapping *m = pool->next_mapping;
 816
 817         BUG_ON(!pool->next_mapping);
 818
 819         memset(m, 0, sizeof(struct dm_thin_new_mapping));
 820         INIT_LIST_HEAD(&m->list);
 821         m->bio = NULL;
 822
 823         pool->next_mapping = NULL;
 824
 825         return m;
 826 }
 827
 828 static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
 829                     sector_t begin, sector_t end)
 830 {
 831         int r;
 832         struct dm_io_region to;
 833
 834         to.bdev = tc->pool_dev->bdev;
 835         to.sector = begin;
 836         to.count = end - begin;
 837
 838         r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
 839         if (r < 0) {
 840                 DMERR_LIMIT("dm_kcopyd_zero() failed");
 841                 copy_complete(1, 1, m);
 842         }
 843 }
 844
 845 /*
 846  * A partial copy also needs to zero the uncopied region.
 847  */
 848 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 849                           struct dm_dev *origin, dm_block_t data_origin,
 850                           dm_block_t data_dest,
 851                           struct dm_bio_prison_cell *cell, struct bio *bio,
 852                           sector_t len)
 853 {
 854         int r;
 855         struct pool *pool = tc->pool;
 856         struct dm_thin_new_mapping *m = get_next_mapping(pool);
 857
 858         m->tc = tc;
 859         m->virt_block = virt_block;
 860         m->data_block = data_dest;
 861         m->cell = cell;
 862
 863         /*
 864          * quiesce action + copy action + an extra reference held for the
 865          * duration of this function (we may need to inc later for a
 866          * partial zero).
 867          */
 868         atomic_set(&m->prepare_actions, 3);
 869
 870         if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
 871                 complete_mapping_preparation(m); /* already quiesced */
 872
 873         /*
 874          * IO to pool_dev remaps to the pool target's data_dev.
 875          *
 876          * If the whole block of data is being overwritten, we can issue the
 877          * bio immediately. Otherwise we use kcopyd to clone the data first.
 878          */
 879         if (io_overwrites_block(pool, bio)) {
 880                 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 881
 882                 h->overwrite_mapping = m;
 883                 m->bio = bio;
 884                 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 885                 inc_all_io_entry(pool, bio);
 886                 remap_and_issue(tc, bio, data_dest);
 887         } else {
 888                 struct dm_io_region from, to;
 889
 890                 from.bdev = origin->bdev;
 891                 from.sector = data_origin * pool->sectors_per_block;
 892                 from.count = len;
 893
 894                 to.bdev = tc->pool_dev->bdev;
 895                 to.sector = data_dest * pool->sectors_per_block;
 896                 to.count = len;
 897
 898                 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
 899                                    0, copy_complete, m);
 900                 if (r < 0) {
 901                         DMERR_LIMIT("dm_kcopyd_copy() failed");
 902                         copy_complete(1, 1, m);
 903
 904                         /*
 905                          * We allow the zero to be issued, to simplify the
 906                          * error path.  Otherwise we'd need to start
 907                          * worrying about decrementing the prepare_actions
 908                          * counter.
 909                          */
 910                 }
 911
 912                 /*
 913                  * Do we need to zero a tail region?
 914                  */
 915                 if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
 916                         atomic_inc(&m->prepare_actions);
 917                         ll_zero(tc, m,
 918                                 data_dest * pool->sectors_per_block + len,
 919                                 (data_dest + 1) * pool->sectors_per_block);
 920                 }
 921         }
 922
 923         complete_mapping_preparation(m); /* drop our ref */
 924 }
 925
 926 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
 927                                    dm_block_t data_origin, dm_block_t data_dest,
 928                                    struct dm_bio_prison_cell *cell, struct bio *bio)
 929 {
 930         schedule_copy(tc, virt_block, tc->pool_dev,
 931                       data_origin, data_dest, cell, bio,
 932                       tc->pool->sectors_per_block);
 933 }
 934
 935 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 936                           dm_block_t data_block, struct dm_bio_prison_cell *cell,
 937                           struct bio *bio)
 938 {
 939         struct pool *pool = tc->pool;
 940         struct dm_thin_new_mapping *m = get_next_mapping(pool);
 941
 942         atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
 943         m->tc = tc;
 944         m->virt_block = virt_block;
 945         m->data_block = data_block;
 946         m->cell = cell;
 947
 948         /*
 949          * If the whole block of data is being overwritten or we are not
 950          * zeroing pre-existing data, we can issue the bio immediately.
 951          * Otherwise we use kcopyd to zero the data first.
 952          */
 953         if (!pool->pf.zero_new_blocks)
 954                 process_prepared_mapping(m);
 955
 956         else if (io_overwrites_block(pool, bio)) {
 957                 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 958
 959                 h->overwrite_mapping = m;
 960                 m->bio = bio;
 961                 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 962                 inc_all_io_entry(pool, bio);
 963                 remap_and_issue(tc, bio, data_block);
 964
 965         } else
 966                 ll_zero(tc, m,
 967                         data_block * pool->sectors_per_block,
 968                         (data_block + 1) * pool->sectors_per_block);
 969 }
 970
 971 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
 972                                    dm_block_t data_dest,
 973                                    struct dm_bio_prison_cell *cell, struct bio *bio)
 974 {
 975         struct pool *pool = tc->pool;
 976         sector_t virt_block_begin = virt_block * pool->sectors_per_block;
 977         sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
 978
 979         if (virt_block_end <= tc->origin_size)
 980                 schedule_copy(tc, virt_block, tc->origin_dev,
 981                               virt_block, data_dest, cell, bio,
 982                               pool->sectors_per_block);
 983
 984         else if (virt_block_begin < tc->origin_size)
 985                 schedule_copy(tc, virt_block, tc->origin_dev,
 986                               virt_block, data_dest, cell, bio,
 987                               tc->origin_size - virt_block_begin);
 988
 989         else
 990                 schedule_zero(tc, virt_block, data_dest, cell, bio);
 991 }
 992
 993 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
 994
 995 static void check_for_space(struct pool *pool)
 996 {
 997         int r;
 998         dm_block_t nr_free;
 999
1000         if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)
1001                 return;
1002
1003         r = dm_pool_get_free_block_count(pool->pmd, &nr_free);
1004         if (r)
1005                 return;
1006
1007         if (nr_free)
1008                 set_pool_mode(pool, PM_WRITE);
1009 }
1010
1011 /*
1012  * A non-zero return indicates read_only or fail_io mode.
1013  * Many callers don't care about the return value.
1014  */
1015 static int commit(struct pool *pool)
1016 {
1017         int r;
1018
1019         if (get_pool_mode(pool) >= PM_READ_ONLY)
1020                 return -EINVAL;
1021
1022         r = dm_pool_commit_metadata(pool->pmd);
1023         if (r)
1024                 metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
1025         else
1026                 check_for_space(pool);
1027
1028         return r;
1029 }
1030
1031 static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
1032 {
1033         unsigned long flags;
1034
1035         if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1036                 DMWARN("%s: reached low water mark for data device: sending event.",
1037                        dm_device_name(pool->pool_md));
1038                 spin_lock_irqsave(&pool->lock, flags);
1039                 pool->low_water_triggered = true;
1040                 spin_unlock_irqrestore(&pool->lock, flags);
1041                 dm_table_event(pool->ti->table);
1042         }
1043 }
1044
1045 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1046 {
1047         int r;
1048         dm_block_t free_blocks;
1049         struct pool *pool = tc->pool;
1050
1051         if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
1052                 return -EINVAL;
1053
1054         r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1055         if (r) {
1056                 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
1057                 return r;
1058         }
1059
1060         check_low_water_mark(pool, free_blocks);
1061
1062         if (!free_blocks) {
1063                 /*
1064                  * Try to commit to see if that will free up some
1065                  * more space.
1066                  */
1067                 r = commit(pool);
1068                 if (r)
1069                         return r;
1070
1071                 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1072                 if (r) {
1073                         metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
1074                         return r;
1075                 }
1076
1077                 if (!free_blocks) {
1078                         set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
1079                         return -ENOSPC;
1080                 }
1081         }
1082
1083         r = dm_pool_alloc_data_block(pool->pmd, result);
1084         if (r) {
1085                 metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
1086                 return r;
1087         }
1088
1089         return 0;
1090 }
1091
1092 /*
1093  * If we have run out of space, queue bios until the device is
1094  * resumed, presumably after having been reloaded with more space.
1095  */
1096 static void retry_on_resume(struct bio *bio)
1097 {
1098         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1099         struct thin_c *tc = h->tc;
1100         unsigned long flags;
1101
1102         spin_lock_irqsave(&tc->lock, flags);
1103         bio_list_add(&tc->retry_on_resume_list, bio);
1104         spin_unlock_irqrestore(&tc->lock, flags);
1105 }
1106
1107 static int should_error_unserviceable_bio(struct pool *pool)
1108 {
1109         enum pool_mode m = get_pool_mode(pool);
1110
1111         switch (m) {
1112         case PM_WRITE:
1113                 /* Shouldn't get here */
1114                 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1115                 return -EIO;
1116
1117         case PM_OUT_OF_DATA_SPACE:
1118                 return pool->pf.error_if_no_space ? -ENOSPC : 0;
1119
1120         case PM_READ_ONLY:
1121         case PM_FAIL:
1122                 return -EIO;
1123         default:
1124                 /* Shouldn't get here */
1125                 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1126                 return -EIO;
1127         }
1128 }
1129
1130 static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1131 {
1132         int error = should_error_unserviceable_bio(pool);
1133
1134         if (error)
1135                 bio_endio(bio, error);
1136         else
1137                 retry_on_resume(bio);
1138 }
1139
1140 static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
1141 {
1142         struct bio *bio;
1143         struct bio_list bios;
1144         int error;
1145
1146         error = should_error_unserviceable_bio(pool);
1147         if (error) {
1148                 cell_error_with_code(pool, cell, error);
1149                 return;
1150         }
1151
1152         bio_list_init(&bios);
1153         cell_release(pool, cell, &bios);
1154
1155         error = should_error_unserviceable_bio(pool);
1156         if (error)
1157                 while ((bio = bio_list_pop(&bios)))
1158                         bio_endio(bio, error);
1159         else
1160                 while ((bio = bio_list_pop(&bios)))
1161                         retry_on_resume(bio);
1162 }
1163
1164 static void process_discard(struct thin_c *tc, struct bio *bio)
1165 {
1166         int r;
1167         unsigned long flags;
1168         struct pool *pool = tc->pool;
1169         struct dm_bio_prison_cell *cell, *cell2;
1170         struct dm_cell_key key, key2;
1171         dm_block_t block = get_bio_block(tc, bio);
1172         struct dm_thin_lookup_result lookup_result;
1173         struct dm_thin_new_mapping *m;
1174
1175         build_virtual_key(tc->td, block, &key);
1176         if (bio_detain(tc->pool, &key, bio, &cell))
1177                 return;
1178
1179         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1180         switch (r) {
1181         case 0:
1182                 /*
1183                  * Check nobody is fiddling with this pool block.  This can
1184                  * happen if someone's in the process of breaking sharing
1185                  * on this block.
1186                  */
1187                 build_data_key(tc->td, lookup_result.block, &key2);
1188                 if (bio_detain(tc->pool, &key2, bio, &cell2)) {
1189                         cell_defer_no_holder(tc, cell);
1190                         break;
1191                 }
1192
1193                 if (io_overlaps_block(pool, bio)) {
1194                         /*
1195                          * IO may still be going to the destination block.  We must
1196                          * quiesce before we can do the removal.
1197                          */
1198                         m = get_next_mapping(pool);
1199                         m->tc = tc;
1200                         m->pass_discard = pool->pf.discard_passdown;
1201                         m->definitely_not_shared = !lookup_result.shared;
1202                         m->virt_block = block;
1203                         m->data_block = lookup_result.block;
1204                         m->cell = cell;
1205                         m->cell2 = cell2;
1206                         m->bio = bio;
1207
1208                         if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
1209                                 spin_lock_irqsave(&pool->lock, flags);
1210                                 list_add_tail(&m->list, &pool->prepared_discards);
1211                                 spin_unlock_irqrestore(&pool->lock, flags);
1212                                 wake_worker(pool);
1213                         }
1214                 } else {
1215                         inc_all_io_entry(pool, bio);
1216                         cell_defer_no_holder(tc, cell);
1217                         cell_defer_no_holder(tc, cell2);
1218
1219                         /*
1220                          * The DM core makes sure that the discard doesn't span
1221                          * a block boundary.  So we submit the discard of a
1222                          * partial block appropriately.
1223                          */
1224                         if ((!lookup_result.shared) && pool->pf.discard_passdown)
1225                                 remap_and_issue(tc, bio, lookup_result.block);
1226                         else
1227                                 bio_endio(bio, 0);
1228                 }
1229                 break;
1230
1231         case -ENODATA:
1232                 /*
1233                  * It isn't provisioned, just forget it.
1234                  */
1235                 cell_defer_no_holder(tc, cell);
1236                 bio_endio(bio, 0);
1237                 break;
1238
1239         default:
1240                 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1241                             __func__, r);
1242                 cell_defer_no_holder(tc, cell);
1243                 bio_io_error(bio);
1244                 break;
1245         }
1246 }
1247
1248 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1249                           struct dm_cell_key *key,
1250                           struct dm_thin_lookup_result *lookup_result,
1251                           struct dm_bio_prison_cell *cell)
1252 {
1253         int r;
1254         dm_block_t data_block;
1255         struct pool *pool = tc->pool;
1256
1257         r = alloc_data_block(tc, &data_block);
1258         switch (r) {
1259         case 0:
1260                 schedule_internal_copy(tc, block, lookup_result->block,
1261                                        data_block, cell, bio);
1262                 break;
1263
1264         case -ENOSPC:
1265                 retry_bios_on_resume(pool, cell);
1266                 break;
1267
1268         default:
1269                 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1270                             __func__, r);
1271                 cell_error(pool, cell);
1272                 break;
1273         }
1274 }
1275
1276 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1277                                dm_block_t block,
1278                                struct dm_thin_lookup_result *lookup_result)
1279 {
1280         struct dm_bio_prison_cell *cell;
1281         struct pool *pool = tc->pool;
1282         struct dm_cell_key key;
1283
1284         /*
1285          * If cell is already occupied, then sharing is already in the process
1286          * of being broken so we have nothing further to do here.
1287          */
1288         build_data_key(tc->td, lookup_result->block, &key);
1289         if (bio_detain(pool, &key, bio, &cell))
1290                 return;
1291
1292         if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
1293                 break_sharing(tc, bio, block, &key, lookup_result, cell);
1294         else {
1295                 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1296
1297                 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1298                 inc_all_io_entry(pool, bio);
1299                 cell_defer_no_holder(tc, cell);
1300
1301                 remap_and_issue(tc, bio, lookup_result->block);
1302         }
1303 }
1304
1305 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1306                             struct dm_bio_prison_cell *cell)
1307 {
1308         int r;
1309         dm_block_t data_block;
1310         struct pool *pool = tc->pool;
1311
1312         /*
1313          * Remap empty bios (flushes) immediately, without provisioning.
1314          */
1315         if (!bio->bi_iter.bi_size) {
1316                 inc_all_io_entry(pool, bio);
1317                 cell_defer_no_holder(tc, cell);
1318
1319                 remap_and_issue(tc, bio, 0);
1320                 return;
1321         }
1322
1323         /*
1324          * Fill read bios with zeroes and complete them immediately.
1325          */
1326         if (bio_data_dir(bio) == READ) {
1327                 zero_fill_bio(bio);
1328                 cell_defer_no_holder(tc, cell);
1329                 bio_endio(bio, 0);
1330                 return;
1331         }
1332
1333         r = alloc_data_block(tc, &data_block);
1334         switch (r) {
1335         case 0:
1336                 if (tc->origin_dev)
1337                         schedule_external_copy(tc, block, data_block, cell, bio);
1338                 else
1339                         schedule_zero(tc, block, data_block, cell, bio);
1340                 break;
1341
1342         case -ENOSPC:
1343                 retry_bios_on_resume(pool, cell);
1344                 break;
1345
1346         default:
1347                 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1348                             __func__, r);
1349                 cell_error(pool, cell);
1350                 break;
1351         }
1352 }
1353
1354 static void process_bio(struct thin_c *tc, struct bio *bio)
1355 {
1356         int r;
1357         struct pool *pool = tc->pool;
1358         dm_block_t block = get_bio_block(tc, bio);
1359         struct dm_bio_prison_cell *cell;
1360         struct dm_cell_key key;
1361         struct dm_thin_lookup_result lookup_result;
1362
1363         /*
1364          * If cell is already occupied, then the block is already
1365          * being provisioned so we have nothing further to do here.
1366          */
1367         build_virtual_key(tc->td, block, &key);
1368         if (bio_detain(pool, &key, bio, &cell))
1369                 return;
1370
1371         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1372         switch (r) {
1373         case 0:
1374                 if (lookup_result.shared) {
1375                         process_shared_bio(tc, bio, block, &lookup_result);
1376                         cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
1377                 } else {
1378                         inc_all_io_entry(pool, bio);
1379                         cell_defer_no_holder(tc, cell);
1380
1381                         remap_and_issue(tc, bio, lookup_result.block);
1382                 }
1383                 break;
1384
1385         case -ENODATA:
1386                 if (bio_data_dir(bio) == READ && tc->origin_dev) {
1387                         inc_all_io_entry(pool, bio);
1388                         cell_defer_no_holder(tc, cell);
1389
1390                         if (bio_end_sector(bio) <= tc->origin_size)
1391                                 remap_to_origin_and_issue(tc, bio);
1392
1393                         else if (bio->bi_iter.bi_sector < tc->origin_size) {
1394                                 zero_fill_bio(bio);
1395                                 bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
1396                                 remap_to_origin_and_issue(tc, bio);
1397
1398                         } else {
1399                                 zero_fill_bio(bio);
1400                                 bio_endio(bio, 0);
1401                         }
1402                 } else
1403                         provision_block(tc, bio, block, cell);
1404                 break;
1405
1406         default:
1407                 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1408                             __func__, r);
1409                 cell_defer_no_holder(tc, cell);
1410                 bio_io_error(bio);
1411                 break;
1412         }
1413 }
1414
1415 static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1416 {
1417         int r;
1418         int rw = bio_data_dir(bio);
1419         dm_block_t block = get_bio_block(tc, bio);
1420         struct dm_thin_lookup_result lookup_result;
1421
1422         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1423         switch (r) {
1424         case 0:
1425                 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
1426                         handle_unserviceable_bio(tc->pool, bio);
1427                 else {
1428                         inc_all_io_entry(tc->pool, bio);
1429                         remap_and_issue(tc, bio, lookup_result.block);
1430                 }
1431                 break;
1432
1433         case -ENODATA:
1434                 if (rw != READ) {
1435                         handle_unserviceable_bio(tc->pool, bio);
1436                         break;
1437                 }
1438
1439                 if (tc->origin_dev) {
1440                         inc_all_io_entry(tc->pool, bio);
1441                         remap_to_origin_and_issue(tc, bio);
1442                         break;
1443                 }
1444
1445                 zero_fill_bio(bio);
1446                 bio_endio(bio, 0);
1447                 break;
1448
1449         default:
1450                 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1451                             __func__, r);
1452                 bio_io_error(bio);
1453                 break;
1454         }
1455 }
1456
1457 static void process_bio_success(struct thin_c *tc, struct bio *bio)
1458 {
1459         bio_endio(bio, 0);
1460 }
1461
1462 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1463 {
1464         bio_io_error(bio);
1465 }
1466
1467 /*
1468  * FIXME: should we also commit due to size of transaction, measured in
1469  * metadata blocks?
1470  */
1471 static int need_commit_due_to_time(struct pool *pool)
1472 {
1473         return jiffies < pool->last_commit_jiffies ||
1474                jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1475 }
1476
1477 #define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
1478 #define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
1479
1480 static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
1481 {
1482         struct rb_node **rbp, *parent;
1483         struct dm_thin_endio_hook *pbd;
1484         sector_t bi_sector = bio->bi_iter.bi_sector;
1485
1486         rbp = &tc->sort_bio_list.rb_node;
1487         parent = NULL;
1488         while (*rbp) {
1489                 parent = *rbp;
1490                 pbd = thin_pbd(parent);
1491
1492                 if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
1493                         rbp = &(*rbp)->rb_left;
1494                 else
1495                         rbp = &(*rbp)->rb_right;
1496         }
1497
1498         pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1499         rb_link_node(&pbd->rb_node, parent, rbp);
1500         rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
1501 }
1502
1503 static void __extract_sorted_bios(struct thin_c *tc)
1504 {
1505         struct rb_node *node;
1506         struct dm_thin_endio_hook *pbd;
1507         struct bio *bio;
1508
1509         for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
1510                 pbd = thin_pbd(node);
1511                 bio = thin_bio(pbd);
1512
1513                 bio_list_add(&tc->deferred_bio_list, bio);
1514                 rb_erase(&pbd->rb_node, &tc->sort_bio_list);
1515         }
1516
1517         WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
1518 }
1519
1520 static void __sort_thin_deferred_bios(struct thin_c *tc)
1521 {
1522         struct bio *bio;
1523         struct bio_list bios;
1524
1525         bio_list_init(&bios);
1526         bio_list_merge(&bios, &tc->deferred_bio_list);
1527         bio_list_init(&tc->deferred_bio_list);
1528
1529         /* Sort deferred_bio_list using rb-tree */
1530         while ((bio = bio_list_pop(&bios)))
1531                 __thin_bio_rb_add(tc, bio);
1532
1533         /*
1534          * Transfer the sorted bios in sort_bio_list back to
1535          * deferred_bio_list to allow lockless submission of
1536          * all bios.
1537          */
1538         __extract_sorted_bios(tc);
1539 }
1540
1541 static void process_thin_deferred_bios(struct thin_c *tc)
1542 {
1543         struct pool *pool = tc->pool;
1544         unsigned long flags;
1545         struct bio *bio;
1546         struct bio_list bios;
1547         struct blk_plug plug;
1548
1549         if (tc->requeue_mode) {
1550                 requeue_bio_list(tc, &tc->deferred_bio_list);
1551                 return;
1552         }
1553
1554         bio_list_init(&bios);
1555
1556         spin_lock_irqsave(&tc->lock, flags);
1557
1558         if (bio_list_empty(&tc->deferred_bio_list)) {
1559                 spin_unlock_irqrestore(&tc->lock, flags);
1560                 return;
1561         }
1562
1563         __sort_thin_deferred_bios(tc);
1564
1565         bio_list_merge(&bios, &tc->deferred_bio_list);
1566         bio_list_init(&tc->deferred_bio_list);
1567
1568         spin_unlock_irqrestore(&tc->lock, flags);
1569
1570         blk_start_plug(&plug);
1571         while ((bio = bio_list_pop(&bios))) {
1572                 /*
1573                  * If we've got no free new_mapping structs, and processing
1574                  * this bio might require one, we pause until there are some
1575                  * prepared mappings to process.
1576                  */
1577                 if (ensure_next_mapping(pool)) {
1578                         spin_lock_irqsave(&tc->lock, flags);
1579                         bio_list_add(&tc->deferred_bio_list, bio);
1580                         bio_list_merge(&tc->deferred_bio_list, &bios);
1581                         spin_unlock_irqrestore(&tc->lock, flags);
1582                         break;
1583                 }
1584
1585                 if (bio->bi_rw & REQ_DISCARD)
1586                         pool->process_discard(tc, bio);
1587                 else
1588                         pool->process_bio(tc, bio);
1589         }
1590         blk_finish_plug(&plug);
1591 }
1592
1593 static void thin_get(struct thin_c *tc);
1594 static void thin_put(struct thin_c *tc);
1595
1596 /*
1597  * We can't hold rcu_read_lock() around code that can block.  So we
1598  * find a thin with the rcu lock held; bump a refcount; then drop
1599  * the lock.
1600  */
1601 static struct thin_c *get_first_thin(struct pool *pool)
1602 {
1603         struct thin_c *tc = NULL;
1604
1605         rcu_read_lock();
1606         if (!list_empty(&pool->active_thins)) {
1607                 tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
1608                 thin_get(tc);
1609         }
1610         rcu_read_unlock();
1611
1612         return tc;
1613 }
1614
1615 static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
1616 {
1617         struct thin_c *old_tc = tc;
1618
1619         rcu_read_lock();
1620         list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
1621                 thin_get(tc);
1622                 thin_put(old_tc);
1623                 rcu_read_unlock();
1624                 return tc;
1625         }
1626         thin_put(old_tc);
1627         rcu_read_unlock();
1628
1629         return NULL;
1630 }
1631
1632 static void process_deferred_bios(struct pool *pool)
1633 {
1634         unsigned long flags;
1635         struct bio *bio;
1636         struct bio_list bios;
1637         struct thin_c *tc;
1638
1639         tc = get_first_thin(pool);
1640         while (tc) {
1641                 process_thin_deferred_bios(tc);
1642                 tc = get_next_thin(pool, tc);
1643         }
1644
1645         /*
1646          * If there are any deferred flush bios, we must commit
1647          * the metadata before issuing them.
1648          */
1649         bio_list_init(&bios);
1650         spin_lock_irqsave(&pool->lock, flags);
1651         bio_list_merge(&bios, &pool->deferred_flush_bios);
1652         bio_list_init(&pool->deferred_flush_bios);
1653         spin_unlock_irqrestore(&pool->lock, flags);
1654
1655         if (bio_list_empty(&bios) &&
1656             !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
1657                 return;
1658
1659         if (commit(pool)) {
1660                 while ((bio = bio_list_pop(&bios)))
1661                         bio_io_error(bio);
1662                 return;
1663         }
1664         pool->last_commit_jiffies = jiffies;
1665
1666         while ((bio = bio_list_pop(&bios)))
1667                 generic_make_request(bio);
1668 }
1669
1670 static void do_worker(struct work_struct *ws)
1671 {
1672         struct pool *pool = container_of(ws, struct pool, worker);
1673
1674         process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1675         process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1676         process_deferred_bios(pool);
1677 }
1678
1679 /*
1680  * We want to commit periodically so that not too much
1681  * unwritten data builds up.
1682  */
1683 static void do_waker(struct work_struct *ws)
1684 {
1685         struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1686         wake_worker(pool);
1687         queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1688 }
1689
1690 /*
1691  * We're holding onto IO to allow userland time to react.  After the
1692  * timeout either the pool will have been resized (and thus back in
1693  * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
1694  */
1695 static void do_no_space_timeout(struct work_struct *ws)
1696 {
1697         struct pool *pool = container_of(to_delayed_work(ws), struct pool,
1698                                          no_space_timeout);
1699
1700         if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
1701                 set_pool_mode(pool, PM_READ_ONLY);
1702 }
1703
1704 /*----------------------------------------------------------------*/
1705
1706 struct pool_work {
1707         struct work_struct worker;
1708         struct completion complete;
1709 };
1710
1711 static struct pool_work *to_pool_work(struct work_struct *ws)
1712 {
1713         return container_of(ws, struct pool_work, worker);
1714 }
1715
1716 static void pool_work_complete(struct pool_work *pw)
1717 {
1718         complete(&pw->complete);
1719 }
1720
1721 static void pool_work_wait(struct pool_work *pw, struct pool *pool,
1722                            void (*fn)(struct work_struct *))
1723 {
1724         INIT_WORK_ONSTACK(&pw->worker, fn);
1725         init_completion(&pw->complete);
1726         queue_work(pool->wq, &pw->worker);
1727         wait_for_completion(&pw->complete);
1728 }
1729
1730 /*----------------------------------------------------------------*/
1731
1732 struct noflush_work {
1733         struct pool_work pw;
1734         struct thin_c *tc;
1735 };
1736
1737 static struct noflush_work *to_noflush(struct work_struct *ws)
1738 {
1739         return container_of(to_pool_work(ws), struct noflush_work, pw);
1740 }
1741
1742 static void do_noflush_start(struct work_struct *ws)
1743 {
1744         struct noflush_work *w = to_noflush(ws);
1745         w->tc->requeue_mode = true;
1746         requeue_io(w->tc);
1747         pool_work_complete(&w->pw);
1748 }
1749
1750 static void do_noflush_stop(struct work_struct *ws)
1751 {
1752         struct noflush_work *w = to_noflush(ws);
1753         w->tc->requeue_mode = false;
1754         pool_work_complete(&w->pw);
1755 }
1756
1757 static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
1758 {
1759         struct noflush_work w;
1760
1761         w.tc = tc;
1762         pool_work_wait(&w.pw, tc->pool, fn);
1763 }
1764
1765 /*----------------------------------------------------------------*/
1766
1767 static enum pool_mode get_pool_mode(struct pool *pool)
1768 {
1769         return pool->pf.mode;
1770 }
1771
1772 static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
1773 {
1774         dm_table_event(pool->ti->table);
1775         DMINFO("%s: switching pool to %s mode",
1776                dm_device_name(pool->pool_md), new_mode);
1777 }
1778
1779 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1780 {
1781         struct pool_c *pt = pool->ti->private;
1782         bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1783         enum pool_mode old_mode = get_pool_mode(pool);
1784         unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
1785
1786         /*
1787          * Never allow the pool to transition to PM_WRITE mode if user
1788          * intervention is required to verify metadata and data consistency.
1789          */
1790         if (new_mode == PM_WRITE && needs_check) {
1791                 DMERR("%s: unable to switch pool to write mode until repaired.",
1792                       dm_device_name(pool->pool_md));
1793                 if (old_mode != new_mode)
1794                         new_mode = old_mode;
1795                 else
1796                         new_mode = PM_READ_ONLY;
1797         }
1798         /*
1799          * If we were in PM_FAIL mode, rollback of metadata failed.  We're
1800          * not going to recover without a thin_repair.  So we never let the
1801          * pool move out of the old mode.
1802          */
1803         if (old_mode == PM_FAIL)
1804                 new_mode = old_mode;
1805
1806         switch (new_mode) {
1807         case PM_FAIL:
1808                 if (old_mode != new_mode)
1809                         notify_of_pool_mode_change(pool, "failure");
1810                 dm_pool_metadata_read_only(pool->pmd);
1811                 pool->process_bio = process_bio_fail;
1812                 pool->process_discard = process_bio_fail;
1813                 pool->process_prepared_mapping = process_prepared_mapping_fail;
1814                 pool->process_prepared_discard = process_prepared_discard_fail;
1815
1816                 error_retry_list(pool);
1817                 break;
1818
1819         case PM_READ_ONLY:
1820                 if (old_mode != new_mode)
1821                         notify_of_pool_mode_change(pool, "read-only");
1822                 dm_pool_metadata_read_only(pool->pmd);
1823                 pool->process_bio = process_bio_read_only;
1824                 pool->process_discard = process_bio_success;
1825                 pool->process_prepared_mapping = process_prepared_mapping_fail;
1826                 pool->process_prepared_discard = process_prepared_discard_passdown;
1827
1828                 error_retry_list(pool);
1829                 break;
1830
1831         case PM_OUT_OF_DATA_SPACE:
1832                 /*
1833                  * Ideally we'd never hit this state; the low water mark
1834                  * would trigger userland to extend the pool before we
1835                  * completely run out of data space.  However, many small
1836                  * IOs to unprovisioned space can consume data space at an
1837                  * alarming rate.  Adjust your low water mark if you're
1838                  * frequently seeing this mode.
1839                  */
1840                 if (old_mode != new_mode)
1841                         notify_of_pool_mode_change(pool, "out-of-data-space");
1842                 pool->process_bio = process_bio_read_only;
1843                 pool->process_discard = process_discard;
1844                 pool->process_prepared_mapping = process_prepared_mapping;
1845                 pool->process_prepared_discard = process_prepared_discard;
1846
1847                 if (!pool->pf.error_if_no_space && no_space_timeout)
1848                         queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
1849                 break;
1850
1851         case PM_WRITE:
1852                 if (old_mode != new_mode)
1853                         notify_of_pool_mode_change(pool, "write");
1854                 dm_pool_metadata_read_write(pool->pmd);
1855                 pool->process_bio = process_bio;
1856                 pool->process_discard = process_discard;
1857                 pool->process_prepared_mapping = process_prepared_mapping;
1858                 pool->process_prepared_discard = process_prepared_discard;
1859                 break;
1860         }
1861
1862         pool->pf.mode = new_mode;
1863         /*
1864          * The pool mode may have changed, sync it so bind_control_target()
1865          * doesn't cause an unexpected mode transition on resume.
1866          */
1867         pt->adjusted_pf.mode = new_mode;
1868 }
1869
1870 static void abort_transaction(struct pool *pool)
1871 {
1872         const char *dev_name = dm_device_name(pool->pool_md);
1873
1874         DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1875         if (dm_pool_abort_metadata(pool->pmd)) {
1876                 DMERR("%s: failed to abort metadata transaction", dev_name);
1877                 set_pool_mode(pool, PM_FAIL);
1878         }
1879
1880         if (dm_pool_metadata_set_needs_check(pool->pmd)) {
1881                 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1882                 set_pool_mode(pool, PM_FAIL);
1883         }
1884 }
1885
1886 static void metadata_operation_failed(struct pool *pool, const char *op, int r)
1887 {
1888         DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1889                     dm_device_name(pool->pool_md), op, r);
1890
1891         abort_transaction(pool);
1892         set_pool_mode(pool, PM_READ_ONLY);
1893 }
1894
1895 /*----------------------------------------------------------------*/
1896
1897 /*
1898  * Mapping functions.
1899  */
1900
1901 /*
1902  * Called only while mapping a thin bio to hand it over to the workqueue.
1903  */
1904 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1905 {
1906         unsigned long flags;
1907         struct pool *pool = tc->pool;
1908
1909         spin_lock_irqsave(&tc->lock, flags);
1910         bio_list_add(&tc->deferred_bio_list, bio);
1911         spin_unlock_irqrestore(&tc->lock, flags);
1912
1913         wake_worker(pool);
1914 }
1915
1916 static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
1917 {
1918         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1919
1920         h->tc = tc;
1921         h->shared_read_entry = NULL;
1922         h->all_io_entry = NULL;
1923         h->overwrite_mapping = NULL;
1924 }
1925
1926 /*
1927  * Non-blocking function called from the thin target's map function.
1928  */
1929 static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1930 {
1931         int r;
1932         struct thin_c *tc = ti->private;
1933         dm_block_t block = get_bio_block(tc, bio);
1934         struct dm_thin_device *td = tc->td;
1935         struct dm_thin_lookup_result result;
1936         struct dm_bio_prison_cell cell1, cell2;
1937         struct dm_bio_prison_cell *cell_result;
1938         struct dm_cell_key key;
1939
1940         thin_hook_bio(tc, bio);
1941
1942         if (tc->requeue_mode) {
1943                 bio_endio(bio, DM_ENDIO_REQUEUE);
1944                 return DM_MAPIO_SUBMITTED;
1945         }
1946
1947         if (get_pool_mode(tc->pool) == PM_FAIL) {
1948                 bio_io_error(bio);
1949                 return DM_MAPIO_SUBMITTED;
1950         }
1951
1952         if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1953                 thin_defer_bio(tc, bio);
1954                 return DM_MAPIO_SUBMITTED;
1955         }
1956
1957         /*
1958          * We must hold the virtual cell before doing the lookup, otherwise
1959          * there's a race with discard.
1960          */
1961         build_virtual_key(tc->td, block, &key);
1962         if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
1963                 return DM_MAPIO_SUBMITTED;
1964
1965         r = dm_thin_find_block(td, block, 0, &result);
1966
1967         /*
1968          * Note that we defer readahead too.
1969          */
1970         switch (r) {
1971         case 0:
1972                 if (unlikely(result.shared)) {
1973                         /*
1974                          * We have a race condition here between the
1975                          * result.shared value returned by the lookup and
1976                          * snapshot creation, which may cause new
1977                          * sharing.
1978                          *
1979                          * To avoid this always quiesce the origin before
1980                          * taking the snap.  You want to do this anyway to
1981                          * ensure a consistent application view
1982                          * (i.e. lockfs).
1983                          *
1984                          * More distant ancestors are irrelevant. The
1985                          * shared flag will be set in their case.
1986                          */
1987                         thin_defer_bio(tc, bio);
1988                         cell_defer_no_holder_no_free(tc, &cell1);
1989                         return DM_MAPIO_SUBMITTED;
1990                 }
1991
1992                 build_data_key(tc->td, result.block, &key);
1993                 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
1994                         cell_defer_no_holder_no_free(tc, &cell1);
1995                         return DM_MAPIO_SUBMITTED;
1996                 }
1997
1998                 inc_all_io_entry(tc->pool, bio);
1999                 cell_defer_no_holder_no_free(tc, &cell2);
2000                 cell_defer_no_holder_no_free(tc, &cell1);
2001
2002                 remap(tc, bio, result.block);
2003                 return DM_MAPIO_REMAPPED;
2004
2005         case -ENODATA:
2006         case -EWOULDBLOCK:
2007                 /*
2008                  * In future, the failed dm_thin_find_block above could
2009                  * provide the hint to load the metadata into cache.
2010                  */
2011                 thin_defer_bio(tc, bio);
2012                 cell_defer_no_holder_no_free(tc, &cell1);
2013                 return DM_MAPIO_SUBMITTED;
2014
2015         default:
2016                 /*
2017                  * Must always call bio_io_error on failure.
2018                  * dm_thin_find_block can fail with -EINVAL if the
2019                  * pool is switched to fail-io mode.
2020                  */
2021                 bio_io_error(bio);
2022                 cell_defer_no_holder_no_free(tc, &cell1);
2023                 return DM_MAPIO_SUBMITTED;
2024         }
2025 }
2026
2027 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2028 {
2029         struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
2030         struct request_queue *q;
2031
2032         if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)
2033                 return 1;
2034
2035         q = bdev_get_queue(pt->data_dev->bdev);
2036         return bdi_congested(&q->backing_dev_info, bdi_bits);
2037 }
2038
2039 static void requeue_bios(struct pool *pool)
2040 {
2041         unsigned long flags;
2042         struct thin_c *tc;
2043
2044         rcu_read_lock();
2045         list_for_each_entry_rcu(tc, &pool->active_thins, list) {
2046                 spin_lock_irqsave(&tc->lock, flags);
2047                 bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
2048                 bio_list_init(&tc->retry_on_resume_list);
2049                 spin_unlock_irqrestore(&tc->lock, flags);
2050         }
2051         rcu_read_unlock();
2052 }
2053
2054 /*----------------------------------------------------------------
2055  * Binding of control targets to a pool object
2056  *--------------------------------------------------------------*/
2057 static bool data_dev_supports_discard(struct pool_c *pt)
2058 {
2059         struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2060
2061         return q && blk_queue_discard(q);
2062 }
2063
2064 static bool is_factor(sector_t block_size, uint32_t n)
2065 {
2066         return !sector_div(block_size, n);
2067 }
2068
2069 /*
2070  * If discard_passdown was enabled verify that the data device
2071  * supports discards.  Disable discard_passdown if not.
2072  */
2073 static void disable_passdown_if_not_supported(struct pool_c *pt)
2074 {
2075         struct pool *pool = pt->pool;
2076         struct block_device *data_bdev = pt->data_dev->bdev;
2077         struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
2078         sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
2079         const char *reason = NULL;
2080         char buf[BDEVNAME_SIZE];
2081
2082         if (!pt->adjusted_pf.discard_passdown)
2083                 return;
2084
2085         if (!data_dev_supports_discard(pt))
2086                 reason = "discard unsupported";
2087
2088         else if (data_limits->max_discard_sectors < pool->sectors_per_block)
2089                 reason = "max discard sectors smaller than a block";
2090
2091         else if (data_limits->discard_granularity > block_size)
2092                 reason = "discard granularity larger than a block";
2093
2094         else if (!is_factor(block_size, data_limits->discard_granularity))
2095                 reason = "discard granularity not a factor of block size";
2096
2097         if (reason) {
2098                 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
2099                 pt->adjusted_pf.discard_passdown = false;
2100         }
2101 }
2102
2103 static int bind_control_target(struct pool *pool, struct dm_target *ti)
2104 {
2105         struct pool_c *pt = ti->private;
2106
2107         /*
2108          * We want to make sure that a pool in PM_FAIL mode is never upgraded.
2109          */
2110         enum pool_mode old_mode = get_pool_mode(pool);
2111         enum pool_mode new_mode = pt->adjusted_pf.mode;
2112
2113         /*
2114          * Don't change the pool's mode until set_pool_mode() below.
2115          * Otherwise the pool's process_* function pointers may
2116          * not match the desired pool mode.
2117          */
2118         pt->adjusted_pf.mode = old_mode;
2119
2120         pool->ti = ti;
2121         pool->pf = pt->adjusted_pf;
2122         pool->low_water_blocks = pt->low_water_blocks;
2123
2124         set_pool_mode(pool, new_mode);
2125
2126         return 0;
2127 }
2128
2129 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
2130 {
2131         if (pool->ti == ti)
2132                 pool->ti = NULL;
2133 }
2134
2135 /*----------------------------------------------------------------
2136  * Pool creation
2137  *--------------------------------------------------------------*/
2138 /* Initialize pool features. */
2139 static void pool_features_init(struct pool_features *pf)
2140 {
2141         pf->mode = PM_WRITE;
2142         pf->zero_new_blocks = true;
2143         pf->discard_enabled = true;
2144         pf->discard_passdown = true;
2145         pf->error_if_no_space = false;
2146 }
2147
2148 static void __pool_destroy(struct pool *pool)
2149 {
2150         __pool_table_remove(pool);
2151
2152         if (dm_pool_metadata_close(pool->pmd) < 0)
2153                 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2154
2155         dm_bio_prison_destroy(pool->prison);
2156         dm_kcopyd_client_destroy(pool->copier);
2157
2158         if (pool->wq)
2159                 destroy_workqueue(pool->wq);
2160
2161         if (pool->next_mapping)
2162                 mempool_free(pool->next_mapping, pool->mapping_pool);
2163         mempool_destroy(pool->mapping_pool);
2164         dm_deferred_set_destroy(pool->shared_read_ds);
2165         dm_deferred_set_destroy(pool->all_io_ds);
2166         kfree(pool);
2167 }
2168
2169 static struct kmem_cache *_new_mapping_cache;
2170
2171 static struct pool *pool_create(struct mapped_device *pool_md,
2172                                 struct block_device *metadata_dev,
2173                                 unsigned long block_size,
2174                                 int read_only, char **error)
2175 {
2176         int r;
2177         void *err_p;
2178         struct pool *pool;
2179         struct dm_pool_metadata *pmd;
2180         bool format_device = read_only ? false : true;
2181
2182         pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
2183         if (IS_ERR(pmd)) {
2184                 *error = "Error creating metadata object";
2185                 return (struct pool *)pmd;
2186         }
2187
2188         pool = kmalloc(sizeof(*pool), GFP_KERNEL);
2189         if (!pool) {
2190                 *error = "Error allocating memory for pool";
2191                 err_p = ERR_PTR(-ENOMEM);
2192                 goto bad_pool;
2193         }
2194
2195         pool->pmd = pmd;
2196         pool->sectors_per_block = block_size;
2197         if (block_size & (block_size - 1))
2198                 pool->sectors_per_block_shift = -1;
2199         else
2200                 pool->sectors_per_block_shift = __ffs(block_size);
2201         pool->low_water_blocks = 0;
2202         pool_features_init(&pool->pf);
2203         pool->prison = dm_bio_prison_create(PRISON_CELLS);
2204         if (!pool->prison) {
2205                 *error = "Error creating pool's bio prison";
2206                 err_p = ERR_PTR(-ENOMEM);
2207                 goto bad_prison;
2208         }
2209
2210         pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2211         if (IS_ERR(pool->copier)) {
2212                 r = PTR_ERR(pool->copier);
2213                 *error = "Error creating pool's kcopyd client";
2214                 err_p = ERR_PTR(r);
2215                 goto bad_kcopyd_client;
2216         }
2217
2218         /*
2219          * Create singlethreaded workqueue that will service all devices
2220          * that use this metadata.
2221          */
2222         pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2223         if (!pool->wq) {
2224                 *error = "Error creating pool's workqueue";
2225                 err_p = ERR_PTR(-ENOMEM);
2226                 goto bad_wq;
2227         }
2228
2229         INIT_WORK(&pool->worker, do_worker);
2230         INIT_DELAYED_WORK(&pool->waker, do_waker);
2231         INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
2232         spin_lock_init(&pool->lock);
2233         bio_list_init(&pool->deferred_flush_bios);
2234         INIT_LIST_HEAD(&pool->prepared_mappings);
2235         INIT_LIST_HEAD(&pool->prepared_discards);
2236         INIT_LIST_HEAD(&pool->active_thins);
2237         pool->low_water_triggered = false;
2238
2239         pool->shared_read_ds = dm_deferred_set_create();
2240         if (!pool->shared_read_ds) {
2241                 *error = "Error creating pool's shared read deferred set";
2242                 err_p = ERR_PTR(-ENOMEM);
2243                 goto bad_shared_read_ds;
2244         }
2245
2246         pool->all_io_ds = dm_deferred_set_create();
2247         if (!pool->all_io_ds) {
2248                 *error = "Error creating pool's all io deferred set";
2249                 err_p = ERR_PTR(-ENOMEM);
2250                 goto bad_all_io_ds;
2251         }
2252
2253         pool->next_mapping = NULL;
2254         pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
2255                                                       _new_mapping_cache);
2256         if (!pool->mapping_pool) {
2257                 *error = "Error creating pool's mapping mempool";
2258                 err_p = ERR_PTR(-ENOMEM);
2259                 goto bad_mapping_pool;
2260         }
2261
2262         pool->ref_count = 1;
2263         pool->last_commit_jiffies = jiffies;
2264         pool->pool_md = pool_md;
2265         pool->md_dev = metadata_dev;
2266         __pool_table_insert(pool);
2267
2268         return pool;
2269
2270 bad_mapping_pool:
2271         dm_deferred_set_destroy(pool->all_io_ds);
2272 bad_all_io_ds:
2273         dm_deferred_set_destroy(pool->shared_read_ds);
2274 bad_shared_read_ds:
2275         destroy_workqueue(pool->wq);
2276 bad_wq:
2277         dm_kcopyd_client_destroy(pool->copier);
2278 bad_kcopyd_client:
2279         dm_bio_prison_destroy(pool->prison);
2280 bad_prison:
2281         kfree(pool);
2282 bad_pool:
2283         if (dm_pool_metadata_close(pmd))
2284                 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2285
2286         return err_p;
2287 }
2288
2289 static void __pool_inc(struct pool *pool)
2290 {
2291         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2292         pool->ref_count++;
2293 }
2294
2295 static void __pool_dec(struct pool *pool)
2296 {
2297         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2298         BUG_ON(!pool->ref_count);
2299         if (!--pool->ref_count)
2300                 __pool_destroy(pool);
2301 }
2302
2303 static struct pool *__pool_find(struct mapped_device *pool_md,
2304                                 struct block_device *metadata_dev,
2305                                 unsigned long block_size, int read_only,
2306                                 char **error, int *created)
2307 {
2308         struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
2309
2310         if (pool) {
2311                 if (pool->pool_md != pool_md) {
2312                         *error = "metadata device already in use by a pool";
2313                         return ERR_PTR(-EBUSY);
2314                 }
2315                 __pool_inc(pool);
2316
2317         } else {
2318                 pool = __pool_table_lookup(pool_md);
2319                 if (pool) {
2320                         if (pool->md_dev != metadata_dev) {
2321                                 *error = "different pool cannot replace a pool";
2322                                 return ERR_PTR(-EINVAL);
2323                         }
2324                         __pool_inc(pool);
2325
2326                 } else {
2327                         pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
2328                         *created = 1;
2329                 }
2330         }
2331
2332         return pool;
2333 }
2334
2335 /*----------------------------------------------------------------
2336  * Pool target methods
2337  *--------------------------------------------------------------*/
2338 static void pool_dtr(struct dm_target *ti)
2339 {
2340         struct pool_c *pt = ti->private;
2341
2342         mutex_lock(&dm_thin_pool_table.mutex);
2343
2344         unbind_control_target(pt->pool, ti);
2345         __pool_dec(pt->pool);
2346         dm_put_device(ti, pt->metadata_dev);
2347         dm_put_device(ti, pt->data_dev);
2348         kfree(pt);
2349
2350         mutex_unlock(&dm_thin_pool_table.mutex);
2351 }
2352
2353 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
2354                                struct dm_target *ti)
2355 {
2356         int r;
2357         unsigned argc;
2358         const char *arg_name;
2359
2360         static struct dm_arg _args[] = {
2361                 {0, 4, "Invalid number of pool feature arguments"},
2362         };
2363
2364         /*
2365          * No feature arguments supplied.
2366          */
2367         if (!as->argc)
2368                 return 0;
2369
2370         r = dm_read_arg_group(_args, as, &argc, &ti->error);
2371         if (r)
2372                 return -EINVAL;
2373
2374         while (argc && !r) {
2375                 arg_name = dm_shift_arg(as);
2376                 argc--;
2377
2378                 if (!strcasecmp(arg_name, "skip_block_zeroing"))
2379                         pf->zero_new_blocks = false;
2380
2381                 else if (!strcasecmp(arg_name, "ignore_discard"))
2382                         pf->discard_enabled = false;
2383
2384                 else if (!strcasecmp(arg_name, "no_discard_passdown"))
2385                         pf->discard_passdown = false;
2386
2387                 else if (!strcasecmp(arg_name, "read_only"))
2388                         pf->mode = PM_READ_ONLY;
2389
2390                 else if (!strcasecmp(arg_name, "error_if_no_space"))
2391                         pf->error_if_no_space = true;
2392
2393                 else {
2394                         ti->error = "Unrecognised pool feature requested";
2395                         r = -EINVAL;
2396                         break;
2397                 }
2398         }
2399
2400         return r;
2401 }
2402
2403 static void metadata_low_callback(void *context)
2404 {
2405         struct pool *pool = context;
2406
2407         DMWARN("%s: reached low water mark for metadata device: sending event.",
2408                dm_device_name(pool->pool_md));
2409
2410         dm_table_event(pool->ti->table);
2411 }
2412
2413 static sector_t get_dev_size(struct block_device *bdev)
2414 {
2415         return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
2416 }
2417
2418 static void warn_if_metadata_device_too_big(struct block_device *bdev)
2419 {
2420         sector_t metadata_dev_size = get_dev_size(bdev);
2421         char buffer[BDEVNAME_SIZE];
2422
2423         if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
2424                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2425                        bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
2426 }
2427
2428 static sector_t get_metadata_dev_size(struct block_device *bdev)
2429 {
2430         sector_t metadata_dev_size = get_dev_size(bdev);
2431
2432         if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
2433                 metadata_dev_size = THIN_METADATA_MAX_SECTORS;
2434
2435         return metadata_dev_size;
2436 }
2437
2438 static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
2439 {
2440         sector_t metadata_dev_size = get_metadata_dev_size(bdev);
2441
2442         sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
2443
2444         return metadata_dev_size;
2445 }
2446
2447 /*
2448  * When a metadata threshold is crossed a dm event is triggered, and
2449  * userland should respond by growing the metadata device.  We could let
2450  * userland set the threshold, like we do with the data threshold, but I'm
2451  * not sure they know enough to do this well.
2452  */
2453 static dm_block_t calc_metadata_threshold(struct pool_c *pt)
2454 {
2455         /*
2456          * 4M is ample for all ops with the possible exception of thin
2457          * device deletion which is harmless if it fails (just retry the
2458          * delete after you've grown the device).
2459          */
2460         dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
2461         return min((dm_block_t)1024ULL /* 4M */, quarter);
2462 }
2463
2464 /*
2465  * thin-pool <metadata dev> <data dev>
2466  *           <data block size (sectors)>
2467  *           <low water mark (blocks)>
2468  *           [<#feature args> [<arg>]*]
2469  *
2470  * Optional feature arguments are:
2471  *           skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
2472  *           ignore_discard: disable discard
2473  *           no_discard_passdown: don't pass discards down to the data device
2474  *           read_only: Don't allow any changes to be made to the pool metadata.
2475  *           error_if_no_space: error IOs, instead of queueing, if no space.
2476  */
2477 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2478 {
2479         int r, pool_created = 0;
2480         struct pool_c *pt;
2481         struct pool *pool;
2482         struct pool_features pf;
2483         struct dm_arg_set as;
2484         struct dm_dev *data_dev;
2485         unsigned long block_size;
2486         dm_block_t low_water_blocks;
2487         struct dm_dev *metadata_dev;
2488         fmode_t metadata_mode;
2489
2490         /*
2491          * FIXME Remove validation from scope of lock.
2492          */
2493         mutex_lock(&dm_thin_pool_table.mutex);
2494
2495         if (argc < 4) {
2496                 ti->error = "Invalid argument count";
2497                 r = -EINVAL;
2498                 goto out_unlock;
2499         }
2500
2501         as.argc = argc;
2502         as.argv = argv;
2503
2504         /*
2505          * Set default pool features.
2506          */
2507         pool_features_init(&pf);
2508
2509         dm_consume_args(&as, 4);
2510         r = parse_pool_features(&as, &pf, ti);
2511         if (r)
2512                 goto out_unlock;
2513
2514         metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
2515         r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
2516         if (r) {
2517                 ti->error = "Error opening metadata block device";
2518                 goto out_unlock;
2519         }
2520         warn_if_metadata_device_too_big(metadata_dev->bdev);
2521
2522         r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
2523         if (r) {
2524                 ti->error = "Error getting data device";
2525                 goto out_metadata;
2526         }
2527
2528         if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
2529             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2530             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2531             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2532                 ti->error = "Invalid block size";
2533                 r = -EINVAL;
2534                 goto out;
2535         }
2536
2537         if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
2538                 ti->error = "Invalid low water mark";
2539                 r = -EINVAL;
2540                 goto out;
2541         }
2542
2543         pt = kzalloc(sizeof(*pt), GFP_KERNEL);
2544         if (!pt) {
2545                 r = -ENOMEM;
2546                 goto out;
2547         }
2548
2549         pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
2550                            block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
2551         if (IS_ERR(pool)) {
2552                 r = PTR_ERR(pool);
2553                 goto out_free_pt;
2554         }
2555
2556         /*
2557          * 'pool_created' reflects whether this is the first table load.
2558          * Top level discard support is not allowed to be changed after
2559          * initial load.  This would require a pool reload to trigger thin
2560          * device changes.
2561          */
2562         if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
2563                 ti->error = "Discard support cannot be disabled once enabled";
2564                 r = -EINVAL;
2565                 goto out_flags_changed;
2566         }
2567
2568         pt->pool = pool;
2569         pt->ti = ti;
2570         pt->metadata_dev = metadata_dev;
2571         pt->data_dev = data_dev;
2572         pt->low_water_blocks = low_water_blocks;
2573         pt->adjusted_pf = pt->requested_pf = pf;
2574         ti->num_flush_bios = 1;
2575
2576         /*
2577          * Only need to enable discards if the pool should pass
2578          * them down to the data device.  The thin device's discard
2579          * processing will cause mappings to be removed from the btree.
2580          */
2581         ti->discard_zeroes_data_unsupported = true;
2582         if (pf.discard_enabled && pf.discard_passdown) {
2583                 ti->num_discard_bios = 1;
2584
2585                 /*
2586                  * Setting 'discards_supported' circumvents the normal
2587                  * stacking of discard limits (this keeps the pool and
2588                  * thin devices' discard limits consistent).
2589                  */
2590                 ti->discards_supported = true;
2591         }
2592         ti->private = pt;
2593
2594         r = dm_pool_register_metadata_threshold(pt->pool->pmd,
2595                                                 calc_metadata_threshold(pt),
2596                                                 metadata_low_callback,
2597                                                 pool);
2598         if (r)
2599                 goto out_free_pt;
2600
2601         pt->callbacks.congested_fn = pool_is_congested;
2602         dm_table_add_target_callbacks(ti->table, &pt->callbacks);
2603
2604         mutex_unlock(&dm_thin_pool_table.mutex);
2605
2606         return 0;
2607
2608 out_flags_changed:
2609         __pool_dec(pool);
2610 out_free_pt:
2611         kfree(pt);
2612 out:
2613         dm_put_device(ti, data_dev);
2614 out_metadata:
2615         dm_put_device(ti, metadata_dev);
2616 out_unlock:
2617         mutex_unlock(&dm_thin_pool_table.mutex);
2618
2619         return r;
2620 }
2621
2622 static int pool_map(struct dm_target *ti, struct bio *bio)
2623 {
2624         int r;
2625         struct pool_c *pt = ti->private;
2626         struct pool *pool = pt->pool;
2627         unsigned long flags;
2628
2629         /*
2630          * As this is a singleton target, ti->begin is always zero.
2631          */
2632         spin_lock_irqsave(&pool->lock, flags);
2633         bio->bi_bdev = pt->data_dev->bdev;
2634         r = DM_MAPIO_REMAPPED;
2635         spin_unlock_irqrestore(&pool->lock, flags);
2636
2637         return r;
2638 }
2639
2640 static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
2641 {
2642         int r;
2643         struct pool_c *pt = ti->private;
2644         struct pool *pool = pt->pool;
2645         sector_t data_size = ti->len;
2646         dm_block_t sb_data_size;
2647
2648         *need_commit = false;
2649
2650         (void) sector_div(data_size, pool->sectors_per_block);
2651
2652         r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2653         if (r) {
2654                 DMERR("%s: failed to retrieve data device size",
2655                       dm_device_name(pool->pool_md));
2656                 return r;
2657         }
2658
2659         if (data_size < sb_data_size) {
2660                 DMERR("%s: pool target (%llu blocks) too small: expected %llu",
2661                       dm_device_name(pool->pool_md),
2662                       (unsigned long long)data_size, sb_data_size);
2663                 return -EINVAL;
2664
2665         } else if (data_size > sb_data_size) {
2666                 if (dm_pool_metadata_needs_check(pool->pmd)) {
2667                         DMERR("%s: unable to grow the data device until repaired.",
2668                               dm_device_name(pool->pool_md));
2669                         return 0;
2670                 }
2671
2672                 if (sb_data_size)
2673                         DMINFO("%s: growing the data device from %llu to %llu blocks",
2674                                dm_device_name(pool->pool_md),
2675                                sb_data_size, (unsigned long long)data_size);
2676                 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2677                 if (r) {
2678                         metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
2679                         return r;
2680                 }
2681
2682                 *need_commit = true;
2683         }
2684
2685         return 0;
2686 }
2687
2688 static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
2689 {
2690         int r;
2691         struct pool_c *pt = ti->private;
2692         struct pool *pool = pt->pool;
2693         dm_block_t metadata_dev_size, sb_metadata_dev_size;
2694
2695         *need_commit = false;
2696
2697         metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
2698
2699         r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
2700         if (r) {
2701                 DMERR("%s: failed to retrieve metadata device size",
2702                       dm_device_name(pool->pool_md));
2703                 return r;
2704         }
2705
2706         if (metadata_dev_size < sb_metadata_dev_size) {
2707                 DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
2708                       dm_device_name(pool->pool_md),
2709                       metadata_dev_size, sb_metadata_dev_size);
2710                 return -EINVAL;
2711
2712         } else if (metadata_dev_size > sb_metadata_dev_size) {
2713                 if (dm_pool_metadata_needs_check(pool->pmd)) {
2714                         DMERR("%s: unable to grow the metadata device until repaired.",
2715                               dm_device_name(pool->pool_md));
2716                         return 0;
2717                 }
2718
2719                 warn_if_metadata_device_too_big(pool->md_dev);
2720                 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
2721                        dm_device_name(pool->pool_md),
2722                        sb_metadata_dev_size, metadata_dev_size);
2723                 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
2724                 if (r) {
2725                         metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
2726                         return r;
2727                 }
2728
2729                 *need_commit = true;
2730         }
2731
2732         return 0;
2733 }
2734
2735 /*
2736  * Retrieves the number of blocks of the data device from
2737  * the superblock and compares it to the actual device size,
2738  * thus resizing the data device in case it has grown.
2739  *
2740  * This both copes with opening preallocated data devices in the ctr
2741  * being followed by a resume
2742  * -and-
2743  * calling the resume method individually after userspace has
2744  * grown the data device in reaction to a table event.
2745  */
2746 static int pool_preresume(struct dm_target *ti)
2747 {
2748         int r;
2749         bool need_commit1, need_commit2;
2750         struct pool_c *pt = ti->private;
2751         struct pool *pool = pt->pool;
2752
2753         /*
2754          * Take control of the pool object.
2755          */
2756         r = bind_control_target(pool, ti);
2757         if (r)
2758                 return r;
2759
2760         r = maybe_resize_data_dev(ti, &need_commit1);
2761         if (r)
2762                 return r;
2763
2764         r = maybe_resize_metadata_dev(ti, &need_commit2);
2765         if (r)
2766                 return r;
2767
2768         if (need_commit1 || need_commit2)
2769                 (void) commit(pool);
2770
2771         return 0;
2772 }
2773
2774 static void pool_resume(struct dm_target *ti)
2775 {
2776         struct pool_c *pt = ti->private;
2777         struct pool *pool = pt->pool;
2778         unsigned long flags;
2779
2780         spin_lock_irqsave(&pool->lock, flags);
2781         pool->low_water_triggered = false;
2782         spin_unlock_irqrestore(&pool->lock, flags);
2783         requeue_bios(pool);
2784
2785         do_waker(&pool->waker.work);
2786 }
2787
2788 static void pool_postsuspend(struct dm_target *ti)
2789 {
2790         struct pool_c *pt = ti->private;
2791         struct pool *pool = pt->pool;
2792
2793         cancel_delayed_work(&pool->waker);
2794         cancel_delayed_work(&pool->no_space_timeout);
2795         flush_workqueue(pool->wq);
2796         (void) commit(pool);
2797 }
2798
2799 static int check_arg_count(unsigned argc, unsigned args_required)
2800 {
2801         if (argc != args_required) {
2802                 DMWARN("Message received with %u arguments instead of %u.",
2803                        argc, args_required);
2804                 return -EINVAL;
2805         }
2806
2807         return 0;
2808 }
2809
2810 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2811 {
2812         if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2813             *dev_id <= MAX_DEV_ID)
2814                 return 0;
2815
2816         if (warning)
2817                 DMWARN("Message received with invalid device id: %s", arg);
2818
2819         return -EINVAL;
2820 }
2821
2822 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2823 {
2824         dm_thin_id dev_id;
2825         int r;
2826
2827         r = check_arg_count(argc, 2);
2828         if (r)
2829                 return r;
2830
2831         r = read_dev_id(argv[1], &dev_id, 1);
2832         if (r)
2833                 return r;
2834
2835         r = dm_pool_create_thin(pool->pmd, dev_id);
2836         if (r) {
2837                 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2838                        argv[1]);
2839                 return r;
2840         }
2841
2842         return 0;
2843 }
2844
2845 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2846 {
2847         dm_thin_id dev_id;
2848         dm_thin_id origin_dev_id;
2849         int r;
2850
2851         r = check_arg_count(argc, 3);
2852         if (r)
2853                 return r;
2854
2855         r = read_dev_id(argv[1], &dev_id, 1);
2856         if (r)
2857                 return r;
2858
2859         r = read_dev_id(argv[2], &origin_dev_id, 1);
2860         if (r)
2861                 return r;
2862
2863         r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2864         if (r) {
2865                 DMWARN("Creation of new snapshot %s of device %s failed.",
2866                        argv[1], argv[2]);
2867                 return r;
2868         }
2869
2870         return 0;
2871 }
2872
2873 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2874 {
2875         dm_thin_id dev_id;
2876         int r;
2877
2878         r = check_arg_count(argc, 2);
2879         if (r)
2880                 return r;
2881
2882         r = read_dev_id(argv[1], &dev_id, 1);
2883         if (r)
2884                 return r;
2885
2886         r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2887         if (r)
2888                 DMWARN("Deletion of thin device %s failed.", argv[1]);
2889
2890         return r;
2891 }
2892
2893 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2894 {
2895         dm_thin_id old_id, new_id;
2896         int r;
2897
2898         r = check_arg_count(argc, 3);
2899         if (r)
2900                 return r;
2901
2902         if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2903                 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2904                 return -EINVAL;
2905         }
2906
2907         if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2908                 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2909                 return -EINVAL;
2910         }
2911
2912         r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2913         if (r) {
2914                 DMWARN("Failed to change transaction id from %s to %s.",
2915                        argv[1], argv[2]);
2916                 return r;
2917         }
2918
2919         return 0;
2920 }
2921
2922 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2923 {
2924         int r;
2925
2926         r = check_arg_count(argc, 1);
2927         if (r)
2928                 return r;
2929
2930         (void) commit(pool);
2931
2932         r = dm_pool_reserve_metadata_snap(pool->pmd);
2933         if (r)
2934                 DMWARN("reserve_metadata_snap message failed.");
2935
2936         return r;
2937 }
2938
2939 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2940 {
2941         int r;
2942
2943         r = check_arg_count(argc, 1);
2944         if (r)
2945                 return r;
2946
2947         r = dm_pool_release_metadata_snap(pool->pmd);
2948         if (r)
2949                 DMWARN("release_metadata_snap message failed.");
2950
2951         return r;
2952 }
2953
2954 /*
2955  * Messages supported:
2956  *   create_thin        <dev_id>
2957  *   create_snap        <dev_id> <origin_id>
2958  *   delete             <dev_id>
2959  *   trim               <dev_id> <new_size_in_sectors>
2960  *   set_transaction_id <current_trans_id> <new_trans_id>
2961  *   reserve_metadata_snap
2962  *   release_metadata_snap
2963  */
2964 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2965 {
2966         int r = -EINVAL;
2967         struct pool_c *pt = ti->private;
2968         struct pool *pool = pt->pool;
2969
2970         if (get_pool_mode(pool) >= PM_READ_ONLY) {
2971                 DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
2972                       dm_device_name(pool->pool_md));
2973                 return -EINVAL;
2974         }
2975
2976         if (!strcasecmp(argv[0], "create_thin"))
2977                 r = process_create_thin_mesg(argc, argv, pool);
2978
2979         else if (!strcasecmp(argv[0], "create_snap"))
2980                 r = process_create_snap_mesg(argc, argv, pool);
2981
2982         else if (!strcasecmp(argv[0], "delete"))
2983                 r = process_delete_mesg(argc, argv, pool);
2984
2985         else if (!strcasecmp(argv[0], "set_transaction_id"))
2986                 r = process_set_transaction_id_mesg(argc, argv, pool);
2987
2988         else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2989                 r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2990
2991         else if (!strcasecmp(argv[0], "release_metadata_snap"))
2992                 r = process_release_metadata_snap_mesg(argc, argv, pool);
2993
2994         else
2995                 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2996
2997         if (!r)
2998                 (void) commit(pool);
2999
3000         return r;
3001 }
3002
3003 static void emit_flags(struct pool_features *pf, char *result,
3004                        unsigned sz, unsigned maxlen)
3005 {
3006         unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
3007                 !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
3008                 pf->error_if_no_space;
3009         DMEMIT("%u ", count);
3010
3011         if (!pf->zero_new_blocks)
3012                 DMEMIT("skip_block_zeroing ");
3013
3014         if (!pf->discard_enabled)
3015                 DMEMIT("ignore_discard ");
3016
3017         if (!pf->discard_passdown)
3018                 DMEMIT("no_discard_passdown ");
3019
3020         if (pf->mode == PM_READ_ONLY)
3021                 DMEMIT("read_only ");
3022
3023         if (pf->error_if_no_space)
3024                 DMEMIT("error_if_no_space ");
3025 }
3026
3027 /*
3028  * Status line is:
3029  *    <transaction id> <used metadata sectors>/<total metadata sectors>
3030  *    <used data sectors>/<total data sectors> <held metadata root>
3031  */
3032 static void pool_status(struct dm_target *ti, status_type_t type,
3033                         unsigned status_flags, char *result, unsigned maxlen)
3034 {
3035         int r;
3036         unsigned sz = 0;
3037         uint64_t transaction_id;
3038         dm_block_t nr_free_blocks_data;
3039         dm_block_t nr_free_blocks_metadata;
3040         dm_block_t nr_blocks_data;
3041         dm_block_t nr_blocks_metadata;
3042         dm_block_t held_root;
3043         char buf[BDEVNAME_SIZE];
3044         char buf2[BDEVNAME_SIZE];
3045         struct pool_c *pt = ti->private;
3046         struct pool *pool = pt->pool;
3047
3048         switch (type) {
3049         case STATUSTYPE_INFO:
3050                 if (get_pool_mode(pool) == PM_FAIL) {
3051                         DMEMIT("Fail");
3052                         break;
3053                 }
3054
3055                 /* Commit to ensure statistics aren't out-of-date */
3056                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3057                         (void) commit(pool);
3058
3059                 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
3060                 if (r) {
3061                         DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
3062                               dm_device_name(pool->pool_md), r);
3063                         goto err;
3064                 }
3065
3066                 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
3067                 if (r) {
3068                         DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
3069                               dm_device_name(pool->pool_md), r);
3070                         goto err;
3071                 }
3072
3073                 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
3074                 if (r) {
3075                         DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
3076                               dm_device_name(pool->pool_md), r);
3077                         goto err;
3078                 }
3079
3080                 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
3081                 if (r) {
3082                         DMERR("%s: dm_pool_get_free_block_count returned %d",
3083                               dm_device_name(pool->pool_md), r);
3084                         goto err;
3085                 }
3086
3087                 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
3088                 if (r) {
3089                         DMERR("%s: dm_pool_get_data_dev_size returned %d",
3090                               dm_device_name(pool->pool_md), r);
3091                         goto err;
3092                 }
3093
3094                 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
3095                 if (r) {
3096                         DMERR("%s: dm_pool_get_metadata_snap returned %d",
3097                               dm_device_name(pool->pool_md), r);
3098                         goto err;
3099                 }
3100
3101                 DMEMIT("%llu %llu/%llu %llu/%llu ",
3102                        (unsigned long long)transaction_id,
3103                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3104                        (unsigned long long)nr_blocks_metadata,
3105                        (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
3106                        (unsigned long long)nr_blocks_data);
3107
3108                 if (held_root)
3109                         DMEMIT("%llu ", held_root);
3110                 else
3111                         DMEMIT("- ");
3112
3113                 if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
3114                         DMEMIT("out_of_data_space ");
3115                 else if (pool->pf.mode == PM_READ_ONLY)
3116                         DMEMIT("ro ");
3117                 else
3118                         DMEMIT("rw ");
3119
3120                 if (!pool->pf.discard_enabled)
3121                         DMEMIT("ignore_discard ");
3122                 else if (pool->pf.discard_passdown)
3123                         DMEMIT("discard_passdown ");
3124                 else
3125                         DMEMIT("no_discard_passdown ");
3126
3127                 if (pool->pf.error_if_no_space)
3128                         DMEMIT("error_if_no_space ");
3129                 else
3130                         DMEMIT("queue_if_no_space ");
3131
3132                 break;
3133
3134         case STATUSTYPE_TABLE:
3135                 DMEMIT("%s %s %lu %llu ",
3136                        format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
3137                        format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
3138                        (unsigned long)pool->sectors_per_block,
3139                        (unsigned long long)pt->low_water_blocks);
3140                 emit_flags(&pt->requested_pf, result, sz, maxlen);
3141                 break;
3142         }
3143         return;
3144
3145 err:
3146         DMEMIT("Error");
3147 }
3148
3149 static int pool_iterate_devices(struct dm_target *ti,
3150                                 iterate_devices_callout_fn fn, void *data)
3151 {
3152         struct pool_c *pt = ti->private;
3153
3154         return fn(ti, pt->data_dev, 0, ti->len, data);
3155 }
3156
3157 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
3158                       struct bio_vec *biovec, int max_size)
3159 {
3160         struct pool_c *pt = ti->private;
3161         struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
3162
3163         if (!q->merge_bvec_fn)
3164                 return max_size;
3165
3166         bvm->bi_bdev = pt->data_dev->bdev;
3167
3168         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3169 }
3170
3171 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
3172 {
3173         struct pool *pool = pt->pool;
3174         struct queue_limits *data_limits;
3175
3176         limits->max_discard_sectors = pool->sectors_per_block;
3177
3178         /*
3179          * discard_granularity is just a hint, and not enforced.
3180          */
3181         if (pt->adjusted_pf.discard_passdown) {
3182                 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
3183                 limits->discard_granularity = max(data_limits->discard_granularity,
3184                                                   pool->sectors_per_block << SECTOR_SHIFT);
3185         } else
3186                 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
3187 }
3188
3189 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
3190 {
3191         struct pool_c *pt = ti->private;
3192         struct pool *pool = pt->pool;
3193         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3194
3195         /*
3196          * If the system-determined stacked limits are compatible with the
3197          * pool's blocksize (io_opt is a factor) do not override them.
3198          */
3199         if (io_opt_sectors < pool->sectors_per_block ||
3200             do_div(io_opt_sectors, pool->sectors_per_block)) {
3201                 blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
3202                 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
3203         }
3204
3205         /*
3206          * pt->adjusted_pf is a staging area for the actual features to use.
3207          * They get transferred to the live pool in bind_control_target()
3208          * called from pool_preresume().
3209          */
3210         if (!pt->adjusted_pf.discard_enabled) {
3211                 /*
3212                  * Must explicitly disallow stacking discard limits otherwise the
3213                  * block layer will stack them if pool's data device has support.
3214                  * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
3215                  * user to see that, so make sure to set all discard limits to 0.
3216                  */
3217                 limits->discard_granularity = 0;
3218                 return;
3219         }
3220
3221         disable_passdown_if_not_supported(pt);
3222
3223         set_discard_limits(pt, limits);
3224 }
3225
3226 static struct target_type pool_target = {
3227         .name = "thin-pool",
3228         .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
3229                     DM_TARGET_IMMUTABLE,
3230         .version = {1, 13, 0},
3231         .module = THIS_MODULE,
3232         .ctr = pool_ctr,
3233         .dtr = pool_dtr,
3234         .map = pool_map,
3235         .postsuspend = pool_postsuspend,
3236         .preresume = pool_preresume,
3237         .resume = pool_resume,
3238         .message = pool_message,
3239         .status = pool_status,
3240         .merge = pool_merge,
3241         .iterate_devices = pool_iterate_devices,
3242         .io_hints = pool_io_hints,
3243 };
3244
3245 /*----------------------------------------------------------------
3246  * Thin target methods
3247  *--------------------------------------------------------------*/
3248 static void thin_get(struct thin_c *tc)
3249 {
3250         atomic_inc(&tc->refcount);
3251 }
3252
3253 static void thin_put(struct thin_c *tc)
3254 {
3255         if (atomic_dec_and_test(&tc->refcount))
3256                 complete(&tc->can_destroy);
3257 }
3258
3259 static void thin_dtr(struct dm_target *ti)
3260 {
3261         struct thin_c *tc = ti->private;
3262         unsigned long flags;
3263
3264         spin_lock_irqsave(&tc->pool->lock, flags);
3265         list_del_rcu(&tc->list);
3266         spin_unlock_irqrestore(&tc->pool->lock, flags);
3267         synchronize_rcu();
3268
3269         thin_put(tc);
3270         wait_for_completion(&tc->can_destroy);
3271
3272         mutex_lock(&dm_thin_pool_table.mutex);
3273
3274         __pool_dec(tc->pool);
3275         dm_pool_close_thin_device(tc->td);
3276         dm_put_device(ti, tc->pool_dev);
3277         if (tc->origin_dev)
3278                 dm_put_device(ti, tc->origin_dev);
3279         kfree(tc);
3280
3281         mutex_unlock(&dm_thin_pool_table.mutex);
3282 }
3283
3284 /*
3285  * Thin target parameters:
3286  *
3287  * <pool_dev> <dev_id> [origin_dev]
3288  *
3289  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
3290  * dev_id: the internal device identifier
3291  * origin_dev: a device external to the pool that should act as the origin
3292  *
3293  * If the pool device has discards disabled, they get disabled for the thin
3294  * device as well.
3295  */
3296 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3297 {
3298         int r;
3299         struct thin_c *tc;
3300         struct dm_dev *pool_dev, *origin_dev;
3301         struct mapped_device *pool_md;
3302         unsigned long flags;
3303
3304         mutex_lock(&dm_thin_pool_table.mutex);
3305
3306         if (argc != 2 && argc != 3) {
3307                 ti->error = "Invalid argument count";
3308                 r = -EINVAL;
3309                 goto out_unlock;
3310         }
3311
3312         tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
3313         if (!tc) {
3314                 ti->error = "Out of memory";
3315                 r = -ENOMEM;
3316                 goto out_unlock;
3317         }
3318         spin_lock_init(&tc->lock);
3319         bio_list_init(&tc->deferred_bio_list);
3320         bio_list_init(&tc->retry_on_resume_list);
3321         tc->sort_bio_list = RB_ROOT;
3322
3323         if (argc == 3) {
3324                 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
3325                 if (r) {
3326                         ti->error = "Error opening origin device";
3327                         goto bad_origin_dev;
3328                 }
3329                 tc->origin_dev = origin_dev;
3330         }
3331
3332         r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
3333         if (r) {
3334                 ti->error = "Error opening pool device";
3335                 goto bad_pool_dev;
3336         }
3337         tc->pool_dev = pool_dev;
3338
3339         if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
3340                 ti->error = "Invalid device id";
3341                 r = -EINVAL;
3342                 goto bad_common;
3343         }
3344
3345         pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
3346         if (!pool_md) {
3347                 ti->error = "Couldn't get pool mapped device";
3348                 r = -EINVAL;
3349                 goto bad_common;
3350         }
3351
3352         tc->pool = __pool_table_lookup(pool_md);
3353         if (!tc->pool) {
3354                 ti->error = "Couldn't find pool object";
3355                 r = -EINVAL;
3356                 goto bad_pool_lookup;
3357         }
3358         __pool_inc(tc->pool);
3359
3360         if (get_pool_mode(tc->pool) == PM_FAIL) {
3361                 ti->error = "Couldn't open thin device, Pool is in fail mode";
3362                 r = -EINVAL;
3363                 goto bad_thin_open;
3364         }
3365
3366         r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
3367         if (r) {
3368                 ti->error = "Couldn't open thin internal device";
3369                 goto bad_thin_open;
3370         }
3371
3372         r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
3373         if (r)
3374                 goto bad_target_max_io_len;
3375
3376         ti->num_flush_bios = 1;
3377         ti->flush_supported = true;
3378         ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
3379
3380         /* In case the pool supports discards, pass them on. */
3381         ti->discard_zeroes_data_unsupported = true;
3382         if (tc->pool->pf.discard_enabled) {
3383                 ti->discards_supported = true;
3384                 ti->num_discard_bios = 1;
3385                 /* Discard bios must be split on a block boundary */
3386                 ti->split_discard_bios = true;
3387         }
3388
3389         dm_put(pool_md);
3390
3391         mutex_unlock(&dm_thin_pool_table.mutex);
3392
3393         atomic_set(&tc->refcount, 1);
3394         init_completion(&tc->can_destroy);
3395
3396         spin_lock_irqsave(&tc->pool->lock, flags);
3397         list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
3398         spin_unlock_irqrestore(&tc->pool->lock, flags);
3399         /*
3400          * This synchronize_rcu() call is needed here otherwise we risk a
3401          * wake_worker() call finding no bios to process (because the newly
3402          * added tc isn't yet visible).  So this reduces latency since we
3403          * aren't then dependent on the periodic commit to wake_worker().
3404          */
3405         synchronize_rcu();
3406
3407         return 0;
3408
3409 bad_target_max_io_len:
3410         dm_pool_close_thin_device(tc->td);
3411 bad_thin_open:
3412         __pool_dec(tc->pool);
3413 bad_pool_lookup:
3414         dm_put(pool_md);
3415 bad_common:
3416         dm_put_device(ti, tc->pool_dev);
3417 bad_pool_dev:
3418         if (tc->origin_dev)
3419                 dm_put_device(ti, tc->origin_dev);
3420 bad_origin_dev:
3421         kfree(tc);
3422 out_unlock:
3423         mutex_unlock(&dm_thin_pool_table.mutex);
3424
3425         return r;
3426 }
3427
3428 static int thin_map(struct dm_target *ti, struct bio *bio)
3429 {
3430         bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
3431
3432         return thin_bio_map(ti, bio);
3433 }
3434
3435 static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
3436 {
3437         unsigned long flags;
3438         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
3439         struct list_head work;
3440         struct dm_thin_new_mapping *m, *tmp;
3441         struct pool *pool = h->tc->pool;
3442
3443         if (h->shared_read_entry) {
3444                 INIT_LIST_HEAD(&work);
3445                 dm_deferred_entry_dec(h->shared_read_entry, &work);
3446
3447                 spin_lock_irqsave(&pool->lock, flags);
3448                 list_for_each_entry_safe(m, tmp, &work, list) {
3449                         list_del(&m->list);
3450                         __complete_mapping_preparation(m);
3451                 }
3452                 spin_unlock_irqrestore(&pool->lock, flags);
3453         }
3454
3455         if (h->all_io_entry) {
3456                 INIT_LIST_HEAD(&work);
3457                 dm_deferred_entry_dec(h->all_io_entry, &work);
3458                 if (!list_empty(&work)) {
3459                         spin_lock_irqsave(&pool->lock, flags);
3460                         list_for_each_entry_safe(m, tmp, &work, list)
3461                                 list_add_tail(&m->list, &pool->prepared_discards);
3462                         spin_unlock_irqrestore(&pool->lock, flags);
3463                         wake_worker(pool);
3464                 }
3465         }
3466
3467         return 0;
3468 }
3469
3470 static void thin_presuspend(struct dm_target *ti)
3471 {
3472         struct thin_c *tc = ti->private;
3473
3474         if (dm_noflush_suspending(ti))
3475                 noflush_work(tc, do_noflush_start);
3476 }
3477
3478 static void thin_postsuspend(struct dm_target *ti)
3479 {
3480         struct thin_c *tc = ti->private;
3481
3482         /*
3483          * The dm_noflush_suspending flag has been cleared by now, so
3484          * unfortunately we must always run this.
3485          */
3486         noflush_work(tc, do_noflush_stop);
3487 }
3488
3489 static int thin_preresume(struct dm_target *ti)
3490 {
3491         struct thin_c *tc = ti->private;
3492
3493         if (tc->origin_dev)
3494                 tc->origin_size = get_dev_size(tc->origin_dev->bdev);
3495
3496         return 0;
3497 }
3498
3499 /*
3500  * <nr mapped sectors> <highest mapped sector>
3501  */
3502 static void thin_status(struct dm_target *ti, status_type_t type,
3503                         unsigned status_flags, char *result, unsigned maxlen)
3504 {
3505         int r;
3506         ssize_t sz = 0;
3507         dm_block_t mapped, highest;
3508         char buf[BDEVNAME_SIZE];
3509         struct thin_c *tc = ti->private;
3510
3511         if (get_pool_mode(tc->pool) == PM_FAIL) {
3512                 DMEMIT("Fail");
3513                 return;
3514         }
3515
3516         if (!tc->td)
3517                 DMEMIT("-");
3518         else {
3519                 switch (type) {
3520                 case STATUSTYPE_INFO:
3521                         r = dm_thin_get_mapped_count(tc->td, &mapped);
3522                         if (r) {
3523                                 DMERR("dm_thin_get_mapped_count returned %d", r);
3524                                 goto err;
3525                         }
3526
3527                         r = dm_thin_get_highest_mapped_block(tc->td, &highest);
3528                         if (r < 0) {
3529                                 DMERR("dm_thin_get_highest_mapped_block returned %d", r);
3530                                 goto err;
3531                         }
3532
3533                         DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
3534                         if (r)
3535                                 DMEMIT("%llu", ((highest + 1) *
3536                                                 tc->pool->sectors_per_block) - 1);
3537                         else
3538                                 DMEMIT("-");
3539                         break;
3540
3541                 case STATUSTYPE_TABLE:
3542                         DMEMIT("%s %lu",
3543                                format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
3544                                (unsigned long) tc->dev_id);
3545                         if (tc->origin_dev)
3546                                 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
3547                         break;
3548                 }
3549         }
3550
3551         return;
3552
3553 err:
3554         DMEMIT("Error");
3555 }
3556
3557 static int thin_iterate_devices(struct dm_target *ti,
3558                                 iterate_devices_callout_fn fn, void *data)
3559 {
3560         sector_t blocks;
3561         struct thin_c *tc = ti->private;
3562         struct pool *pool = tc->pool;
3563
3564         /*
3565          * We can't call dm_pool_get_data_dev_size() since that blocks.  So
3566          * we follow a more convoluted path through to the pool's target.
3567          */
3568         if (!pool->ti)
3569                 return 0;       /* nothing is bound */
3570
3571         blocks = pool->ti->len;
3572         (void) sector_div(blocks, pool->sectors_per_block);
3573         if (blocks)
3574                 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
3575
3576         return 0;
3577 }
3578
3579 static struct target_type thin_target = {
3580         .name = "thin",
3581         .version = {1, 13, 0},
3582         .module = THIS_MODULE,
3583         .ctr = thin_ctr,
3584         .dtr = thin_dtr,
3585         .map = thin_map,
3586         .end_io = thin_endio,
3587         .preresume = thin_preresume,
3588         .presuspend = thin_presuspend,
3589         .postsuspend = thin_postsuspend,
3590         .status = thin_status,
3591         .iterate_devices = thin_iterate_devices,
3592 };
3593
3594 /*----------------------------------------------------------------*/
3595
3596 static int __init dm_thin_init(void)
3597 {
3598         int r;
3599
3600         pool_table_init();
3601
3602         r = dm_register_target(&thin_target);
3603         if (r)
3604                 return r;
3605
3606         r = dm_register_target(&pool_target);
3607         if (r)
3608                 goto bad_pool_target;
3609
3610         r = -ENOMEM;
3611
3612         _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
3613         if (!_new_mapping_cache)
3614                 goto bad_new_mapping_cache;
3615
3616         return 0;
3617
3618 bad_new_mapping_cache:
3619         dm_unregister_target(&pool_target);
3620 bad_pool_target:
3621         dm_unregister_target(&thin_target);
3622
3623         return r;
3624 }
3625
3626 static void dm_thin_exit(void)
3627 {
3628         dm_unregister_target(&thin_target);
3629         dm_unregister_target(&pool_target);
3630
3631         kmem_cache_destroy(_new_mapping_cache);
3632 }
3633
3634 module_init(dm_thin_init);
3635 module_exit(dm_thin_exit);
3636
3637 module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
3638 MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
3639
3640 MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
3641 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3642 MODULE_LICENSE("GPL");