drivers/md/dm-cache-target.c

   1 /*
   2  * Copyright (C) 2012 Red Hat. All rights reserved.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm.h"
   8 #include "dm-bio-prison.h"
   9 #include "dm-bio-record.h"
  10 #include "dm-cache-metadata.h"
  11
  12 #include <linux/dm-io.h>
  13 #include <linux/dm-kcopyd.h>
  14 #include <linux/init.h>
  15 #include <linux/mempool.h>
  16 #include <linux/module.h>
  17 #include <linux/slab.h>
  18 #include <linux/vmalloc.h>
  19
  20 #define DM_MSG_PREFIX "cache"
  21
  22 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
  23         "A percentage of time allocated for copying to and/or from cache");
  24
  25 /*----------------------------------------------------------------*/
  26
  27 /*
  28  * Glossary:
  29  *
  30  * oblock: index of an origin block
  31  * cblock: index of a cache block
  32  * promotion: movement of a block from origin to cache
  33  * demotion: movement of a block from cache to origin
  34  * migration: movement of a block between the origin and cache device,
  35  *            either direction
  36  */
  37
  38 /*----------------------------------------------------------------*/
  39
  40 static size_t bitset_size_in_bytes(unsigned nr_entries)
  41 {
  42         return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
  43 }
  44
  45 static unsigned long *alloc_bitset(unsigned nr_entries)
  46 {
  47         size_t s = bitset_size_in_bytes(nr_entries);
  48         return vzalloc(s);
  49 }
  50
  51 static void clear_bitset(void *bitset, unsigned nr_entries)
  52 {
  53         size_t s = bitset_size_in_bytes(nr_entries);
  54         memset(bitset, 0, s);
  55 }
  56
  57 static void free_bitset(unsigned long *bits)
  58 {
  59         vfree(bits);
  60 }
  61
  62 /*----------------------------------------------------------------*/
  63
  64 /*
  65  * There are a couple of places where we let a bio run, but want to do some
  66  * work before calling its endio function.  We do this by temporarily
  67  * changing the endio fn.
  68  */
  69 struct dm_hook_info {
  70         bio_end_io_t *bi_end_io;
  71         void *bi_private;
  72 };
  73
  74 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
  75                         bio_end_io_t *bi_end_io, void *bi_private)
  76 {
  77         h->bi_end_io = bio->bi_end_io;
  78         h->bi_private = bio->bi_private;
  79
  80         bio->bi_end_io = bi_end_io;
  81         bio->bi_private = bi_private;
  82 }
  83
  84 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
  85 {
  86         bio->bi_end_io = h->bi_end_io;
  87         bio->bi_private = h->bi_private;
  88
  89         /*
  90          * Must bump bi_remaining to allow bio to complete with
  91          * restored bi_end_io.
  92          */
  93         atomic_inc(&bio->bi_remaining);
  94 }
  95
  96 /*----------------------------------------------------------------*/
  97
  98 #define PRISON_CELLS 1024
  99 #define MIGRATION_POOL_SIZE 128
 100 #define COMMIT_PERIOD HZ
 101 #define MIGRATION_COUNT_WINDOW 10
 102
 103 /*
 104  * The block size of the device holding cache data must be
 105  * between 32KB and 1GB.
 106  */
 107 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
 108 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
 109
 110 /*
 111  * FIXME: the cache is read/write for the time being.
 112  */
 113 enum cache_metadata_mode {
 114         CM_WRITE,               /* metadata may be changed */
 115         CM_READ_ONLY,           /* metadata may not be changed */
 116 };
 117
 118 enum cache_io_mode {
 119         /*
 120          * Data is written to cached blocks only.  These blocks are marked
 121          * dirty.  If you lose the cache device you will lose data.
 122          * Potential performance increase for both reads and writes.
 123          */
 124         CM_IO_WRITEBACK,
 125
 126         /*
 127          * Data is written to both cache and origin.  Blocks are never
 128          * dirty.  Potential performance benfit for reads only.
 129          */
 130         CM_IO_WRITETHROUGH,
 131
 132         /*
 133          * A degraded mode useful for various cache coherency situations
 134          * (eg, rolling back snapshots).  Reads and writes always go to the
 135          * origin.  If a write goes to a cached oblock, then the cache
 136          * block is invalidated.
 137          */
 138         CM_IO_PASSTHROUGH
 139 };
 140
 141 struct cache_features {
 142         enum cache_metadata_mode mode;
 143         enum cache_io_mode io_mode;
 144 };
 145
 146 struct cache_stats {
 147         atomic_t read_hit;
 148         atomic_t read_miss;
 149         atomic_t write_hit;
 150         atomic_t write_miss;
 151         atomic_t demotion;
 152         atomic_t promotion;
 153         atomic_t copies_avoided;
 154         atomic_t cache_cell_clash;
 155         atomic_t commit_count;
 156         atomic_t discard_count;
 157 };
 158
 159 /*
 160  * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
 161  * the one-past-the-end value.
 162  */
 163 struct cblock_range {
 164         dm_cblock_t begin;
 165         dm_cblock_t end;
 166 };
 167
 168 struct invalidation_request {
 169         struct list_head list;
 170         struct cblock_range *cblocks;
 171
 172         atomic_t complete;
 173         int err;
 174
 175         wait_queue_head_t result_wait;
 176 };
 177
 178 struct cache {
 179         struct dm_target *ti;
 180         struct dm_target_callbacks callbacks;
 181
 182         struct dm_cache_metadata *cmd;
 183
 184         /*
 185          * Metadata is written to this device.
 186          */
 187         struct dm_dev *metadata_dev;
 188
 189         /*
 190          * The slower of the two data devices.  Typically a spindle.
 191          */
 192         struct dm_dev *origin_dev;
 193
 194         /*
 195          * The faster of the two data devices.  Typically an SSD.
 196          */
 197         struct dm_dev *cache_dev;
 198
 199         /*
 200          * Size of the origin device in _complete_ blocks and native sectors.
 201          */
 202         dm_oblock_t origin_blocks;
 203         sector_t origin_sectors;
 204
 205         /*
 206          * Size of the cache device in blocks.
 207          */
 208         dm_cblock_t cache_size;
 209
 210         /*
 211          * Fields for converting from sectors to blocks.
 212          */
 213         uint32_t sectors_per_block;
 214         int sectors_per_block_shift;
 215
 216         spinlock_t lock;
 217         struct bio_list deferred_bios;
 218         struct bio_list deferred_flush_bios;
 219         struct bio_list deferred_writethrough_bios;
 220         struct list_head quiesced_migrations;
 221         struct list_head completed_migrations;
 222         struct list_head need_commit_migrations;
 223         sector_t migration_threshold;
 224         wait_queue_head_t migration_wait;
 225         atomic_t nr_allocated_migrations;
 226
 227         /*
 228          * The number of in flight migrations that are performing
 229          * background io. eg, promotion, writeback.
 230          */
 231         atomic_t nr_io_migrations;
 232
 233         wait_queue_head_t quiescing_wait;
 234         atomic_t quiescing;
 235         atomic_t quiescing_ack;
 236
 237         /*
 238          * cache_size entries, dirty if set
 239          */
 240         atomic_t nr_dirty;
 241         unsigned long *dirty_bitset;
 242
 243         /*
 244          * origin_blocks entries, discarded if set.
 245          */
 246         dm_oblock_t discard_nr_blocks;
 247         unsigned long *discard_bitset;
 248
 249         /*
 250          * Rather than reconstructing the table line for the status we just
 251          * save it and regurgitate.
 252          */
 253         unsigned nr_ctr_args;
 254         const char **ctr_args;
 255
 256         struct dm_kcopyd_client *copier;
 257         struct workqueue_struct *wq;
 258         struct work_struct worker;
 259
 260         struct delayed_work waker;
 261         unsigned long last_commit_jiffies;
 262
 263         struct dm_bio_prison *prison;
 264         struct dm_deferred_set *all_io_ds;
 265
 266         mempool_t *migration_pool;
 267
 268         struct dm_cache_policy *policy;
 269         unsigned policy_nr_args;
 270
 271         bool need_tick_bio:1;
 272         bool sized:1;
 273         bool invalidate:1;
 274         bool commit_requested:1;
 275         bool loaded_mappings:1;
 276         bool loaded_discards:1;
 277
 278         /*
 279          * Cache features such as write-through.
 280          */
 281         struct cache_features features;
 282
 283         struct cache_stats stats;
 284
 285         /*
 286          * Invalidation fields.
 287          */
 288         spinlock_t invalidation_lock;
 289         struct list_head invalidation_requests;
 290 };
 291
 292 struct per_bio_data {
 293         bool tick:1;
 294         unsigned req_nr:2;
 295         struct dm_deferred_entry *all_io_entry;
 296         struct dm_hook_info hook_info;
 297
 298         /*
 299          * writethrough fields.  These MUST remain at the end of this
 300          * structure and the 'cache' member must be the first as it
 301          * is used to determine the offset of the writethrough fields.
 302          */
 303         struct cache *cache;
 304         dm_cblock_t cblock;
 305         struct dm_bio_details bio_details;
 306 };
 307
 308 struct dm_cache_migration {
 309         struct list_head list;
 310         struct cache *cache;
 311
 312         unsigned long start_jiffies;
 313         dm_oblock_t old_oblock;
 314         dm_oblock_t new_oblock;
 315         dm_cblock_t cblock;
 316
 317         bool err:1;
 318         bool writeback:1;
 319         bool demote:1;
 320         bool promote:1;
 321         bool requeue_holder:1;
 322         bool invalidate:1;
 323
 324         struct dm_bio_prison_cell *old_ocell;
 325         struct dm_bio_prison_cell *new_ocell;
 326 };
 327
 328 /*
 329  * Processing a bio in the worker thread may require these memory
 330  * allocations.  We prealloc to avoid deadlocks (the same worker thread
 331  * frees them back to the mempool).
 332  */
 333 struct prealloc {
 334         struct dm_cache_migration *mg;
 335         struct dm_bio_prison_cell *cell1;
 336         struct dm_bio_prison_cell *cell2;
 337 };
 338
 339 static void wake_worker(struct cache *cache)
 340 {
 341         queue_work(cache->wq, &cache->worker);
 342 }
 343
 344 /*----------------------------------------------------------------*/
 345
 346 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
 347 {
 348         /* FIXME: change to use a local slab. */
 349         return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
 350 }
 351
 352 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
 353 {
 354         dm_bio_prison_free_cell(cache->prison, cell);
 355 }
 356
 357 static struct dm_cache_migration *alloc_migration(struct cache *cache)
 358 {
 359         struct dm_cache_migration *mg;
 360
 361         mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
 362         if (mg) {
 363                 mg->cache = cache;
 364                 atomic_inc(&mg->cache->nr_allocated_migrations);
 365         }
 366
 367         return mg;
 368 }
 369
 370 static void free_migration(struct dm_cache_migration *mg)
 371 {
 372         if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations))
 373                 wake_up(&mg->cache->migration_wait);
 374
 375         mempool_free(mg, mg->cache->migration_pool);
 376 }
 377
 378 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
 379 {
 380         if (!p->mg) {
 381                 p->mg = alloc_migration(cache);
 382                 if (!p->mg)
 383                         return -ENOMEM;
 384         }
 385
 386         if (!p->cell1) {
 387                 p->cell1 = alloc_prison_cell(cache);
 388                 if (!p->cell1)
 389                         return -ENOMEM;
 390         }
 391
 392         if (!p->cell2) {
 393                 p->cell2 = alloc_prison_cell(cache);
 394                 if (!p->cell2)
 395                         return -ENOMEM;
 396         }
 397
 398         return 0;
 399 }
 400
 401 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
 402 {
 403         if (p->cell2)
 404                 free_prison_cell(cache, p->cell2);
 405
 406         if (p->cell1)
 407                 free_prison_cell(cache, p->cell1);
 408
 409         if (p->mg)
 410                 free_migration(p->mg);
 411 }
 412
 413 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
 414 {
 415         struct dm_cache_migration *mg = p->mg;
 416
 417         BUG_ON(!mg);
 418         p->mg = NULL;
 419
 420         return mg;
 421 }
 422
 423 /*
 424  * You must have a cell within the prealloc struct to return.  If not this
 425  * function will BUG() rather than returning NULL.
 426  */
 427 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
 428 {
 429         struct dm_bio_prison_cell *r = NULL;
 430
 431         if (p->cell1) {
 432                 r = p->cell1;
 433                 p->cell1 = NULL;
 434
 435         } else if (p->cell2) {
 436                 r = p->cell2;
 437                 p->cell2 = NULL;
 438         } else
 439                 BUG();
 440
 441         return r;
 442 }
 443
 444 /*
 445  * You can't have more than two cells in a prealloc struct.  BUG() will be
 446  * called if you try and overfill.
 447  */
 448 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
 449 {
 450         if (!p->cell2)
 451                 p->cell2 = cell;
 452
 453         else if (!p->cell1)
 454                 p->cell1 = cell;
 455
 456         else
 457                 BUG();
 458 }
 459
 460 /*----------------------------------------------------------------*/
 461
 462 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
 463 {
 464         key->virtual = 0;
 465         key->dev = 0;
 466         key->block = from_oblock(oblock);
 467 }
 468
 469 /*
 470  * The caller hands in a preallocated cell, and a free function for it.
 471  * The cell will be freed if there's an error, or if it wasn't used because
 472  * a cell with that key already exists.
 473  */
 474 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
 475
 476 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
 477                       struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
 478                       cell_free_fn free_fn, void *free_context,
 479                       struct dm_bio_prison_cell **cell_result)
 480 {
 481         int r;
 482         struct dm_cell_key key;
 483
 484         build_key(oblock, &key);
 485         r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
 486         if (r)
 487                 free_fn(free_context, cell_prealloc);
 488
 489         return r;
 490 }
 491
 492 static int get_cell(struct cache *cache,
 493                     dm_oblock_t oblock,
 494                     struct prealloc *structs,
 495                     struct dm_bio_prison_cell **cell_result)
 496 {
 497         int r;
 498         struct dm_cell_key key;
 499         struct dm_bio_prison_cell *cell_prealloc;
 500
 501         cell_prealloc = prealloc_get_cell(structs);
 502
 503         build_key(oblock, &key);
 504         r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
 505         if (r)
 506                 prealloc_put_cell(structs, cell_prealloc);
 507
 508         return r;
 509 }
 510
 511 /*----------------------------------------------------------------*/
 512
 513 static bool is_dirty(struct cache *cache, dm_cblock_t b)
 514 {
 515         return test_bit(from_cblock(b), cache->dirty_bitset);
 516 }
 517
 518 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
 519 {
 520         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
 521                 atomic_inc(&cache->nr_dirty);
 522                 policy_set_dirty(cache->policy, oblock);
 523         }
 524 }
 525
 526 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
 527 {
 528         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
 529                 policy_clear_dirty(cache->policy, oblock);
 530                 if (atomic_dec_return(&cache->nr_dirty) == 0)
 531                         dm_table_event(cache->ti->table);
 532         }
 533 }
 534
 535 /*----------------------------------------------------------------*/
 536
 537 static bool block_size_is_power_of_two(struct cache *cache)
 538 {
 539         return cache->sectors_per_block_shift >= 0;
 540 }
 541
 542 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
 543 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
 544 __always_inline
 545 #endif
 546 static dm_block_t block_div(dm_block_t b, uint32_t n)
 547 {
 548         do_div(b, n);
 549
 550         return b;
 551 }
 552
 553 static void set_discard(struct cache *cache, dm_oblock_t b)
 554 {
 555         unsigned long flags;
 556
 557         atomic_inc(&cache->stats.discard_count);
 558
 559         spin_lock_irqsave(&cache->lock, flags);
 560         set_bit(from_oblock(b), cache->discard_bitset);
 561         spin_unlock_irqrestore(&cache->lock, flags);
 562 }
 563
 564 static void clear_discard(struct cache *cache, dm_oblock_t b)
 565 {
 566         unsigned long flags;
 567
 568         spin_lock_irqsave(&cache->lock, flags);
 569         clear_bit(from_oblock(b), cache->discard_bitset);
 570         spin_unlock_irqrestore(&cache->lock, flags);
 571 }
 572
 573 static bool is_discarded(struct cache *cache, dm_oblock_t b)
 574 {
 575         int r;
 576         unsigned long flags;
 577
 578         spin_lock_irqsave(&cache->lock, flags);
 579         r = test_bit(from_oblock(b), cache->discard_bitset);
 580         spin_unlock_irqrestore(&cache->lock, flags);
 581
 582         return r;
 583 }
 584
 585 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
 586 {
 587         int r;
 588         unsigned long flags;
 589
 590         spin_lock_irqsave(&cache->lock, flags);
 591         r = test_bit(from_oblock(b), cache->discard_bitset);
 592         spin_unlock_irqrestore(&cache->lock, flags);
 593
 594         return r;
 595 }
 596
 597 /*----------------------------------------------------------------*/
 598
 599 static void load_stats(struct cache *cache)
 600 {
 601         struct dm_cache_statistics stats;
 602
 603         dm_cache_metadata_get_stats(cache->cmd, &stats);
 604         atomic_set(&cache->stats.read_hit, stats.read_hits);
 605         atomic_set(&cache->stats.read_miss, stats.read_misses);
 606         atomic_set(&cache->stats.write_hit, stats.write_hits);
 607         atomic_set(&cache->stats.write_miss, stats.write_misses);
 608 }
 609
 610 static void save_stats(struct cache *cache)
 611 {
 612         struct dm_cache_statistics stats;
 613
 614         stats.read_hits = atomic_read(&cache->stats.read_hit);
 615         stats.read_misses = atomic_read(&cache->stats.read_miss);
 616         stats.write_hits = atomic_read(&cache->stats.write_hit);
 617         stats.write_misses = atomic_read(&cache->stats.write_miss);
 618
 619         dm_cache_metadata_set_stats(cache->cmd, &stats);
 620 }
 621
 622 /*----------------------------------------------------------------
 623  * Per bio data
 624  *--------------------------------------------------------------*/
 625
 626 /*
 627  * If using writeback, leave out struct per_bio_data's writethrough fields.
 628  */
 629 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
 630 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
 631
 632 static bool writethrough_mode(struct cache_features *f)
 633 {
 634         return f->io_mode == CM_IO_WRITETHROUGH;
 635 }
 636
 637 static bool writeback_mode(struct cache_features *f)
 638 {
 639         return f->io_mode == CM_IO_WRITEBACK;
 640 }
 641
 642 static bool passthrough_mode(struct cache_features *f)
 643 {
 644         return f->io_mode == CM_IO_PASSTHROUGH;
 645 }
 646
 647 static size_t get_per_bio_data_size(struct cache *cache)
 648 {
 649         return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
 650 }
 651
 652 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
 653 {
 654         struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
 655         BUG_ON(!pb);
 656         return pb;
 657 }
 658
 659 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
 660 {
 661         struct per_bio_data *pb = get_per_bio_data(bio, data_size);
 662
 663         pb->tick = false;
 664         pb->req_nr = dm_bio_get_target_bio_nr(bio);
 665         pb->all_io_entry = NULL;
 666
 667         return pb;
 668 }
 669
 670 /*----------------------------------------------------------------
 671  * Remapping
 672  *--------------------------------------------------------------*/
 673 static void remap_to_origin(struct cache *cache, struct bio *bio)
 674 {
 675         bio->bi_bdev = cache->origin_dev->bdev;
 676 }
 677
 678 static void remap_to_cache(struct cache *cache, struct bio *bio,
 679                            dm_cblock_t cblock)
 680 {
 681         sector_t bi_sector = bio->bi_iter.bi_sector;
 682         sector_t block = from_cblock(cblock);
 683
 684         bio->bi_bdev = cache->cache_dev->bdev;
 685         if (!block_size_is_power_of_two(cache))
 686                 bio->bi_iter.bi_sector =
 687                         (block * cache->sectors_per_block) +
 688                         sector_div(bi_sector, cache->sectors_per_block);
 689         else
 690                 bio->bi_iter.bi_sector =
 691                         (block << cache->sectors_per_block_shift) |
 692                         (bi_sector & (cache->sectors_per_block - 1));
 693 }
 694
 695 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 696 {
 697         unsigned long flags;
 698         size_t pb_data_size = get_per_bio_data_size(cache);
 699         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 700
 701         spin_lock_irqsave(&cache->lock, flags);
 702         if (cache->need_tick_bio &&
 703             !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
 704                 pb->tick = true;
 705                 cache->need_tick_bio = false;
 706         }
 707         spin_unlock_irqrestore(&cache->lock, flags);
 708 }
 709
 710 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 711                                   dm_oblock_t oblock)
 712 {
 713         check_if_tick_bio_needed(cache, bio);
 714         remap_to_origin(cache, bio);
 715         if (bio_data_dir(bio) == WRITE)
 716                 clear_discard(cache, oblock);
 717 }
 718
 719 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 720                                  dm_oblock_t oblock, dm_cblock_t cblock)
 721 {
 722         check_if_tick_bio_needed(cache, bio);
 723         remap_to_cache(cache, bio, cblock);
 724         if (bio_data_dir(bio) == WRITE) {
 725                 set_dirty(cache, oblock, cblock);
 726                 clear_discard(cache, oblock);
 727         }
 728 }
 729
 730 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 731 {
 732         sector_t block_nr = bio->bi_iter.bi_sector;
 733
 734         if (!block_size_is_power_of_two(cache))
 735                 (void) sector_div(block_nr, cache->sectors_per_block);
 736         else
 737                 block_nr >>= cache->sectors_per_block_shift;
 738
 739         return to_oblock(block_nr);
 740 }
 741
 742 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
 743 {
 744         return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 745 }
 746
 747 static void issue(struct cache *cache, struct bio *bio)
 748 {
 749         unsigned long flags;
 750
 751         if (!bio_triggers_commit(cache, bio)) {
 752                 generic_make_request(bio);
 753                 return;
 754         }
 755
 756         /*
 757          * Batch together any bios that trigger commits and then issue a
 758          * single commit for them in do_worker().
 759          */
 760         spin_lock_irqsave(&cache->lock, flags);
 761         cache->commit_requested = true;
 762         bio_list_add(&cache->deferred_flush_bios, bio);
 763         spin_unlock_irqrestore(&cache->lock, flags);
 764 }
 765
 766 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
 767 {
 768         unsigned long flags;
 769
 770         spin_lock_irqsave(&cache->lock, flags);
 771         bio_list_add(&cache->deferred_writethrough_bios, bio);
 772         spin_unlock_irqrestore(&cache->lock, flags);
 773
 774         wake_worker(cache);
 775 }
 776
 777 static void writethrough_endio(struct bio *bio, int err)
 778 {
 779         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
 780
 781         dm_unhook_bio(&pb->hook_info, bio);
 782
 783         if (err) {
 784                 bio_endio(bio, err);
 785                 return;
 786         }
 787
 788         dm_bio_restore(&pb->bio_details, bio);
 789         remap_to_cache(pb->cache, bio, pb->cblock);
 790
 791         /*
 792          * We can't issue this bio directly, since we're in interrupt
 793          * context.  So it gets put on a bio list for processing by the
 794          * worker thread.
 795          */
 796         defer_writethrough_bio(pb->cache, bio);
 797 }
 798
 799 /*
 800  * When running in writethrough mode we need to send writes to clean blocks
 801  * to both the cache and origin devices.  In future we'd like to clone the
 802  * bio and send them in parallel, but for now we're doing them in
 803  * series as this is easier.
 804  */
 805 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
 806                                        dm_oblock_t oblock, dm_cblock_t cblock)
 807 {
 808         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
 809
 810         pb->cache = cache;
 811         pb->cblock = cblock;
 812         dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
 813         dm_bio_record(&pb->bio_details, bio);
 814
 815         remap_to_origin_clear_discard(pb->cache, bio, oblock);
 816 }
 817
 818 /*----------------------------------------------------------------
 819  * Migration processing
 820  *
 821  * Migration covers moving data from the origin device to the cache, or
 822  * vice versa.
 823  *--------------------------------------------------------------*/
 824 static void inc_io_migrations(struct cache *cache)
 825 {
 826         atomic_inc(&cache->nr_io_migrations);
 827 }
 828
 829 static void dec_io_migrations(struct cache *cache)
 830 {
 831         atomic_dec(&cache->nr_io_migrations);
 832 }
 833
 834 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
 835                          bool holder)
 836 {
 837         (holder ? dm_cell_release : dm_cell_release_no_holder)
 838                 (cache->prison, cell, &cache->deferred_bios);
 839         free_prison_cell(cache, cell);
 840 }
 841
 842 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
 843                        bool holder)
 844 {
 845         unsigned long flags;
 846
 847         spin_lock_irqsave(&cache->lock, flags);
 848         __cell_defer(cache, cell, holder);
 849         spin_unlock_irqrestore(&cache->lock, flags);
 850
 851         wake_worker(cache);
 852 }
 853
 854 static void free_io_migration(struct dm_cache_migration *mg)
 855 {
 856         dec_io_migrations(mg->cache);
 857         free_migration(mg);
 858 }
 859
 860 static void migration_failure(struct dm_cache_migration *mg)
 861 {
 862         struct cache *cache = mg->cache;
 863
 864         if (mg->writeback) {
 865                 DMWARN_LIMIT("writeback failed; couldn't copy block");
 866                 set_dirty(cache, mg->old_oblock, mg->cblock);
 867                 cell_defer(cache, mg->old_ocell, false);
 868
 869         } else if (mg->demote) {
 870                 DMWARN_LIMIT("demotion failed; couldn't copy block");
 871                 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
 872
 873                 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
 874                 if (mg->promote)
 875                         cell_defer(cache, mg->new_ocell, true);
 876         } else {
 877                 DMWARN_LIMIT("promotion failed; couldn't copy block");
 878                 policy_remove_mapping(cache->policy, mg->new_oblock);
 879                 cell_defer(cache, mg->new_ocell, true);
 880         }
 881
 882         free_io_migration(mg);
 883 }
 884
 885 static void migration_success_pre_commit(struct dm_cache_migration *mg)
 886 {
 887         unsigned long flags;
 888         struct cache *cache = mg->cache;
 889
 890         if (mg->writeback) {
 891                 clear_dirty(cache, mg->old_oblock, mg->cblock);
 892                 cell_defer(cache, mg->old_ocell, false);
 893                 free_io_migration(mg);
 894                 return;
 895
 896         } else if (mg->demote) {
 897                 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
 898                         DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
 899                         policy_force_mapping(cache->policy, mg->new_oblock,
 900                                              mg->old_oblock);
 901                         if (mg->promote)
 902                                 cell_defer(cache, mg->new_ocell, true);
 903                         free_io_migration(mg);
 904                         return;
 905                 }
 906         } else {
 907                 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
 908                         DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
 909                         policy_remove_mapping(cache->policy, mg->new_oblock);
 910                         free_io_migration(mg);
 911                         return;
 912                 }
 913         }
 914
 915         spin_lock_irqsave(&cache->lock, flags);
 916         list_add_tail(&mg->list, &cache->need_commit_migrations);
 917         cache->commit_requested = true;
 918         spin_unlock_irqrestore(&cache->lock, flags);
 919 }
 920
 921 static void migration_success_post_commit(struct dm_cache_migration *mg)
 922 {
 923         unsigned long flags;
 924         struct cache *cache = mg->cache;
 925
 926         if (mg->writeback) {
 927                 DMWARN("writeback unexpectedly triggered commit");
 928                 return;
 929
 930         } else if (mg->demote) {
 931                 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
 932
 933                 if (mg->promote) {
 934                         mg->demote = false;
 935
 936                         spin_lock_irqsave(&cache->lock, flags);
 937                         list_add_tail(&mg->list, &cache->quiesced_migrations);
 938                         spin_unlock_irqrestore(&cache->lock, flags);
 939
 940                 } else {
 941                         if (mg->invalidate)
 942                                 policy_remove_mapping(cache->policy, mg->old_oblock);
 943                         free_io_migration(mg);
 944                 }
 945
 946         } else {
 947                 if (mg->requeue_holder) {
 948                         clear_dirty(cache, mg->new_oblock, mg->cblock);
 949                         cell_defer(cache, mg->new_ocell, true);
 950                 } else {
 951                         /*
 952                          * The block was promoted via an overwrite, so it's dirty.
 953                          */
 954                         set_dirty(cache, mg->new_oblock, mg->cblock);
 955                         bio_endio(mg->new_ocell->holder, 0);
 956                         cell_defer(cache, mg->new_ocell, false);
 957                 }
 958                 free_io_migration(mg);
 959         }
 960 }
 961
 962 static void copy_complete(int read_err, unsigned long write_err, void *context)
 963 {
 964         unsigned long flags;
 965         struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
 966         struct cache *cache = mg->cache;
 967
 968         if (read_err || write_err)
 969                 mg->err = true;
 970
 971         spin_lock_irqsave(&cache->lock, flags);
 972         list_add_tail(&mg->list, &cache->completed_migrations);
 973         spin_unlock_irqrestore(&cache->lock, flags);
 974
 975         wake_worker(cache);
 976 }
 977
 978 static void issue_copy_real(struct dm_cache_migration *mg)
 979 {
 980         int r;
 981         struct dm_io_region o_region, c_region;
 982         struct cache *cache = mg->cache;
 983         sector_t cblock = from_cblock(mg->cblock);
 984
 985         o_region.bdev = cache->origin_dev->bdev;
 986         o_region.count = cache->sectors_per_block;
 987
 988         c_region.bdev = cache->cache_dev->bdev;
 989         c_region.sector = cblock * cache->sectors_per_block;
 990         c_region.count = cache->sectors_per_block;
 991
 992         if (mg->writeback || mg->demote) {
 993                 /* demote */
 994                 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
 995                 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
 996         } else {
 997                 /* promote */
 998                 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
 999                 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
1000         }
1001
1002         if (r < 0) {
1003                 DMERR_LIMIT("issuing migration failed");
1004                 migration_failure(mg);
1005         }
1006 }
1007
1008 static void overwrite_endio(struct bio *bio, int err)
1009 {
1010         struct dm_cache_migration *mg = bio->bi_private;
1011         struct cache *cache = mg->cache;
1012         size_t pb_data_size = get_per_bio_data_size(cache);
1013         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1014         unsigned long flags;
1015
1016         dm_unhook_bio(&pb->hook_info, bio);
1017
1018         if (err)
1019                 mg->err = true;
1020
1021         mg->requeue_holder = false;
1022
1023         spin_lock_irqsave(&cache->lock, flags);
1024         list_add_tail(&mg->list, &cache->completed_migrations);
1025         spin_unlock_irqrestore(&cache->lock, flags);
1026
1027         wake_worker(cache);
1028 }
1029
1030 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
1031 {
1032         size_t pb_data_size = get_per_bio_data_size(mg->cache);
1033         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1034
1035         dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1036         remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
1037         generic_make_request(bio);
1038 }
1039
1040 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1041 {
1042         return (bio_data_dir(bio) == WRITE) &&
1043                 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1044 }
1045
1046 static void avoid_copy(struct dm_cache_migration *mg)
1047 {
1048         atomic_inc(&mg->cache->stats.copies_avoided);
1049         migration_success_pre_commit(mg);
1050 }
1051
1052 static void issue_copy(struct dm_cache_migration *mg)
1053 {
1054         bool avoid;
1055         struct cache *cache = mg->cache;
1056
1057         if (mg->writeback || mg->demote)
1058                 avoid = !is_dirty(cache, mg->cblock) ||
1059                         is_discarded_oblock(cache, mg->old_oblock);
1060         else {
1061                 struct bio *bio = mg->new_ocell->holder;
1062
1063                 avoid = is_discarded_oblock(cache, mg->new_oblock);
1064
1065                 if (writeback_mode(&cache->features) &&
1066                     !avoid && bio_writes_complete_block(cache, bio)) {
1067                         issue_overwrite(mg, bio);
1068                         return;
1069                 }
1070         }
1071
1072         avoid ? avoid_copy(mg) : issue_copy_real(mg);
1073 }
1074
1075 static void complete_migration(struct dm_cache_migration *mg)
1076 {
1077         if (mg->err)
1078                 migration_failure(mg);
1079         else
1080                 migration_success_pre_commit(mg);
1081 }
1082
1083 static void process_migrations(struct cache *cache, struct list_head *head,
1084                                void (*fn)(struct dm_cache_migration *))
1085 {
1086         unsigned long flags;
1087         struct list_head list;
1088         struct dm_cache_migration *mg, *tmp;
1089
1090         INIT_LIST_HEAD(&list);
1091         spin_lock_irqsave(&cache->lock, flags);
1092         list_splice_init(head, &list);
1093         spin_unlock_irqrestore(&cache->lock, flags);
1094
1095         list_for_each_entry_safe(mg, tmp, &list, list)
1096                 fn(mg);
1097 }
1098
1099 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
1100 {
1101         list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
1102 }
1103
1104 static void queue_quiesced_migration(struct dm_cache_migration *mg)
1105 {
1106         unsigned long flags;
1107         struct cache *cache = mg->cache;
1108
1109         spin_lock_irqsave(&cache->lock, flags);
1110         __queue_quiesced_migration(mg);
1111         spin_unlock_irqrestore(&cache->lock, flags);
1112
1113         wake_worker(cache);
1114 }
1115
1116 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
1117 {
1118         unsigned long flags;
1119         struct dm_cache_migration *mg, *tmp;
1120
1121         spin_lock_irqsave(&cache->lock, flags);
1122         list_for_each_entry_safe(mg, tmp, work, list)
1123                 __queue_quiesced_migration(mg);
1124         spin_unlock_irqrestore(&cache->lock, flags);
1125
1126         wake_worker(cache);
1127 }
1128
1129 static void check_for_quiesced_migrations(struct cache *cache,
1130                                           struct per_bio_data *pb)
1131 {
1132         struct list_head work;
1133
1134         if (!pb->all_io_entry)
1135                 return;
1136
1137         INIT_LIST_HEAD(&work);
1138         if (pb->all_io_entry)
1139                 dm_deferred_entry_dec(pb->all_io_entry, &work);
1140
1141         if (!list_empty(&work))
1142                 queue_quiesced_migrations(cache, &work);
1143 }
1144
1145 static void quiesce_migration(struct dm_cache_migration *mg)
1146 {
1147         if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
1148                 queue_quiesced_migration(mg);
1149 }
1150
1151 static void promote(struct cache *cache, struct prealloc *structs,
1152                     dm_oblock_t oblock, dm_cblock_t cblock,
1153                     struct dm_bio_prison_cell *cell)
1154 {
1155         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1156
1157         mg->err = false;
1158         mg->writeback = false;
1159         mg->demote = false;
1160         mg->promote = true;
1161         mg->requeue_holder = true;
1162         mg->invalidate = false;
1163         mg->cache = cache;
1164         mg->new_oblock = oblock;
1165         mg->cblock = cblock;
1166         mg->old_ocell = NULL;
1167         mg->new_ocell = cell;
1168         mg->start_jiffies = jiffies;
1169
1170         inc_io_migrations(cache);
1171         quiesce_migration(mg);
1172 }
1173
1174 static void writeback(struct cache *cache, struct prealloc *structs,
1175                       dm_oblock_t oblock, dm_cblock_t cblock,
1176                       struct dm_bio_prison_cell *cell)
1177 {
1178         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1179
1180         mg->err = false;
1181         mg->writeback = true;
1182         mg->demote = false;
1183         mg->promote = false;
1184         mg->requeue_holder = true;
1185         mg->invalidate = false;
1186         mg->cache = cache;
1187         mg->old_oblock = oblock;
1188         mg->cblock = cblock;
1189         mg->old_ocell = cell;
1190         mg->new_ocell = NULL;
1191         mg->start_jiffies = jiffies;
1192
1193         inc_io_migrations(cache);
1194         quiesce_migration(mg);
1195 }
1196
1197 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1198                                 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1199                                 dm_cblock_t cblock,
1200                                 struct dm_bio_prison_cell *old_ocell,
1201                                 struct dm_bio_prison_cell *new_ocell)
1202 {
1203         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1204
1205         mg->err = false;
1206         mg->writeback = false;
1207         mg->demote = true;
1208         mg->promote = true;
1209         mg->requeue_holder = true;
1210         mg->invalidate = false;
1211         mg->cache = cache;
1212         mg->old_oblock = old_oblock;
1213         mg->new_oblock = new_oblock;
1214         mg->cblock = cblock;
1215         mg->old_ocell = old_ocell;
1216         mg->new_ocell = new_ocell;
1217         mg->start_jiffies = jiffies;
1218
1219         inc_io_migrations(cache);
1220         quiesce_migration(mg);
1221 }
1222
1223 /*
1224  * Invalidate a cache entry.  No writeback occurs; any changes in the cache
1225  * block are thrown away.
1226  */
1227 static void invalidate(struct cache *cache, struct prealloc *structs,
1228                        dm_oblock_t oblock, dm_cblock_t cblock,
1229                        struct dm_bio_prison_cell *cell)
1230 {
1231         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1232
1233         mg->err = false;
1234         mg->writeback = false;
1235         mg->demote = true;
1236         mg->promote = false;
1237         mg->requeue_holder = true;
1238         mg->invalidate = true;
1239         mg->cache = cache;
1240         mg->old_oblock = oblock;
1241         mg->cblock = cblock;
1242         mg->old_ocell = cell;
1243         mg->new_ocell = NULL;
1244         mg->start_jiffies = jiffies;
1245
1246         inc_io_migrations(cache);
1247         quiesce_migration(mg);
1248 }
1249
1250 /*----------------------------------------------------------------
1251  * bio processing
1252  *--------------------------------------------------------------*/
1253 static void defer_bio(struct cache *cache, struct bio *bio)
1254 {
1255         unsigned long flags;
1256
1257         spin_lock_irqsave(&cache->lock, flags);
1258         bio_list_add(&cache->deferred_bios, bio);
1259         spin_unlock_irqrestore(&cache->lock, flags);
1260
1261         wake_worker(cache);
1262 }
1263
1264 static void process_flush_bio(struct cache *cache, struct bio *bio)
1265 {
1266         size_t pb_data_size = get_per_bio_data_size(cache);
1267         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1268
1269         BUG_ON(bio->bi_iter.bi_size);
1270         if (!pb->req_nr)
1271                 remap_to_origin(cache, bio);
1272         else
1273                 remap_to_cache(cache, bio, 0);
1274
1275         issue(cache, bio);
1276 }
1277
1278 /*
1279  * People generally discard large parts of a device, eg, the whole device
1280  * when formatting.  Splitting these large discards up into cache block
1281  * sized ios and then quiescing (always neccessary for discard) takes too
1282  * long.
1283  *
1284  * We keep it simple, and allow any size of discard to come in, and just
1285  * mark off blocks on the discard bitset.  No passdown occurs!
1286  *
1287  * To implement passdown we need to change the bio_prison such that a cell
1288  * can have a key that spans many blocks.
1289  */
1290 static void process_discard_bio(struct cache *cache, struct bio *bio)
1291 {
1292         dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector,
1293                                                   cache->sectors_per_block);
1294         dm_block_t end_block = bio_end_sector(bio);
1295         dm_block_t b;
1296
1297         end_block = block_div(end_block, cache->sectors_per_block);
1298
1299         for (b = start_block; b < end_block; b++)
1300                 set_discard(cache, to_oblock(b));
1301
1302         bio_endio(bio, 0);
1303 }
1304
1305 static bool spare_migration_bandwidth(struct cache *cache)
1306 {
1307         sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1308                 cache->sectors_per_block;
1309         return current_volume < cache->migration_threshold;
1310 }
1311
1312 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1313 {
1314         atomic_inc(bio_data_dir(bio) == READ ?
1315                    &cache->stats.read_hit : &cache->stats.write_hit);
1316 }
1317
1318 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1319 {
1320         atomic_inc(bio_data_dir(bio) == READ ?
1321                    &cache->stats.read_miss : &cache->stats.write_miss);
1322 }
1323
1324 static void issue_cache_bio(struct cache *cache, struct bio *bio,
1325                             struct per_bio_data *pb,
1326                             dm_oblock_t oblock, dm_cblock_t cblock)
1327 {
1328         pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1329         remap_to_cache_dirty(cache, bio, oblock, cblock);
1330         issue(cache, bio);
1331 }
1332
1333 static void process_bio(struct cache *cache, struct prealloc *structs,
1334                         struct bio *bio)
1335 {
1336         int r;
1337         bool release_cell = true;
1338         dm_oblock_t block = get_bio_block(cache, bio);
1339         struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1340         struct policy_result lookup_result;
1341         size_t pb_data_size = get_per_bio_data_size(cache);
1342         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1343         bool discarded_block = is_discarded_oblock(cache, block);
1344         bool passthrough = passthrough_mode(&cache->features);
1345         bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
1346
1347         /*
1348          * Check to see if that block is currently migrating.
1349          */
1350         cell_prealloc = prealloc_get_cell(structs);
1351         r = bio_detain(cache, block, bio, cell_prealloc,
1352                        (cell_free_fn) prealloc_put_cell,
1353                        structs, &new_ocell);
1354         if (r > 0)
1355                 return;
1356
1357         r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1358                        bio, &lookup_result);
1359
1360         if (r == -EWOULDBLOCK)
1361                 /* migration has been denied */
1362                 lookup_result.op = POLICY_MISS;
1363
1364         switch (lookup_result.op) {
1365         case POLICY_HIT:
1366                 if (passthrough) {
1367                         inc_miss_counter(cache, bio);
1368
1369                         /*
1370                          * Passthrough always maps to the origin,
1371                          * invalidating any cache blocks that are written
1372                          * to.
1373                          */
1374
1375                         if (bio_data_dir(bio) == WRITE) {
1376                                 atomic_inc(&cache->stats.demotion);
1377                                 invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
1378                                 release_cell = false;
1379
1380                         } else {
1381                                 /* FIXME: factor out issue_origin() */
1382                                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1383                                 remap_to_origin_clear_discard(cache, bio, block);
1384                                 issue(cache, bio);
1385                         }
1386                 } else {
1387                         inc_hit_counter(cache, bio);
1388
1389                         if (bio_data_dir(bio) == WRITE &&
1390                             writethrough_mode(&cache->features) &&
1391                             !is_dirty(cache, lookup_result.cblock)) {
1392                                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1393                                 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1394                                 issue(cache, bio);
1395                         } else
1396                                 issue_cache_bio(cache, bio, pb, block, lookup_result.cblock);
1397                 }
1398
1399                 break;
1400
1401         case POLICY_MISS:
1402                 inc_miss_counter(cache, bio);
1403                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1404                 remap_to_origin_clear_discard(cache, bio, block);
1405                 issue(cache, bio);
1406                 break;
1407
1408         case POLICY_NEW:
1409                 atomic_inc(&cache->stats.promotion);
1410                 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1411                 release_cell = false;
1412                 break;
1413
1414         case POLICY_REPLACE:
1415                 cell_prealloc = prealloc_get_cell(structs);
1416                 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1417                                (cell_free_fn) prealloc_put_cell,
1418                                structs, &old_ocell);
1419                 if (r > 0) {
1420                         /*
1421                          * We have to be careful to avoid lock inversion of
1422                          * the cells.  So we back off, and wait for the
1423                          * old_ocell to become free.
1424                          */
1425                         policy_force_mapping(cache->policy, block,
1426                                              lookup_result.old_oblock);
1427                         atomic_inc(&cache->stats.cache_cell_clash);
1428                         break;
1429                 }
1430                 atomic_inc(&cache->stats.demotion);
1431                 atomic_inc(&cache->stats.promotion);
1432
1433                 demote_then_promote(cache, structs, lookup_result.old_oblock,
1434                                     block, lookup_result.cblock,
1435                                     old_ocell, new_ocell);
1436                 release_cell = false;
1437                 break;
1438
1439         default:
1440                 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1441                             (unsigned) lookup_result.op);
1442                 bio_io_error(bio);
1443         }
1444
1445         if (release_cell)
1446                 cell_defer(cache, new_ocell, false);
1447 }
1448
1449 static int need_commit_due_to_time(struct cache *cache)
1450 {
1451         return jiffies < cache->last_commit_jiffies ||
1452                jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1453 }
1454
1455 static int commit_if_needed(struct cache *cache)
1456 {
1457         int r = 0;
1458
1459         if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
1460             dm_cache_changed_this_transaction(cache->cmd)) {
1461                 atomic_inc(&cache->stats.commit_count);
1462                 cache->commit_requested = false;
1463                 r = dm_cache_commit(cache->cmd, false);
1464                 cache->last_commit_jiffies = jiffies;
1465         }
1466
1467         return r;
1468 }
1469
1470 static void process_deferred_bios(struct cache *cache)
1471 {
1472         unsigned long flags;
1473         struct bio_list bios;
1474         struct bio *bio;
1475         struct prealloc structs;
1476
1477         memset(&structs, 0, sizeof(structs));
1478         bio_list_init(&bios);
1479
1480         spin_lock_irqsave(&cache->lock, flags);
1481         bio_list_merge(&bios, &cache->deferred_bios);
1482         bio_list_init(&cache->deferred_bios);
1483         spin_unlock_irqrestore(&cache->lock, flags);
1484
1485         while (!bio_list_empty(&bios)) {
1486                 /*
1487                  * If we've got no free migration structs, and processing
1488                  * this bio might require one, we pause until there are some
1489                  * prepared mappings to process.
1490                  */
1491                 if (prealloc_data_structs(cache, &structs)) {
1492                         spin_lock_irqsave(&cache->lock, flags);
1493                         bio_list_merge(&cache->deferred_bios, &bios);
1494                         spin_unlock_irqrestore(&cache->lock, flags);
1495                         break;
1496                 }
1497
1498                 bio = bio_list_pop(&bios);
1499
1500                 if (bio->bi_rw & REQ_FLUSH)
1501                         process_flush_bio(cache, bio);
1502                 else if (bio->bi_rw & REQ_DISCARD)
1503                         process_discard_bio(cache, bio);
1504                 else
1505                         process_bio(cache, &structs, bio);
1506         }
1507
1508         prealloc_free_structs(cache, &structs);
1509 }
1510
1511 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1512 {
1513         unsigned long flags;
1514         struct bio_list bios;
1515         struct bio *bio;
1516
1517         bio_list_init(&bios);
1518
1519         spin_lock_irqsave(&cache->lock, flags);
1520         bio_list_merge(&bios, &cache->deferred_flush_bios);
1521         bio_list_init(&cache->deferred_flush_bios);
1522         spin_unlock_irqrestore(&cache->lock, flags);
1523
1524         while ((bio = bio_list_pop(&bios)))
1525                 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1526 }
1527
1528 static void process_deferred_writethrough_bios(struct cache *cache)
1529 {
1530         unsigned long flags;
1531         struct bio_list bios;
1532         struct bio *bio;
1533
1534         bio_list_init(&bios);
1535
1536         spin_lock_irqsave(&cache->lock, flags);
1537         bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1538         bio_list_init(&cache->deferred_writethrough_bios);
1539         spin_unlock_irqrestore(&cache->lock, flags);
1540
1541         while ((bio = bio_list_pop(&bios)))
1542                 generic_make_request(bio);
1543 }
1544
1545 static void writeback_some_dirty_blocks(struct cache *cache)
1546 {
1547         int r = 0;
1548         dm_oblock_t oblock;
1549         dm_cblock_t cblock;
1550         struct prealloc structs;
1551         struct dm_bio_prison_cell *old_ocell;
1552
1553         memset(&structs, 0, sizeof(structs));
1554
1555         while (spare_migration_bandwidth(cache)) {
1556                 if (prealloc_data_structs(cache, &structs))
1557                         break;
1558
1559                 r = policy_writeback_work(cache->policy, &oblock, &cblock);
1560                 if (r)
1561                         break;
1562
1563                 r = get_cell(cache, oblock, &structs, &old_ocell);
1564                 if (r) {
1565                         policy_set_dirty(cache->policy, oblock);
1566                         break;
1567                 }
1568
1569                 writeback(cache, &structs, oblock, cblock, old_ocell);
1570         }
1571
1572         prealloc_free_structs(cache, &structs);
1573 }
1574
1575 /*----------------------------------------------------------------
1576  * Invalidations.
1577  * Dropping something from the cache *without* writing back.
1578  *--------------------------------------------------------------*/
1579
1580 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
1581 {
1582         int r = 0;
1583         uint64_t begin = from_cblock(req->cblocks->begin);
1584         uint64_t end = from_cblock(req->cblocks->end);
1585
1586         while (begin != end) {
1587                 r = policy_remove_cblock(cache->policy, to_cblock(begin));
1588                 if (!r) {
1589                         r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
1590                         if (r)
1591                                 break;
1592
1593                 } else if (r == -ENODATA) {
1594                         /* harmless, already unmapped */
1595                         r = 0;
1596
1597                 } else {
1598                         DMERR("policy_remove_cblock failed");
1599                         break;
1600                 }
1601
1602                 begin++;
1603         }
1604
1605         cache->commit_requested = true;
1606
1607         req->err = r;
1608         atomic_set(&req->complete, 1);
1609
1610         wake_up(&req->result_wait);
1611 }
1612
1613 static void process_invalidation_requests(struct cache *cache)
1614 {
1615         struct list_head list;
1616         struct invalidation_request *req, *tmp;
1617
1618         INIT_LIST_HEAD(&list);
1619         spin_lock(&cache->invalidation_lock);
1620         list_splice_init(&cache->invalidation_requests, &list);
1621         spin_unlock(&cache->invalidation_lock);
1622
1623         list_for_each_entry_safe (req, tmp, &list, list)
1624                 process_invalidation_request(cache, req);
1625 }
1626
1627 /*----------------------------------------------------------------
1628  * Main worker loop
1629  *--------------------------------------------------------------*/
1630 static bool is_quiescing(struct cache *cache)
1631 {
1632         return atomic_read(&cache->quiescing);
1633 }
1634
1635 static void ack_quiescing(struct cache *cache)
1636 {
1637         if (is_quiescing(cache)) {
1638                 atomic_inc(&cache->quiescing_ack);
1639                 wake_up(&cache->quiescing_wait);
1640         }
1641 }
1642
1643 static void wait_for_quiescing_ack(struct cache *cache)
1644 {
1645         wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
1646 }
1647
1648 static void start_quiescing(struct cache *cache)
1649 {
1650         atomic_inc(&cache->quiescing);
1651         wait_for_quiescing_ack(cache);
1652 }
1653
1654 static void stop_quiescing(struct cache *cache)
1655 {
1656         atomic_set(&cache->quiescing, 0);
1657         atomic_set(&cache->quiescing_ack, 0);
1658 }
1659
1660 static void wait_for_migrations(struct cache *cache)
1661 {
1662         wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
1663 }
1664
1665 static void stop_worker(struct cache *cache)
1666 {
1667         cancel_delayed_work(&cache->waker);
1668         flush_workqueue(cache->wq);
1669 }
1670
1671 static void requeue_deferred_io(struct cache *cache)
1672 {
1673         struct bio *bio;
1674         struct bio_list bios;
1675
1676         bio_list_init(&bios);
1677         bio_list_merge(&bios, &cache->deferred_bios);
1678         bio_list_init(&cache->deferred_bios);
1679
1680         while ((bio = bio_list_pop(&bios)))
1681                 bio_endio(bio, DM_ENDIO_REQUEUE);
1682 }
1683
1684 static int more_work(struct cache *cache)
1685 {
1686         if (is_quiescing(cache))
1687                 return !list_empty(&cache->quiesced_migrations) ||
1688                         !list_empty(&cache->completed_migrations) ||
1689                         !list_empty(&cache->need_commit_migrations);
1690         else
1691                 return !bio_list_empty(&cache->deferred_bios) ||
1692                         !bio_list_empty(&cache->deferred_flush_bios) ||
1693                         !bio_list_empty(&cache->deferred_writethrough_bios) ||
1694                         !list_empty(&cache->quiesced_migrations) ||
1695                         !list_empty(&cache->completed_migrations) ||
1696                         !list_empty(&cache->need_commit_migrations) ||
1697                         cache->invalidate;
1698 }
1699
1700 static void do_worker(struct work_struct *ws)
1701 {
1702         struct cache *cache = container_of(ws, struct cache, worker);
1703
1704         do {
1705                 if (!is_quiescing(cache)) {
1706                         writeback_some_dirty_blocks(cache);
1707                         process_deferred_writethrough_bios(cache);
1708                         process_deferred_bios(cache);
1709                         process_invalidation_requests(cache);
1710                 }
1711
1712                 process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1713                 process_migrations(cache, &cache->completed_migrations, complete_migration);
1714
1715                 if (commit_if_needed(cache)) {
1716                         process_deferred_flush_bios(cache, false);
1717
1718                         /*
1719                          * FIXME: rollback metadata or just go into a
1720                          * failure mode and error everything
1721                          */
1722                 } else {
1723                         process_deferred_flush_bios(cache, true);
1724                         process_migrations(cache, &cache->need_commit_migrations,
1725                                            migration_success_post_commit);
1726                 }
1727
1728                 ack_quiescing(cache);
1729
1730         } while (more_work(cache));
1731 }
1732
1733 /*
1734  * We want to commit periodically so that not too much
1735  * unwritten metadata builds up.
1736  */
1737 static void do_waker(struct work_struct *ws)
1738 {
1739         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1740         policy_tick(cache->policy);
1741         wake_worker(cache);
1742         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1743 }
1744
1745 /*----------------------------------------------------------------*/
1746
1747 static int is_congested(struct dm_dev *dev, int bdi_bits)
1748 {
1749         struct request_queue *q = bdev_get_queue(dev->bdev);
1750         return bdi_congested(&q->backing_dev_info, bdi_bits);
1751 }
1752
1753 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1754 {
1755         struct cache *cache = container_of(cb, struct cache, callbacks);
1756
1757         return is_congested(cache->origin_dev, bdi_bits) ||
1758                 is_congested(cache->cache_dev, bdi_bits);
1759 }
1760
1761 /*----------------------------------------------------------------
1762  * Target methods
1763  *--------------------------------------------------------------*/
1764
1765 /*
1766  * This function gets called on the error paths of the constructor, so we
1767  * have to cope with a partially initialised struct.
1768  */
1769 static void destroy(struct cache *cache)
1770 {
1771         unsigned i;
1772
1773         if (cache->migration_pool)
1774                 mempool_destroy(cache->migration_pool);
1775
1776         if (cache->all_io_ds)
1777                 dm_deferred_set_destroy(cache->all_io_ds);
1778
1779         if (cache->prison)
1780                 dm_bio_prison_destroy(cache->prison);
1781
1782         if (cache->wq)
1783                 destroy_workqueue(cache->wq);
1784
1785         if (cache->dirty_bitset)
1786                 free_bitset(cache->dirty_bitset);
1787
1788         if (cache->discard_bitset)
1789                 free_bitset(cache->discard_bitset);
1790
1791         if (cache->copier)
1792                 dm_kcopyd_client_destroy(cache->copier);
1793
1794         if (cache->cmd)
1795                 dm_cache_metadata_close(cache->cmd);
1796
1797         if (cache->metadata_dev)
1798                 dm_put_device(cache->ti, cache->metadata_dev);
1799
1800         if (cache->origin_dev)
1801                 dm_put_device(cache->ti, cache->origin_dev);
1802
1803         if (cache->cache_dev)
1804                 dm_put_device(cache->ti, cache->cache_dev);
1805
1806         if (cache->policy)
1807                 dm_cache_policy_destroy(cache->policy);
1808
1809         for (i = 0; i < cache->nr_ctr_args ; i++)
1810                 kfree(cache->ctr_args[i]);
1811         kfree(cache->ctr_args);
1812
1813         kfree(cache);
1814 }
1815
1816 static void cache_dtr(struct dm_target *ti)
1817 {
1818         struct cache *cache = ti->private;
1819
1820         destroy(cache);
1821 }
1822
1823 static sector_t get_dev_size(struct dm_dev *dev)
1824 {
1825         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1826 }
1827
1828 /*----------------------------------------------------------------*/
1829
1830 /*
1831  * Construct a cache device mapping.
1832  *
1833  * cache <metadata dev> <cache dev> <origin dev> <block size>
1834  *       <#feature args> [<feature arg>]*
1835  *       <policy> <#policy args> [<policy arg>]*
1836  *
1837  * metadata dev    : fast device holding the persistent metadata
1838  * cache dev       : fast device holding cached data blocks
1839  * origin dev      : slow device holding original data blocks
1840  * block size      : cache unit size in sectors
1841  *
1842  * #feature args   : number of feature arguments passed
1843  * feature args    : writethrough.  (The default is writeback.)
1844  *
1845  * policy          : the replacement policy to use
1846  * #policy args    : an even number of policy arguments corresponding
1847  *                   to key/value pairs passed to the policy
1848  * policy args     : key/value pairs passed to the policy
1849  *                   E.g. 'sequential_threshold 1024'
1850  *                   See cache-policies.txt for details.
1851  *
1852  * Optional feature arguments are:
1853  *   writethrough  : write through caching that prohibits cache block
1854  *                   content from being different from origin block content.
1855  *                   Without this argument, the default behaviour is to write
1856  *                   back cache block contents later for performance reasons,
1857  *                   so they may differ from the corresponding origin blocks.
1858  */
1859 struct cache_args {
1860         struct dm_target *ti;
1861
1862         struct dm_dev *metadata_dev;
1863
1864         struct dm_dev *cache_dev;
1865         sector_t cache_sectors;
1866
1867         struct dm_dev *origin_dev;
1868         sector_t origin_sectors;
1869
1870         uint32_t block_size;
1871
1872         const char *policy_name;
1873         int policy_argc;
1874         const char **policy_argv;
1875
1876         struct cache_features features;
1877 };
1878
1879 static void destroy_cache_args(struct cache_args *ca)
1880 {
1881         if (ca->metadata_dev)
1882                 dm_put_device(ca->ti, ca->metadata_dev);
1883
1884         if (ca->cache_dev)
1885                 dm_put_device(ca->ti, ca->cache_dev);
1886
1887         if (ca->origin_dev)
1888                 dm_put_device(ca->ti, ca->origin_dev);
1889
1890         kfree(ca);
1891 }
1892
1893 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1894 {
1895         if (!as->argc) {
1896                 *error = "Insufficient args";
1897                 return false;
1898         }
1899
1900         return true;
1901 }
1902
1903 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1904                               char **error)
1905 {
1906         int r;
1907         sector_t metadata_dev_size;
1908         char b[BDEVNAME_SIZE];
1909
1910         if (!at_least_one_arg(as, error))
1911                 return -EINVAL;
1912
1913         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1914                           &ca->metadata_dev);
1915         if (r) {
1916                 *error = "Error opening metadata device";
1917                 return r;
1918         }
1919
1920         metadata_dev_size = get_dev_size(ca->metadata_dev);
1921         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1922                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1923                        bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1924
1925         return 0;
1926 }
1927
1928 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1929                            char **error)
1930 {
1931         int r;
1932
1933         if (!at_least_one_arg(as, error))
1934                 return -EINVAL;
1935
1936         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1937                           &ca->cache_dev);
1938         if (r) {
1939                 *error = "Error opening cache device";
1940                 return r;
1941         }
1942         ca->cache_sectors = get_dev_size(ca->cache_dev);
1943
1944         return 0;
1945 }
1946
1947 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1948                             char **error)
1949 {
1950         int r;
1951
1952         if (!at_least_one_arg(as, error))
1953                 return -EINVAL;
1954
1955         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1956                           &ca->origin_dev);
1957         if (r) {
1958                 *error = "Error opening origin device";
1959                 return r;
1960         }
1961
1962         ca->origin_sectors = get_dev_size(ca->origin_dev);
1963         if (ca->ti->len > ca->origin_sectors) {
1964                 *error = "Device size larger than cached device";
1965                 return -EINVAL;
1966         }
1967
1968         return 0;
1969 }
1970
1971 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1972                             char **error)
1973 {
1974         unsigned long block_size;
1975
1976         if (!at_least_one_arg(as, error))
1977                 return -EINVAL;
1978
1979         if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
1980             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1981             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1982             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1983                 *error = "Invalid data block size";
1984                 return -EINVAL;
1985         }
1986
1987         if (block_size > ca->cache_sectors) {
1988                 *error = "Data block size is larger than the cache device";
1989                 return -EINVAL;
1990         }
1991
1992         ca->block_size = block_size;
1993
1994         return 0;
1995 }
1996
1997 static void init_features(struct cache_features *cf)
1998 {
1999         cf->mode = CM_WRITE;
2000         cf->io_mode = CM_IO_WRITEBACK;
2001 }
2002
2003 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2004                           char **error)
2005 {
2006         static struct dm_arg _args[] = {
2007                 {0, 1, "Invalid number of cache feature arguments"},
2008         };
2009
2010         int r;
2011         unsigned argc;
2012         const char *arg;
2013         struct cache_features *cf = &ca->features;
2014
2015         init_features(cf);
2016
2017         r = dm_read_arg_group(_args, as, &argc, error);
2018         if (r)
2019                 return -EINVAL;
2020
2021         while (argc--) {
2022                 arg = dm_shift_arg(as);
2023
2024                 if (!strcasecmp(arg, "writeback"))
2025                         cf->io_mode = CM_IO_WRITEBACK;
2026
2027                 else if (!strcasecmp(arg, "writethrough"))
2028                         cf->io_mode = CM_IO_WRITETHROUGH;
2029
2030                 else if (!strcasecmp(arg, "passthrough"))
2031                         cf->io_mode = CM_IO_PASSTHROUGH;
2032
2033                 else {
2034                         *error = "Unrecognised cache feature requested";
2035                         return -EINVAL;
2036                 }
2037         }
2038
2039         return 0;
2040 }
2041
2042 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2043                         char **error)
2044 {
2045         static struct dm_arg _args[] = {
2046                 {0, 1024, "Invalid number of policy arguments"},
2047         };
2048
2049         int r;
2050
2051         if (!at_least_one_arg(as, error))
2052                 return -EINVAL;
2053
2054         ca->policy_name = dm_shift_arg(as);
2055
2056         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2057         if (r)
2058                 return -EINVAL;
2059
2060         ca->policy_argv = (const char **)as->argv;
2061         dm_consume_args(as, ca->policy_argc);
2062
2063         return 0;
2064 }
2065
2066 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2067                             char **error)
2068 {
2069         int r;
2070         struct dm_arg_set as;
2071
2072         as.argc = argc;
2073         as.argv = argv;
2074
2075         r = parse_metadata_dev(ca, &as, error);
2076         if (r)
2077                 return r;
2078
2079         r = parse_cache_dev(ca, &as, error);
2080         if (r)
2081                 return r;
2082
2083         r = parse_origin_dev(ca, &as, error);
2084         if (r)
2085                 return r;
2086
2087         r = parse_block_size(ca, &as, error);
2088         if (r)
2089                 return r;
2090
2091         r = parse_features(ca, &as, error);
2092         if (r)
2093                 return r;
2094
2095         r = parse_policy(ca, &as, error);
2096         if (r)
2097                 return r;
2098
2099         return 0;
2100 }
2101
2102 /*----------------------------------------------------------------*/
2103
2104 static struct kmem_cache *migration_cache;
2105
2106 #define NOT_CORE_OPTION 1
2107
2108 static int process_config_option(struct cache *cache, const char *key, const char *value)
2109 {
2110         unsigned long tmp;
2111
2112         if (!strcasecmp(key, "migration_threshold")) {
2113                 if (kstrtoul(value, 10, &tmp))
2114                         return -EINVAL;
2115
2116                 cache->migration_threshold = tmp;
2117                 return 0;
2118         }
2119
2120         return NOT_CORE_OPTION;
2121 }
2122
2123 static int set_config_value(struct cache *cache, const char *key, const char *value)
2124 {
2125         int r = process_config_option(cache, key, value);
2126
2127         if (r == NOT_CORE_OPTION)
2128                 r = policy_set_config_value(cache->policy, key, value);
2129
2130         if (r)
2131                 DMWARN("bad config value for %s: %s", key, value);
2132
2133         return r;
2134 }
2135
2136 static int set_config_values(struct cache *cache, int argc, const char **argv)
2137 {
2138         int r = 0;
2139
2140         if (argc & 1) {
2141                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2142                 return -EINVAL;
2143         }
2144
2145         while (argc) {
2146                 r = set_config_value(cache, argv[0], argv[1]);
2147                 if (r)
2148                         break;
2149
2150                 argc -= 2;
2151                 argv += 2;
2152         }
2153
2154         return r;
2155 }
2156
2157 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2158                                char **error)
2159 {
2160         struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2161                                                            cache->cache_size,
2162                                                            cache->origin_sectors,
2163                                                            cache->sectors_per_block);
2164         if (IS_ERR(p)) {
2165                 *error = "Error creating cache's policy";
2166                 return PTR_ERR(p);
2167         }
2168         cache->policy = p;
2169
2170         return 0;
2171 }
2172
2173 #define DEFAULT_MIGRATION_THRESHOLD 2048
2174
2175 static int cache_create(struct cache_args *ca, struct cache **result)
2176 {
2177         int r = 0;
2178         char **error = &ca->ti->error;
2179         struct cache *cache;
2180         struct dm_target *ti = ca->ti;
2181         dm_block_t origin_blocks;
2182         struct dm_cache_metadata *cmd;
2183         bool may_format = ca->features.mode == CM_WRITE;
2184
2185         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2186         if (!cache)
2187                 return -ENOMEM;
2188
2189         cache->ti = ca->ti;
2190         ti->private = cache;
2191         ti->num_flush_bios = 2;
2192         ti->flush_supported = true;
2193
2194         ti->num_discard_bios = 1;
2195         ti->discards_supported = true;
2196         ti->discard_zeroes_data_unsupported = true;
2197         /* Discard bios must be split on a block boundary */
2198         ti->split_discard_bios = true;
2199
2200         cache->features = ca->features;
2201         ti->per_bio_data_size = get_per_bio_data_size(cache);
2202
2203         cache->callbacks.congested_fn = cache_is_congested;
2204         dm_table_add_target_callbacks(ti->table, &cache->callbacks);
2205
2206         cache->metadata_dev = ca->metadata_dev;
2207         cache->origin_dev = ca->origin_dev;
2208         cache->cache_dev = ca->cache_dev;
2209
2210         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2211
2212         /* FIXME: factor out this whole section */
2213         origin_blocks = cache->origin_sectors = ca->origin_sectors;
2214         origin_blocks = block_div(origin_blocks, ca->block_size);
2215         cache->origin_blocks = to_oblock(origin_blocks);
2216
2217         cache->sectors_per_block = ca->block_size;
2218         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2219                 r = -EINVAL;
2220                 goto bad;
2221         }
2222
2223         if (ca->block_size & (ca->block_size - 1)) {
2224                 dm_block_t cache_size = ca->cache_sectors;
2225
2226                 cache->sectors_per_block_shift = -1;
2227                 cache_size = block_div(cache_size, ca->block_size);
2228                 cache->cache_size = to_cblock(cache_size);
2229         } else {
2230                 cache->sectors_per_block_shift = __ffs(ca->block_size);
2231                 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
2232         }
2233
2234         r = create_cache_policy(cache, ca, error);
2235         if (r)
2236                 goto bad;
2237
2238         cache->policy_nr_args = ca->policy_argc;
2239         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2240
2241         r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2242         if (r) {
2243                 *error = "Error setting cache policy's config values";
2244                 goto bad;
2245         }
2246
2247         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2248                                      ca->block_size, may_format,
2249                                      dm_cache_policy_get_hint_size(cache->policy));
2250         if (IS_ERR(cmd)) {
2251                 *error = "Error creating metadata object";
2252                 r = PTR_ERR(cmd);
2253                 goto bad;
2254         }
2255         cache->cmd = cmd;
2256
2257         if (passthrough_mode(&cache->features)) {
2258                 bool all_clean;
2259
2260                 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2261                 if (r) {
2262                         *error = "dm_cache_metadata_all_clean() failed";
2263                         goto bad;
2264                 }
2265
2266                 if (!all_clean) {
2267                         *error = "Cannot enter passthrough mode unless all blocks are clean";
2268                         r = -EINVAL;
2269                         goto bad;
2270                 }
2271         }
2272
2273         spin_lock_init(&cache->lock);
2274         bio_list_init(&cache->deferred_bios);
2275         bio_list_init(&cache->deferred_flush_bios);
2276         bio_list_init(&cache->deferred_writethrough_bios);
2277         INIT_LIST_HEAD(&cache->quiesced_migrations);
2278         INIT_LIST_HEAD(&cache->completed_migrations);
2279         INIT_LIST_HEAD(&cache->need_commit_migrations);
2280         atomic_set(&cache->nr_allocated_migrations, 0);
2281         atomic_set(&cache->nr_io_migrations, 0);
2282         init_waitqueue_head(&cache->migration_wait);
2283
2284         init_waitqueue_head(&cache->quiescing_wait);
2285         atomic_set(&cache->quiescing, 0);
2286         atomic_set(&cache->quiescing_ack, 0);
2287
2288         r = -ENOMEM;
2289         atomic_set(&cache->nr_dirty, 0);
2290         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2291         if (!cache->dirty_bitset) {
2292                 *error = "could not allocate dirty bitset";
2293                 goto bad;
2294         }
2295         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2296
2297         cache->discard_nr_blocks = cache->origin_blocks;
2298         cache->discard_bitset = alloc_bitset(from_oblock(cache->discard_nr_blocks));
2299         if (!cache->discard_bitset) {
2300                 *error = "could not allocate discard bitset";
2301                 goto bad;
2302         }
2303         clear_bitset(cache->discard_bitset, from_oblock(cache->discard_nr_blocks));
2304
2305         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2306         if (IS_ERR(cache->copier)) {
2307                 *error = "could not create kcopyd client";
2308                 r = PTR_ERR(cache->copier);
2309                 goto bad;
2310         }
2311
2312         cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2313         if (!cache->wq) {
2314                 *error = "could not create workqueue for metadata object";
2315                 goto bad;
2316         }
2317         INIT_WORK(&cache->worker, do_worker);
2318         INIT_DELAYED_WORK(&cache->waker, do_waker);
2319         cache->last_commit_jiffies = jiffies;
2320
2321         cache->prison = dm_bio_prison_create(PRISON_CELLS);
2322         if (!cache->prison) {
2323                 *error = "could not create bio prison";
2324                 goto bad;
2325         }
2326
2327         cache->all_io_ds = dm_deferred_set_create();
2328         if (!cache->all_io_ds) {
2329                 *error = "could not create all_io deferred set";
2330                 goto bad;
2331         }
2332
2333         cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2334                                                          migration_cache);
2335         if (!cache->migration_pool) {
2336                 *error = "Error creating cache's migration mempool";
2337                 goto bad;
2338         }
2339
2340         cache->need_tick_bio = true;
2341         cache->sized = false;
2342         cache->invalidate = false;
2343         cache->commit_requested = false;
2344         cache->loaded_mappings = false;
2345         cache->loaded_discards = false;
2346
2347         load_stats(cache);
2348
2349         atomic_set(&cache->stats.demotion, 0);
2350         atomic_set(&cache->stats.promotion, 0);
2351         atomic_set(&cache->stats.copies_avoided, 0);
2352         atomic_set(&cache->stats.cache_cell_clash, 0);
2353         atomic_set(&cache->stats.commit_count, 0);
2354         atomic_set(&cache->stats.discard_count, 0);
2355
2356         spin_lock_init(&cache->invalidation_lock);
2357         INIT_LIST_HEAD(&cache->invalidation_requests);
2358
2359         *result = cache;
2360         return 0;
2361
2362 bad:
2363         destroy(cache);
2364         return r;
2365 }
2366
2367 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2368 {
2369         unsigned i;
2370         const char **copy;
2371
2372         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2373         if (!copy)
2374                 return -ENOMEM;
2375         for (i = 0; i < argc; i++) {
2376                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2377                 if (!copy[i]) {
2378                         while (i--)
2379                                 kfree(copy[i]);
2380                         kfree(copy);
2381                         return -ENOMEM;
2382                 }
2383         }
2384
2385         cache->nr_ctr_args = argc;
2386         cache->ctr_args = copy;
2387
2388         return 0;
2389 }
2390
2391 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2392 {
2393         int r = -EINVAL;
2394         struct cache_args *ca;
2395         struct cache *cache = NULL;
2396
2397         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2398         if (!ca) {
2399                 ti->error = "Error allocating memory for cache";
2400                 return -ENOMEM;
2401         }
2402         ca->ti = ti;
2403
2404         r = parse_cache_args(ca, argc, argv, &ti->error);
2405         if (r)
2406                 goto out;
2407
2408         r = cache_create(ca, &cache);
2409         if (r)
2410                 goto out;
2411
2412         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2413         if (r) {
2414                 destroy(cache);
2415                 goto out;
2416         }
2417
2418         ti->private = cache;
2419
2420 out:
2421         destroy_cache_args(ca);
2422         return r;
2423 }
2424
2425 static int cache_map(struct dm_target *ti, struct bio *bio)
2426 {
2427         struct cache *cache = ti->private;
2428
2429         int r;
2430         dm_oblock_t block = get_bio_block(cache, bio);
2431         size_t pb_data_size = get_per_bio_data_size(cache);
2432         bool can_migrate = false;
2433         bool discarded_block;
2434         struct dm_bio_prison_cell *cell;
2435         struct policy_result lookup_result;
2436         struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
2437
2438         if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2439                 /*
2440                  * This can only occur if the io goes to a partial block at
2441                  * the end of the origin device.  We don't cache these.
2442                  * Just remap to the origin and carry on.
2443                  */
2444                 remap_to_origin(cache, bio);
2445                 return DM_MAPIO_REMAPPED;
2446         }
2447
2448         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2449                 defer_bio(cache, bio);
2450                 return DM_MAPIO_SUBMITTED;
2451         }
2452
2453         /*
2454          * Check to see if that block is currently migrating.
2455          */
2456         cell = alloc_prison_cell(cache);
2457         if (!cell) {
2458                 defer_bio(cache, bio);
2459                 return DM_MAPIO_SUBMITTED;
2460         }
2461
2462         r = bio_detain(cache, block, bio, cell,
2463                        (cell_free_fn) free_prison_cell,
2464                        cache, &cell);
2465         if (r) {
2466                 if (r < 0)
2467                         defer_bio(cache, bio);
2468
2469                 return DM_MAPIO_SUBMITTED;
2470         }
2471
2472         discarded_block = is_discarded_oblock(cache, block);
2473
2474         r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2475                        bio, &lookup_result);
2476         if (r == -EWOULDBLOCK) {
2477                 cell_defer(cache, cell, true);
2478                 return DM_MAPIO_SUBMITTED;
2479
2480         } else if (r) {
2481                 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2482                 bio_io_error(bio);
2483                 return DM_MAPIO_SUBMITTED;
2484         }
2485
2486         r = DM_MAPIO_REMAPPED;
2487         switch (lookup_result.op) {
2488         case POLICY_HIT:
2489                 if (passthrough_mode(&cache->features)) {
2490                         if (bio_data_dir(bio) == WRITE) {
2491                                 /*
2492                                  * We need to invalidate this block, so
2493                                  * defer for the worker thread.
2494                                  */
2495                                 cell_defer(cache, cell, true);
2496                                 r = DM_MAPIO_SUBMITTED;
2497
2498                         } else {
2499                                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2500                                 inc_miss_counter(cache, bio);
2501                                 remap_to_origin_clear_discard(cache, bio, block);
2502
2503                                 cell_defer(cache, cell, false);
2504                         }
2505
2506                 } else {
2507                         inc_hit_counter(cache, bio);
2508                         pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2509
2510                         if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
2511                             !is_dirty(cache, lookup_result.cblock))
2512                                 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2513                         else
2514                                 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2515
2516                         cell_defer(cache, cell, false);
2517                 }
2518                 break;
2519
2520         case POLICY_MISS:
2521                 inc_miss_counter(cache, bio);
2522                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2523
2524                 if (pb->req_nr != 0) {
2525                         /*
2526                          * This is a duplicate writethrough io that is no
2527                          * longer needed because the block has been demoted.
2528                          */
2529                         bio_endio(bio, 0);
2530                         cell_defer(cache, cell, false);
2531                         return DM_MAPIO_SUBMITTED;
2532                 } else {
2533                         remap_to_origin_clear_discard(cache, bio, block);
2534                         cell_defer(cache, cell, false);
2535                 }
2536                 break;
2537
2538         default:
2539                 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2540                             (unsigned) lookup_result.op);
2541                 bio_io_error(bio);
2542                 r = DM_MAPIO_SUBMITTED;
2543         }
2544
2545         return r;
2546 }
2547
2548 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2549 {
2550         struct cache *cache = ti->private;
2551         unsigned long flags;
2552         size_t pb_data_size = get_per_bio_data_size(cache);
2553         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
2554
2555         if (pb->tick) {
2556                 policy_tick(cache->policy);
2557
2558                 spin_lock_irqsave(&cache->lock, flags);
2559                 cache->need_tick_bio = true;
2560                 spin_unlock_irqrestore(&cache->lock, flags);
2561         }
2562
2563         check_for_quiesced_migrations(cache, pb);
2564
2565         return 0;
2566 }
2567
2568 static int write_dirty_bitset(struct cache *cache)
2569 {
2570         unsigned i, r;
2571
2572         for (i = 0; i < from_cblock(cache->cache_size); i++) {
2573                 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2574                                        is_dirty(cache, to_cblock(i)));
2575                 if (r)
2576                         return r;
2577         }
2578
2579         return 0;
2580 }
2581
2582 static int write_discard_bitset(struct cache *cache)
2583 {
2584         unsigned i, r;
2585
2586         r = dm_cache_discard_bitset_resize(cache->cmd, cache->sectors_per_block,
2587                                            cache->origin_blocks);
2588         if (r) {
2589                 DMERR("could not resize on-disk discard bitset");
2590                 return r;
2591         }
2592
2593         for (i = 0; i < from_oblock(cache->discard_nr_blocks); i++) {
2594                 r = dm_cache_set_discard(cache->cmd, to_oblock(i),
2595                                          is_discarded(cache, to_oblock(i)));
2596                 if (r)
2597                         return r;
2598         }
2599
2600         return 0;
2601 }
2602
2603 /*
2604  * returns true on success
2605  */
2606 static bool sync_metadata(struct cache *cache)
2607 {
2608         int r1, r2, r3, r4;
2609
2610         r1 = write_dirty_bitset(cache);
2611         if (r1)
2612                 DMERR("could not write dirty bitset");
2613
2614         r2 = write_discard_bitset(cache);
2615         if (r2)
2616                 DMERR("could not write discard bitset");
2617
2618         save_stats(cache);
2619
2620         r3 = dm_cache_write_hints(cache->cmd, cache->policy);
2621         if (r3)
2622                 DMERR("could not write hints");
2623
2624         /*
2625          * If writing the above metadata failed, we still commit, but don't
2626          * set the clean shutdown flag.  This will effectively force every
2627          * dirty bit to be set on reload.
2628          */
2629         r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2630         if (r4)
2631                 DMERR("could not write cache metadata.  Data loss may occur.");
2632
2633         return !r1 && !r2 && !r3 && !r4;
2634 }
2635
2636 static void cache_postsuspend(struct dm_target *ti)
2637 {
2638         struct cache *cache = ti->private;
2639
2640         start_quiescing(cache);
2641         wait_for_migrations(cache);
2642         stop_worker(cache);
2643         requeue_deferred_io(cache);
2644         stop_quiescing(cache);
2645
2646         (void) sync_metadata(cache);
2647 }
2648
2649 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2650                         bool dirty, uint32_t hint, bool hint_valid)
2651 {
2652         int r;
2653         struct cache *cache = context;
2654
2655         r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2656         if (r)
2657                 return r;
2658
2659         if (dirty)
2660                 set_dirty(cache, oblock, cblock);
2661         else
2662                 clear_dirty(cache, oblock, cblock);
2663
2664         return 0;
2665 }
2666
2667 static int load_discard(void *context, sector_t discard_block_size,
2668                         dm_oblock_t oblock, bool discard)
2669 {
2670         struct cache *cache = context;
2671
2672         if (discard)
2673                 set_discard(cache, oblock);
2674         else
2675                 clear_discard(cache, oblock);
2676
2677         return 0;
2678 }
2679
2680 static dm_cblock_t get_cache_dev_size(struct cache *cache)
2681 {
2682         sector_t size = get_dev_size(cache->cache_dev);
2683         (void) sector_div(size, cache->sectors_per_block);
2684         return to_cblock(size);
2685 }
2686
2687 static bool can_resize(struct cache *cache, dm_cblock_t new_size)
2688 {
2689         if (from_cblock(new_size) > from_cblock(cache->cache_size))
2690                 return true;
2691
2692         /*
2693          * We can't drop a dirty block when shrinking the cache.
2694          */
2695         while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
2696                 new_size = to_cblock(from_cblock(new_size) + 1);
2697                 if (is_dirty(cache, new_size)) {
2698                         DMERR("unable to shrink cache; cache block %llu is dirty",
2699                               (unsigned long long) from_cblock(new_size));
2700                         return false;
2701                 }
2702         }
2703
2704         return true;
2705 }
2706
2707 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
2708 {
2709         int r;
2710
2711         r = dm_cache_resize(cache->cmd, new_size);
2712         if (r) {
2713                 DMERR("could not resize cache metadata");
2714                 return r;
2715         }
2716
2717         cache->cache_size = new_size;
2718
2719         return 0;
2720 }
2721
2722 static int cache_preresume(struct dm_target *ti)
2723 {
2724         int r = 0;
2725         struct cache *cache = ti->private;
2726         dm_cblock_t csize = get_cache_dev_size(cache);
2727
2728         /*
2729          * Check to see if the cache has resized.
2730          */
2731         if (!cache->sized) {
2732                 r = resize_cache_dev(cache, csize);
2733                 if (r)
2734                         return r;
2735
2736                 cache->sized = true;
2737
2738         } else if (csize != cache->cache_size) {
2739                 if (!can_resize(cache, csize))
2740                         return -EINVAL;
2741
2742                 r = resize_cache_dev(cache, csize);
2743                 if (r)
2744                         return r;
2745         }
2746
2747         if (!cache->loaded_mappings) {
2748                 r = dm_cache_load_mappings(cache->cmd, cache->policy,
2749                                            load_mapping, cache);
2750                 if (r) {
2751                         DMERR("could not load cache mappings");
2752                         return r;
2753                 }
2754
2755                 cache->loaded_mappings = true;
2756         }
2757
2758         if (!cache->loaded_discards) {
2759                 r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2760                 if (r) {
2761                         DMERR("could not load origin discards");
2762                         return r;
2763                 }
2764
2765                 cache->loaded_discards = true;
2766         }
2767
2768         return r;
2769 }
2770
2771 static void cache_resume(struct dm_target *ti)
2772 {
2773         struct cache *cache = ti->private;
2774
2775         cache->need_tick_bio = true;
2776         do_waker(&cache->waker.work);
2777 }
2778
2779 /*
2780  * Status format:
2781  *
2782  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
2783  * <cache block size> <#used cache blocks>/<#total cache blocks>
2784  * <#read hits> <#read misses> <#write hits> <#write misses>
2785  * <#demotions> <#promotions> <#dirty>
2786  * <#features> <features>*
2787  * <#core args> <core args>
2788  * <policy name> <#policy args> <policy args>*
2789  */
2790 static void cache_status(struct dm_target *ti, status_type_t type,
2791                          unsigned status_flags, char *result, unsigned maxlen)
2792 {
2793         int r = 0;
2794         unsigned i;
2795         ssize_t sz = 0;
2796         dm_block_t nr_free_blocks_metadata = 0;
2797         dm_block_t nr_blocks_metadata = 0;
2798         char buf[BDEVNAME_SIZE];
2799         struct cache *cache = ti->private;
2800         dm_cblock_t residency;
2801
2802         switch (type) {
2803         case STATUSTYPE_INFO:
2804                 /* Commit to ensure statistics aren't out-of-date */
2805                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2806                         r = dm_cache_commit(cache->cmd, false);
2807                         if (r)
2808                                 DMERR("could not commit metadata for accurate status");
2809                 }
2810
2811                 r = dm_cache_get_free_metadata_block_count(cache->cmd,
2812                                                            &nr_free_blocks_metadata);
2813                 if (r) {
2814                         DMERR("could not get metadata free block count");
2815                         goto err;
2816                 }
2817
2818                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2819                 if (r) {
2820                         DMERR("could not get metadata device size");
2821                         goto err;
2822                 }
2823
2824                 residency = policy_residency(cache->policy);
2825
2826                 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
2827                        (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
2828                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2829                        (unsigned long long)nr_blocks_metadata,
2830                        cache->sectors_per_block,
2831                        (unsigned long long) from_cblock(residency),
2832                        (unsigned long long) from_cblock(cache->cache_size),
2833                        (unsigned) atomic_read(&cache->stats.read_hit),
2834                        (unsigned) atomic_read(&cache->stats.read_miss),
2835                        (unsigned) atomic_read(&cache->stats.write_hit),
2836                        (unsigned) atomic_read(&cache->stats.write_miss),
2837                        (unsigned) atomic_read(&cache->stats.demotion),
2838                        (unsigned) atomic_read(&cache->stats.promotion),
2839                        (unsigned long) atomic_read(&cache->nr_dirty));
2840
2841                 if (writethrough_mode(&cache->features))
2842                         DMEMIT("1 writethrough ");
2843
2844                 else if (passthrough_mode(&cache->features))
2845                         DMEMIT("1 passthrough ");
2846
2847                 else if (writeback_mode(&cache->features))
2848                         DMEMIT("1 writeback ");
2849
2850                 else {
2851                         DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
2852                         goto err;
2853                 }
2854
2855                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2856
2857                 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
2858                 if (sz < maxlen) {
2859                         r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2860                         if (r)
2861                                 DMERR("policy_emit_config_values returned %d", r);
2862                 }
2863
2864                 break;
2865
2866         case STATUSTYPE_TABLE:
2867                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2868                 DMEMIT("%s ", buf);
2869                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2870                 DMEMIT("%s ", buf);
2871                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2872                 DMEMIT("%s", buf);
2873
2874                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
2875                         DMEMIT(" %s", cache->ctr_args[i]);
2876                 if (cache->nr_ctr_args)
2877                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2878         }
2879
2880         return;
2881
2882 err:
2883         DMEMIT("Error");
2884 }
2885
2886 /*
2887  * A cache block range can take two forms:
2888  *
2889  * i) A single cblock, eg. '3456'
2890  * ii) A begin and end cblock with dots between, eg. 123-234
2891  */
2892 static int parse_cblock_range(struct cache *cache, const char *str,
2893                               struct cblock_range *result)
2894 {
2895         char dummy;
2896         uint64_t b, e;
2897         int r;
2898
2899         /*
2900          * Try and parse form (ii) first.
2901          */
2902         r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
2903         if (r < 0)
2904                 return r;
2905
2906         if (r == 2) {
2907                 result->begin = to_cblock(b);
2908                 result->end = to_cblock(e);
2909                 return 0;
2910         }
2911
2912         /*
2913          * That didn't work, try form (i).
2914          */
2915         r = sscanf(str, "%llu%c", &b, &dummy);
2916         if (r < 0)
2917                 return r;
2918
2919         if (r == 1) {
2920                 result->begin = to_cblock(b);
2921                 result->end = to_cblock(from_cblock(result->begin) + 1u);
2922                 return 0;
2923         }
2924
2925         DMERR("invalid cblock range '%s'", str);
2926         return -EINVAL;
2927 }
2928
2929 static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
2930 {
2931         uint64_t b = from_cblock(range->begin);
2932         uint64_t e = from_cblock(range->end);
2933         uint64_t n = from_cblock(cache->cache_size);
2934
2935         if (b >= n) {
2936                 DMERR("begin cblock out of range: %llu >= %llu", b, n);
2937                 return -EINVAL;
2938         }
2939
2940         if (e > n) {
2941                 DMERR("end cblock out of range: %llu > %llu", e, n);
2942                 return -EINVAL;
2943         }
2944
2945         if (b >= e) {
2946                 DMERR("invalid cblock range: %llu >= %llu", b, e);
2947                 return -EINVAL;
2948         }
2949
2950         return 0;
2951 }
2952
2953 static int request_invalidation(struct cache *cache, struct cblock_range *range)
2954 {
2955         struct invalidation_request req;
2956
2957         INIT_LIST_HEAD(&req.list);
2958         req.cblocks = range;
2959         atomic_set(&req.complete, 0);
2960         req.err = 0;
2961         init_waitqueue_head(&req.result_wait);
2962
2963         spin_lock(&cache->invalidation_lock);
2964         list_add(&req.list, &cache->invalidation_requests);
2965         spin_unlock(&cache->invalidation_lock);
2966         wake_worker(cache);
2967
2968         wait_event(req.result_wait, atomic_read(&req.complete));
2969         return req.err;
2970 }
2971
2972 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
2973                                               const char **cblock_ranges)
2974 {
2975         int r = 0;
2976         unsigned i;
2977         struct cblock_range range;
2978
2979         if (!passthrough_mode(&cache->features)) {
2980                 DMERR("cache has to be in passthrough mode for invalidation");
2981                 return -EPERM;
2982         }
2983
2984         for (i = 0; i < count; i++) {
2985                 r = parse_cblock_range(cache, cblock_ranges[i], &range);
2986                 if (r)
2987                         break;
2988
2989                 r = validate_cblock_range(cache, &range);
2990                 if (r)
2991                         break;
2992
2993                 /*
2994                  * Pass begin and end origin blocks to the worker and wake it.
2995                  */
2996                 r = request_invalidation(cache, &range);
2997                 if (r)
2998                         break;
2999         }
3000
3001         return r;
3002 }
3003
3004 /*
3005  * Supports
3006  *      "<key> <value>"
3007  * and
3008  *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3009  *
3010  * The key migration_threshold is supported by the cache target core.
3011  */
3012 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
3013 {
3014         struct cache *cache = ti->private;
3015
3016         if (!argc)
3017                 return -EINVAL;
3018
3019         if (!strcasecmp(argv[0], "invalidate_cblocks"))
3020                 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3021
3022         if (argc != 2)
3023                 return -EINVAL;
3024
3025         return set_config_value(cache, argv[0], argv[1]);
3026 }
3027
3028 static int cache_iterate_devices(struct dm_target *ti,
3029                                  iterate_devices_callout_fn fn, void *data)
3030 {
3031         int r = 0;
3032         struct cache *cache = ti->private;
3033
3034         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3035         if (!r)
3036                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
3037
3038         return r;
3039 }
3040
3041 /*
3042  * We assume I/O is going to the origin (which is the volume
3043  * more likely to have restrictions e.g. by being striped).
3044  * (Looking up the exact location of the data would be expensive
3045  * and could always be out of date by the time the bio is submitted.)
3046  */
3047 static int cache_bvec_merge(struct dm_target *ti,
3048                             struct bvec_merge_data *bvm,
3049                             struct bio_vec *biovec, int max_size)
3050 {
3051         struct cache *cache = ti->private;
3052         struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
3053
3054         if (!q->merge_bvec_fn)
3055                 return max_size;
3056
3057         bvm->bi_bdev = cache->origin_dev->bdev;
3058         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3059 }
3060
3061 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3062 {
3063         /*
3064          * FIXME: these limits may be incompatible with the cache device
3065          */
3066         limits->max_discard_sectors = cache->sectors_per_block;
3067         limits->discard_granularity = cache->sectors_per_block << SECTOR_SHIFT;
3068 }
3069
3070 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3071 {
3072         struct cache *cache = ti->private;
3073         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3074
3075         /*
3076          * If the system-determined stacked limits are compatible with the
3077          * cache's blocksize (io_opt is a factor) do not override them.
3078          */
3079         if (io_opt_sectors < cache->sectors_per_block ||
3080             do_div(io_opt_sectors, cache->sectors_per_block)) {
3081                 blk_limits_io_min(limits, 0);
3082                 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3083         }
3084         set_discard_limits(cache, limits);
3085 }
3086
3087 /*----------------------------------------------------------------*/
3088
3089 static struct target_type cache_target = {
3090         .name = "cache",
3091         .version = {1, 4, 0},
3092         .module = THIS_MODULE,
3093         .ctr = cache_ctr,
3094         .dtr = cache_dtr,
3095         .map = cache_map,
3096         .end_io = cache_end_io,
3097         .postsuspend = cache_postsuspend,
3098         .preresume = cache_preresume,
3099         .resume = cache_resume,
3100         .status = cache_status,
3101         .message = cache_message,
3102         .iterate_devices = cache_iterate_devices,
3103         .merge = cache_bvec_merge,
3104         .io_hints = cache_io_hints,
3105 };
3106
3107 static int __init dm_cache_init(void)
3108 {
3109         int r;
3110
3111         r = dm_register_target(&cache_target);
3112         if (r) {
3113                 DMERR("cache target registration failed: %d", r);
3114                 return r;
3115         }
3116
3117         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3118         if (!migration_cache) {
3119                 dm_unregister_target(&cache_target);
3120                 return -ENOMEM;
3121         }
3122
3123         return 0;
3124 }
3125
3126 static void __exit dm_cache_exit(void)
3127 {
3128         dm_unregister_target(&cache_target);
3129         kmem_cache_destroy(migration_cache);
3130 }
3131
3132 module_init(dm_cache_init);
3133 module_exit(dm_cache_exit);
3134
3135 MODULE_DESCRIPTION(DM_NAME " cache target");
3136 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3137 MODULE_LICENSE("GPL");