drivers/md/dm-cache-target.c

   1 /*
   2  * Copyright (C) 2012 Red Hat. All rights reserved.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm.h"
   8 #include "dm-bio-prison.h"
   9 #include "dm-bio-record.h"
  10 #include "dm-cache-metadata.h"
  11
  12 #include <linux/dm-io.h>
  13 #include <linux/dm-kcopyd.h>
  14 #include <linux/init.h>
  15 #include <linux/mempool.h>
  16 #include <linux/module.h>
  17 #include <linux/slab.h>
  18 #include <linux/vmalloc.h>
  19
  20 #define DM_MSG_PREFIX "cache"
  21
  22 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
  23         "A percentage of time allocated for copying to and/or from cache");
  24
  25 /*----------------------------------------------------------------*/
  26
  27 /*
  28  * Glossary:
  29  *
  30  * oblock: index of an origin block
  31  * cblock: index of a cache block
  32  * promotion: movement of a block from origin to cache
  33  * demotion: movement of a block from cache to origin
  34  * migration: movement of a block between the origin and cache device,
  35  *            either direction
  36  */
  37
  38 /*----------------------------------------------------------------*/
  39
  40 static size_t bitset_size_in_bytes(unsigned nr_entries)
  41 {
  42         return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
  43 }
  44
  45 static unsigned long *alloc_bitset(unsigned nr_entries)
  46 {
  47         size_t s = bitset_size_in_bytes(nr_entries);
  48         return vzalloc(s);
  49 }
  50
  51 static void clear_bitset(void *bitset, unsigned nr_entries)
  52 {
  53         size_t s = bitset_size_in_bytes(nr_entries);
  54         memset(bitset, 0, s);
  55 }
  56
  57 static void free_bitset(unsigned long *bits)
  58 {
  59         vfree(bits);
  60 }
  61
  62 /*----------------------------------------------------------------*/
  63
  64 #define PRISON_CELLS 1024
  65 #define MIGRATION_POOL_SIZE 128
  66 #define COMMIT_PERIOD HZ
  67 #define MIGRATION_COUNT_WINDOW 10
  68
  69 /*
  70  * The block size of the device holding cache data must be
  71  * between 32KB and 1GB.
  72  */
  73 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
  74 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
  75
  76 /*
  77  * FIXME: the cache is read/write for the time being.
  78  */
  79 enum cache_mode {
  80         CM_WRITE,               /* metadata may be changed */
  81         CM_READ_ONLY,           /* metadata may not be changed */
  82 };
  83
  84 struct cache_features {
  85         enum cache_mode mode;
  86         bool write_through:1;
  87 };
  88
  89 struct cache_stats {
  90         atomic_t read_hit;
  91         atomic_t read_miss;
  92         atomic_t write_hit;
  93         atomic_t write_miss;
  94         atomic_t demotion;
  95         atomic_t promotion;
  96         atomic_t copies_avoided;
  97         atomic_t cache_cell_clash;
  98         atomic_t commit_count;
  99         atomic_t discard_count;
 100 };
 101
 102 struct cache {
 103         struct dm_target *ti;
 104         struct dm_target_callbacks callbacks;
 105
 106         struct dm_cache_metadata *cmd;
 107
 108         /*
 109          * Metadata is written to this device.
 110          */
 111         struct dm_dev *metadata_dev;
 112
 113         /*
 114          * The slower of the two data devices.  Typically a spindle.
 115          */
 116         struct dm_dev *origin_dev;
 117
 118         /*
 119          * The faster of the two data devices.  Typically an SSD.
 120          */
 121         struct dm_dev *cache_dev;
 122
 123         /*
 124          * Size of the origin device in _complete_ blocks and native sectors.
 125          */
 126         dm_oblock_t origin_blocks;
 127         sector_t origin_sectors;
 128
 129         /*
 130          * Size of the cache device in blocks.
 131          */
 132         dm_cblock_t cache_size;
 133
 134         /*
 135          * Fields for converting from sectors to blocks.
 136          */
 137         uint32_t sectors_per_block;
 138         int sectors_per_block_shift;
 139
 140         spinlock_t lock;
 141         struct bio_list deferred_bios;
 142         struct bio_list deferred_flush_bios;
 143         struct bio_list deferred_writethrough_bios;
 144         struct list_head quiesced_migrations;
 145         struct list_head completed_migrations;
 146         struct list_head need_commit_migrations;
 147         sector_t migration_threshold;
 148         wait_queue_head_t migration_wait;
 149         atomic_t nr_migrations;
 150
 151         wait_queue_head_t quiescing_wait;
 152         atomic_t quiescing_ack;
 153
 154         /*
 155          * cache_size entries, dirty if set
 156          */
 157         atomic_t nr_dirty;
 158         unsigned long *dirty_bitset;
 159
 160         /*
 161          * origin_blocks entries, discarded if set.
 162          */
 163         dm_dblock_t discard_nr_blocks;
 164         unsigned long *discard_bitset;
 165         uint32_t discard_block_size;
 166
 167         /*
 168          * Rather than reconstructing the table line for the status we just
 169          * save it and regurgitate.
 170          */
 171         unsigned nr_ctr_args;
 172         const char **ctr_args;
 173
 174         struct dm_kcopyd_client *copier;
 175         struct workqueue_struct *wq;
 176         struct work_struct worker;
 177
 178         struct delayed_work waker;
 179         unsigned long last_commit_jiffies;
 180
 181         struct dm_bio_prison *prison;
 182         struct dm_deferred_set *all_io_ds;
 183
 184         mempool_t *migration_pool;
 185         struct dm_cache_migration *next_migration;
 186
 187         struct dm_cache_policy *policy;
 188         unsigned policy_nr_args;
 189
 190         bool need_tick_bio:1;
 191         bool sized:1;
 192         bool quiescing:1;
 193         bool commit_requested:1;
 194         bool loaded_mappings:1;
 195         bool loaded_discards:1;
 196
 197         /*
 198          * Cache features such as write-through.
 199          */
 200         struct cache_features features;
 201
 202         struct cache_stats stats;
 203 };
 204
 205 struct per_bio_data {
 206         bool tick:1;
 207         unsigned req_nr:2;
 208         struct dm_deferred_entry *all_io_entry;
 209
 210         /*
 211          * writethrough fields.  These MUST remain at the end of this
 212          * structure and the 'cache' member must be the first as it
 213          * is used to determine the offset of the writethrough fields.
 214          */
 215         struct cache *cache;
 216         dm_cblock_t cblock;
 217         bio_end_io_t *saved_bi_end_io;
 218         struct dm_bio_details bio_details;
 219 };
 220
 221 struct dm_cache_migration {
 222         struct list_head list;
 223         struct cache *cache;
 224
 225         unsigned long start_jiffies;
 226         dm_oblock_t old_oblock;
 227         dm_oblock_t new_oblock;
 228         dm_cblock_t cblock;
 229
 230         bool err:1;
 231         bool writeback:1;
 232         bool demote:1;
 233         bool promote:1;
 234
 235         struct dm_bio_prison_cell *old_ocell;
 236         struct dm_bio_prison_cell *new_ocell;
 237 };
 238
 239 /*
 240  * Processing a bio in the worker thread may require these memory
 241  * allocations.  We prealloc to avoid deadlocks (the same worker thread
 242  * frees them back to the mempool).
 243  */
 244 struct prealloc {
 245         struct dm_cache_migration *mg;
 246         struct dm_bio_prison_cell *cell1;
 247         struct dm_bio_prison_cell *cell2;
 248 };
 249
 250 static void wake_worker(struct cache *cache)
 251 {
 252         queue_work(cache->wq, &cache->worker);
 253 }
 254
 255 /*----------------------------------------------------------------*/
 256
 257 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
 258 {
 259         /* FIXME: change to use a local slab. */
 260         return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
 261 }
 262
 263 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
 264 {
 265         dm_bio_prison_free_cell(cache->prison, cell);
 266 }
 267
 268 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
 269 {
 270         if (!p->mg) {
 271                 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
 272                 if (!p->mg)
 273                         return -ENOMEM;
 274         }
 275
 276         if (!p->cell1) {
 277                 p->cell1 = alloc_prison_cell(cache);
 278                 if (!p->cell1)
 279                         return -ENOMEM;
 280         }
 281
 282         if (!p->cell2) {
 283                 p->cell2 = alloc_prison_cell(cache);
 284                 if (!p->cell2)
 285                         return -ENOMEM;
 286         }
 287
 288         return 0;
 289 }
 290
 291 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
 292 {
 293         if (p->cell2)
 294                 free_prison_cell(cache, p->cell2);
 295
 296         if (p->cell1)
 297                 free_prison_cell(cache, p->cell1);
 298
 299         if (p->mg)
 300                 mempool_free(p->mg, cache->migration_pool);
 301 }
 302
 303 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
 304 {
 305         struct dm_cache_migration *mg = p->mg;
 306
 307         BUG_ON(!mg);
 308         p->mg = NULL;
 309
 310         return mg;
 311 }
 312
 313 /*
 314  * You must have a cell within the prealloc struct to return.  If not this
 315  * function will BUG() rather than returning NULL.
 316  */
 317 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
 318 {
 319         struct dm_bio_prison_cell *r = NULL;
 320
 321         if (p->cell1) {
 322                 r = p->cell1;
 323                 p->cell1 = NULL;
 324
 325         } else if (p->cell2) {
 326                 r = p->cell2;
 327                 p->cell2 = NULL;
 328         } else
 329                 BUG();
 330
 331         return r;
 332 }
 333
 334 /*
 335  * You can't have more than two cells in a prealloc struct.  BUG() will be
 336  * called if you try and overfill.
 337  */
 338 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
 339 {
 340         if (!p->cell2)
 341                 p->cell2 = cell;
 342
 343         else if (!p->cell1)
 344                 p->cell1 = cell;
 345
 346         else
 347                 BUG();
 348 }
 349
 350 /*----------------------------------------------------------------*/
 351
 352 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
 353 {
 354         key->virtual = 0;
 355         key->dev = 0;
 356         key->block = from_oblock(oblock);
 357 }
 358
 359 /*
 360  * The caller hands in a preallocated cell, and a free function for it.
 361  * The cell will be freed if there's an error, or if it wasn't used because
 362  * a cell with that key already exists.
 363  */
 364 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
 365
 366 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
 367                       struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
 368                       cell_free_fn free_fn, void *free_context,
 369                       struct dm_bio_prison_cell **cell_result)
 370 {
 371         int r;
 372         struct dm_cell_key key;
 373
 374         build_key(oblock, &key);
 375         r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
 376         if (r)
 377                 free_fn(free_context, cell_prealloc);
 378
 379         return r;
 380 }
 381
 382 static int get_cell(struct cache *cache,
 383                     dm_oblock_t oblock,
 384                     struct prealloc *structs,
 385                     struct dm_bio_prison_cell **cell_result)
 386 {
 387         int r;
 388         struct dm_cell_key key;
 389         struct dm_bio_prison_cell *cell_prealloc;
 390
 391         cell_prealloc = prealloc_get_cell(structs);
 392
 393         build_key(oblock, &key);
 394         r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
 395         if (r)
 396                 prealloc_put_cell(structs, cell_prealloc);
 397
 398         return r;
 399 }
 400
 401 /*----------------------------------------------------------------*/
 402
 403 static bool is_dirty(struct cache *cache, dm_cblock_t b)
 404 {
 405         return test_bit(from_cblock(b), cache->dirty_bitset);
 406 }
 407
 408 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
 409 {
 410         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
 411                 atomic_inc(&cache->nr_dirty);
 412                 policy_set_dirty(cache->policy, oblock);
 413         }
 414 }
 415
 416 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
 417 {
 418         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
 419                 policy_clear_dirty(cache->policy, oblock);
 420                 if (atomic_dec_return(&cache->nr_dirty) == 0)
 421                         dm_table_event(cache->ti->table);
 422         }
 423 }
 424
 425 /*----------------------------------------------------------------*/
 426
 427 static bool block_size_is_power_of_two(struct cache *cache)
 428 {
 429         return cache->sectors_per_block_shift >= 0;
 430 }
 431
 432 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
 433 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
 434 __always_inline
 435 #endif
 436 static dm_block_t block_div(dm_block_t b, uint32_t n)
 437 {
 438         do_div(b, n);
 439
 440         return b;
 441 }
 442
 443 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
 444 {
 445         uint32_t discard_blocks = cache->discard_block_size;
 446         dm_block_t b = from_oblock(oblock);
 447
 448         if (!block_size_is_power_of_two(cache))
 449                 discard_blocks = discard_blocks / cache->sectors_per_block;
 450         else
 451                 discard_blocks >>= cache->sectors_per_block_shift;
 452
 453         b = block_div(b, discard_blocks);
 454
 455         return to_dblock(b);
 456 }
 457
 458 static void set_discard(struct cache *cache, dm_dblock_t b)
 459 {
 460         unsigned long flags;
 461
 462         atomic_inc(&cache->stats.discard_count);
 463
 464         spin_lock_irqsave(&cache->lock, flags);
 465         set_bit(from_dblock(b), cache->discard_bitset);
 466         spin_unlock_irqrestore(&cache->lock, flags);
 467 }
 468
 469 static void clear_discard(struct cache *cache, dm_dblock_t b)
 470 {
 471         unsigned long flags;
 472
 473         spin_lock_irqsave(&cache->lock, flags);
 474         clear_bit(from_dblock(b), cache->discard_bitset);
 475         spin_unlock_irqrestore(&cache->lock, flags);
 476 }
 477
 478 static bool is_discarded(struct cache *cache, dm_dblock_t b)
 479 {
 480         int r;
 481         unsigned long flags;
 482
 483         spin_lock_irqsave(&cache->lock, flags);
 484         r = test_bit(from_dblock(b), cache->discard_bitset);
 485         spin_unlock_irqrestore(&cache->lock, flags);
 486
 487         return r;
 488 }
 489
 490 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
 491 {
 492         int r;
 493         unsigned long flags;
 494
 495         spin_lock_irqsave(&cache->lock, flags);
 496         r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
 497                      cache->discard_bitset);
 498         spin_unlock_irqrestore(&cache->lock, flags);
 499
 500         return r;
 501 }
 502
 503 /*----------------------------------------------------------------*/
 504
 505 static void load_stats(struct cache *cache)
 506 {
 507         struct dm_cache_statistics stats;
 508
 509         dm_cache_metadata_get_stats(cache->cmd, &stats);
 510         atomic_set(&cache->stats.read_hit, stats.read_hits);
 511         atomic_set(&cache->stats.read_miss, stats.read_misses);
 512         atomic_set(&cache->stats.write_hit, stats.write_hits);
 513         atomic_set(&cache->stats.write_miss, stats.write_misses);
 514 }
 515
 516 static void save_stats(struct cache *cache)
 517 {
 518         struct dm_cache_statistics stats;
 519
 520         stats.read_hits = atomic_read(&cache->stats.read_hit);
 521         stats.read_misses = atomic_read(&cache->stats.read_miss);
 522         stats.write_hits = atomic_read(&cache->stats.write_hit);
 523         stats.write_misses = atomic_read(&cache->stats.write_miss);
 524
 525         dm_cache_metadata_set_stats(cache->cmd, &stats);
 526 }
 527
 528 /*----------------------------------------------------------------
 529  * Per bio data
 530  *--------------------------------------------------------------*/
 531
 532 /*
 533  * If using writeback, leave out struct per_bio_data's writethrough fields.
 534  */
 535 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
 536 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
 537
 538 static size_t get_per_bio_data_size(struct cache *cache)
 539 {
 540         return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
 541 }
 542
 543 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
 544 {
 545         struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
 546         BUG_ON(!pb);
 547         return pb;
 548 }
 549
 550 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
 551 {
 552         struct per_bio_data *pb = get_per_bio_data(bio, data_size);
 553
 554         pb->tick = false;
 555         pb->req_nr = dm_bio_get_target_bio_nr(bio);
 556         pb->all_io_entry = NULL;
 557
 558         return pb;
 559 }
 560
 561 /*----------------------------------------------------------------
 562  * Remapping
 563  *--------------------------------------------------------------*/
 564 static void remap_to_origin(struct cache *cache, struct bio *bio)
 565 {
 566         bio->bi_bdev = cache->origin_dev->bdev;
 567 }
 568
 569 static void remap_to_cache(struct cache *cache, struct bio *bio,
 570                            dm_cblock_t cblock)
 571 {
 572         sector_t bi_sector = bio->bi_sector;
 573
 574         bio->bi_bdev = cache->cache_dev->bdev;
 575         if (!block_size_is_power_of_two(cache))
 576                 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
 577                                 sector_div(bi_sector, cache->sectors_per_block);
 578         else
 579                 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
 580                                 (bi_sector & (cache->sectors_per_block - 1));
 581 }
 582
 583 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 584 {
 585         unsigned long flags;
 586         size_t pb_data_size = get_per_bio_data_size(cache);
 587         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 588
 589         spin_lock_irqsave(&cache->lock, flags);
 590         if (cache->need_tick_bio &&
 591             !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
 592                 pb->tick = true;
 593                 cache->need_tick_bio = false;
 594         }
 595         spin_unlock_irqrestore(&cache->lock, flags);
 596 }
 597
 598 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 599                                   dm_oblock_t oblock)
 600 {
 601         check_if_tick_bio_needed(cache, bio);
 602         remap_to_origin(cache, bio);
 603         if (bio_data_dir(bio) == WRITE)
 604                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 605 }
 606
 607 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 608                                  dm_oblock_t oblock, dm_cblock_t cblock)
 609 {
 610         remap_to_cache(cache, bio, cblock);
 611         if (bio_data_dir(bio) == WRITE) {
 612                 set_dirty(cache, oblock, cblock);
 613                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 614         }
 615 }
 616
 617 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 618 {
 619         sector_t block_nr = bio->bi_sector;
 620
 621         if (!block_size_is_power_of_two(cache))
 622                 (void) sector_div(block_nr, cache->sectors_per_block);
 623         else
 624                 block_nr >>= cache->sectors_per_block_shift;
 625
 626         return to_oblock(block_nr);
 627 }
 628
 629 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
 630 {
 631         return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 632 }
 633
 634 static void issue(struct cache *cache, struct bio *bio)
 635 {
 636         unsigned long flags;
 637
 638         if (!bio_triggers_commit(cache, bio)) {
 639                 generic_make_request(bio);
 640                 return;
 641         }
 642
 643         /*
 644          * Batch together any bios that trigger commits and then issue a
 645          * single commit for them in do_worker().
 646          */
 647         spin_lock_irqsave(&cache->lock, flags);
 648         cache->commit_requested = true;
 649         bio_list_add(&cache->deferred_flush_bios, bio);
 650         spin_unlock_irqrestore(&cache->lock, flags);
 651 }
 652
 653 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
 654 {
 655         unsigned long flags;
 656
 657         spin_lock_irqsave(&cache->lock, flags);
 658         bio_list_add(&cache->deferred_writethrough_bios, bio);
 659         spin_unlock_irqrestore(&cache->lock, flags);
 660
 661         wake_worker(cache);
 662 }
 663
 664 static void writethrough_endio(struct bio *bio, int err)
 665 {
 666         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
 667         bio->bi_end_io = pb->saved_bi_end_io;
 668
 669         if (err) {
 670                 bio_endio(bio, err);
 671                 return;
 672         }
 673
 674         dm_bio_restore(&pb->bio_details, bio);
 675         remap_to_cache(pb->cache, bio, pb->cblock);
 676
 677         /*
 678          * We can't issue this bio directly, since we're in interrupt
 679          * context.  So it gets put on a bio list for processing by the
 680          * worker thread.
 681          */
 682         defer_writethrough_bio(pb->cache, bio);
 683 }
 684
 685 /*
 686  * When running in writethrough mode we need to send writes to clean blocks
 687  * to both the cache and origin devices.  In future we'd like to clone the
 688  * bio and send them in parallel, but for now we're doing them in
 689  * series as this is easier.
 690  */
 691 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
 692                                        dm_oblock_t oblock, dm_cblock_t cblock)
 693 {
 694         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
 695
 696         pb->cache = cache;
 697         pb->cblock = cblock;
 698         pb->saved_bi_end_io = bio->bi_end_io;
 699         dm_bio_record(&pb->bio_details, bio);
 700         bio->bi_end_io = writethrough_endio;
 701
 702         remap_to_origin_clear_discard(pb->cache, bio, oblock);
 703 }
 704
 705 /*----------------------------------------------------------------
 706  * Migration processing
 707  *
 708  * Migration covers moving data from the origin device to the cache, or
 709  * vice versa.
 710  *--------------------------------------------------------------*/
 711 static void free_migration(struct dm_cache_migration *mg)
 712 {
 713         mempool_free(mg, mg->cache->migration_pool);
 714 }
 715
 716 static void inc_nr_migrations(struct cache *cache)
 717 {
 718         atomic_inc(&cache->nr_migrations);
 719 }
 720
 721 static void dec_nr_migrations(struct cache *cache)
 722 {
 723         atomic_dec(&cache->nr_migrations);
 724
 725         /*
 726          * Wake the worker in case we're suspending the target.
 727          */
 728         wake_up(&cache->migration_wait);
 729 }
 730
 731 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
 732                          bool holder)
 733 {
 734         (holder ? dm_cell_release : dm_cell_release_no_holder)
 735                 (cache->prison, cell, &cache->deferred_bios);
 736         free_prison_cell(cache, cell);
 737 }
 738
 739 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
 740                        bool holder)
 741 {
 742         unsigned long flags;
 743
 744         spin_lock_irqsave(&cache->lock, flags);
 745         __cell_defer(cache, cell, holder);
 746         spin_unlock_irqrestore(&cache->lock, flags);
 747
 748         wake_worker(cache);
 749 }
 750
 751 static void cleanup_migration(struct dm_cache_migration *mg)
 752 {
 753         struct cache *cache = mg->cache;
 754         free_migration(mg);
 755         dec_nr_migrations(cache);
 756 }
 757
 758 static void migration_failure(struct dm_cache_migration *mg)
 759 {
 760         struct cache *cache = mg->cache;
 761
 762         if (mg->writeback) {
 763                 DMWARN_LIMIT("writeback failed; couldn't copy block");
 764                 set_dirty(cache, mg->old_oblock, mg->cblock);
 765                 cell_defer(cache, mg->old_ocell, false);
 766
 767         } else if (mg->demote) {
 768                 DMWARN_LIMIT("demotion failed; couldn't copy block");
 769                 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
 770
 771                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
 772                 if (mg->promote)
 773                         cell_defer(cache, mg->new_ocell, 1);
 774         } else {
 775                 DMWARN_LIMIT("promotion failed; couldn't copy block");
 776                 policy_remove_mapping(cache->policy, mg->new_oblock);
 777                 cell_defer(cache, mg->new_ocell, 1);
 778         }
 779
 780         cleanup_migration(mg);
 781 }
 782
 783 static void migration_success_pre_commit(struct dm_cache_migration *mg)
 784 {
 785         unsigned long flags;
 786         struct cache *cache = mg->cache;
 787
 788         if (mg->writeback) {
 789                 cell_defer(cache, mg->old_ocell, false);
 790                 clear_dirty(cache, mg->old_oblock, mg->cblock);
 791                 cleanup_migration(mg);
 792                 return;
 793
 794         } else if (mg->demote) {
 795                 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
 796                         DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
 797                         policy_force_mapping(cache->policy, mg->new_oblock,
 798                                              mg->old_oblock);
 799                         if (mg->promote)
 800                                 cell_defer(cache, mg->new_ocell, true);
 801                         cleanup_migration(mg);
 802                         return;
 803                 }
 804         } else {
 805                 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
 806                         DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
 807                         policy_remove_mapping(cache->policy, mg->new_oblock);
 808                         cleanup_migration(mg);
 809                         return;
 810                 }
 811         }
 812
 813         spin_lock_irqsave(&cache->lock, flags);
 814         list_add_tail(&mg->list, &cache->need_commit_migrations);
 815         cache->commit_requested = true;
 816         spin_unlock_irqrestore(&cache->lock, flags);
 817 }
 818
 819 static void migration_success_post_commit(struct dm_cache_migration *mg)
 820 {
 821         unsigned long flags;
 822         struct cache *cache = mg->cache;
 823
 824         if (mg->writeback) {
 825                 DMWARN("writeback unexpectedly triggered commit");
 826                 return;
 827
 828         } else if (mg->demote) {
 829                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
 830
 831                 if (mg->promote) {
 832                         mg->demote = false;
 833
 834                         spin_lock_irqsave(&cache->lock, flags);
 835                         list_add_tail(&mg->list, &cache->quiesced_migrations);
 836                         spin_unlock_irqrestore(&cache->lock, flags);
 837
 838                 } else
 839                         cleanup_migration(mg);
 840
 841         } else {
 842                 cell_defer(cache, mg->new_ocell, true);
 843                 clear_dirty(cache, mg->new_oblock, mg->cblock);
 844                 cleanup_migration(mg);
 845         }
 846 }
 847
 848 static void copy_complete(int read_err, unsigned long write_err, void *context)
 849 {
 850         unsigned long flags;
 851         struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
 852         struct cache *cache = mg->cache;
 853
 854         if (read_err || write_err)
 855                 mg->err = true;
 856
 857         spin_lock_irqsave(&cache->lock, flags);
 858         list_add_tail(&mg->list, &cache->completed_migrations);
 859         spin_unlock_irqrestore(&cache->lock, flags);
 860
 861         wake_worker(cache);
 862 }
 863
 864 static void issue_copy_real(struct dm_cache_migration *mg)
 865 {
 866         int r;
 867         struct dm_io_region o_region, c_region;
 868         struct cache *cache = mg->cache;
 869         sector_t cblock = from_cblock(mg->cblock);
 870
 871         o_region.bdev = cache->origin_dev->bdev;
 872         o_region.count = cache->sectors_per_block;
 873
 874         c_region.bdev = cache->cache_dev->bdev;
 875         c_region.sector = cblock * cache->sectors_per_block;
 876         c_region.count = cache->sectors_per_block;
 877
 878         if (mg->writeback || mg->demote) {
 879                 /* demote */
 880                 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
 881                 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
 882         } else {
 883                 /* promote */
 884                 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
 885                 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
 886         }
 887
 888         if (r < 0)
 889                 migration_failure(mg);
 890 }
 891
 892 static void avoid_copy(struct dm_cache_migration *mg)
 893 {
 894         atomic_inc(&mg->cache->stats.copies_avoided);
 895         migration_success_pre_commit(mg);
 896 }
 897
 898 static void issue_copy(struct dm_cache_migration *mg)
 899 {
 900         bool avoid;
 901         struct cache *cache = mg->cache;
 902
 903         if (mg->writeback || mg->demote)
 904                 avoid = !is_dirty(cache, mg->cblock) ||
 905                         is_discarded_oblock(cache, mg->old_oblock);
 906         else
 907                 avoid = is_discarded_oblock(cache, mg->new_oblock);
 908
 909         avoid ? avoid_copy(mg) : issue_copy_real(mg);
 910 }
 911
 912 static void complete_migration(struct dm_cache_migration *mg)
 913 {
 914         if (mg->err)
 915                 migration_failure(mg);
 916         else
 917                 migration_success_pre_commit(mg);
 918 }
 919
 920 static void process_migrations(struct cache *cache, struct list_head *head,
 921                                void (*fn)(struct dm_cache_migration *))
 922 {
 923         unsigned long flags;
 924         struct list_head list;
 925         struct dm_cache_migration *mg, *tmp;
 926
 927         INIT_LIST_HEAD(&list);
 928         spin_lock_irqsave(&cache->lock, flags);
 929         list_splice_init(head, &list);
 930         spin_unlock_irqrestore(&cache->lock, flags);
 931
 932         list_for_each_entry_safe(mg, tmp, &list, list)
 933                 fn(mg);
 934 }
 935
 936 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
 937 {
 938         list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
 939 }
 940
 941 static void queue_quiesced_migration(struct dm_cache_migration *mg)
 942 {
 943         unsigned long flags;
 944         struct cache *cache = mg->cache;
 945
 946         spin_lock_irqsave(&cache->lock, flags);
 947         __queue_quiesced_migration(mg);
 948         spin_unlock_irqrestore(&cache->lock, flags);
 949
 950         wake_worker(cache);
 951 }
 952
 953 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
 954 {
 955         unsigned long flags;
 956         struct dm_cache_migration *mg, *tmp;
 957
 958         spin_lock_irqsave(&cache->lock, flags);
 959         list_for_each_entry_safe(mg, tmp, work, list)
 960                 __queue_quiesced_migration(mg);
 961         spin_unlock_irqrestore(&cache->lock, flags);
 962
 963         wake_worker(cache);
 964 }
 965
 966 static void check_for_quiesced_migrations(struct cache *cache,
 967                                           struct per_bio_data *pb)
 968 {
 969         struct list_head work;
 970
 971         if (!pb->all_io_entry)
 972                 return;
 973
 974         INIT_LIST_HEAD(&work);
 975         if (pb->all_io_entry)
 976                 dm_deferred_entry_dec(pb->all_io_entry, &work);
 977
 978         if (!list_empty(&work))
 979                 queue_quiesced_migrations(cache, &work);
 980 }
 981
 982 static void quiesce_migration(struct dm_cache_migration *mg)
 983 {
 984         if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
 985                 queue_quiesced_migration(mg);
 986 }
 987
 988 static void promote(struct cache *cache, struct prealloc *structs,
 989                     dm_oblock_t oblock, dm_cblock_t cblock,
 990                     struct dm_bio_prison_cell *cell)
 991 {
 992         struct dm_cache_migration *mg = prealloc_get_migration(structs);
 993
 994         mg->err = false;
 995         mg->writeback = false;
 996         mg->demote = false;
 997         mg->promote = true;
 998         mg->cache = cache;
 999         mg->new_oblock = oblock;
1000         mg->cblock = cblock;
1001         mg->old_ocell = NULL;
1002         mg->new_ocell = cell;
1003         mg->start_jiffies = jiffies;
1004
1005         inc_nr_migrations(cache);
1006         quiesce_migration(mg);
1007 }
1008
1009 static void writeback(struct cache *cache, struct prealloc *structs,
1010                       dm_oblock_t oblock, dm_cblock_t cblock,
1011                       struct dm_bio_prison_cell *cell)
1012 {
1013         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1014
1015         mg->err = false;
1016         mg->writeback = true;
1017         mg->demote = false;
1018         mg->promote = false;
1019         mg->cache = cache;
1020         mg->old_oblock = oblock;
1021         mg->cblock = cblock;
1022         mg->old_ocell = cell;
1023         mg->new_ocell = NULL;
1024         mg->start_jiffies = jiffies;
1025
1026         inc_nr_migrations(cache);
1027         quiesce_migration(mg);
1028 }
1029
1030 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1031                                 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1032                                 dm_cblock_t cblock,
1033                                 struct dm_bio_prison_cell *old_ocell,
1034                                 struct dm_bio_prison_cell *new_ocell)
1035 {
1036         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1037
1038         mg->err = false;
1039         mg->writeback = false;
1040         mg->demote = true;
1041         mg->promote = true;
1042         mg->cache = cache;
1043         mg->old_oblock = old_oblock;
1044         mg->new_oblock = new_oblock;
1045         mg->cblock = cblock;
1046         mg->old_ocell = old_ocell;
1047         mg->new_ocell = new_ocell;
1048         mg->start_jiffies = jiffies;
1049
1050         inc_nr_migrations(cache);
1051         quiesce_migration(mg);
1052 }
1053
1054 /*----------------------------------------------------------------
1055  * bio processing
1056  *--------------------------------------------------------------*/
1057 static void defer_bio(struct cache *cache, struct bio *bio)
1058 {
1059         unsigned long flags;
1060
1061         spin_lock_irqsave(&cache->lock, flags);
1062         bio_list_add(&cache->deferred_bios, bio);
1063         spin_unlock_irqrestore(&cache->lock, flags);
1064
1065         wake_worker(cache);
1066 }
1067
1068 static void process_flush_bio(struct cache *cache, struct bio *bio)
1069 {
1070         size_t pb_data_size = get_per_bio_data_size(cache);
1071         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1072
1073         BUG_ON(bio->bi_size);
1074         if (!pb->req_nr)
1075                 remap_to_origin(cache, bio);
1076         else
1077                 remap_to_cache(cache, bio, 0);
1078
1079         issue(cache, bio);
1080 }
1081
1082 /*
1083  * People generally discard large parts of a device, eg, the whole device
1084  * when formatting.  Splitting these large discards up into cache block
1085  * sized ios and then quiescing (always neccessary for discard) takes too
1086  * long.
1087  *
1088  * We keep it simple, and allow any size of discard to come in, and just
1089  * mark off blocks on the discard bitset.  No passdown occurs!
1090  *
1091  * To implement passdown we need to change the bio_prison such that a cell
1092  * can have a key that spans many blocks.
1093  */
1094 static void process_discard_bio(struct cache *cache, struct bio *bio)
1095 {
1096         dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
1097                                                   cache->discard_block_size);
1098         dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1099         dm_block_t b;
1100
1101         end_block = block_div(end_block, cache->discard_block_size);
1102
1103         for (b = start_block; b < end_block; b++)
1104                 set_discard(cache, to_dblock(b));
1105
1106         bio_endio(bio, 0);
1107 }
1108
1109 static bool spare_migration_bandwidth(struct cache *cache)
1110 {
1111         sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1112                 cache->sectors_per_block;
1113         return current_volume < cache->migration_threshold;
1114 }
1115
1116 static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1117                                dm_cblock_t cblock)
1118 {
1119         return bio_data_dir(bio) == WRITE &&
1120                 cache->features.write_through && !is_dirty(cache, cblock);
1121 }
1122
1123 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1124 {
1125         atomic_inc(bio_data_dir(bio) == READ ?
1126                    &cache->stats.read_hit : &cache->stats.write_hit);
1127 }
1128
1129 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1130 {
1131         atomic_inc(bio_data_dir(bio) == READ ?
1132                    &cache->stats.read_miss : &cache->stats.write_miss);
1133 }
1134
1135 static void process_bio(struct cache *cache, struct prealloc *structs,
1136                         struct bio *bio)
1137 {
1138         int r;
1139         bool release_cell = true;
1140         dm_oblock_t block = get_bio_block(cache, bio);
1141         struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1142         struct policy_result lookup_result;
1143         size_t pb_data_size = get_per_bio_data_size(cache);
1144         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1145         bool discarded_block = is_discarded_oblock(cache, block);
1146         bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
1147
1148         /*
1149          * Check to see if that block is currently migrating.
1150          */
1151         cell_prealloc = prealloc_get_cell(structs);
1152         r = bio_detain(cache, block, bio, cell_prealloc,
1153                        (cell_free_fn) prealloc_put_cell,
1154                        structs, &new_ocell);
1155         if (r > 0)
1156                 return;
1157
1158         r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1159                        bio, &lookup_result);
1160
1161         if (r == -EWOULDBLOCK)
1162                 /* migration has been denied */
1163                 lookup_result.op = POLICY_MISS;
1164
1165         switch (lookup_result.op) {
1166         case POLICY_HIT:
1167                 inc_hit_counter(cache, bio);
1168                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1169
1170                 if (is_writethrough_io(cache, bio, lookup_result.cblock))
1171                         remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1172                 else
1173                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1174
1175                 issue(cache, bio);
1176                 break;
1177
1178         case POLICY_MISS:
1179                 inc_miss_counter(cache, bio);
1180                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1181                 remap_to_origin_clear_discard(cache, bio, block);
1182                 issue(cache, bio);
1183                 break;
1184
1185         case POLICY_NEW:
1186                 atomic_inc(&cache->stats.promotion);
1187                 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1188                 release_cell = false;
1189                 break;
1190
1191         case POLICY_REPLACE:
1192                 cell_prealloc = prealloc_get_cell(structs);
1193                 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1194                                (cell_free_fn) prealloc_put_cell,
1195                                structs, &old_ocell);
1196                 if (r > 0) {
1197                         /*
1198                          * We have to be careful to avoid lock inversion of
1199                          * the cells.  So we back off, and wait for the
1200                          * old_ocell to become free.
1201                          */
1202                         policy_force_mapping(cache->policy, block,
1203                                              lookup_result.old_oblock);
1204                         atomic_inc(&cache->stats.cache_cell_clash);
1205                         break;
1206                 }
1207                 atomic_inc(&cache->stats.demotion);
1208                 atomic_inc(&cache->stats.promotion);
1209
1210                 demote_then_promote(cache, structs, lookup_result.old_oblock,
1211                                     block, lookup_result.cblock,
1212                                     old_ocell, new_ocell);
1213                 release_cell = false;
1214                 break;
1215
1216         default:
1217                 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1218                             (unsigned) lookup_result.op);
1219                 bio_io_error(bio);
1220         }
1221
1222         if (release_cell)
1223                 cell_defer(cache, new_ocell, false);
1224 }
1225
1226 static int need_commit_due_to_time(struct cache *cache)
1227 {
1228         return jiffies < cache->last_commit_jiffies ||
1229                jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1230 }
1231
1232 static int commit_if_needed(struct cache *cache)
1233 {
1234         if (dm_cache_changed_this_transaction(cache->cmd) &&
1235             (cache->commit_requested || need_commit_due_to_time(cache))) {
1236                 atomic_inc(&cache->stats.commit_count);
1237                 cache->last_commit_jiffies = jiffies;
1238                 cache->commit_requested = false;
1239                 return dm_cache_commit(cache->cmd, false);
1240         }
1241
1242         return 0;
1243 }
1244
1245 static void process_deferred_bios(struct cache *cache)
1246 {
1247         unsigned long flags;
1248         struct bio_list bios;
1249         struct bio *bio;
1250         struct prealloc structs;
1251
1252         memset(&structs, 0, sizeof(structs));
1253         bio_list_init(&bios);
1254
1255         spin_lock_irqsave(&cache->lock, flags);
1256         bio_list_merge(&bios, &cache->deferred_bios);
1257         bio_list_init(&cache->deferred_bios);
1258         spin_unlock_irqrestore(&cache->lock, flags);
1259
1260         while (!bio_list_empty(&bios)) {
1261                 /*
1262                  * If we've got no free migration structs, and processing
1263                  * this bio might require one, we pause until there are some
1264                  * prepared mappings to process.
1265                  */
1266                 if (prealloc_data_structs(cache, &structs)) {
1267                         spin_lock_irqsave(&cache->lock, flags);
1268                         bio_list_merge(&cache->deferred_bios, &bios);
1269                         spin_unlock_irqrestore(&cache->lock, flags);
1270                         break;
1271                 }
1272
1273                 bio = bio_list_pop(&bios);
1274
1275                 if (bio->bi_rw & REQ_FLUSH)
1276                         process_flush_bio(cache, bio);
1277                 else if (bio->bi_rw & REQ_DISCARD)
1278                         process_discard_bio(cache, bio);
1279                 else
1280                         process_bio(cache, &structs, bio);
1281         }
1282
1283         prealloc_free_structs(cache, &structs);
1284 }
1285
1286 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1287 {
1288         unsigned long flags;
1289         struct bio_list bios;
1290         struct bio *bio;
1291
1292         bio_list_init(&bios);
1293
1294         spin_lock_irqsave(&cache->lock, flags);
1295         bio_list_merge(&bios, &cache->deferred_flush_bios);
1296         bio_list_init(&cache->deferred_flush_bios);
1297         spin_unlock_irqrestore(&cache->lock, flags);
1298
1299         while ((bio = bio_list_pop(&bios)))
1300                 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1301 }
1302
1303 static void process_deferred_writethrough_bios(struct cache *cache)
1304 {
1305         unsigned long flags;
1306         struct bio_list bios;
1307         struct bio *bio;
1308
1309         bio_list_init(&bios);
1310
1311         spin_lock_irqsave(&cache->lock, flags);
1312         bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1313         bio_list_init(&cache->deferred_writethrough_bios);
1314         spin_unlock_irqrestore(&cache->lock, flags);
1315
1316         while ((bio = bio_list_pop(&bios)))
1317                 generic_make_request(bio);
1318 }
1319
1320 static void writeback_some_dirty_blocks(struct cache *cache)
1321 {
1322         int r = 0;
1323         dm_oblock_t oblock;
1324         dm_cblock_t cblock;
1325         struct prealloc structs;
1326         struct dm_bio_prison_cell *old_ocell;
1327
1328         memset(&structs, 0, sizeof(structs));
1329
1330         while (spare_migration_bandwidth(cache)) {
1331                 if (prealloc_data_structs(cache, &structs))
1332                         break;
1333
1334                 r = policy_writeback_work(cache->policy, &oblock, &cblock);
1335                 if (r)
1336                         break;
1337
1338                 r = get_cell(cache, oblock, &structs, &old_ocell);
1339                 if (r) {
1340                         policy_set_dirty(cache->policy, oblock);
1341                         break;
1342                 }
1343
1344                 writeback(cache, &structs, oblock, cblock, old_ocell);
1345         }
1346
1347         prealloc_free_structs(cache, &structs);
1348 }
1349
1350 /*----------------------------------------------------------------
1351  * Main worker loop
1352  *--------------------------------------------------------------*/
1353 static bool is_quiescing(struct cache *cache)
1354 {
1355         int r;
1356         unsigned long flags;
1357
1358         spin_lock_irqsave(&cache->lock, flags);
1359         r = cache->quiescing;
1360         spin_unlock_irqrestore(&cache->lock, flags);
1361
1362         return r;
1363 }
1364
1365 static void ack_quiescing(struct cache *cache)
1366 {
1367         if (is_quiescing(cache)) {
1368                 atomic_inc(&cache->quiescing_ack);
1369                 wake_up(&cache->quiescing_wait);
1370         }
1371 }
1372
1373 static void wait_for_quiescing_ack(struct cache *cache)
1374 {
1375         wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
1376 }
1377
1378 static void start_quiescing(struct cache *cache)
1379 {
1380         unsigned long flags;
1381
1382         spin_lock_irqsave(&cache->lock, flags);
1383         cache->quiescing = true;
1384         spin_unlock_irqrestore(&cache->lock, flags);
1385
1386         wait_for_quiescing_ack(cache);
1387 }
1388
1389 static void stop_quiescing(struct cache *cache)
1390 {
1391         unsigned long flags;
1392
1393         spin_lock_irqsave(&cache->lock, flags);
1394         cache->quiescing = false;
1395         spin_unlock_irqrestore(&cache->lock, flags);
1396
1397         atomic_set(&cache->quiescing_ack, 0);
1398 }
1399
1400 static void wait_for_migrations(struct cache *cache)
1401 {
1402         wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1403 }
1404
1405 static void stop_worker(struct cache *cache)
1406 {
1407         cancel_delayed_work(&cache->waker);
1408         flush_workqueue(cache->wq);
1409 }
1410
1411 static void requeue_deferred_io(struct cache *cache)
1412 {
1413         struct bio *bio;
1414         struct bio_list bios;
1415
1416         bio_list_init(&bios);
1417         bio_list_merge(&bios, &cache->deferred_bios);
1418         bio_list_init(&cache->deferred_bios);
1419
1420         while ((bio = bio_list_pop(&bios)))
1421                 bio_endio(bio, DM_ENDIO_REQUEUE);
1422 }
1423
1424 static int more_work(struct cache *cache)
1425 {
1426         if (is_quiescing(cache))
1427                 return !list_empty(&cache->quiesced_migrations) ||
1428                         !list_empty(&cache->completed_migrations) ||
1429                         !list_empty(&cache->need_commit_migrations);
1430         else
1431                 return !bio_list_empty(&cache->deferred_bios) ||
1432                         !bio_list_empty(&cache->deferred_flush_bios) ||
1433                         !bio_list_empty(&cache->deferred_writethrough_bios) ||
1434                         !list_empty(&cache->quiesced_migrations) ||
1435                         !list_empty(&cache->completed_migrations) ||
1436                         !list_empty(&cache->need_commit_migrations);
1437 }
1438
1439 static void do_worker(struct work_struct *ws)
1440 {
1441         struct cache *cache = container_of(ws, struct cache, worker);
1442
1443         do {
1444                 if (!is_quiescing(cache)) {
1445                         writeback_some_dirty_blocks(cache);
1446                         process_deferred_writethrough_bios(cache);
1447                         process_deferred_bios(cache);
1448                 }
1449
1450                 process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1451                 process_migrations(cache, &cache->completed_migrations, complete_migration);
1452
1453                 if (commit_if_needed(cache)) {
1454                         process_deferred_flush_bios(cache, false);
1455
1456                         /*
1457                          * FIXME: rollback metadata or just go into a
1458                          * failure mode and error everything
1459                          */
1460                 } else {
1461                         process_deferred_flush_bios(cache, true);
1462                         process_migrations(cache, &cache->need_commit_migrations,
1463                                            migration_success_post_commit);
1464                 }
1465
1466                 ack_quiescing(cache);
1467
1468         } while (more_work(cache));
1469 }
1470
1471 /*
1472  * We want to commit periodically so that not too much
1473  * unwritten metadata builds up.
1474  */
1475 static void do_waker(struct work_struct *ws)
1476 {
1477         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1478         policy_tick(cache->policy);
1479         wake_worker(cache);
1480         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1481 }
1482
1483 /*----------------------------------------------------------------*/
1484
1485 static int is_congested(struct dm_dev *dev, int bdi_bits)
1486 {
1487         struct request_queue *q = bdev_get_queue(dev->bdev);
1488         return bdi_congested(&q->backing_dev_info, bdi_bits);
1489 }
1490
1491 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1492 {
1493         struct cache *cache = container_of(cb, struct cache, callbacks);
1494
1495         return is_congested(cache->origin_dev, bdi_bits) ||
1496                 is_congested(cache->cache_dev, bdi_bits);
1497 }
1498
1499 /*----------------------------------------------------------------
1500  * Target methods
1501  *--------------------------------------------------------------*/
1502
1503 /*
1504  * This function gets called on the error paths of the constructor, so we
1505  * have to cope with a partially initialised struct.
1506  */
1507 static void destroy(struct cache *cache)
1508 {
1509         unsigned i;
1510
1511         if (cache->next_migration)
1512                 mempool_free(cache->next_migration, cache->migration_pool);
1513
1514         if (cache->migration_pool)
1515                 mempool_destroy(cache->migration_pool);
1516
1517         if (cache->all_io_ds)
1518                 dm_deferred_set_destroy(cache->all_io_ds);
1519
1520         if (cache->prison)
1521                 dm_bio_prison_destroy(cache->prison);
1522
1523         if (cache->wq)
1524                 destroy_workqueue(cache->wq);
1525
1526         if (cache->dirty_bitset)
1527                 free_bitset(cache->dirty_bitset);
1528
1529         if (cache->discard_bitset)
1530                 free_bitset(cache->discard_bitset);
1531
1532         if (cache->copier)
1533                 dm_kcopyd_client_destroy(cache->copier);
1534
1535         if (cache->cmd)
1536                 dm_cache_metadata_close(cache->cmd);
1537
1538         if (cache->metadata_dev)
1539                 dm_put_device(cache->ti, cache->metadata_dev);
1540
1541         if (cache->origin_dev)
1542                 dm_put_device(cache->ti, cache->origin_dev);
1543
1544         if (cache->cache_dev)
1545                 dm_put_device(cache->ti, cache->cache_dev);
1546
1547         if (cache->policy)
1548                 dm_cache_policy_destroy(cache->policy);
1549
1550         for (i = 0; i < cache->nr_ctr_args ; i++)
1551                 kfree(cache->ctr_args[i]);
1552         kfree(cache->ctr_args);
1553
1554         kfree(cache);
1555 }
1556
1557 static void cache_dtr(struct dm_target *ti)
1558 {
1559         struct cache *cache = ti->private;
1560
1561         destroy(cache);
1562 }
1563
1564 static sector_t get_dev_size(struct dm_dev *dev)
1565 {
1566         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1567 }
1568
1569 /*----------------------------------------------------------------*/
1570
1571 /*
1572  * Construct a cache device mapping.
1573  *
1574  * cache <metadata dev> <cache dev> <origin dev> <block size>
1575  *       <#feature args> [<feature arg>]*
1576  *       <policy> <#policy args> [<policy arg>]*
1577  *
1578  * metadata dev    : fast device holding the persistent metadata
1579  * cache dev       : fast device holding cached data blocks
1580  * origin dev      : slow device holding original data blocks
1581  * block size      : cache unit size in sectors
1582  *
1583  * #feature args   : number of feature arguments passed
1584  * feature args    : writethrough.  (The default is writeback.)
1585  *
1586  * policy          : the replacement policy to use
1587  * #policy args    : an even number of policy arguments corresponding
1588  *                   to key/value pairs passed to the policy
1589  * policy args     : key/value pairs passed to the policy
1590  *                   E.g. 'sequential_threshold 1024'
1591  *                   See cache-policies.txt for details.
1592  *
1593  * Optional feature arguments are:
1594  *   writethrough  : write through caching that prohibits cache block
1595  *                   content from being different from origin block content.
1596  *                   Without this argument, the default behaviour is to write
1597  *                   back cache block contents later for performance reasons,
1598  *                   so they may differ from the corresponding origin blocks.
1599  */
1600 struct cache_args {
1601         struct dm_target *ti;
1602
1603         struct dm_dev *metadata_dev;
1604
1605         struct dm_dev *cache_dev;
1606         sector_t cache_sectors;
1607
1608         struct dm_dev *origin_dev;
1609         sector_t origin_sectors;
1610
1611         uint32_t block_size;
1612
1613         const char *policy_name;
1614         int policy_argc;
1615         const char **policy_argv;
1616
1617         struct cache_features features;
1618 };
1619
1620 static void destroy_cache_args(struct cache_args *ca)
1621 {
1622         if (ca->metadata_dev)
1623                 dm_put_device(ca->ti, ca->metadata_dev);
1624
1625         if (ca->cache_dev)
1626                 dm_put_device(ca->ti, ca->cache_dev);
1627
1628         if (ca->origin_dev)
1629                 dm_put_device(ca->ti, ca->origin_dev);
1630
1631         kfree(ca);
1632 }
1633
1634 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1635 {
1636         if (!as->argc) {
1637                 *error = "Insufficient args";
1638                 return false;
1639         }
1640
1641         return true;
1642 }
1643
1644 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1645                               char **error)
1646 {
1647         int r;
1648         sector_t metadata_dev_size;
1649         char b[BDEVNAME_SIZE];
1650
1651         if (!at_least_one_arg(as, error))
1652                 return -EINVAL;
1653
1654         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1655                           &ca->metadata_dev);
1656         if (r) {
1657                 *error = "Error opening metadata device";
1658                 return r;
1659         }
1660
1661         metadata_dev_size = get_dev_size(ca->metadata_dev);
1662         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1663                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1664                        bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1665
1666         return 0;
1667 }
1668
1669 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1670                            char **error)
1671 {
1672         int r;
1673
1674         if (!at_least_one_arg(as, error))
1675                 return -EINVAL;
1676
1677         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1678                           &ca->cache_dev);
1679         if (r) {
1680                 *error = "Error opening cache device";
1681                 return r;
1682         }
1683         ca->cache_sectors = get_dev_size(ca->cache_dev);
1684
1685         return 0;
1686 }
1687
1688 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1689                             char **error)
1690 {
1691         int r;
1692
1693         if (!at_least_one_arg(as, error))
1694                 return -EINVAL;
1695
1696         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1697                           &ca->origin_dev);
1698         if (r) {
1699                 *error = "Error opening origin device";
1700                 return r;
1701         }
1702
1703         ca->origin_sectors = get_dev_size(ca->origin_dev);
1704         if (ca->ti->len > ca->origin_sectors) {
1705                 *error = "Device size larger than cached device";
1706                 return -EINVAL;
1707         }
1708
1709         return 0;
1710 }
1711
1712 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1713                             char **error)
1714 {
1715         unsigned long block_size;
1716
1717         if (!at_least_one_arg(as, error))
1718                 return -EINVAL;
1719
1720         if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
1721             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1722             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1723             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1724                 *error = "Invalid data block size";
1725                 return -EINVAL;
1726         }
1727
1728         if (block_size > ca->cache_sectors) {
1729                 *error = "Data block size is larger than the cache device";
1730                 return -EINVAL;
1731         }
1732
1733         ca->block_size = block_size;
1734
1735         return 0;
1736 }
1737
1738 static void init_features(struct cache_features *cf)
1739 {
1740         cf->mode = CM_WRITE;
1741         cf->write_through = false;
1742 }
1743
1744 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1745                           char **error)
1746 {
1747         static struct dm_arg _args[] = {
1748                 {0, 1, "Invalid number of cache feature arguments"},
1749         };
1750
1751         int r;
1752         unsigned argc;
1753         const char *arg;
1754         struct cache_features *cf = &ca->features;
1755
1756         init_features(cf);
1757
1758         r = dm_read_arg_group(_args, as, &argc, error);
1759         if (r)
1760                 return -EINVAL;
1761
1762         while (argc--) {
1763                 arg = dm_shift_arg(as);
1764
1765                 if (!strcasecmp(arg, "writeback"))
1766                         cf->write_through = false;
1767
1768                 else if (!strcasecmp(arg, "writethrough"))
1769                         cf->write_through = true;
1770
1771                 else {
1772                         *error = "Unrecognised cache feature requested";
1773                         return -EINVAL;
1774                 }
1775         }
1776
1777         return 0;
1778 }
1779
1780 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
1781                         char **error)
1782 {
1783         static struct dm_arg _args[] = {
1784                 {0, 1024, "Invalid number of policy arguments"},
1785         };
1786
1787         int r;
1788
1789         if (!at_least_one_arg(as, error))
1790                 return -EINVAL;
1791
1792         ca->policy_name = dm_shift_arg(as);
1793
1794         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
1795         if (r)
1796                 return -EINVAL;
1797
1798         ca->policy_argv = (const char **)as->argv;
1799         dm_consume_args(as, ca->policy_argc);
1800
1801         return 0;
1802 }
1803
1804 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
1805                             char **error)
1806 {
1807         int r;
1808         struct dm_arg_set as;
1809
1810         as.argc = argc;
1811         as.argv = argv;
1812
1813         r = parse_metadata_dev(ca, &as, error);
1814         if (r)
1815                 return r;
1816
1817         r = parse_cache_dev(ca, &as, error);
1818         if (r)
1819                 return r;
1820
1821         r = parse_origin_dev(ca, &as, error);
1822         if (r)
1823                 return r;
1824
1825         r = parse_block_size(ca, &as, error);
1826         if (r)
1827                 return r;
1828
1829         r = parse_features(ca, &as, error);
1830         if (r)
1831                 return r;
1832
1833         r = parse_policy(ca, &as, error);
1834         if (r)
1835                 return r;
1836
1837         return 0;
1838 }
1839
1840 /*----------------------------------------------------------------*/
1841
1842 static struct kmem_cache *migration_cache;
1843
1844 #define NOT_CORE_OPTION 1
1845
1846 static int process_config_option(struct cache *cache, const char *key, const char *value)
1847 {
1848         unsigned long tmp;
1849
1850         if (!strcasecmp(key, "migration_threshold")) {
1851                 if (kstrtoul(value, 10, &tmp))
1852                         return -EINVAL;
1853
1854                 cache->migration_threshold = tmp;
1855                 return 0;
1856         }
1857
1858         return NOT_CORE_OPTION;
1859 }
1860
1861 static int set_config_value(struct cache *cache, const char *key, const char *value)
1862 {
1863         int r = process_config_option(cache, key, value);
1864
1865         if (r == NOT_CORE_OPTION)
1866                 r = policy_set_config_value(cache->policy, key, value);
1867
1868         if (r)
1869                 DMWARN("bad config value for %s: %s", key, value);
1870
1871         return r;
1872 }
1873
1874 static int set_config_values(struct cache *cache, int argc, const char **argv)
1875 {
1876         int r = 0;
1877
1878         if (argc & 1) {
1879                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
1880                 return -EINVAL;
1881         }
1882
1883         while (argc) {
1884                 r = set_config_value(cache, argv[0], argv[1]);
1885                 if (r)
1886                         break;
1887
1888                 argc -= 2;
1889                 argv += 2;
1890         }
1891
1892         return r;
1893 }
1894
1895 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1896                                char **error)
1897 {
1898         cache->policy = dm_cache_policy_create(ca->policy_name,
1899                                                cache->cache_size,
1900                                                cache->origin_sectors,
1901                                                cache->sectors_per_block);
1902         if (!cache->policy) {
1903                 *error = "Error creating cache's policy";
1904                 return -ENOMEM;
1905         }
1906
1907         return 0;
1908 }
1909
1910 #define DEFAULT_MIGRATION_THRESHOLD 2048
1911
1912 static int cache_create(struct cache_args *ca, struct cache **result)
1913 {
1914         int r = 0;
1915         char **error = &ca->ti->error;
1916         struct cache *cache;
1917         struct dm_target *ti = ca->ti;
1918         dm_block_t origin_blocks;
1919         struct dm_cache_metadata *cmd;
1920         bool may_format = ca->features.mode == CM_WRITE;
1921
1922         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
1923         if (!cache)
1924                 return -ENOMEM;
1925
1926         cache->ti = ca->ti;
1927         ti->private = cache;
1928         ti->num_flush_bios = 2;
1929         ti->flush_supported = true;
1930
1931         ti->num_discard_bios = 1;
1932         ti->discards_supported = true;
1933         ti->discard_zeroes_data_unsupported = true;
1934         /* Discard bios must be split on a block boundary */
1935         ti->split_discard_bios = true;
1936
1937         cache->features = ca->features;
1938         ti->per_bio_data_size = get_per_bio_data_size(cache);
1939
1940         cache->callbacks.congested_fn = cache_is_congested;
1941         dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1942
1943         cache->metadata_dev = ca->metadata_dev;
1944         cache->origin_dev = ca->origin_dev;
1945         cache->cache_dev = ca->cache_dev;
1946
1947         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
1948
1949         /* FIXME: factor out this whole section */
1950         origin_blocks = cache->origin_sectors = ca->origin_sectors;
1951         origin_blocks = block_div(origin_blocks, ca->block_size);
1952         cache->origin_blocks = to_oblock(origin_blocks);
1953
1954         cache->sectors_per_block = ca->block_size;
1955         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
1956                 r = -EINVAL;
1957                 goto bad;
1958         }
1959
1960         if (ca->block_size & (ca->block_size - 1)) {
1961                 dm_block_t cache_size = ca->cache_sectors;
1962
1963                 cache->sectors_per_block_shift = -1;
1964                 cache_size = block_div(cache_size, ca->block_size);
1965                 cache->cache_size = to_cblock(cache_size);
1966         } else {
1967                 cache->sectors_per_block_shift = __ffs(ca->block_size);
1968                 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
1969         }
1970
1971         r = create_cache_policy(cache, ca, error);
1972         if (r)
1973                 goto bad;
1974
1975         cache->policy_nr_args = ca->policy_argc;
1976         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
1977
1978         r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
1979         if (r) {
1980                 *error = "Error setting cache policy's config values";
1981                 goto bad;
1982         }
1983
1984         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
1985                                      ca->block_size, may_format,
1986                                      dm_cache_policy_get_hint_size(cache->policy));
1987         if (IS_ERR(cmd)) {
1988                 *error = "Error creating metadata object";
1989                 r = PTR_ERR(cmd);
1990                 goto bad;
1991         }
1992         cache->cmd = cmd;
1993
1994         spin_lock_init(&cache->lock);
1995         bio_list_init(&cache->deferred_bios);
1996         bio_list_init(&cache->deferred_flush_bios);
1997         bio_list_init(&cache->deferred_writethrough_bios);
1998         INIT_LIST_HEAD(&cache->quiesced_migrations);
1999         INIT_LIST_HEAD(&cache->completed_migrations);
2000         INIT_LIST_HEAD(&cache->need_commit_migrations);
2001         atomic_set(&cache->nr_migrations, 0);
2002         init_waitqueue_head(&cache->migration_wait);
2003
2004         init_waitqueue_head(&cache->quiescing_wait);
2005         atomic_set(&cache->quiescing_ack, 0);
2006
2007         r = -ENOMEM;
2008         atomic_set(&cache->nr_dirty, 0);
2009         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2010         if (!cache->dirty_bitset) {
2011                 *error = "could not allocate dirty bitset";
2012                 goto bad;
2013         }
2014         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2015
2016         cache->discard_block_size = cache->sectors_per_block;
2017         cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
2018         cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2019         if (!cache->discard_bitset) {
2020                 *error = "could not allocate discard bitset";
2021                 goto bad;
2022         }
2023         clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2024
2025         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2026         if (IS_ERR(cache->copier)) {
2027                 *error = "could not create kcopyd client";
2028                 r = PTR_ERR(cache->copier);
2029                 goto bad;
2030         }
2031
2032         cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2033         if (!cache->wq) {
2034                 *error = "could not create workqueue for metadata object";
2035                 goto bad;
2036         }
2037         INIT_WORK(&cache->worker, do_worker);
2038         INIT_DELAYED_WORK(&cache->waker, do_waker);
2039         cache->last_commit_jiffies = jiffies;
2040
2041         cache->prison = dm_bio_prison_create(PRISON_CELLS);
2042         if (!cache->prison) {
2043                 *error = "could not create bio prison";
2044                 goto bad;
2045         }
2046
2047         cache->all_io_ds = dm_deferred_set_create();
2048         if (!cache->all_io_ds) {
2049                 *error = "could not create all_io deferred set";
2050                 goto bad;
2051         }
2052
2053         cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2054                                                          migration_cache);
2055         if (!cache->migration_pool) {
2056                 *error = "Error creating cache's migration mempool";
2057                 goto bad;
2058         }
2059
2060         cache->next_migration = NULL;
2061
2062         cache->need_tick_bio = true;
2063         cache->sized = false;
2064         cache->quiescing = false;
2065         cache->commit_requested = false;
2066         cache->loaded_mappings = false;
2067         cache->loaded_discards = false;
2068
2069         load_stats(cache);
2070
2071         atomic_set(&cache->stats.demotion, 0);
2072         atomic_set(&cache->stats.promotion, 0);
2073         atomic_set(&cache->stats.copies_avoided, 0);
2074         atomic_set(&cache->stats.cache_cell_clash, 0);
2075         atomic_set(&cache->stats.commit_count, 0);
2076         atomic_set(&cache->stats.discard_count, 0);
2077
2078         *result = cache;
2079         return 0;
2080
2081 bad:
2082         destroy(cache);
2083         return r;
2084 }
2085
2086 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2087 {
2088         unsigned i;
2089         const char **copy;
2090
2091         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2092         if (!copy)
2093                 return -ENOMEM;
2094         for (i = 0; i < argc; i++) {
2095                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2096                 if (!copy[i]) {
2097                         while (i--)
2098                                 kfree(copy[i]);
2099                         kfree(copy);
2100                         return -ENOMEM;
2101                 }
2102         }
2103
2104         cache->nr_ctr_args = argc;
2105         cache->ctr_args = copy;
2106
2107         return 0;
2108 }
2109
2110 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2111 {
2112         int r = -EINVAL;
2113         struct cache_args *ca;
2114         struct cache *cache = NULL;
2115
2116         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2117         if (!ca) {
2118                 ti->error = "Error allocating memory for cache";
2119                 return -ENOMEM;
2120         }
2121         ca->ti = ti;
2122
2123         r = parse_cache_args(ca, argc, argv, &ti->error);
2124         if (r)
2125                 goto out;
2126
2127         r = cache_create(ca, &cache);
2128         if (r)
2129                 goto out;
2130
2131         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2132         if (r) {
2133                 destroy(cache);
2134                 goto out;
2135         }
2136
2137         ti->private = cache;
2138
2139 out:
2140         destroy_cache_args(ca);
2141         return r;
2142 }
2143
2144 static int cache_map(struct dm_target *ti, struct bio *bio)
2145 {
2146         struct cache *cache = ti->private;
2147
2148         int r;
2149         dm_oblock_t block = get_bio_block(cache, bio);
2150         size_t pb_data_size = get_per_bio_data_size(cache);
2151         bool can_migrate = false;
2152         bool discarded_block;
2153         struct dm_bio_prison_cell *cell;
2154         struct policy_result lookup_result;
2155         struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
2156
2157         if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2158                 /*
2159                  * This can only occur if the io goes to a partial block at
2160                  * the end of the origin device.  We don't cache these.
2161                  * Just remap to the origin and carry on.
2162                  */
2163                 remap_to_origin(cache, bio);
2164                 return DM_MAPIO_REMAPPED;
2165         }
2166
2167         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2168                 defer_bio(cache, bio);
2169                 return DM_MAPIO_SUBMITTED;
2170         }
2171
2172         /*
2173          * Check to see if that block is currently migrating.
2174          */
2175         cell = alloc_prison_cell(cache);
2176         if (!cell) {
2177                 defer_bio(cache, bio);
2178                 return DM_MAPIO_SUBMITTED;
2179         }
2180
2181         r = bio_detain(cache, block, bio, cell,
2182                        (cell_free_fn) free_prison_cell,
2183                        cache, &cell);
2184         if (r) {
2185                 if (r < 0)
2186                         defer_bio(cache, bio);
2187
2188                 return DM_MAPIO_SUBMITTED;
2189         }
2190
2191         discarded_block = is_discarded_oblock(cache, block);
2192
2193         r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2194                        bio, &lookup_result);
2195         if (r == -EWOULDBLOCK) {
2196                 cell_defer(cache, cell, true);
2197                 return DM_MAPIO_SUBMITTED;
2198
2199         } else if (r) {
2200                 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2201                 bio_io_error(bio);
2202                 return DM_MAPIO_SUBMITTED;
2203         }
2204
2205         switch (lookup_result.op) {
2206         case POLICY_HIT:
2207                 inc_hit_counter(cache, bio);
2208                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2209
2210                 if (is_writethrough_io(cache, bio, lookup_result.cblock))
2211                         remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2212                 else
2213                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2214
2215                 cell_defer(cache, cell, false);
2216                 break;
2217
2218         case POLICY_MISS:
2219                 inc_miss_counter(cache, bio);
2220                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2221
2222                 if (pb->req_nr != 0) {
2223                         /*
2224                          * This is a duplicate writethrough io that is no
2225                          * longer needed because the block has been demoted.
2226                          */
2227                         bio_endio(bio, 0);
2228                         cell_defer(cache, cell, false);
2229                         return DM_MAPIO_SUBMITTED;
2230                 } else {
2231                         remap_to_origin_clear_discard(cache, bio, block);
2232                         cell_defer(cache, cell, false);
2233                 }
2234                 break;
2235
2236         default:
2237                 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2238                             (unsigned) lookup_result.op);
2239                 bio_io_error(bio);
2240                 return DM_MAPIO_SUBMITTED;
2241         }
2242
2243         return DM_MAPIO_REMAPPED;
2244 }
2245
2246 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2247 {
2248         struct cache *cache = ti->private;
2249         unsigned long flags;
2250         size_t pb_data_size = get_per_bio_data_size(cache);
2251         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
2252
2253         if (pb->tick) {
2254                 policy_tick(cache->policy);
2255
2256                 spin_lock_irqsave(&cache->lock, flags);
2257                 cache->need_tick_bio = true;
2258                 spin_unlock_irqrestore(&cache->lock, flags);
2259         }
2260
2261         check_for_quiesced_migrations(cache, pb);
2262
2263         return 0;
2264 }
2265
2266 static int write_dirty_bitset(struct cache *cache)
2267 {
2268         unsigned i, r;
2269
2270         for (i = 0; i < from_cblock(cache->cache_size); i++) {
2271                 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2272                                        is_dirty(cache, to_cblock(i)));
2273                 if (r)
2274                         return r;
2275         }
2276
2277         return 0;
2278 }
2279
2280 static int write_discard_bitset(struct cache *cache)
2281 {
2282         unsigned i, r;
2283
2284         r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2285                                            cache->discard_nr_blocks);
2286         if (r) {
2287                 DMERR("could not resize on-disk discard bitset");
2288                 return r;
2289         }
2290
2291         for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2292                 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2293                                          is_discarded(cache, to_dblock(i)));
2294                 if (r)
2295                         return r;
2296         }
2297
2298         return 0;
2299 }
2300
2301 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2302                      uint32_t hint)
2303 {
2304         struct cache *cache = context;
2305         return dm_cache_save_hint(cache->cmd, cblock, hint);
2306 }
2307
2308 static int write_hints(struct cache *cache)
2309 {
2310         int r;
2311
2312         r = dm_cache_begin_hints(cache->cmd, cache->policy);
2313         if (r) {
2314                 DMERR("dm_cache_begin_hints failed");
2315                 return r;
2316         }
2317
2318         r = policy_walk_mappings(cache->policy, save_hint, cache);
2319         if (r)
2320                 DMERR("policy_walk_mappings failed");
2321
2322         return r;
2323 }
2324
2325 /*
2326  * returns true on success
2327  */
2328 static bool sync_metadata(struct cache *cache)
2329 {
2330         int r1, r2, r3, r4;
2331
2332         r1 = write_dirty_bitset(cache);
2333         if (r1)
2334                 DMERR("could not write dirty bitset");
2335
2336         r2 = write_discard_bitset(cache);
2337         if (r2)
2338                 DMERR("could not write discard bitset");
2339
2340         save_stats(cache);
2341
2342         r3 = write_hints(cache);
2343         if (r3)
2344                 DMERR("could not write hints");
2345
2346         /*
2347          * If writing the above metadata failed, we still commit, but don't
2348          * set the clean shutdown flag.  This will effectively force every
2349          * dirty bit to be set on reload.
2350          */
2351         r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2352         if (r4)
2353                 DMERR("could not write cache metadata.  Data loss may occur.");
2354
2355         return !r1 && !r2 && !r3 && !r4;
2356 }
2357
2358 static void cache_postsuspend(struct dm_target *ti)
2359 {
2360         struct cache *cache = ti->private;
2361
2362         start_quiescing(cache);
2363         wait_for_migrations(cache);
2364         stop_worker(cache);
2365         requeue_deferred_io(cache);
2366         stop_quiescing(cache);
2367
2368         (void) sync_metadata(cache);
2369 }
2370
2371 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2372                         bool dirty, uint32_t hint, bool hint_valid)
2373 {
2374         int r;
2375         struct cache *cache = context;
2376
2377         r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2378         if (r)
2379                 return r;
2380
2381         if (dirty)
2382                 set_dirty(cache, oblock, cblock);
2383         else
2384                 clear_dirty(cache, oblock, cblock);
2385
2386         return 0;
2387 }
2388
2389 static int load_discard(void *context, sector_t discard_block_size,
2390                         dm_dblock_t dblock, bool discard)
2391 {
2392         struct cache *cache = context;
2393
2394         /* FIXME: handle mis-matched block size */
2395
2396         if (discard)
2397                 set_discard(cache, dblock);
2398         else
2399                 clear_discard(cache, dblock);
2400
2401         return 0;
2402 }
2403
2404 static int cache_preresume(struct dm_target *ti)
2405 {
2406         int r = 0;
2407         struct cache *cache = ti->private;
2408         sector_t actual_cache_size = get_dev_size(cache->cache_dev);
2409         (void) sector_div(actual_cache_size, cache->sectors_per_block);
2410
2411         /*
2412          * Check to see if the cache has resized.
2413          */
2414         if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
2415                 cache->cache_size = to_cblock(actual_cache_size);
2416
2417                 r = dm_cache_resize(cache->cmd, cache->cache_size);
2418                 if (r) {
2419                         DMERR("could not resize cache metadata");
2420                         return r;
2421                 }
2422
2423                 cache->sized = true;
2424         }
2425
2426         if (!cache->loaded_mappings) {
2427                 r = dm_cache_load_mappings(cache->cmd, cache->policy,
2428                                            load_mapping, cache);
2429                 if (r) {
2430                         DMERR("could not load cache mappings");
2431                         return r;
2432                 }
2433
2434                 cache->loaded_mappings = true;
2435         }
2436
2437         if (!cache->loaded_discards) {
2438                 r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2439                 if (r) {
2440                         DMERR("could not load origin discards");
2441                         return r;
2442                 }
2443
2444                 cache->loaded_discards = true;
2445         }
2446
2447         return r;
2448 }
2449
2450 static void cache_resume(struct dm_target *ti)
2451 {
2452         struct cache *cache = ti->private;
2453
2454         cache->need_tick_bio = true;
2455         do_waker(&cache->waker.work);
2456 }
2457
2458 /*
2459  * Status format:
2460  *
2461  * <#used metadata blocks>/<#total metadata blocks>
2462  * <#read hits> <#read misses> <#write hits> <#write misses>
2463  * <#demotions> <#promotions> <#blocks in cache> <#dirty>
2464  * <#features> <features>*
2465  * <#core args> <core args>
2466  * <#policy args> <policy args>*
2467  */
2468 static void cache_status(struct dm_target *ti, status_type_t type,
2469                          unsigned status_flags, char *result, unsigned maxlen)
2470 {
2471         int r = 0;
2472         unsigned i;
2473         ssize_t sz = 0;
2474         dm_block_t nr_free_blocks_metadata = 0;
2475         dm_block_t nr_blocks_metadata = 0;
2476         char buf[BDEVNAME_SIZE];
2477         struct cache *cache = ti->private;
2478         dm_cblock_t residency;
2479
2480         switch (type) {
2481         case STATUSTYPE_INFO:
2482                 /* Commit to ensure statistics aren't out-of-date */
2483                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2484                         r = dm_cache_commit(cache->cmd, false);
2485                         if (r)
2486                                 DMERR("could not commit metadata for accurate status");
2487                 }
2488
2489                 r = dm_cache_get_free_metadata_block_count(cache->cmd,
2490                                                            &nr_free_blocks_metadata);
2491                 if (r) {
2492                         DMERR("could not get metadata free block count");
2493                         goto err;
2494                 }
2495
2496                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2497                 if (r) {
2498                         DMERR("could not get metadata device size");
2499                         goto err;
2500                 }
2501
2502                 residency = policy_residency(cache->policy);
2503
2504                 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %lu ",
2505                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2506                        (unsigned long long)nr_blocks_metadata,
2507                        (unsigned) atomic_read(&cache->stats.read_hit),
2508                        (unsigned) atomic_read(&cache->stats.read_miss),
2509                        (unsigned) atomic_read(&cache->stats.write_hit),
2510                        (unsigned) atomic_read(&cache->stats.write_miss),
2511                        (unsigned) atomic_read(&cache->stats.demotion),
2512                        (unsigned) atomic_read(&cache->stats.promotion),
2513                        (unsigned long long) from_cblock(residency),
2514                        (unsigned long) atomic_read(&cache->nr_dirty));
2515
2516                 if (cache->features.write_through)
2517                         DMEMIT("1 writethrough ");
2518                 else
2519                         DMEMIT("0 ");
2520
2521                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2522                 if (sz < maxlen) {
2523                         r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2524                         if (r)
2525                                 DMERR("policy_emit_config_values returned %d", r);
2526                 }
2527
2528                 break;
2529
2530         case STATUSTYPE_TABLE:
2531                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2532                 DMEMIT("%s ", buf);
2533                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2534                 DMEMIT("%s ", buf);
2535                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2536                 DMEMIT("%s", buf);
2537
2538                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
2539                         DMEMIT(" %s", cache->ctr_args[i]);
2540                 if (cache->nr_ctr_args)
2541                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2542         }
2543
2544         return;
2545
2546 err:
2547         DMEMIT("Error");
2548 }
2549
2550 /*
2551  * Supports <key> <value>.
2552  *
2553  * The key migration_threshold is supported by the cache target core.
2554  */
2555 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2556 {
2557         struct cache *cache = ti->private;
2558
2559         if (argc != 2)
2560                 return -EINVAL;
2561
2562         return set_config_value(cache, argv[0], argv[1]);
2563 }
2564
2565 static int cache_iterate_devices(struct dm_target *ti,
2566                                  iterate_devices_callout_fn fn, void *data)
2567 {
2568         int r = 0;
2569         struct cache *cache = ti->private;
2570
2571         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
2572         if (!r)
2573                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
2574
2575         return r;
2576 }
2577
2578 /*
2579  * We assume I/O is going to the origin (which is the volume
2580  * more likely to have restrictions e.g. by being striped).
2581  * (Looking up the exact location of the data would be expensive
2582  * and could always be out of date by the time the bio is submitted.)
2583  */
2584 static int cache_bvec_merge(struct dm_target *ti,
2585                             struct bvec_merge_data *bvm,
2586                             struct bio_vec *biovec, int max_size)
2587 {
2588         struct cache *cache = ti->private;
2589         struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
2590
2591         if (!q->merge_bvec_fn)
2592                 return max_size;
2593
2594         bvm->bi_bdev = cache->origin_dev->bdev;
2595         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2596 }
2597
2598 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2599 {
2600         /*
2601          * FIXME: these limits may be incompatible with the cache device
2602          */
2603         limits->max_discard_sectors = cache->discard_block_size;
2604         limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
2605 }
2606
2607 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2608 {
2609         struct cache *cache = ti->private;
2610         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
2611
2612         /*
2613          * If the system-determined stacked limits are compatible with the
2614          * cache's blocksize (io_opt is a factor) do not override them.
2615          */
2616         if (io_opt_sectors < cache->sectors_per_block ||
2617             do_div(io_opt_sectors, cache->sectors_per_block)) {
2618                 blk_limits_io_min(limits, 0);
2619                 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2620         }
2621         set_discard_limits(cache, limits);
2622 }
2623
2624 /*----------------------------------------------------------------*/
2625
2626 static struct target_type cache_target = {
2627         .name = "cache",
2628         .version = {1, 1, 1},
2629         .module = THIS_MODULE,
2630         .ctr = cache_ctr,
2631         .dtr = cache_dtr,
2632         .map = cache_map,
2633         .end_io = cache_end_io,
2634         .postsuspend = cache_postsuspend,
2635         .preresume = cache_preresume,
2636         .resume = cache_resume,
2637         .status = cache_status,
2638         .message = cache_message,
2639         .iterate_devices = cache_iterate_devices,
2640         .merge = cache_bvec_merge,
2641         .io_hints = cache_io_hints,
2642 };
2643
2644 static int __init dm_cache_init(void)
2645 {
2646         int r;
2647
2648         r = dm_register_target(&cache_target);
2649         if (r) {
2650                 DMERR("cache target registration failed: %d", r);
2651                 return r;
2652         }
2653
2654         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
2655         if (!migration_cache) {
2656                 dm_unregister_target(&cache_target);
2657                 return -ENOMEM;
2658         }
2659
2660         return 0;
2661 }
2662
2663 static void __exit dm_cache_exit(void)
2664 {
2665         dm_unregister_target(&cache_target);
2666         kmem_cache_destroy(migration_cache);
2667 }
2668
2669 module_init(dm_cache_init);
2670 module_exit(dm_cache_exit);
2671
2672 MODULE_DESCRIPTION(DM_NAME " cache target");
2673 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
2674 MODULE_LICENSE("GPL");