drivers/md/dm-clone-target.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
   4  */
   5
   6 #include <linux/mm.h>
   7 #include <linux/bio.h>
   8 #include <linux/err.h>
   9 #include <linux/hash.h>
  10 #include <linux/list.h>
  11 #include <linux/log2.h>
  12 #include <linux/init.h>
  13 #include <linux/slab.h>
  14 #include <linux/wait.h>
  15 #include <linux/dm-io.h>
  16 #include <linux/mutex.h>
  17 #include <linux/atomic.h>
  18 #include <linux/bitops.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/kdev_t.h>
  21 #include <linux/kernel.h>
  22 #include <linux/module.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/mempool.h>
  25 #include <linux/spinlock.h>
  26 #include <linux/blk_types.h>
  27 #include <linux/dm-kcopyd.h>
  28 #include <linux/workqueue.h>
  29 #include <linux/backing-dev.h>
  30 #include <linux/device-mapper.h>
  31
  32 #include "dm.h"
  33 #include "dm-clone-metadata.h"
  34
  35 #define DM_MSG_PREFIX "clone"
  36
  37 /*
  38  * Minimum and maximum allowed region sizes
  39  */
  40 #define MIN_REGION_SIZE (1 << 3)  /* 4KB */
  41 #define MAX_REGION_SIZE (1 << 21) /* 1GB */
  42
  43 #define MIN_HYDRATIONS 256 /* Size of hydration mempool */
  44 #define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */
  45 #define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */
  46
  47 #define COMMIT_PERIOD HZ /* 1 sec */
  48
  49 /*
  50  * Hydration hash table size: 1 << HASH_TABLE_BITS
  51  */
  52 #define HASH_TABLE_BITS 15
  53
  54 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle,
  55         "A percentage of time allocated for hydrating regions");
  56
  57 /* Slab cache for struct dm_clone_region_hydration */
  58 static struct kmem_cache *_hydration_cache;
  59
  60 /* dm-clone metadata modes */
  61 enum clone_metadata_mode {
  62         CM_WRITE,               /* metadata may be changed */
  63         CM_READ_ONLY,           /* metadata may not be changed */
  64         CM_FAIL,                /* all metadata I/O fails */
  65 };
  66
  67 struct hash_table_bucket;
  68
  69 struct clone {
  70         struct dm_target *ti;
  71         struct dm_target_callbacks callbacks;
  72
  73         struct dm_dev *metadata_dev;
  74         struct dm_dev *dest_dev;
  75         struct dm_dev *source_dev;
  76
  77         unsigned long nr_regions;
  78         sector_t region_size;
  79         unsigned int region_shift;
  80
  81         /*
  82          * A metadata commit and the actions taken in case it fails should run
  83          * as a single atomic step.
  84          */
  85         struct mutex commit_lock;
  86
  87         struct dm_clone_metadata *cmd;
  88
  89         /*
  90          * bio used to flush the destination device, before committing the
  91          * metadata.
  92          */
  93         struct bio flush_bio;
  94
  95         /* Region hydration hash table */
  96         struct hash_table_bucket *ht;
  97
  98         atomic_t ios_in_flight;
  99
 100         wait_queue_head_t hydration_stopped;
 101
 102         mempool_t hydration_pool;
 103
 104         unsigned long last_commit_jiffies;
 105
 106         /*
 107          * We defer incoming WRITE bios for regions that are not hydrated,
 108          * until after these regions have been hydrated.
 109          *
 110          * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the
 111          * metadata have been committed.
 112          */
 113         spinlock_t lock;
 114         struct bio_list deferred_bios;
 115         struct bio_list deferred_discard_bios;
 116         struct bio_list deferred_flush_bios;
 117         struct bio_list deferred_flush_completions;
 118
 119         /* Maximum number of regions being copied during background hydration. */
 120         unsigned int hydration_threshold;
 121
 122         /* Number of regions to batch together during background hydration. */
 123         unsigned int hydration_batch_size;
 124
 125         /* Which region to hydrate next */
 126         unsigned long hydration_offset;
 127
 128         atomic_t hydrations_in_flight;
 129
 130         /*
 131          * Save a copy of the table line rather than reconstructing it for the
 132          * status.
 133          */
 134         unsigned int nr_ctr_args;
 135         const char **ctr_args;
 136
 137         struct workqueue_struct *wq;
 138         struct work_struct worker;
 139         struct delayed_work waker;
 140
 141         struct dm_kcopyd_client *kcopyd_client;
 142
 143         enum clone_metadata_mode mode;
 144         unsigned long flags;
 145 };
 146
 147 /*
 148  * dm-clone flags
 149  */
 150 #define DM_CLONE_DISCARD_PASSDOWN 0
 151 #define DM_CLONE_HYDRATION_ENABLED 1
 152 #define DM_CLONE_HYDRATION_SUSPENDED 2
 153
 154 /*---------------------------------------------------------------------------*/
 155
 156 /*
 157  * Metadata failure handling.
 158  */
 159 static enum clone_metadata_mode get_clone_mode(struct clone *clone)
 160 {
 161         return READ_ONCE(clone->mode);
 162 }
 163
 164 static const char *clone_device_name(struct clone *clone)
 165 {
 166         return dm_table_device_name(clone->ti->table);
 167 }
 168
 169 static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode)
 170 {
 171         const char *descs[] = {
 172                 "read-write",
 173                 "read-only",
 174                 "fail"
 175         };
 176
 177         enum clone_metadata_mode old_mode = get_clone_mode(clone);
 178
 179         /* Never move out of fail mode */
 180         if (old_mode == CM_FAIL)
 181                 new_mode = CM_FAIL;
 182
 183         switch (new_mode) {
 184         case CM_FAIL:
 185         case CM_READ_ONLY:
 186                 dm_clone_metadata_set_read_only(clone->cmd);
 187                 break;
 188
 189         case CM_WRITE:
 190                 dm_clone_metadata_set_read_write(clone->cmd);
 191                 break;
 192         }
 193
 194         WRITE_ONCE(clone->mode, new_mode);
 195
 196         if (new_mode != old_mode) {
 197                 dm_table_event(clone->ti->table);
 198                 DMINFO("%s: Switching to %s mode", clone_device_name(clone),
 199                        descs[(int)new_mode]);
 200         }
 201 }
 202
 203 static void __abort_transaction(struct clone *clone)
 204 {
 205         const char *dev_name = clone_device_name(clone);
 206
 207         if (get_clone_mode(clone) >= CM_READ_ONLY)
 208                 return;
 209
 210         DMERR("%s: Aborting current metadata transaction", dev_name);
 211         if (dm_clone_metadata_abort(clone->cmd)) {
 212                 DMERR("%s: Failed to abort metadata transaction", dev_name);
 213                 __set_clone_mode(clone, CM_FAIL);
 214         }
 215 }
 216
 217 static void __reload_in_core_bitset(struct clone *clone)
 218 {
 219         const char *dev_name = clone_device_name(clone);
 220
 221         if (get_clone_mode(clone) == CM_FAIL)
 222                 return;
 223
 224         /* Reload the on-disk bitset */
 225         DMINFO("%s: Reloading on-disk bitmap", dev_name);
 226         if (dm_clone_reload_in_core_bitset(clone->cmd)) {
 227                 DMERR("%s: Failed to reload on-disk bitmap", dev_name);
 228                 __set_clone_mode(clone, CM_FAIL);
 229         }
 230 }
 231
 232 static void __metadata_operation_failed(struct clone *clone, const char *op, int r)
 233 {
 234         DMERR("%s: Metadata operation `%s' failed: error = %d",
 235               clone_device_name(clone), op, r);
 236
 237         __abort_transaction(clone);
 238         __set_clone_mode(clone, CM_READ_ONLY);
 239
 240         /*
 241          * dm_clone_reload_in_core_bitset() may run concurrently with either
 242          * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but
 243          * it's safe as we have already set the metadata to read-only mode.
 244          */
 245         __reload_in_core_bitset(clone);
 246 }
 247
 248 /*---------------------------------------------------------------------------*/
 249
 250 /* Wake up anyone waiting for region hydrations to stop */
 251 static inline void wakeup_hydration_waiters(struct clone *clone)
 252 {
 253         wake_up_all(&clone->hydration_stopped);
 254 }
 255
 256 static inline void wake_worker(struct clone *clone)
 257 {
 258         queue_work(clone->wq, &clone->worker);
 259 }
 260
 261 /*---------------------------------------------------------------------------*/
 262
 263 /*
 264  * bio helper functions.
 265  */
 266 static inline void remap_to_source(struct clone *clone, struct bio *bio)
 267 {
 268         bio_set_dev(bio, clone->source_dev->bdev);
 269 }
 270
 271 static inline void remap_to_dest(struct clone *clone, struct bio *bio)
 272 {
 273         bio_set_dev(bio, clone->dest_dev->bdev);
 274 }
 275
 276 static bool bio_triggers_commit(struct clone *clone, struct bio *bio)
 277 {
 278         return op_is_flush(bio->bi_opf) &&
 279                 dm_clone_changed_this_transaction(clone->cmd);
 280 }
 281
 282 /* Get the address of the region in sectors */
 283 static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr)
 284 {
 285         return ((sector_t)region_nr << clone->region_shift);
 286 }
 287
 288 /* Get the region number of the bio */
 289 static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio)
 290 {
 291         return (bio->bi_iter.bi_sector >> clone->region_shift);
 292 }
 293
 294 /* Get the region range covered by the bio */
 295 static void bio_region_range(struct clone *clone, struct bio *bio,
 296                              unsigned long *rs, unsigned long *nr_regions)
 297 {
 298         unsigned long end;
 299
 300         *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
 301         end = bio_end_sector(bio) >> clone->region_shift;
 302
 303         if (*rs >= end)
 304                 *nr_regions = 0;
 305         else
 306                 *nr_regions = end - *rs;
 307 }
 308
 309 /* Check whether a bio overwrites a region */
 310 static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio)
 311 {
 312         return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size);
 313 }
 314
 315 static void fail_bios(struct bio_list *bios, blk_status_t status)
 316 {
 317         struct bio *bio;
 318
 319         while ((bio = bio_list_pop(bios))) {
 320                 bio->bi_status = status;
 321                 bio_endio(bio);
 322         }
 323 }
 324
 325 static void submit_bios(struct bio_list *bios)
 326 {
 327         struct bio *bio;
 328         struct blk_plug plug;
 329
 330         blk_start_plug(&plug);
 331
 332         while ((bio = bio_list_pop(bios)))
 333                 generic_make_request(bio);
 334
 335         blk_finish_plug(&plug);
 336 }
 337
 338 /*
 339  * Submit bio to the underlying device.
 340  *
 341  * If the bio triggers a commit, delay it, until after the metadata have been
 342  * committed.
 343  *
 344  * NOTE: The bio remapping must be performed by the caller.
 345  */
 346 static void issue_bio(struct clone *clone, struct bio *bio)
 347 {
 348         if (!bio_triggers_commit(clone, bio)) {
 349                 generic_make_request(bio);
 350                 return;
 351         }
 352
 353         /*
 354          * If the metadata mode is RO or FAIL we won't be able to commit the
 355          * metadata, so we complete the bio with an error.
 356          */
 357         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
 358                 bio_io_error(bio);
 359                 return;
 360         }
 361
 362         /*
 363          * Batch together any bios that trigger commits and then issue a single
 364          * commit for them in process_deferred_flush_bios().
 365          */
 366         spin_lock_irq(&clone->lock);
 367         bio_list_add(&clone->deferred_flush_bios, bio);
 368         spin_unlock_irq(&clone->lock);
 369
 370         wake_worker(clone);
 371 }
 372
 373 /*
 374  * Remap bio to the destination device and submit it.
 375  *
 376  * If the bio triggers a commit, delay it, until after the metadata have been
 377  * committed.
 378  */
 379 static void remap_and_issue(struct clone *clone, struct bio *bio)
 380 {
 381         remap_to_dest(clone, bio);
 382         issue_bio(clone, bio);
 383 }
 384
 385 /*
 386  * Issue bios that have been deferred until after their region has finished
 387  * hydrating.
 388  *
 389  * We delegate the bio submission to the worker thread, so this is safe to call
 390  * from interrupt context.
 391  */
 392 static void issue_deferred_bios(struct clone *clone, struct bio_list *bios)
 393 {
 394         struct bio *bio;
 395         unsigned long flags;
 396         struct bio_list flush_bios = BIO_EMPTY_LIST;
 397         struct bio_list normal_bios = BIO_EMPTY_LIST;
 398
 399         if (bio_list_empty(bios))
 400                 return;
 401
 402         while ((bio = bio_list_pop(bios))) {
 403                 if (bio_triggers_commit(clone, bio))
 404                         bio_list_add(&flush_bios, bio);
 405                 else
 406                         bio_list_add(&normal_bios, bio);
 407         }
 408
 409         spin_lock_irqsave(&clone->lock, flags);
 410         bio_list_merge(&clone->deferred_bios, &normal_bios);
 411         bio_list_merge(&clone->deferred_flush_bios, &flush_bios);
 412         spin_unlock_irqrestore(&clone->lock, flags);
 413
 414         wake_worker(clone);
 415 }
 416
 417 static void complete_overwrite_bio(struct clone *clone, struct bio *bio)
 418 {
 419         unsigned long flags;
 420
 421         /*
 422          * If the bio has the REQ_FUA flag set we must commit the metadata
 423          * before signaling its completion.
 424          *
 425          * complete_overwrite_bio() is only called by hydration_complete(),
 426          * after having successfully updated the metadata. This means we don't
 427          * need to call dm_clone_changed_this_transaction() to check if the
 428          * metadata has changed and thus we can avoid taking the metadata spin
 429          * lock.
 430          */
 431         if (!(bio->bi_opf & REQ_FUA)) {
 432                 bio_endio(bio);
 433                 return;
 434         }
 435
 436         /*
 437          * If the metadata mode is RO or FAIL we won't be able to commit the
 438          * metadata, so we complete the bio with an error.
 439          */
 440         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
 441                 bio_io_error(bio);
 442                 return;
 443         }
 444
 445         /*
 446          * Batch together any bios that trigger commits and then issue a single
 447          * commit for them in process_deferred_flush_bios().
 448          */
 449         spin_lock_irqsave(&clone->lock, flags);
 450         bio_list_add(&clone->deferred_flush_completions, bio);
 451         spin_unlock_irqrestore(&clone->lock, flags);
 452
 453         wake_worker(clone);
 454 }
 455
 456 static void trim_bio(struct bio *bio, sector_t sector, unsigned int len)
 457 {
 458         bio->bi_iter.bi_sector = sector;
 459         bio->bi_iter.bi_size = to_bytes(len);
 460 }
 461
 462 static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success)
 463 {
 464         unsigned long rs, nr_regions;
 465
 466         /*
 467          * If the destination device supports discards, remap and trim the
 468          * discard bio and pass it down. Otherwise complete the bio
 469          * immediately.
 470          */
 471         if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
 472                 remap_to_dest(clone, bio);
 473                 bio_region_range(clone, bio, &rs, &nr_regions);
 474                 trim_bio(bio, region_to_sector(clone, rs),
 475                          nr_regions << clone->region_shift);
 476                 generic_make_request(bio);
 477         } else
 478                 bio_endio(bio);
 479 }
 480
 481 static void process_discard_bio(struct clone *clone, struct bio *bio)
 482 {
 483         unsigned long rs, nr_regions;
 484
 485         bio_region_range(clone, bio, &rs, &nr_regions);
 486         if (!nr_regions) {
 487                 bio_endio(bio);
 488                 return;
 489         }
 490
 491         if (WARN_ON(rs >= clone->nr_regions || (rs + nr_regions) < rs ||
 492                     (rs + nr_regions) > clone->nr_regions)) {
 493                 DMERR("%s: Invalid range (%lu + %lu, total regions %lu) for discard (%llu + %u)",
 494                       clone_device_name(clone), rs, nr_regions,
 495                       clone->nr_regions,
 496                       (unsigned long long)bio->bi_iter.bi_sector,
 497                       bio_sectors(bio));
 498                 bio_endio(bio);
 499                 return;
 500         }
 501
 502         /*
 503          * The covered regions are already hydrated so we just need to pass
 504          * down the discard.
 505          */
 506         if (dm_clone_is_range_hydrated(clone->cmd, rs, nr_regions)) {
 507                 complete_discard_bio(clone, bio, true);
 508                 return;
 509         }
 510
 511         /*
 512          * If the metadata mode is RO or FAIL we won't be able to update the
 513          * metadata for the regions covered by the discard so we just ignore
 514          * it.
 515          */
 516         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
 517                 bio_endio(bio);
 518                 return;
 519         }
 520
 521         /*
 522          * Defer discard processing.
 523          */
 524         spin_lock_irq(&clone->lock);
 525         bio_list_add(&clone->deferred_discard_bios, bio);
 526         spin_unlock_irq(&clone->lock);
 527
 528         wake_worker(clone);
 529 }
 530
 531 /*---------------------------------------------------------------------------*/
 532
 533 /*
 534  * dm-clone region hydrations.
 535  */
 536 struct dm_clone_region_hydration {
 537         struct clone *clone;
 538         unsigned long region_nr;
 539
 540         struct bio *overwrite_bio;
 541         bio_end_io_t *overwrite_bio_end_io;
 542
 543         struct bio_list deferred_bios;
 544
 545         blk_status_t status;
 546
 547         /* Used by hydration batching */
 548         struct list_head list;
 549
 550         /* Used by hydration hash table */
 551         struct hlist_node h;
 552 };
 553
 554 /*
 555  * Hydration hash table implementation.
 556  *
 557  * Ideally we would like to use list_bl, which uses bit spin locks and employs
 558  * the least significant bit of the list head to lock the corresponding bucket,
 559  * reducing the memory overhead for the locks. But, currently, list_bl and bit
 560  * spin locks don't support IRQ safe versions. Since we have to take the lock
 561  * in both process and interrupt context, we must fall back to using regular
 562  * spin locks; one per hash table bucket.
 563  */
 564 struct hash_table_bucket {
 565         struct hlist_head head;
 566
 567         /* Spinlock protecting the bucket */
 568         spinlock_t lock;
 569 };
 570
 571 #define bucket_lock_irqsave(bucket, flags) \
 572         spin_lock_irqsave(&(bucket)->lock, flags)
 573
 574 #define bucket_unlock_irqrestore(bucket, flags) \
 575         spin_unlock_irqrestore(&(bucket)->lock, flags)
 576
 577 #define bucket_lock_irq(bucket) \
 578         spin_lock_irq(&(bucket)->lock)
 579
 580 #define bucket_unlock_irq(bucket) \
 581         spin_unlock_irq(&(bucket)->lock)
 582
 583 static int hash_table_init(struct clone *clone)
 584 {
 585         unsigned int i, sz;
 586         struct hash_table_bucket *bucket;
 587
 588         sz = 1 << HASH_TABLE_BITS;
 589
 590         clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL);
 591         if (!clone->ht)
 592                 return -ENOMEM;
 593
 594         for (i = 0; i < sz; i++) {
 595                 bucket = clone->ht + i;
 596
 597                 INIT_HLIST_HEAD(&bucket->head);
 598                 spin_lock_init(&bucket->lock);
 599         }
 600
 601         return 0;
 602 }
 603
 604 static void hash_table_exit(struct clone *clone)
 605 {
 606         kvfree(clone->ht);
 607 }
 608
 609 static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone,
 610                                                        unsigned long region_nr)
 611 {
 612         return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)];
 613 }
 614
 615 /*
 616  * Search hash table for a hydration with hd->region_nr == region_nr
 617  *
 618  * NOTE: Must be called with the bucket lock held
 619  */
 620 static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket,
 621                                                      unsigned long region_nr)
 622 {
 623         struct dm_clone_region_hydration *hd;
 624
 625         hlist_for_each_entry(hd, &bucket->head, h) {
 626                 if (hd->region_nr == region_nr)
 627                         return hd;
 628         }
 629
 630         return NULL;
 631 }
 632
 633 /*
 634  * Insert a hydration into the hash table.
 635  *
 636  * NOTE: Must be called with the bucket lock held.
 637  */
 638 static inline void __insert_region_hydration(struct hash_table_bucket *bucket,
 639                                              struct dm_clone_region_hydration *hd)
 640 {
 641         hlist_add_head(&hd->h, &bucket->head);
 642 }
 643
 644 /*
 645  * This function inserts a hydration into the hash table, unless someone else
 646  * managed to insert a hydration for the same region first. In the latter case
 647  * it returns the existing hydration descriptor for this region.
 648  *
 649  * NOTE: Must be called with the hydration hash table lock held.
 650  */
 651 static struct dm_clone_region_hydration *
 652 __find_or_insert_region_hydration(struct hash_table_bucket *bucket,
 653                                   struct dm_clone_region_hydration *hd)
 654 {
 655         struct dm_clone_region_hydration *hd2;
 656
 657         hd2 = __hash_find(bucket, hd->region_nr);
 658         if (hd2)
 659                 return hd2;
 660
 661         __insert_region_hydration(bucket, hd);
 662
 663         return hd;
 664 }
 665
 666 /*---------------------------------------------------------------------------*/
 667
 668 /* Allocate a hydration */
 669 static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone)
 670 {
 671         struct dm_clone_region_hydration *hd;
 672
 673         /*
 674          * Allocate a hydration from the hydration mempool.
 675          * This might block but it can't fail.
 676          */
 677         hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO);
 678         hd->clone = clone;
 679
 680         return hd;
 681 }
 682
 683 static inline void free_hydration(struct dm_clone_region_hydration *hd)
 684 {
 685         mempool_free(hd, &hd->clone->hydration_pool);
 686 }
 687
 688 /* Initialize a hydration */
 689 static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr)
 690 {
 691         hd->region_nr = region_nr;
 692         hd->overwrite_bio = NULL;
 693         bio_list_init(&hd->deferred_bios);
 694         hd->status = 0;
 695
 696         INIT_LIST_HEAD(&hd->list);
 697         INIT_HLIST_NODE(&hd->h);
 698 }
 699
 700 /*---------------------------------------------------------------------------*/
 701
 702 /*
 703  * Update dm-clone's metadata after a region has finished hydrating and remove
 704  * hydration from the hash table.
 705  */
 706 static int hydration_update_metadata(struct dm_clone_region_hydration *hd)
 707 {
 708         int r = 0;
 709         unsigned long flags;
 710         struct hash_table_bucket *bucket;
 711         struct clone *clone = hd->clone;
 712
 713         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
 714                 r = -EPERM;
 715
 716         /* Update the metadata */
 717         if (likely(!r) && hd->status == BLK_STS_OK)
 718                 r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr);
 719
 720         bucket = get_hash_table_bucket(clone, hd->region_nr);
 721
 722         /* Remove hydration from hash table */
 723         bucket_lock_irqsave(bucket, flags);
 724         hlist_del(&hd->h);
 725         bucket_unlock_irqrestore(bucket, flags);
 726
 727         return r;
 728 }
 729
 730 /*
 731  * Complete a region's hydration:
 732  *
 733  *      1. Update dm-clone's metadata.
 734  *      2. Remove hydration from hash table.
 735  *      3. Complete overwrite bio.
 736  *      4. Issue deferred bios.
 737  *      5. If this was the last hydration, wake up anyone waiting for
 738  *         hydrations to finish.
 739  */
 740 static void hydration_complete(struct dm_clone_region_hydration *hd)
 741 {
 742         int r;
 743         blk_status_t status;
 744         struct clone *clone = hd->clone;
 745
 746         r = hydration_update_metadata(hd);
 747
 748         if (hd->status == BLK_STS_OK && likely(!r)) {
 749                 if (hd->overwrite_bio)
 750                         complete_overwrite_bio(clone, hd->overwrite_bio);
 751
 752                 issue_deferred_bios(clone, &hd->deferred_bios);
 753         } else {
 754                 status = r ? BLK_STS_IOERR : hd->status;
 755
 756                 if (hd->overwrite_bio)
 757                         bio_list_add(&hd->deferred_bios, hd->overwrite_bio);
 758
 759                 fail_bios(&hd->deferred_bios, status);
 760         }
 761
 762         free_hydration(hd);
 763
 764         if (atomic_dec_and_test(&clone->hydrations_in_flight))
 765                 wakeup_hydration_waiters(clone);
 766 }
 767
 768 static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context)
 769 {
 770         blk_status_t status;
 771
 772         struct dm_clone_region_hydration *tmp, *hd = context;
 773         struct clone *clone = hd->clone;
 774
 775         LIST_HEAD(batched_hydrations);
 776
 777         if (read_err || write_err) {
 778                 DMERR_LIMIT("%s: hydration failed", clone_device_name(clone));
 779                 status = BLK_STS_IOERR;
 780         } else {
 781                 status = BLK_STS_OK;
 782         }
 783         list_splice_tail(&hd->list, &batched_hydrations);
 784
 785         hd->status = status;
 786         hydration_complete(hd);
 787
 788         /* Complete batched hydrations */
 789         list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) {
 790                 hd->status = status;
 791                 hydration_complete(hd);
 792         }
 793
 794         /* Continue background hydration, if there is no I/O in-flight */
 795         if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
 796             !atomic_read(&clone->ios_in_flight))
 797                 wake_worker(clone);
 798 }
 799
 800 static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions)
 801 {
 802         unsigned long region_start, region_end;
 803         sector_t tail_size, region_size, total_size;
 804         struct dm_io_region from, to;
 805         struct clone *clone = hd->clone;
 806
 807         if (WARN_ON(!nr_regions))
 808                 return;
 809
 810         region_size = clone->region_size;
 811         region_start = hd->region_nr;
 812         region_end = region_start + nr_regions - 1;
 813
 814         total_size = region_to_sector(clone, nr_regions - 1);
 815
 816         if (region_end == clone->nr_regions - 1) {
 817                 /*
 818                  * The last region of the target might be smaller than
 819                  * region_size.
 820                  */
 821                 tail_size = clone->ti->len & (region_size - 1);
 822                 if (!tail_size)
 823                         tail_size = region_size;
 824         } else {
 825                 tail_size = region_size;
 826         }
 827
 828         total_size += tail_size;
 829
 830         from.bdev = clone->source_dev->bdev;
 831         from.sector = region_to_sector(clone, region_start);
 832         from.count = total_size;
 833
 834         to.bdev = clone->dest_dev->bdev;
 835         to.sector = from.sector;
 836         to.count = from.count;
 837
 838         /* Issue copy */
 839         atomic_add(nr_regions, &clone->hydrations_in_flight);
 840         dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0,
 841                        hydration_kcopyd_callback, hd);
 842 }
 843
 844 static void overwrite_endio(struct bio *bio)
 845 {
 846         struct dm_clone_region_hydration *hd = bio->bi_private;
 847
 848         bio->bi_end_io = hd->overwrite_bio_end_io;
 849         hd->status = bio->bi_status;
 850
 851         hydration_complete(hd);
 852 }
 853
 854 static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio)
 855 {
 856         /*
 857          * We don't need to save and restore bio->bi_private because device
 858          * mapper core generates a new bio for us to use, with clean
 859          * bi_private.
 860          */
 861         hd->overwrite_bio = bio;
 862         hd->overwrite_bio_end_io = bio->bi_end_io;
 863
 864         bio->bi_end_io = overwrite_endio;
 865         bio->bi_private = hd;
 866
 867         atomic_inc(&hd->clone->hydrations_in_flight);
 868         generic_make_request(bio);
 869 }
 870
 871 /*
 872  * Hydrate bio's region.
 873  *
 874  * This function starts the hydration of the bio's region and puts the bio in
 875  * the list of deferred bios for this region. In case, by the time this
 876  * function is called, the region has finished hydrating it's submitted to the
 877  * destination device.
 878  *
 879  * NOTE: The bio remapping must be performed by the caller.
 880  */
 881 static void hydrate_bio_region(struct clone *clone, struct bio *bio)
 882 {
 883         unsigned long region_nr;
 884         struct hash_table_bucket *bucket;
 885         struct dm_clone_region_hydration *hd, *hd2;
 886
 887         region_nr = bio_to_region(clone, bio);
 888         bucket = get_hash_table_bucket(clone, region_nr);
 889
 890         bucket_lock_irq(bucket);
 891
 892         hd = __hash_find(bucket, region_nr);
 893         if (hd) {
 894                 /* Someone else is hydrating the region */
 895                 bio_list_add(&hd->deferred_bios, bio);
 896                 bucket_unlock_irq(bucket);
 897                 return;
 898         }
 899
 900         if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
 901                 /* The region has been hydrated */
 902                 bucket_unlock_irq(bucket);
 903                 issue_bio(clone, bio);
 904                 return;
 905         }
 906
 907         /*
 908          * We must allocate a hydration descriptor and start the hydration of
 909          * the corresponding region.
 910          */
 911         bucket_unlock_irq(bucket);
 912
 913         hd = alloc_hydration(clone);
 914         hydration_init(hd, region_nr);
 915
 916         bucket_lock_irq(bucket);
 917
 918         /* Check if the region has been hydrated in the meantime. */
 919         if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
 920                 bucket_unlock_irq(bucket);
 921                 free_hydration(hd);
 922                 issue_bio(clone, bio);
 923                 return;
 924         }
 925
 926         hd2 = __find_or_insert_region_hydration(bucket, hd);
 927         if (hd2 != hd) {
 928                 /* Someone else started the region's hydration. */
 929                 bio_list_add(&hd2->deferred_bios, bio);
 930                 bucket_unlock_irq(bucket);
 931                 free_hydration(hd);
 932                 return;
 933         }
 934
 935         /*
 936          * If the metadata mode is RO or FAIL then there is no point starting a
 937          * hydration, since we will not be able to update the metadata when the
 938          * hydration finishes.
 939          */
 940         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
 941                 hlist_del(&hd->h);
 942                 bucket_unlock_irq(bucket);
 943                 free_hydration(hd);
 944                 bio_io_error(bio);
 945                 return;
 946         }
 947
 948         /*
 949          * Start region hydration.
 950          *
 951          * If a bio overwrites a region, i.e., its size is equal to the
 952          * region's size, then we don't need to copy the region from the source
 953          * to the destination device.
 954          */
 955         if (is_overwrite_bio(clone, bio)) {
 956                 bucket_unlock_irq(bucket);
 957                 hydration_overwrite(hd, bio);
 958         } else {
 959                 bio_list_add(&hd->deferred_bios, bio);
 960                 bucket_unlock_irq(bucket);
 961                 hydration_copy(hd, 1);
 962         }
 963 }
 964
 965 /*---------------------------------------------------------------------------*/
 966
 967 /*
 968  * Background hydrations.
 969  */
 970
 971 /*
 972  * Batch region hydrations.
 973  *
 974  * To better utilize device bandwidth we batch together the hydration of
 975  * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which
 976  * is good for small, random write performance (because of the overwriting of
 977  * un-hydrated regions) and at the same time issue big copy requests to kcopyd
 978  * to achieve high hydration bandwidth.
 979  */
 980 struct batch_info {
 981         struct dm_clone_region_hydration *head;
 982         unsigned int nr_batched_regions;
 983 };
 984
 985 static void __batch_hydration(struct batch_info *batch,
 986                               struct dm_clone_region_hydration *hd)
 987 {
 988         struct clone *clone = hd->clone;
 989         unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size);
 990
 991         if (batch->head) {
 992                 /* Try to extend the current batch */
 993                 if (batch->nr_batched_regions < max_batch_size &&
 994                     (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) {
 995                         list_add_tail(&hd->list, &batch->head->list);
 996                         batch->nr_batched_regions++;
 997                         hd = NULL;
 998                 }
 999
1000                 /* Check if we should issue the current batch */
1001                 if (batch->nr_batched_regions >= max_batch_size || hd) {
1002                         hydration_copy(batch->head, batch->nr_batched_regions);
1003                         batch->head = NULL;
1004                         batch->nr_batched_regions = 0;
1005                 }
1006         }
1007
1008         if (!hd)
1009                 return;
1010
1011         /* We treat max batch sizes of zero and one equivalently */
1012         if (max_batch_size <= 1) {
1013                 hydration_copy(hd, 1);
1014                 return;
1015         }
1016
1017         /* Start a new batch */
1018         BUG_ON(!list_empty(&hd->list));
1019         batch->head = hd;
1020         batch->nr_batched_regions = 1;
1021 }
1022
1023 static unsigned long __start_next_hydration(struct clone *clone,
1024                                             unsigned long offset,
1025                                             struct batch_info *batch)
1026 {
1027         struct hash_table_bucket *bucket;
1028         struct dm_clone_region_hydration *hd;
1029         unsigned long nr_regions = clone->nr_regions;
1030
1031         hd = alloc_hydration(clone);
1032
1033         /* Try to find a region to hydrate. */
1034         do {
1035                 offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset);
1036                 if (offset == nr_regions)
1037                         break;
1038
1039                 bucket = get_hash_table_bucket(clone, offset);
1040                 bucket_lock_irq(bucket);
1041
1042                 if (!dm_clone_is_region_hydrated(clone->cmd, offset) &&
1043                     !__hash_find(bucket, offset)) {
1044                         hydration_init(hd, offset);
1045                         __insert_region_hydration(bucket, hd);
1046                         bucket_unlock_irq(bucket);
1047
1048                         /* Batch hydration */
1049                         __batch_hydration(batch, hd);
1050
1051                         return (offset + 1);
1052                 }
1053
1054                 bucket_unlock_irq(bucket);
1055
1056         } while (++offset < nr_regions);
1057
1058         if (hd)
1059                 free_hydration(hd);
1060
1061         return offset;
1062 }
1063
1064 /*
1065  * This function searches for regions that still reside in the source device
1066  * and starts their hydration.
1067  */
1068 static void do_hydration(struct clone *clone)
1069 {
1070         unsigned int current_volume;
1071         unsigned long offset, nr_regions = clone->nr_regions;
1072
1073         struct batch_info batch = {
1074                 .head = NULL,
1075                 .nr_batched_regions = 0,
1076         };
1077
1078         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
1079                 return;
1080
1081         if (dm_clone_is_hydration_done(clone->cmd))
1082                 return;
1083
1084         /*
1085          * Avoid race with device suspension.
1086          */
1087         atomic_inc(&clone->hydrations_in_flight);
1088
1089         /*
1090          * Make sure atomic_inc() is ordered before test_bit(), otherwise we
1091          * might race with clone_postsuspend() and start a region hydration
1092          * after the target has been suspended.
1093          *
1094          * This is paired with the smp_mb__after_atomic() in
1095          * clone_postsuspend().
1096          */
1097         smp_mb__after_atomic();
1098
1099         offset = clone->hydration_offset;
1100         while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) &&
1101                !atomic_read(&clone->ios_in_flight) &&
1102                test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
1103                offset < nr_regions) {
1104                 current_volume = atomic_read(&clone->hydrations_in_flight);
1105                 current_volume += batch.nr_batched_regions;
1106
1107                 if (current_volume > READ_ONCE(clone->hydration_threshold))
1108                         break;
1109
1110                 offset = __start_next_hydration(clone, offset, &batch);
1111         }
1112
1113         if (batch.head)
1114                 hydration_copy(batch.head, batch.nr_batched_regions);
1115
1116         if (offset >= nr_regions)
1117                 offset = 0;
1118
1119         clone->hydration_offset = offset;
1120
1121         if (atomic_dec_and_test(&clone->hydrations_in_flight))
1122                 wakeup_hydration_waiters(clone);
1123 }
1124
1125 /*---------------------------------------------------------------------------*/
1126
1127 static bool need_commit_due_to_time(struct clone *clone)
1128 {
1129         return !time_in_range(jiffies, clone->last_commit_jiffies,
1130                               clone->last_commit_jiffies + COMMIT_PERIOD);
1131 }
1132
1133 /*
1134  * A non-zero return indicates read-only or fail mode.
1135  */
1136 static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
1137 {
1138         int r = 0;
1139
1140         if (dest_dev_flushed)
1141                 *dest_dev_flushed = false;
1142
1143         mutex_lock(&clone->commit_lock);
1144
1145         if (!dm_clone_changed_this_transaction(clone->cmd))
1146                 goto out;
1147
1148         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
1149                 r = -EPERM;
1150                 goto out;
1151         }
1152
1153         r = dm_clone_metadata_pre_commit(clone->cmd);
1154         if (unlikely(r)) {
1155                 __metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r);
1156                 goto out;
1157         }
1158
1159         bio_reset(&clone->flush_bio);
1160         bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
1161         clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
1162
1163         r = submit_bio_wait(&clone->flush_bio);
1164         if (unlikely(r)) {
1165                 __metadata_operation_failed(clone, "flush destination device", r);
1166                 goto out;
1167         }
1168
1169         if (dest_dev_flushed)
1170                 *dest_dev_flushed = true;
1171
1172         r = dm_clone_metadata_commit(clone->cmd);
1173         if (unlikely(r)) {
1174                 __metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
1175                 goto out;
1176         }
1177
1178         if (dm_clone_is_hydration_done(clone->cmd))
1179                 dm_table_event(clone->ti->table);
1180 out:
1181         mutex_unlock(&clone->commit_lock);
1182
1183         return r;
1184 }
1185
1186 static void process_deferred_discards(struct clone *clone)
1187 {
1188         int r = -EPERM;
1189         struct bio *bio;
1190         struct blk_plug plug;
1191         unsigned long rs, nr_regions;
1192         struct bio_list discards = BIO_EMPTY_LIST;
1193
1194         spin_lock_irq(&clone->lock);
1195         bio_list_merge(&discards, &clone->deferred_discard_bios);
1196         bio_list_init(&clone->deferred_discard_bios);
1197         spin_unlock_irq(&clone->lock);
1198
1199         if (bio_list_empty(&discards))
1200                 return;
1201
1202         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
1203                 goto out;
1204
1205         /* Update the metadata */
1206         bio_list_for_each(bio, &discards) {
1207                 bio_region_range(clone, bio, &rs, &nr_regions);
1208                 /*
1209                  * A discard request might cover regions that have been already
1210                  * hydrated. There is no need to update the metadata for these
1211                  * regions.
1212                  */
1213                 r = dm_clone_cond_set_range(clone->cmd, rs, nr_regions);
1214                 if (unlikely(r))
1215                         break;
1216         }
1217 out:
1218         blk_start_plug(&plug);
1219         while ((bio = bio_list_pop(&discards)))
1220                 complete_discard_bio(clone, bio, r == 0);
1221         blk_finish_plug(&plug);
1222 }
1223
1224 static void process_deferred_bios(struct clone *clone)
1225 {
1226         struct bio_list bios = BIO_EMPTY_LIST;
1227
1228         spin_lock_irq(&clone->lock);
1229         bio_list_merge(&bios, &clone->deferred_bios);
1230         bio_list_init(&clone->deferred_bios);
1231         spin_unlock_irq(&clone->lock);
1232
1233         if (bio_list_empty(&bios))
1234                 return;
1235
1236         submit_bios(&bios);
1237 }
1238
1239 static void process_deferred_flush_bios(struct clone *clone)
1240 {
1241         struct bio *bio;
1242         bool dest_dev_flushed;
1243         struct bio_list bios = BIO_EMPTY_LIST;
1244         struct bio_list bio_completions = BIO_EMPTY_LIST;
1245
1246         /*
1247          * If there are any deferred flush bios, we must commit the metadata
1248          * before issuing them or signaling their completion.
1249          */
1250         spin_lock_irq(&clone->lock);
1251         bio_list_merge(&bios, &clone->deferred_flush_bios);
1252         bio_list_init(&clone->deferred_flush_bios);
1253
1254         bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
1255         bio_list_init(&clone->deferred_flush_completions);
1256         spin_unlock_irq(&clone->lock);
1257
1258         if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
1259             !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
1260                 return;
1261
1262         if (commit_metadata(clone, &dest_dev_flushed)) {
1263                 bio_list_merge(&bios, &bio_completions);
1264
1265                 while ((bio = bio_list_pop(&bios)))
1266                         bio_io_error(bio);
1267
1268                 return;
1269         }
1270
1271         clone->last_commit_jiffies = jiffies;
1272
1273         while ((bio = bio_list_pop(&bio_completions)))
1274                 bio_endio(bio);
1275
1276         while ((bio = bio_list_pop(&bios))) {
1277                 if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
1278                         /* We just flushed the destination device as part of
1279                          * the metadata commit, so there is no reason to send
1280                          * another flush.
1281                          */
1282                         bio_endio(bio);
1283                 } else {
1284                         generic_make_request(bio);
1285                 }
1286         }
1287 }
1288
1289 static void do_worker(struct work_struct *work)
1290 {
1291         struct clone *clone = container_of(work, typeof(*clone), worker);
1292
1293         process_deferred_bios(clone);
1294         process_deferred_discards(clone);
1295
1296         /*
1297          * process_deferred_flush_bios():
1298          *
1299          *   - Commit metadata
1300          *
1301          *   - Process deferred REQ_FUA completions
1302          *
1303          *   - Process deferred REQ_PREFLUSH bios
1304          */
1305         process_deferred_flush_bios(clone);
1306
1307         /* Background hydration */
1308         do_hydration(clone);
1309 }
1310
1311 /*
1312  * Commit periodically so that not too much unwritten data builds up.
1313  *
1314  * Also, restart background hydration, if it has been stopped by in-flight I/O.
1315  */
1316 static void do_waker(struct work_struct *work)
1317 {
1318         struct clone *clone = container_of(to_delayed_work(work), struct clone, waker);
1319
1320         wake_worker(clone);
1321         queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD);
1322 }
1323
1324 /*---------------------------------------------------------------------------*/
1325
1326 /*
1327  * Target methods
1328  */
1329 static int clone_map(struct dm_target *ti, struct bio *bio)
1330 {
1331         struct clone *clone = ti->private;
1332         unsigned long region_nr;
1333
1334         atomic_inc(&clone->ios_in_flight);
1335
1336         if (unlikely(get_clone_mode(clone) == CM_FAIL))
1337                 return DM_MAPIO_KILL;
1338
1339         /*
1340          * REQ_PREFLUSH bios carry no data:
1341          *
1342          * - Commit metadata, if changed
1343          *
1344          * - Pass down to destination device
1345          */
1346         if (bio->bi_opf & REQ_PREFLUSH) {
1347                 remap_and_issue(clone, bio);
1348                 return DM_MAPIO_SUBMITTED;
1349         }
1350
1351         bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1352
1353         /*
1354          * dm-clone interprets discards and performs a fast hydration of the
1355          * discarded regions, i.e., we skip the copy from the source device and
1356          * just mark the regions as hydrated.
1357          */
1358         if (bio_op(bio) == REQ_OP_DISCARD) {
1359                 process_discard_bio(clone, bio);
1360                 return DM_MAPIO_SUBMITTED;
1361         }
1362
1363         /*
1364          * If the bio's region is hydrated, redirect it to the destination
1365          * device.
1366          *
1367          * If the region is not hydrated and the bio is a READ, redirect it to
1368          * the source device.
1369          *
1370          * Else, defer WRITE bio until after its region has been hydrated and
1371          * start the region's hydration immediately.
1372          */
1373         region_nr = bio_to_region(clone, bio);
1374         if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
1375                 remap_and_issue(clone, bio);
1376                 return DM_MAPIO_SUBMITTED;
1377         } else if (bio_data_dir(bio) == READ) {
1378                 remap_to_source(clone, bio);
1379                 return DM_MAPIO_REMAPPED;
1380         }
1381
1382         remap_to_dest(clone, bio);
1383         hydrate_bio_region(clone, bio);
1384
1385         return DM_MAPIO_SUBMITTED;
1386 }
1387
1388 static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error)
1389 {
1390         struct clone *clone = ti->private;
1391
1392         atomic_dec(&clone->ios_in_flight);
1393
1394         return DM_ENDIO_DONE;
1395 }
1396
1397 static void emit_flags(struct clone *clone, char *result, unsigned int maxlen,
1398                        ssize_t *sz_ptr)
1399 {
1400         ssize_t sz = *sz_ptr;
1401         unsigned int count;
1402
1403         count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1404         count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1405
1406         DMEMIT("%u ", count);
1407
1408         if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
1409                 DMEMIT("no_hydration ");
1410
1411         if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
1412                 DMEMIT("no_discard_passdown ");
1413
1414         *sz_ptr = sz;
1415 }
1416
1417 static void emit_core_args(struct clone *clone, char *result,
1418                            unsigned int maxlen, ssize_t *sz_ptr)
1419 {
1420         ssize_t sz = *sz_ptr;
1421         unsigned int count = 4;
1422
1423         DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count,
1424                READ_ONCE(clone->hydration_threshold),
1425                READ_ONCE(clone->hydration_batch_size));
1426
1427         *sz_ptr = sz;
1428 }
1429
1430 /*
1431  * Status format:
1432  *
1433  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
1434  * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions>
1435  * <#features> <features>* <#core args> <core args>* <clone metadata mode>
1436  */
1437 static void clone_status(struct dm_target *ti, status_type_t type,
1438                          unsigned int status_flags, char *result,
1439                          unsigned int maxlen)
1440 {
1441         int r;
1442         unsigned int i;
1443         ssize_t sz = 0;
1444         dm_block_t nr_free_metadata_blocks = 0;
1445         dm_block_t nr_metadata_blocks = 0;
1446         char buf[BDEVNAME_SIZE];
1447         struct clone *clone = ti->private;
1448
1449         switch (type) {
1450         case STATUSTYPE_INFO:
1451                 if (get_clone_mode(clone) == CM_FAIL) {
1452                         DMEMIT("Fail");
1453                         break;
1454                 }
1455
1456                 /* Commit to ensure statistics aren't out-of-date */
1457                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
1458                         (void) commit_metadata(clone, NULL);
1459
1460                 r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
1461
1462                 if (r) {
1463                         DMERR("%s: dm_clone_get_free_metadata_block_count returned %d",
1464                               clone_device_name(clone), r);
1465                         goto error;
1466                 }
1467
1468                 r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks);
1469
1470                 if (r) {
1471                         DMERR("%s: dm_clone_get_metadata_dev_size returned %d",
1472                               clone_device_name(clone), r);
1473                         goto error;
1474                 }
1475
1476                 DMEMIT("%u %llu/%llu %llu %u/%lu %u ",
1477                        DM_CLONE_METADATA_BLOCK_SIZE,
1478                        (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks),
1479                        (unsigned long long)nr_metadata_blocks,
1480                        (unsigned long long)clone->region_size,
1481                        dm_clone_nr_of_hydrated_regions(clone->cmd),
1482                        clone->nr_regions,
1483                        atomic_read(&clone->hydrations_in_flight));
1484
1485                 emit_flags(clone, result, maxlen, &sz);
1486                 emit_core_args(clone, result, maxlen, &sz);
1487
1488                 switch (get_clone_mode(clone)) {
1489                 case CM_WRITE:
1490                         DMEMIT("rw");
1491                         break;
1492                 case CM_READ_ONLY:
1493                         DMEMIT("ro");
1494                         break;
1495                 case CM_FAIL:
1496                         DMEMIT("Fail");
1497                 }
1498
1499                 break;
1500
1501         case STATUSTYPE_TABLE:
1502                 format_dev_t(buf, clone->metadata_dev->bdev->bd_dev);
1503                 DMEMIT("%s ", buf);
1504
1505                 format_dev_t(buf, clone->dest_dev->bdev->bd_dev);
1506                 DMEMIT("%s ", buf);
1507
1508                 format_dev_t(buf, clone->source_dev->bdev->bd_dev);
1509                 DMEMIT("%s", buf);
1510
1511                 for (i = 0; i < clone->nr_ctr_args; i++)
1512                         DMEMIT(" %s", clone->ctr_args[i]);
1513         }
1514
1515         return;
1516
1517 error:
1518         DMEMIT("Error");
1519 }
1520
1521 static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1522 {
1523         struct request_queue *dest_q, *source_q;
1524         struct clone *clone = container_of(cb, struct clone, callbacks);
1525
1526         source_q = bdev_get_queue(clone->source_dev->bdev);
1527         dest_q = bdev_get_queue(clone->dest_dev->bdev);
1528
1529         return (bdi_congested(dest_q->backing_dev_info, bdi_bits) |
1530                 bdi_congested(source_q->backing_dev_info, bdi_bits));
1531 }
1532
1533 static sector_t get_dev_size(struct dm_dev *dev)
1534 {
1535         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1536 }
1537
1538 /*---------------------------------------------------------------------------*/
1539
1540 /*
1541  * Construct a clone device mapping:
1542  *
1543  * clone <metadata dev> <destination dev> <source dev> <region size>
1544  *      [<#feature args> [<feature arg>]* [<#core args> [key value]*]]
1545  *
1546  * metadata dev: Fast device holding the persistent metadata
1547  * destination dev: The destination device, which will become a clone of the
1548  *                  source device
1549  * source dev: The read-only source device that gets cloned
1550  * region size: dm-clone unit size in sectors
1551  *
1552  * #feature args: Number of feature arguments passed
1553  * feature args: E.g. no_hydration, no_discard_passdown
1554  *
1555  * #core arguments: An even number of core arguments
1556  * core arguments: Key/value pairs for tuning the core
1557  *                 E.g. 'hydration_threshold 256'
1558  */
1559 static int parse_feature_args(struct dm_arg_set *as, struct clone *clone)
1560 {
1561         int r;
1562         unsigned int argc;
1563         const char *arg_name;
1564         struct dm_target *ti = clone->ti;
1565
1566         const struct dm_arg args = {
1567                 .min = 0,
1568                 .max = 2,
1569                 .error = "Invalid number of feature arguments"
1570         };
1571
1572         /* No feature arguments supplied */
1573         if (!as->argc)
1574                 return 0;
1575
1576         r = dm_read_arg_group(&args, as, &argc, &ti->error);
1577         if (r)
1578                 return r;
1579
1580         while (argc) {
1581                 arg_name = dm_shift_arg(as);
1582                 argc--;
1583
1584                 if (!strcasecmp(arg_name, "no_hydration")) {
1585                         __clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1586                 } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
1587                         __clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1588                 } else {
1589                         ti->error = "Invalid feature argument";
1590                         return -EINVAL;
1591                 }
1592         }
1593
1594         return 0;
1595 }
1596
1597 static int parse_core_args(struct dm_arg_set *as, struct clone *clone)
1598 {
1599         int r;
1600         unsigned int argc;
1601         unsigned int value;
1602         const char *arg_name;
1603         struct dm_target *ti = clone->ti;
1604
1605         const struct dm_arg args = {
1606                 .min = 0,
1607                 .max = 4,
1608                 .error = "Invalid number of core arguments"
1609         };
1610
1611         /* Initialize core arguments */
1612         clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE;
1613         clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD;
1614
1615         /* No core arguments supplied */
1616         if (!as->argc)
1617                 return 0;
1618
1619         r = dm_read_arg_group(&args, as, &argc, &ti->error);
1620         if (r)
1621                 return r;
1622
1623         if (argc & 1) {
1624                 ti->error = "Number of core arguments must be even";
1625                 return -EINVAL;
1626         }
1627
1628         while (argc) {
1629                 arg_name = dm_shift_arg(as);
1630                 argc -= 2;
1631
1632                 if (!strcasecmp(arg_name, "hydration_threshold")) {
1633                         if (kstrtouint(dm_shift_arg(as), 10, &value)) {
1634                                 ti->error = "Invalid value for argument `hydration_threshold'";
1635                                 return -EINVAL;
1636                         }
1637                         clone->hydration_threshold = value;
1638                 } else if (!strcasecmp(arg_name, "hydration_batch_size")) {
1639                         if (kstrtouint(dm_shift_arg(as), 10, &value)) {
1640                                 ti->error = "Invalid value for argument `hydration_batch_size'";
1641                                 return -EINVAL;
1642                         }
1643                         clone->hydration_batch_size = value;
1644                 } else {
1645                         ti->error = "Invalid core argument";
1646                         return -EINVAL;
1647                 }
1648         }
1649
1650         return 0;
1651 }
1652
1653 static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error)
1654 {
1655         int r;
1656         unsigned int region_size;
1657         struct dm_arg arg;
1658
1659         arg.min = MIN_REGION_SIZE;
1660         arg.max = MAX_REGION_SIZE;
1661         arg.error = "Invalid region size";
1662
1663         r = dm_read_arg(&arg, as, &region_size, error);
1664         if (r)
1665                 return r;
1666
1667         /* Check region size is a power of 2 */
1668         if (!is_power_of_2(region_size)) {
1669                 *error = "Region size is not a power of 2";
1670                 return -EINVAL;
1671         }
1672
1673         /* Validate the region size against the device logical block size */
1674         if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) ||
1675             region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) {
1676                 *error = "Region size is not a multiple of device logical block size";
1677                 return -EINVAL;
1678         }
1679
1680         clone->region_size = region_size;
1681
1682         return 0;
1683 }
1684
1685 static int validate_nr_regions(unsigned long n, char **error)
1686 {
1687         /*
1688          * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us
1689          * further to 2^31 regions.
1690          */
1691         if (n > (1UL << 31)) {
1692                 *error = "Too many regions. Consider increasing the region size";
1693                 return -EINVAL;
1694         }
1695
1696         return 0;
1697 }
1698
1699 static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1700 {
1701         int r;
1702         sector_t metadata_dev_size;
1703         char b[BDEVNAME_SIZE];
1704
1705         r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1706                           &clone->metadata_dev);
1707         if (r) {
1708                 *error = "Error opening metadata device";
1709                 return r;
1710         }
1711
1712         metadata_dev_size = get_dev_size(clone->metadata_dev);
1713         if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING)
1714                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1715                        bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS);
1716
1717         return 0;
1718 }
1719
1720 static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1721 {
1722         int r;
1723         sector_t dest_dev_size;
1724
1725         r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1726                           &clone->dest_dev);
1727         if (r) {
1728                 *error = "Error opening destination device";
1729                 return r;
1730         }
1731
1732         dest_dev_size = get_dev_size(clone->dest_dev);
1733         if (dest_dev_size < clone->ti->len) {
1734                 dm_put_device(clone->ti, clone->dest_dev);
1735                 *error = "Device size larger than destination device";
1736                 return -EINVAL;
1737         }
1738
1739         return 0;
1740 }
1741
1742 static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1743 {
1744         int r;
1745         sector_t source_dev_size;
1746
1747         r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ,
1748                           &clone->source_dev);
1749         if (r) {
1750                 *error = "Error opening source device";
1751                 return r;
1752         }
1753
1754         source_dev_size = get_dev_size(clone->source_dev);
1755         if (source_dev_size < clone->ti->len) {
1756                 dm_put_device(clone->ti, clone->source_dev);
1757                 *error = "Device size larger than source device";
1758                 return -EINVAL;
1759         }
1760
1761         return 0;
1762 }
1763
1764 static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error)
1765 {
1766         unsigned int i;
1767         const char **copy;
1768
1769         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
1770         if (!copy)
1771                 goto error;
1772
1773         for (i = 0; i < argc; i++) {
1774                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
1775
1776                 if (!copy[i]) {
1777                         while (i--)
1778                                 kfree(copy[i]);
1779                         kfree(copy);
1780                         goto error;
1781                 }
1782         }
1783
1784         clone->nr_ctr_args = argc;
1785         clone->ctr_args = copy;
1786         return 0;
1787
1788 error:
1789         *error = "Failed to allocate memory for table line";
1790         return -ENOMEM;
1791 }
1792
1793 static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1794 {
1795         int r;
1796         sector_t nr_regions;
1797         struct clone *clone;
1798         struct dm_arg_set as;
1799
1800         if (argc < 4) {
1801                 ti->error = "Invalid number of arguments";
1802                 return -EINVAL;
1803         }
1804
1805         as.argc = argc;
1806         as.argv = argv;
1807
1808         clone = kzalloc(sizeof(*clone), GFP_KERNEL);
1809         if (!clone) {
1810                 ti->error = "Failed to allocate clone structure";
1811                 return -ENOMEM;
1812         }
1813
1814         clone->ti = ti;
1815
1816         /* Initialize dm-clone flags */
1817         __set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1818         __set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
1819         __set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1820
1821         r = parse_metadata_dev(clone, &as, &ti->error);
1822         if (r)
1823                 goto out_with_clone;
1824
1825         r = parse_dest_dev(clone, &as, &ti->error);
1826         if (r)
1827                 goto out_with_meta_dev;
1828
1829         r = parse_source_dev(clone, &as, &ti->error);
1830         if (r)
1831                 goto out_with_dest_dev;
1832
1833         r = parse_region_size(clone, &as, &ti->error);
1834         if (r)
1835                 goto out_with_source_dev;
1836
1837         clone->region_shift = __ffs(clone->region_size);
1838         nr_regions = dm_sector_div_up(ti->len, clone->region_size);
1839
1840         /* Check for overflow */
1841         if (nr_regions != (unsigned long)nr_regions) {
1842                 ti->error = "Too many regions. Consider increasing the region size";
1843                 r = -EOVERFLOW;
1844                 goto out_with_source_dev;
1845         }
1846
1847         clone->nr_regions = nr_regions;
1848
1849         r = validate_nr_regions(clone->nr_regions, &ti->error);
1850         if (r)
1851                 goto out_with_source_dev;
1852
1853         r = dm_set_target_max_io_len(ti, clone->region_size);
1854         if (r) {
1855                 ti->error = "Failed to set max io len";
1856                 goto out_with_source_dev;
1857         }
1858
1859         r = parse_feature_args(&as, clone);
1860         if (r)
1861                 goto out_with_source_dev;
1862
1863         r = parse_core_args(&as, clone);
1864         if (r)
1865                 goto out_with_source_dev;
1866
1867         /* Load metadata */
1868         clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len,
1869                                             clone->region_size);
1870         if (IS_ERR(clone->cmd)) {
1871                 ti->error = "Failed to load metadata";
1872                 r = PTR_ERR(clone->cmd);
1873                 goto out_with_source_dev;
1874         }
1875
1876         __set_clone_mode(clone, CM_WRITE);
1877
1878         if (get_clone_mode(clone) != CM_WRITE) {
1879                 ti->error = "Unable to get write access to metadata, please check/repair metadata";
1880                 r = -EPERM;
1881                 goto out_with_metadata;
1882         }
1883
1884         clone->last_commit_jiffies = jiffies;
1885
1886         /* Allocate hydration hash table */
1887         r = hash_table_init(clone);
1888         if (r) {
1889                 ti->error = "Failed to allocate hydration hash table";
1890                 goto out_with_metadata;
1891         }
1892
1893         atomic_set(&clone->ios_in_flight, 0);
1894         init_waitqueue_head(&clone->hydration_stopped);
1895         spin_lock_init(&clone->lock);
1896         bio_list_init(&clone->deferred_bios);
1897         bio_list_init(&clone->deferred_discard_bios);
1898         bio_list_init(&clone->deferred_flush_bios);
1899         bio_list_init(&clone->deferred_flush_completions);
1900         clone->hydration_offset = 0;
1901         atomic_set(&clone->hydrations_in_flight, 0);
1902         bio_init(&clone->flush_bio, NULL, 0);
1903
1904         clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
1905         if (!clone->wq) {
1906                 ti->error = "Failed to allocate workqueue";
1907                 r = -ENOMEM;
1908                 goto out_with_ht;
1909         }
1910
1911         INIT_WORK(&clone->worker, do_worker);
1912         INIT_DELAYED_WORK(&clone->waker, do_waker);
1913
1914         clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1915         if (IS_ERR(clone->kcopyd_client)) {
1916                 r = PTR_ERR(clone->kcopyd_client);
1917                 goto out_with_wq;
1918         }
1919
1920         r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS,
1921                                    _hydration_cache);
1922         if (r) {
1923                 ti->error = "Failed to create dm_clone_region_hydration memory pool";
1924                 goto out_with_kcopyd;
1925         }
1926
1927         /* Save a copy of the table line */
1928         r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error);
1929         if (r)
1930                 goto out_with_mempool;
1931
1932         mutex_init(&clone->commit_lock);
1933         clone->callbacks.congested_fn = clone_is_congested;
1934         dm_table_add_target_callbacks(ti->table, &clone->callbacks);
1935
1936         /* Enable flushes */
1937         ti->num_flush_bios = 1;
1938         ti->flush_supported = true;
1939
1940         /* Enable discards */
1941         ti->discards_supported = true;
1942         ti->num_discard_bios = 1;
1943
1944         ti->private = clone;
1945
1946         return 0;
1947
1948 out_with_mempool:
1949         mempool_exit(&clone->hydration_pool);
1950 out_with_kcopyd:
1951         dm_kcopyd_client_destroy(clone->kcopyd_client);
1952 out_with_wq:
1953         destroy_workqueue(clone->wq);
1954 out_with_ht:
1955         hash_table_exit(clone);
1956 out_with_metadata:
1957         dm_clone_metadata_close(clone->cmd);
1958 out_with_source_dev:
1959         dm_put_device(ti, clone->source_dev);
1960 out_with_dest_dev:
1961         dm_put_device(ti, clone->dest_dev);
1962 out_with_meta_dev:
1963         dm_put_device(ti, clone->metadata_dev);
1964 out_with_clone:
1965         kfree(clone);
1966
1967         return r;
1968 }
1969
1970 static void clone_dtr(struct dm_target *ti)
1971 {
1972         unsigned int i;
1973         struct clone *clone = ti->private;
1974
1975         mutex_destroy(&clone->commit_lock);
1976         bio_uninit(&clone->flush_bio);
1977
1978         for (i = 0; i < clone->nr_ctr_args; i++)
1979                 kfree(clone->ctr_args[i]);
1980         kfree(clone->ctr_args);
1981
1982         mempool_exit(&clone->hydration_pool);
1983         dm_kcopyd_client_destroy(clone->kcopyd_client);
1984         destroy_workqueue(clone->wq);
1985         hash_table_exit(clone);
1986         dm_clone_metadata_close(clone->cmd);
1987         dm_put_device(ti, clone->source_dev);
1988         dm_put_device(ti, clone->dest_dev);
1989         dm_put_device(ti, clone->metadata_dev);
1990
1991         kfree(clone);
1992 }
1993
1994 /*---------------------------------------------------------------------------*/
1995
1996 static void clone_postsuspend(struct dm_target *ti)
1997 {
1998         struct clone *clone = ti->private;
1999
2000         /*
2001          * To successfully suspend the device:
2002          *
2003          *      - We cancel the delayed work for periodic commits and wait for
2004          *        it to finish.
2005          *
2006          *      - We stop the background hydration, i.e. we prevent new region
2007          *        hydrations from starting.
2008          *
2009          *      - We wait for any in-flight hydrations to finish.
2010          *
2011          *      - We flush the workqueue.
2012          *
2013          *      - We commit the metadata.
2014          */
2015         cancel_delayed_work_sync(&clone->waker);
2016
2017         set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
2018
2019         /*
2020          * Make sure set_bit() is ordered before atomic_read(), otherwise we
2021          * might race with do_hydration() and miss some started region
2022          * hydrations.
2023          *
2024          * This is paired with smp_mb__after_atomic() in do_hydration().
2025          */
2026         smp_mb__after_atomic();
2027
2028         wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
2029         flush_workqueue(clone->wq);
2030
2031         (void) commit_metadata(clone, NULL);
2032 }
2033
2034 static void clone_resume(struct dm_target *ti)
2035 {
2036         struct clone *clone = ti->private;
2037
2038         clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
2039         do_waker(&clone->waker.work);
2040 }
2041
2042 static bool bdev_supports_discards(struct block_device *bdev)
2043 {
2044         struct request_queue *q = bdev_get_queue(bdev);
2045
2046         return (q && blk_queue_discard(q));
2047 }
2048
2049 /*
2050  * If discard_passdown was enabled verify that the destination device supports
2051  * discards. Disable discard_passdown if not.
2052  */
2053 static void disable_passdown_if_not_supported(struct clone *clone)
2054 {
2055         struct block_device *dest_dev = clone->dest_dev->bdev;
2056         struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits;
2057         const char *reason = NULL;
2058         char buf[BDEVNAME_SIZE];
2059
2060         if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
2061                 return;
2062
2063         if (!bdev_supports_discards(dest_dev))
2064                 reason = "discard unsupported";
2065         else if (dest_limits->max_discard_sectors < clone->region_size)
2066                 reason = "max discard sectors smaller than a region";
2067
2068         if (reason) {
2069                 DMWARN("Destination device (%s) %s: Disabling discard passdown.",
2070                        bdevname(dest_dev, buf), reason);
2071                 clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
2072         }
2073 }
2074
2075 static void set_discard_limits(struct clone *clone, struct queue_limits *limits)
2076 {
2077         struct block_device *dest_bdev = clone->dest_dev->bdev;
2078         struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits;
2079
2080         if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) {
2081                 /* No passdown is done so we set our own virtual limits */
2082                 limits->discard_granularity = clone->region_size << SECTOR_SHIFT;
2083                 limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size);
2084                 return;
2085         }
2086
2087         /*
2088          * clone_iterate_devices() is stacking both the source and destination
2089          * device limits but discards aren't passed to the source device, so
2090          * inherit destination's limits.
2091          */
2092         limits->max_discard_sectors = dest_limits->max_discard_sectors;
2093         limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors;
2094         limits->discard_granularity = dest_limits->discard_granularity;
2095         limits->discard_alignment = dest_limits->discard_alignment;
2096         limits->discard_misaligned = dest_limits->discard_misaligned;
2097         limits->max_discard_segments = dest_limits->max_discard_segments;
2098 }
2099
2100 static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits)
2101 {
2102         struct clone *clone = ti->private;
2103         u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
2104
2105         /*
2106          * If the system-determined stacked limits are compatible with
2107          * dm-clone's region size (io_opt is a factor) do not override them.
2108          */
2109         if (io_opt_sectors < clone->region_size ||
2110             do_div(io_opt_sectors, clone->region_size)) {
2111                 blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT);
2112                 blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT);
2113         }
2114
2115         disable_passdown_if_not_supported(clone);
2116         set_discard_limits(clone, limits);
2117 }
2118
2119 static int clone_iterate_devices(struct dm_target *ti,
2120                                  iterate_devices_callout_fn fn, void *data)
2121 {
2122         int ret;
2123         struct clone *clone = ti->private;
2124         struct dm_dev *dest_dev = clone->dest_dev;
2125         struct dm_dev *source_dev = clone->source_dev;
2126
2127         ret = fn(ti, source_dev, 0, ti->len, data);
2128         if (!ret)
2129                 ret = fn(ti, dest_dev, 0, ti->len, data);
2130         return ret;
2131 }
2132
2133 /*
2134  * dm-clone message functions.
2135  */
2136 static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions)
2137 {
2138         WRITE_ONCE(clone->hydration_threshold, nr_regions);
2139
2140         /*
2141          * If user space sets hydration_threshold to zero then the hydration
2142          * will stop. If at a later time the hydration_threshold is increased
2143          * we must restart the hydration process by waking up the worker.
2144          */
2145         wake_worker(clone);
2146 }
2147
2148 static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions)
2149 {
2150         WRITE_ONCE(clone->hydration_batch_size, nr_regions);
2151 }
2152
2153 static void enable_hydration(struct clone *clone)
2154 {
2155         if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
2156                 wake_worker(clone);
2157 }
2158
2159 static void disable_hydration(struct clone *clone)
2160 {
2161         clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
2162 }
2163
2164 static int clone_message(struct dm_target *ti, unsigned int argc, char **argv,
2165                          char *result, unsigned int maxlen)
2166 {
2167         struct clone *clone = ti->private;
2168         unsigned int value;
2169
2170         if (!argc)
2171                 return -EINVAL;
2172
2173         if (!strcasecmp(argv[0], "enable_hydration")) {
2174                 enable_hydration(clone);
2175                 return 0;
2176         }
2177
2178         if (!strcasecmp(argv[0], "disable_hydration")) {
2179                 disable_hydration(clone);
2180                 return 0;
2181         }
2182
2183         if (argc != 2)
2184                 return -EINVAL;
2185
2186         if (!strcasecmp(argv[0], "hydration_threshold")) {
2187                 if (kstrtouint(argv[1], 10, &value))
2188                         return -EINVAL;
2189
2190                 set_hydration_threshold(clone, value);
2191
2192                 return 0;
2193         }
2194
2195         if (!strcasecmp(argv[0], "hydration_batch_size")) {
2196                 if (kstrtouint(argv[1], 10, &value))
2197                         return -EINVAL;
2198
2199                 set_hydration_batch_size(clone, value);
2200
2201                 return 0;
2202         }
2203
2204         DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]);
2205         return -EINVAL;
2206 }
2207
2208 static struct target_type clone_target = {
2209         .name = "clone",
2210         .version = {1, 0, 0},
2211         .module = THIS_MODULE,
2212         .ctr = clone_ctr,
2213         .dtr =  clone_dtr,
2214         .map = clone_map,
2215         .end_io = clone_endio,
2216         .postsuspend = clone_postsuspend,
2217         .resume = clone_resume,
2218         .status = clone_status,
2219         .message = clone_message,
2220         .io_hints = clone_io_hints,
2221         .iterate_devices = clone_iterate_devices,
2222 };
2223
2224 /*---------------------------------------------------------------------------*/
2225
2226 /* Module functions */
2227 static int __init dm_clone_init(void)
2228 {
2229         int r;
2230
2231         _hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0);
2232         if (!_hydration_cache)
2233                 return -ENOMEM;
2234
2235         r = dm_register_target(&clone_target);
2236         if (r < 0) {
2237                 DMERR("Failed to register clone target");
2238                 return r;
2239         }
2240
2241         return 0;
2242 }
2243
2244 static void __exit dm_clone_exit(void)
2245 {
2246         dm_unregister_target(&clone_target);
2247
2248         kmem_cache_destroy(_hydration_cache);
2249         _hydration_cache = NULL;
2250 }
2251
2252 /* Module hooks */
2253 module_init(dm_clone_init);
2254 module_exit(dm_clone_exit);
2255
2256 MODULE_DESCRIPTION(DM_NAME " clone target");
2257 MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>");
2258 MODULE_LICENSE("GPL");