drivers/md/dm-exception-store.c

   1 /*
   2  * dm-snapshot.c
   3  *
   4  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
   5  *
   6  * This file is released under the GPL.
   7  */
   8
   9 #include "dm.h"
  10 #include "dm-snap.h"
  11 #include "dm-io.h"
  12 #include "kcopyd.h"
  13
  14 #include <linux/mm.h>
  15 #include <linux/pagemap.h>
  16 #include <linux/vmalloc.h>
  17 #include <linux/slab.h>
  18
  19 #define DM_MSG_PREFIX "snapshots"
  20 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32        /* 16KB */
  21
  22 /*-----------------------------------------------------------------
  23  * Persistent snapshots, by persistent we mean that the snapshot
  24  * will survive a reboot.
  25  *---------------------------------------------------------------*/
  26
  27 /*
  28  * We need to store a record of which parts of the origin have
  29  * been copied to the snapshot device.  The snapshot code
  30  * requires that we copy exception chunks to chunk aligned areas
  31  * of the COW store.  It makes sense therefore, to store the
  32  * metadata in chunk size blocks.
  33  *
  34  * There is no backward or forward compatibility implemented,
  35  * snapshots with different disk versions than the kernel will
  36  * not be usable.  It is expected that "lvcreate" will blank out
  37  * the start of a fresh COW device before calling the snapshot
  38  * constructor.
  39  *
  40  * The first chunk of the COW device just contains the header.
  41  * After this there is a chunk filled with exception metadata,
  42  * followed by as many exception chunks as can fit in the
  43  * metadata areas.
  44  *
  45  * All on disk structures are in little-endian format.  The end
  46  * of the exceptions info is indicated by an exception with a
  47  * new_chunk of 0, which is invalid since it would point to the
  48  * header chunk.
  49  */
  50
  51 /*
  52  * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
  53  */
  54 #define SNAP_MAGIC 0x70416e53
  55
  56 /*
  57  * The on-disk version of the metadata.
  58  */
  59 #define SNAPSHOT_DISK_VERSION 1
  60
  61 struct disk_header {
  62         uint32_t magic;
  63
  64         /*
  65          * Is this snapshot valid.  There is no way of recovering
  66          * an invalid snapshot.
  67          */
  68         uint32_t valid;
  69
  70         /*
  71          * Simple, incrementing version. no backward
  72          * compatibility.
  73          */
  74         uint32_t version;
  75
  76         /* In sectors */
  77         uint32_t chunk_size;
  78 };
  79
  80 struct disk_exception {
  81         uint64_t old_chunk;
  82         uint64_t new_chunk;
  83 };
  84
  85 struct commit_callback {
  86         void (*callback)(void *, int success);
  87         void *context;
  88 };
  89
  90 /*
  91  * The top level structure for a persistent exception store.
  92  */
  93 struct pstore {
  94         struct dm_snapshot *snap;       /* up pointer to my snapshot */
  95         int version;
  96         int valid;
  97         uint32_t exceptions_per_area;
  98
  99         /*
 100          * Now that we have an asynchronous kcopyd there is no
 101          * need for large chunk sizes, so it wont hurt to have a
 102          * whole chunks worth of metadata in memory at once.
 103          */
 104         void *area;
 105
 106         /*
 107          * Used to keep track of which metadata area the data in
 108          * 'chunk' refers to.
 109          */
 110         uint32_t current_area;
 111
 112         /*
 113          * The next free chunk for an exception.
 114          */
 115         uint32_t next_free;
 116
 117         /*
 118          * The index of next free exception in the current
 119          * metadata area.
 120          */
 121         uint32_t current_committed;
 122
 123         atomic_t pending_count;
 124         uint32_t callback_count;
 125         struct commit_callback *callbacks;
 126 };
 127
 128 static inline unsigned int sectors_to_pages(unsigned int sectors)
 129 {
 130         return sectors / (PAGE_SIZE >> 9);
 131 }
 132
 133 static int alloc_area(struct pstore *ps)
 134 {
 135         int r = -ENOMEM;
 136         size_t len;
 137
 138         len = ps->snap->chunk_size << SECTOR_SHIFT;
 139
 140         /*
 141          * Allocate the chunk_size block of memory that will hold
 142          * a single metadata area.
 143          */
 144         ps->area = vmalloc(len);
 145         if (!ps->area)
 146                 return r;
 147
 148         return 0;
 149 }
 150
 151 static void free_area(struct pstore *ps)
 152 {
 153         vfree(ps->area);
 154         ps->area = NULL;
 155 }
 156
 157 /*
 158  * Read or write a chunk aligned and sized block of data from a device.
 159  */
 160 static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
 161 {
 162         struct io_region where;
 163         unsigned long bits;
 164
 165         where.bdev = ps->snap->cow->bdev;
 166         where.sector = ps->snap->chunk_size * chunk;
 167         where.count = ps->snap->chunk_size;
 168
 169         return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
 170 }
 171
 172 /*
 173  * Read or write a metadata area.  Remembering to skip the first
 174  * chunk which holds the header.
 175  */
 176 static int area_io(struct pstore *ps, uint32_t area, int rw)
 177 {
 178         int r;
 179         uint32_t chunk;
 180
 181         /* convert a metadata area index to a chunk index */
 182         chunk = 1 + ((ps->exceptions_per_area + 1) * area);
 183
 184         r = chunk_io(ps, chunk, rw);
 185         if (r)
 186                 return r;
 187
 188         ps->current_area = area;
 189         return 0;
 190 }
 191
 192 static int zero_area(struct pstore *ps, uint32_t area)
 193 {
 194         memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
 195         return area_io(ps, area, WRITE);
 196 }
 197
 198 static int read_header(struct pstore *ps, int *new_snapshot)
 199 {
 200         int r;
 201         struct disk_header *dh;
 202         chunk_t chunk_size;
 203         int chunk_size_supplied = 1;
 204
 205         /*
 206          * Use default chunk size (or hardsect_size, if larger) if none supplied
 207          */
 208         if (!ps->snap->chunk_size) {
 209                 ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
 210                     bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
 211                 ps->snap->chunk_mask = ps->snap->chunk_size - 1;
 212                 ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
 213                 chunk_size_supplied = 0;
 214         }
 215
 216         r = dm_io_get(sectors_to_pages(ps->snap->chunk_size));
 217         if (r)
 218                 return r;
 219
 220         r = alloc_area(ps);
 221         if (r)
 222                 goto bad1;
 223
 224         r = chunk_io(ps, 0, READ);
 225         if (r)
 226                 goto bad2;
 227
 228         dh = (struct disk_header *) ps->area;
 229
 230         if (le32_to_cpu(dh->magic) == 0) {
 231                 *new_snapshot = 1;
 232                 return 0;
 233         }
 234
 235         if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
 236                 DMWARN("Invalid or corrupt snapshot");
 237                 r = -ENXIO;
 238                 goto bad2;
 239         }
 240
 241         *new_snapshot = 0;
 242         ps->valid = le32_to_cpu(dh->valid);
 243         ps->version = le32_to_cpu(dh->version);
 244         chunk_size = le32_to_cpu(dh->chunk_size);
 245
 246         if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
 247                 return 0;
 248
 249         DMWARN("chunk size %llu in device metadata overrides "
 250                "table chunk size of %llu.",
 251                (unsigned long long)chunk_size,
 252                (unsigned long long)ps->snap->chunk_size);
 253
 254         /* We had a bogus chunk_size. Fix stuff up. */
 255         dm_io_put(sectors_to_pages(ps->snap->chunk_size));
 256         free_area(ps);
 257
 258         ps->snap->chunk_size = chunk_size;
 259         ps->snap->chunk_mask = chunk_size - 1;
 260         ps->snap->chunk_shift = ffs(chunk_size) - 1;
 261
 262         r = dm_io_get(sectors_to_pages(chunk_size));
 263         if (r)
 264                 return r;
 265
 266         r = alloc_area(ps);
 267         if (r)
 268                 goto bad1;
 269
 270         return 0;
 271
 272 bad2:
 273         free_area(ps);
 274 bad1:
 275         dm_io_put(sectors_to_pages(ps->snap->chunk_size));
 276         return r;
 277 }
 278
 279 static int write_header(struct pstore *ps)
 280 {
 281         struct disk_header *dh;
 282
 283         memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
 284
 285         dh = (struct disk_header *) ps->area;
 286         dh->magic = cpu_to_le32(SNAP_MAGIC);
 287         dh->valid = cpu_to_le32(ps->valid);
 288         dh->version = cpu_to_le32(ps->version);
 289         dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
 290
 291         return chunk_io(ps, 0, WRITE);
 292 }
 293
 294 /*
 295  * Access functions for the disk exceptions, these do the endian conversions.
 296  */
 297 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
 298 {
 299         BUG_ON(index >= ps->exceptions_per_area);
 300
 301         return ((struct disk_exception *) ps->area) + index;
 302 }
 303
 304 static void read_exception(struct pstore *ps,
 305                            uint32_t index, struct disk_exception *result)
 306 {
 307         struct disk_exception *e = get_exception(ps, index);
 308
 309         /* copy it */
 310         result->old_chunk = le64_to_cpu(e->old_chunk);
 311         result->new_chunk = le64_to_cpu(e->new_chunk);
 312 }
 313
 314 static void write_exception(struct pstore *ps,
 315                             uint32_t index, struct disk_exception *de)
 316 {
 317         struct disk_exception *e = get_exception(ps, index);
 318
 319         /* copy it */
 320         e->old_chunk = cpu_to_le64(de->old_chunk);
 321         e->new_chunk = cpu_to_le64(de->new_chunk);
 322 }
 323
 324 /*
 325  * Registers the exceptions that are present in the current area.
 326  * 'full' is filled in to indicate if the area has been
 327  * filled.
 328  */
 329 static int insert_exceptions(struct pstore *ps, int *full)
 330 {
 331         int r;
 332         unsigned int i;
 333         struct disk_exception de;
 334
 335         /* presume the area is full */
 336         *full = 1;
 337
 338         for (i = 0; i < ps->exceptions_per_area; i++) {
 339                 read_exception(ps, i, &de);
 340
 341                 /*
 342                  * If the new_chunk is pointing at the start of
 343                  * the COW device, where the first metadata area
 344                  * is we know that we've hit the end of the
 345                  * exceptions.  Therefore the area is not full.
 346                  */
 347                 if (de.new_chunk == 0LL) {
 348                         ps->current_committed = i;
 349                         *full = 0;
 350                         break;
 351                 }
 352
 353                 /*
 354                  * Keep track of the start of the free chunks.
 355                  */
 356                 if (ps->next_free <= de.new_chunk)
 357                         ps->next_free = de.new_chunk + 1;
 358
 359                 /*
 360                  * Otherwise we add the exception to the snapshot.
 361                  */
 362                 r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
 363                 if (r)
 364                         return r;
 365         }
 366
 367         return 0;
 368 }
 369
 370 static int read_exceptions(struct pstore *ps)
 371 {
 372         uint32_t area;
 373         int r, full = 1;
 374
 375         /*
 376          * Keeping reading chunks and inserting exceptions until
 377          * we find a partially full area.
 378          */
 379         for (area = 0; full; area++) {
 380                 r = area_io(ps, area, READ);
 381                 if (r)
 382                         return r;
 383
 384                 r = insert_exceptions(ps, &full);
 385                 if (r)
 386                         return r;
 387         }
 388
 389         return 0;
 390 }
 391
 392 static inline struct pstore *get_info(struct exception_store *store)
 393 {
 394         return (struct pstore *) store->context;
 395 }
 396
 397 static void persistent_fraction_full(struct exception_store *store,
 398                                      sector_t *numerator, sector_t *denominator)
 399 {
 400         *numerator = get_info(store)->next_free * store->snap->chunk_size;
 401         *denominator = get_dev_size(store->snap->cow->bdev);
 402 }
 403
 404 static void persistent_destroy(struct exception_store *store)
 405 {
 406         struct pstore *ps = get_info(store);
 407
 408         dm_io_put(sectors_to_pages(ps->snap->chunk_size));
 409         vfree(ps->callbacks);
 410         free_area(ps);
 411         kfree(ps);
 412 }
 413
 414 static int persistent_read_metadata(struct exception_store *store)
 415 {
 416         int r, new_snapshot;
 417         struct pstore *ps = get_info(store);
 418
 419         /*
 420          * Read the snapshot header.
 421          */
 422         r = read_header(ps, &new_snapshot);
 423         if (r)
 424                 return r;
 425
 426         /*
 427          * Now we know correct chunk_size, complete the initialisation.
 428          */
 429         ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
 430                                   sizeof(struct disk_exception);
 431         ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
 432                         sizeof(*ps->callbacks));
 433         if (!ps->callbacks)
 434                 return -ENOMEM;
 435
 436         /*
 437          * Do we need to setup a new snapshot ?
 438          */
 439         if (new_snapshot) {
 440                 r = write_header(ps);
 441                 if (r) {
 442                         DMWARN("write_header failed");
 443                         return r;
 444                 }
 445
 446                 r = zero_area(ps, 0);
 447                 if (r) {
 448                         DMWARN("zero_area(0) failed");
 449                         return r;
 450                 }
 451
 452         } else {
 453                 /*
 454                  * Sanity checks.
 455                  */
 456                 if (!ps->valid) {
 457                         DMWARN("snapshot is marked invalid");
 458                         return -EINVAL;
 459                 }
 460
 461                 if (ps->version != SNAPSHOT_DISK_VERSION) {
 462                         DMWARN("unable to handle snapshot disk version %d",
 463                                ps->version);
 464                         return -EINVAL;
 465                 }
 466
 467                 /*
 468                  * Read the metadata.
 469                  */
 470                 r = read_exceptions(ps);
 471                 if (r)
 472                         return r;
 473         }
 474
 475         return 0;
 476 }
 477
 478 static int persistent_prepare(struct exception_store *store,
 479                               struct exception *e)
 480 {
 481         struct pstore *ps = get_info(store);
 482         uint32_t stride;
 483         sector_t size = get_dev_size(store->snap->cow->bdev);
 484
 485         /* Is there enough room ? */
 486         if (size < ((ps->next_free + 1) * store->snap->chunk_size))
 487                 return -ENOSPC;
 488
 489         e->new_chunk = ps->next_free;
 490
 491         /*
 492          * Move onto the next free pending, making sure to take
 493          * into account the location of the metadata chunks.
 494          */
 495         stride = (ps->exceptions_per_area + 1);
 496         if ((++ps->next_free % stride) == 1)
 497                 ps->next_free++;
 498
 499         atomic_inc(&ps->pending_count);
 500         return 0;
 501 }
 502
 503 static void persistent_commit(struct exception_store *store,
 504                               struct exception *e,
 505                               void (*callback) (void *, int success),
 506                               void *callback_context)
 507 {
 508         int r;
 509         unsigned int i;
 510         struct pstore *ps = get_info(store);
 511         struct disk_exception de;
 512         struct commit_callback *cb;
 513
 514         de.old_chunk = e->old_chunk;
 515         de.new_chunk = e->new_chunk;
 516         write_exception(ps, ps->current_committed++, &de);
 517
 518         /*
 519          * Add the callback to the back of the array.  This code
 520          * is the only place where the callback array is
 521          * manipulated, and we know that it will never be called
 522          * multiple times concurrently.
 523          */
 524         cb = ps->callbacks + ps->callback_count++;
 525         cb->callback = callback;
 526         cb->context = callback_context;
 527
 528         /*
 529          * If there are no more exceptions in flight, or we have
 530          * filled this metadata area we commit the exceptions to
 531          * disk.
 532          */
 533         if (atomic_dec_and_test(&ps->pending_count) ||
 534             (ps->current_committed == ps->exceptions_per_area)) {
 535                 r = area_io(ps, ps->current_area, WRITE);
 536                 if (r)
 537                         ps->valid = 0;
 538
 539                 /*
 540                  * Have we completely filled the current area ?
 541                  */
 542                 if (ps->current_committed == ps->exceptions_per_area) {
 543                         ps->current_committed = 0;
 544                         r = zero_area(ps, ps->current_area + 1);
 545                         if (r)
 546                                 ps->valid = 0;
 547                 }
 548
 549                 for (i = 0; i < ps->callback_count; i++) {
 550                         cb = ps->callbacks + i;
 551                         cb->callback(cb->context, r == 0 ? 1 : 0);
 552                 }
 553
 554                 ps->callback_count = 0;
 555         }
 556 }
 557
 558 static void persistent_drop(struct exception_store *store)
 559 {
 560         struct pstore *ps = get_info(store);
 561
 562         ps->valid = 0;
 563         if (write_header(ps))
 564                 DMWARN("write header failed");
 565 }
 566
 567 int dm_create_persistent(struct exception_store *store)
 568 {
 569         struct pstore *ps;
 570
 571         /* allocate the pstore */
 572         ps = kmalloc(sizeof(*ps), GFP_KERNEL);
 573         if (!ps)
 574                 return -ENOMEM;
 575
 576         ps->snap = store->snap;
 577         ps->valid = 1;
 578         ps->version = SNAPSHOT_DISK_VERSION;
 579         ps->area = NULL;
 580         ps->next_free = 2;      /* skipping the header and first area */
 581         ps->current_committed = 0;
 582
 583         ps->callback_count = 0;
 584         atomic_set(&ps->pending_count, 0);
 585         ps->callbacks = NULL;
 586
 587         store->destroy = persistent_destroy;
 588         store->read_metadata = persistent_read_metadata;
 589         store->prepare_exception = persistent_prepare;
 590         store->commit_exception = persistent_commit;
 591         store->drop_snapshot = persistent_drop;
 592         store->fraction_full = persistent_fraction_full;
 593         store->context = ps;
 594
 595         return 0;
 596 }
 597
 598 /*-----------------------------------------------------------------
 599  * Implementation of the store for non-persistent snapshots.
 600  *---------------------------------------------------------------*/
 601 struct transient_c {
 602         sector_t next_free;
 603 };
 604
 605 static void transient_destroy(struct exception_store *store)
 606 {
 607         kfree(store->context);
 608 }
 609
 610 static int transient_read_metadata(struct exception_store *store)
 611 {
 612         return 0;
 613 }
 614
 615 static int transient_prepare(struct exception_store *store, struct exception *e)
 616 {
 617         struct transient_c *tc = (struct transient_c *) store->context;
 618         sector_t size = get_dev_size(store->snap->cow->bdev);
 619
 620         if (size < (tc->next_free + store->snap->chunk_size))
 621                 return -1;
 622
 623         e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
 624         tc->next_free += store->snap->chunk_size;
 625
 626         return 0;
 627 }
 628
 629 static void transient_commit(struct exception_store *store,
 630                       struct exception *e,
 631                       void (*callback) (void *, int success),
 632                       void *callback_context)
 633 {
 634         /* Just succeed */
 635         callback(callback_context, 1);
 636 }
 637
 638 static void transient_fraction_full(struct exception_store *store,
 639                                     sector_t *numerator, sector_t *denominator)
 640 {
 641         *numerator = ((struct transient_c *) store->context)->next_free;
 642         *denominator = get_dev_size(store->snap->cow->bdev);
 643 }
 644
 645 int dm_create_transient(struct exception_store *store)
 646 {
 647         struct transient_c *tc;
 648
 649         store->destroy = transient_destroy;
 650         store->read_metadata = transient_read_metadata;
 651         store->prepare_exception = transient_prepare;
 652         store->commit_exception = transient_commit;
 653         store->drop_snapshot = NULL;
 654         store->fraction_full = transient_fraction_full;
 655
 656         tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
 657         if (!tc)
 658                 return -ENOMEM;
 659
 660         tc->next_free = 0;
 661         store->context = tc;
 662
 663         return 0;
 664 }