drivers/md/dm-exception-store.c

   1 /*
   2  * dm-exception-store.c
   3  *
   4  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
   5  * Copyright (C) 2006 Red Hat GmbH
   6  *
   7  * This file is released under the GPL.
   8  */
   9
  10 #include "dm.h"
  11 #include "dm-snap.h"
  12 #include "dm-io.h"
  13 #include "kcopyd.h"
  14
  15 #include <linux/mm.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/vmalloc.h>
  18 #include <linux/slab.h>
  19
  20 #define DM_MSG_PREFIX "snapshots"
  21 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32        /* 16KB */
  22
  23 /*-----------------------------------------------------------------
  24  * Persistent snapshots, by persistent we mean that the snapshot
  25  * will survive a reboot.
  26  *---------------------------------------------------------------*/
  27
  28 /*
  29  * We need to store a record of which parts of the origin have
  30  * been copied to the snapshot device.  The snapshot code
  31  * requires that we copy exception chunks to chunk aligned areas
  32  * of the COW store.  It makes sense therefore, to store the
  33  * metadata in chunk size blocks.
  34  *
  35  * There is no backward or forward compatibility implemented,
  36  * snapshots with different disk versions than the kernel will
  37  * not be usable.  It is expected that "lvcreate" will blank out
  38  * the start of a fresh COW device before calling the snapshot
  39  * constructor.
  40  *
  41  * The first chunk of the COW device just contains the header.
  42  * After this there is a chunk filled with exception metadata,
  43  * followed by as many exception chunks as can fit in the
  44  * metadata areas.
  45  *
  46  * All on disk structures are in little-endian format.  The end
  47  * of the exceptions info is indicated by an exception with a
  48  * new_chunk of 0, which is invalid since it would point to the
  49  * header chunk.
  50  */
  51
  52 /*
  53  * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
  54  */
  55 #define SNAP_MAGIC 0x70416e53
  56
  57 /*
  58  * The on-disk version of the metadata.
  59  */
  60 #define SNAPSHOT_DISK_VERSION 1
  61
  62 struct disk_header {
  63         uint32_t magic;
  64
  65         /*
  66          * Is this snapshot valid.  There is no way of recovering
  67          * an invalid snapshot.
  68          */
  69         uint32_t valid;
  70
  71         /*
  72          * Simple, incrementing version. no backward
  73          * compatibility.
  74          */
  75         uint32_t version;
  76
  77         /* In sectors */
  78         uint32_t chunk_size;
  79 };
  80
  81 struct disk_exception {
  82         uint64_t old_chunk;
  83         uint64_t new_chunk;
  84 };
  85
  86 struct commit_callback {
  87         void (*callback)(void *, int success);
  88         void *context;
  89 };
  90
  91 /*
  92  * The top level structure for a persistent exception store.
  93  */
  94 struct pstore {
  95         struct dm_snapshot *snap;       /* up pointer to my snapshot */
  96         int version;
  97         int valid;
  98         uint32_t exceptions_per_area;
  99
 100         /*
 101          * Now that we have an asynchronous kcopyd there is no
 102          * need for large chunk sizes, so it wont hurt to have a
 103          * whole chunks worth of metadata in memory at once.
 104          */
 105         void *area;
 106
 107         /*
 108          * Used to keep track of which metadata area the data in
 109          * 'chunk' refers to.
 110          */
 111         uint32_t current_area;
 112
 113         /*
 114          * The next free chunk for an exception.
 115          */
 116         uint32_t next_free;
 117
 118         /*
 119          * The index of next free exception in the current
 120          * metadata area.
 121          */
 122         uint32_t current_committed;
 123
 124         atomic_t pending_count;
 125         uint32_t callback_count;
 126         struct commit_callback *callbacks;
 127         struct dm_io_client *io_client;
 128 };
 129
 130 static inline unsigned int sectors_to_pages(unsigned int sectors)
 131 {
 132         return sectors / (PAGE_SIZE >> 9);
 133 }
 134
 135 static int alloc_area(struct pstore *ps)
 136 {
 137         int r = -ENOMEM;
 138         size_t len;
 139
 140         len = ps->snap->chunk_size << SECTOR_SHIFT;
 141
 142         /*
 143          * Allocate the chunk_size block of memory that will hold
 144          * a single metadata area.
 145          */
 146         ps->area = vmalloc(len);
 147         if (!ps->area)
 148                 return r;
 149
 150         return 0;
 151 }
 152
 153 static void free_area(struct pstore *ps)
 154 {
 155         vfree(ps->area);
 156         ps->area = NULL;
 157 }
 158
 159 /*
 160  * Read or write a chunk aligned and sized block of data from a device.
 161  */
 162 static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
 163 {
 164         struct io_region where = {
 165                 .bdev = ps->snap->cow->bdev,
 166                 .sector = ps->snap->chunk_size * chunk,
 167                 .count = ps->snap->chunk_size,
 168         };
 169         struct dm_io_request io_req = {
 170                 .bi_rw = rw,
 171                 .mem.type = DM_IO_VMA,
 172                 .mem.ptr.vma = ps->area,
 173                 .client = ps->io_client,
 174                 .notify.fn = NULL,
 175         };
 176
 177         return dm_io(&io_req, 1, &where, NULL);
 178 }
 179
 180 /*
 181  * Read or write a metadata area.  Remembering to skip the first
 182  * chunk which holds the header.
 183  */
 184 static int area_io(struct pstore *ps, uint32_t area, int rw)
 185 {
 186         int r;
 187         uint32_t chunk;
 188
 189         /* convert a metadata area index to a chunk index */
 190         chunk = 1 + ((ps->exceptions_per_area + 1) * area);
 191
 192         r = chunk_io(ps, chunk, rw);
 193         if (r)
 194                 return r;
 195
 196         ps->current_area = area;
 197         return 0;
 198 }
 199
 200 static int zero_area(struct pstore *ps, uint32_t area)
 201 {
 202         memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
 203         return area_io(ps, area, WRITE);
 204 }
 205
 206 static int read_header(struct pstore *ps, int *new_snapshot)
 207 {
 208         int r;
 209         struct disk_header *dh;
 210         chunk_t chunk_size;
 211         int chunk_size_supplied = 1;
 212
 213         /*
 214          * Use default chunk size (or hardsect_size, if larger) if none supplied
 215          */
 216         if (!ps->snap->chunk_size) {
 217                 ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
 218                     bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
 219                 ps->snap->chunk_mask = ps->snap->chunk_size - 1;
 220                 ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
 221                 chunk_size_supplied = 0;
 222         }
 223
 224         ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
 225                                                              chunk_size));
 226         if (IS_ERR(ps->io_client))
 227                 return PTR_ERR(ps->io_client);
 228
 229         r = alloc_area(ps);
 230         if (r)
 231                 return r;
 232
 233         r = chunk_io(ps, 0, READ);
 234         if (r)
 235                 goto bad;
 236
 237         dh = (struct disk_header *) ps->area;
 238
 239         if (le32_to_cpu(dh->magic) == 0) {
 240                 *new_snapshot = 1;
 241                 return 0;
 242         }
 243
 244         if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
 245                 DMWARN("Invalid or corrupt snapshot");
 246                 r = -ENXIO;
 247                 goto bad;
 248         }
 249
 250         *new_snapshot = 0;
 251         ps->valid = le32_to_cpu(dh->valid);
 252         ps->version = le32_to_cpu(dh->version);
 253         chunk_size = le32_to_cpu(dh->chunk_size);
 254
 255         if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
 256                 return 0;
 257
 258         DMWARN("chunk size %llu in device metadata overrides "
 259                "table chunk size of %llu.",
 260                (unsigned long long)chunk_size,
 261                (unsigned long long)ps->snap->chunk_size);
 262
 263         /* We had a bogus chunk_size. Fix stuff up. */
 264         free_area(ps);
 265
 266         ps->snap->chunk_size = chunk_size;
 267         ps->snap->chunk_mask = chunk_size - 1;
 268         ps->snap->chunk_shift = ffs(chunk_size) - 1;
 269
 270         r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
 271                                 ps->io_client);
 272         if (r)
 273                 return r;
 274
 275         r = alloc_area(ps);
 276         return r;
 277
 278 bad:
 279         free_area(ps);
 280         return r;
 281 }
 282
 283 static int write_header(struct pstore *ps)
 284 {
 285         struct disk_header *dh;
 286
 287         memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
 288
 289         dh = (struct disk_header *) ps->area;
 290         dh->magic = cpu_to_le32(SNAP_MAGIC);
 291         dh->valid = cpu_to_le32(ps->valid);
 292         dh->version = cpu_to_le32(ps->version);
 293         dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
 294
 295         return chunk_io(ps, 0, WRITE);
 296 }
 297
 298 /*
 299  * Access functions for the disk exceptions, these do the endian conversions.
 300  */
 301 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
 302 {
 303         BUG_ON(index >= ps->exceptions_per_area);
 304
 305         return ((struct disk_exception *) ps->area) + index;
 306 }
 307
 308 static void read_exception(struct pstore *ps,
 309                            uint32_t index, struct disk_exception *result)
 310 {
 311         struct disk_exception *e = get_exception(ps, index);
 312
 313         /* copy it */
 314         result->old_chunk = le64_to_cpu(e->old_chunk);
 315         result->new_chunk = le64_to_cpu(e->new_chunk);
 316 }
 317
 318 static void write_exception(struct pstore *ps,
 319                             uint32_t index, struct disk_exception *de)
 320 {
 321         struct disk_exception *e = get_exception(ps, index);
 322
 323         /* copy it */
 324         e->old_chunk = cpu_to_le64(de->old_chunk);
 325         e->new_chunk = cpu_to_le64(de->new_chunk);
 326 }
 327
 328 /*
 329  * Registers the exceptions that are present in the current area.
 330  * 'full' is filled in to indicate if the area has been
 331  * filled.
 332  */
 333 static int insert_exceptions(struct pstore *ps, int *full)
 334 {
 335         int r;
 336         unsigned int i;
 337         struct disk_exception de;
 338
 339         /* presume the area is full */
 340         *full = 1;
 341
 342         for (i = 0; i < ps->exceptions_per_area; i++) {
 343                 read_exception(ps, i, &de);
 344
 345                 /*
 346                  * If the new_chunk is pointing at the start of
 347                  * the COW device, where the first metadata area
 348                  * is we know that we've hit the end of the
 349                  * exceptions.  Therefore the area is not full.
 350                  */
 351                 if (de.new_chunk == 0LL) {
 352                         ps->current_committed = i;
 353                         *full = 0;
 354                         break;
 355                 }
 356
 357                 /*
 358                  * Keep track of the start of the free chunks.
 359                  */
 360                 if (ps->next_free <= de.new_chunk)
 361                         ps->next_free = de.new_chunk + 1;
 362
 363                 /*
 364                  * Otherwise we add the exception to the snapshot.
 365                  */
 366                 r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
 367                 if (r)
 368                         return r;
 369         }
 370
 371         return 0;
 372 }
 373
 374 static int read_exceptions(struct pstore *ps)
 375 {
 376         uint32_t area;
 377         int r, full = 1;
 378
 379         /*
 380          * Keeping reading chunks and inserting exceptions until
 381          * we find a partially full area.
 382          */
 383         for (area = 0; full; area++) {
 384                 r = area_io(ps, area, READ);
 385                 if (r)
 386                         return r;
 387
 388                 r = insert_exceptions(ps, &full);
 389                 if (r)
 390                         return r;
 391         }
 392
 393         return 0;
 394 }
 395
 396 static inline struct pstore *get_info(struct exception_store *store)
 397 {
 398         return (struct pstore *) store->context;
 399 }
 400
 401 static void persistent_fraction_full(struct exception_store *store,
 402                                      sector_t *numerator, sector_t *denominator)
 403 {
 404         *numerator = get_info(store)->next_free * store->snap->chunk_size;
 405         *denominator = get_dev_size(store->snap->cow->bdev);
 406 }
 407
 408 static void persistent_destroy(struct exception_store *store)
 409 {
 410         struct pstore *ps = get_info(store);
 411
 412         dm_io_client_destroy(ps->io_client);
 413         vfree(ps->callbacks);
 414         free_area(ps);
 415         kfree(ps);
 416 }
 417
 418 static int persistent_read_metadata(struct exception_store *store)
 419 {
 420         int r, new_snapshot;
 421         struct pstore *ps = get_info(store);
 422
 423         /*
 424          * Read the snapshot header.
 425          */
 426         r = read_header(ps, &new_snapshot);
 427         if (r)
 428                 return r;
 429
 430         /*
 431          * Now we know correct chunk_size, complete the initialisation.
 432          */
 433         ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
 434                                   sizeof(struct disk_exception);
 435         ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
 436                         sizeof(*ps->callbacks));
 437         if (!ps->callbacks)
 438                 return -ENOMEM;
 439
 440         /*
 441          * Do we need to setup a new snapshot ?
 442          */
 443         if (new_snapshot) {
 444                 r = write_header(ps);
 445                 if (r) {
 446                         DMWARN("write_header failed");
 447                         return r;
 448                 }
 449
 450                 r = zero_area(ps, 0);
 451                 if (r) {
 452                         DMWARN("zero_area(0) failed");
 453                         return r;
 454                 }
 455
 456         } else {
 457                 /*
 458                  * Sanity checks.
 459                  */
 460                 if (!ps->valid) {
 461                         DMWARN("snapshot is marked invalid");
 462                         return -EINVAL;
 463                 }
 464
 465                 if (ps->version != SNAPSHOT_DISK_VERSION) {
 466                         DMWARN("unable to handle snapshot disk version %d",
 467                                ps->version);
 468                         return -EINVAL;
 469                 }
 470
 471                 /*
 472                  * Read the metadata.
 473                  */
 474                 r = read_exceptions(ps);
 475                 if (r)
 476                         return r;
 477         }
 478
 479         return 0;
 480 }
 481
 482 static int persistent_prepare(struct exception_store *store,
 483                               struct exception *e)
 484 {
 485         struct pstore *ps = get_info(store);
 486         uint32_t stride;
 487         sector_t size = get_dev_size(store->snap->cow->bdev);
 488
 489         /* Is there enough room ? */
 490         if (size < ((ps->next_free + 1) * store->snap->chunk_size))
 491                 return -ENOSPC;
 492
 493         e->new_chunk = ps->next_free;
 494
 495         /*
 496          * Move onto the next free pending, making sure to take
 497          * into account the location of the metadata chunks.
 498          */
 499         stride = (ps->exceptions_per_area + 1);
 500         if ((++ps->next_free % stride) == 1)
 501                 ps->next_free++;
 502
 503         atomic_inc(&ps->pending_count);
 504         return 0;
 505 }
 506
 507 static void persistent_commit(struct exception_store *store,
 508                               struct exception *e,
 509                               void (*callback) (void *, int success),
 510                               void *callback_context)
 511 {
 512         int r;
 513         unsigned int i;
 514         struct pstore *ps = get_info(store);
 515         struct disk_exception de;
 516         struct commit_callback *cb;
 517
 518         de.old_chunk = e->old_chunk;
 519         de.new_chunk = e->new_chunk;
 520         write_exception(ps, ps->current_committed++, &de);
 521
 522         /*
 523          * Add the callback to the back of the array.  This code
 524          * is the only place where the callback array is
 525          * manipulated, and we know that it will never be called
 526          * multiple times concurrently.
 527          */
 528         cb = ps->callbacks + ps->callback_count++;
 529         cb->callback = callback;
 530         cb->context = callback_context;
 531
 532         /*
 533          * If there are no more exceptions in flight, or we have
 534          * filled this metadata area we commit the exceptions to
 535          * disk.
 536          */
 537         if (atomic_dec_and_test(&ps->pending_count) ||
 538             (ps->current_committed == ps->exceptions_per_area)) {
 539                 r = area_io(ps, ps->current_area, WRITE);
 540                 if (r)
 541                         ps->valid = 0;
 542
 543                 /*
 544                  * Have we completely filled the current area ?
 545                  */
 546                 if (ps->current_committed == ps->exceptions_per_area) {
 547                         ps->current_committed = 0;
 548                         r = zero_area(ps, ps->current_area + 1);
 549                         if (r)
 550                                 ps->valid = 0;
 551                 }
 552
 553                 for (i = 0; i < ps->callback_count; i++) {
 554                         cb = ps->callbacks + i;
 555                         cb->callback(cb->context, r == 0 ? 1 : 0);
 556                 }
 557
 558                 ps->callback_count = 0;
 559         }
 560 }
 561
 562 static void persistent_drop(struct exception_store *store)
 563 {
 564         struct pstore *ps = get_info(store);
 565
 566         ps->valid = 0;
 567         if (write_header(ps))
 568                 DMWARN("write header failed");
 569 }
 570
 571 int dm_create_persistent(struct exception_store *store)
 572 {
 573         struct pstore *ps;
 574
 575         /* allocate the pstore */
 576         ps = kmalloc(sizeof(*ps), GFP_KERNEL);
 577         if (!ps)
 578                 return -ENOMEM;
 579
 580         ps->snap = store->snap;
 581         ps->valid = 1;
 582         ps->version = SNAPSHOT_DISK_VERSION;
 583         ps->area = NULL;
 584         ps->next_free = 2;      /* skipping the header and first area */
 585         ps->current_committed = 0;
 586
 587         ps->callback_count = 0;
 588         atomic_set(&ps->pending_count, 0);
 589         ps->callbacks = NULL;
 590
 591         store->destroy = persistent_destroy;
 592         store->read_metadata = persistent_read_metadata;
 593         store->prepare_exception = persistent_prepare;
 594         store->commit_exception = persistent_commit;
 595         store->drop_snapshot = persistent_drop;
 596         store->fraction_full = persistent_fraction_full;
 597         store->context = ps;
 598
 599         return 0;
 600 }
 601
 602 /*-----------------------------------------------------------------
 603  * Implementation of the store for non-persistent snapshots.
 604  *---------------------------------------------------------------*/
 605 struct transient_c {
 606         sector_t next_free;
 607 };
 608
 609 static void transient_destroy(struct exception_store *store)
 610 {
 611         kfree(store->context);
 612 }
 613
 614 static int transient_read_metadata(struct exception_store *store)
 615 {
 616         return 0;
 617 }
 618
 619 static int transient_prepare(struct exception_store *store, struct exception *e)
 620 {
 621         struct transient_c *tc = (struct transient_c *) store->context;
 622         sector_t size = get_dev_size(store->snap->cow->bdev);
 623
 624         if (size < (tc->next_free + store->snap->chunk_size))
 625                 return -1;
 626
 627         e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
 628         tc->next_free += store->snap->chunk_size;
 629
 630         return 0;
 631 }
 632
 633 static void transient_commit(struct exception_store *store,
 634                       struct exception *e,
 635                       void (*callback) (void *, int success),
 636                       void *callback_context)
 637 {
 638         /* Just succeed */
 639         callback(callback_context, 1);
 640 }
 641
 642 static void transient_fraction_full(struct exception_store *store,
 643                                     sector_t *numerator, sector_t *denominator)
 644 {
 645         *numerator = ((struct transient_c *) store->context)->next_free;
 646         *denominator = get_dev_size(store->snap->cow->bdev);
 647 }
 648
 649 int dm_create_transient(struct exception_store *store)
 650 {
 651         struct transient_c *tc;
 652
 653         store->destroy = transient_destroy;
 654         store->read_metadata = transient_read_metadata;
 655         store->prepare_exception = transient_prepare;
 656         store->commit_exception = transient_commit;
 657         store->drop_snapshot = NULL;
 658         store->fraction_full = transient_fraction_full;
 659
 660         tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
 661         if (!tc)
 662                 return -ENOMEM;
 663
 664         tc->next_free = 0;
 665         store->context = tc;
 666
 667         return 0;
 668 }