[NETFILTER]: nf_conntrack: export hash allocation/destruction functions
[linux-2.6/verdex.git] / drivers / md / dm-exception-store.c
blob07e0a0c84f6ef9f8d93da5b914f22dfc0f9c47fc
1 /*
2 * dm-exception-store.c
4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5 * Copyright (C) 2006 Red Hat GmbH
7 * This file is released under the GPL.
8 */
10 #include "dm.h"
11 #include "dm-snap.h"
12 #include "dm-io.h"
13 #include "kcopyd.h"
15 #include <linux/mm.h>
16 #include <linux/pagemap.h>
17 #include <linux/vmalloc.h>
18 #include <linux/slab.h>
20 #define DM_MSG_PREFIX "snapshots"
21 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
23 /*-----------------------------------------------------------------
24 * Persistent snapshots, by persistent we mean that the snapshot
25 * will survive a reboot.
26 *---------------------------------------------------------------*/
29 * We need to store a record of which parts of the origin have
30 * been copied to the snapshot device. The snapshot code
31 * requires that we copy exception chunks to chunk aligned areas
32 * of the COW store. It makes sense therefore, to store the
33 * metadata in chunk size blocks.
35 * There is no backward or forward compatibility implemented,
36 * snapshots with different disk versions than the kernel will
37 * not be usable. It is expected that "lvcreate" will blank out
38 * the start of a fresh COW device before calling the snapshot
39 * constructor.
41 * The first chunk of the COW device just contains the header.
42 * After this there is a chunk filled with exception metadata,
43 * followed by as many exception chunks as can fit in the
44 * metadata areas.
46 * All on disk structures are in little-endian format. The end
47 * of the exceptions info is indicated by an exception with a
48 * new_chunk of 0, which is invalid since it would point to the
49 * header chunk.
53 * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
55 #define SNAP_MAGIC 0x70416e53
58 * The on-disk version of the metadata.
60 #define SNAPSHOT_DISK_VERSION 1
62 struct disk_header {
63 uint32_t magic;
66 * Is this snapshot valid. There is no way of recovering
67 * an invalid snapshot.
69 uint32_t valid;
72 * Simple, incrementing version. no backward
73 * compatibility.
75 uint32_t version;
77 /* In sectors */
78 uint32_t chunk_size;
81 struct disk_exception {
82 uint64_t old_chunk;
83 uint64_t new_chunk;
86 struct commit_callback {
87 void (*callback)(void *, int success);
88 void *context;
92 * The top level structure for a persistent exception store.
94 struct pstore {
95 struct dm_snapshot *snap; /* up pointer to my snapshot */
96 int version;
97 int valid;
98 uint32_t exceptions_per_area;
101 * Now that we have an asynchronous kcopyd there is no
102 * need for large chunk sizes, so it wont hurt to have a
103 * whole chunks worth of metadata in memory at once.
105 void *area;
108 * Used to keep track of which metadata area the data in
109 * 'chunk' refers to.
111 uint32_t current_area;
114 * The next free chunk for an exception.
116 uint32_t next_free;
119 * The index of next free exception in the current
120 * metadata area.
122 uint32_t current_committed;
124 atomic_t pending_count;
125 uint32_t callback_count;
126 struct commit_callback *callbacks;
127 struct dm_io_client *io_client;
130 static inline unsigned int sectors_to_pages(unsigned int sectors)
132 return sectors / (PAGE_SIZE >> 9);
135 static int alloc_area(struct pstore *ps)
137 int r = -ENOMEM;
138 size_t len;
140 len = ps->snap->chunk_size << SECTOR_SHIFT;
143 * Allocate the chunk_size block of memory that will hold
144 * a single metadata area.
146 ps->area = vmalloc(len);
147 if (!ps->area)
148 return r;
150 return 0;
153 static void free_area(struct pstore *ps)
155 vfree(ps->area);
156 ps->area = NULL;
160 * Read or write a chunk aligned and sized block of data from a device.
162 static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
164 struct io_region where = {
165 .bdev = ps->snap->cow->bdev,
166 .sector = ps->snap->chunk_size * chunk,
167 .count = ps->snap->chunk_size,
169 struct dm_io_request io_req = {
170 .bi_rw = rw,
171 .mem.type = DM_IO_VMA,
172 .mem.ptr.vma = ps->area,
173 .client = ps->io_client,
174 .notify.fn = NULL,
177 return dm_io(&io_req, 1, &where, NULL);
181 * Read or write a metadata area. Remembering to skip the first
182 * chunk which holds the header.
184 static int area_io(struct pstore *ps, uint32_t area, int rw)
186 int r;
187 uint32_t chunk;
189 /* convert a metadata area index to a chunk index */
190 chunk = 1 + ((ps->exceptions_per_area + 1) * area);
192 r = chunk_io(ps, chunk, rw);
193 if (r)
194 return r;
196 ps->current_area = area;
197 return 0;
200 static int zero_area(struct pstore *ps, uint32_t area)
202 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
203 return area_io(ps, area, WRITE);
206 static int read_header(struct pstore *ps, int *new_snapshot)
208 int r;
209 struct disk_header *dh;
210 chunk_t chunk_size;
211 int chunk_size_supplied = 1;
214 * Use default chunk size (or hardsect_size, if larger) if none supplied
216 if (!ps->snap->chunk_size) {
217 ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
218 bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
219 ps->snap->chunk_mask = ps->snap->chunk_size - 1;
220 ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
221 chunk_size_supplied = 0;
224 ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
225 chunk_size));
226 if (IS_ERR(ps->io_client))
227 return PTR_ERR(ps->io_client);
229 r = alloc_area(ps);
230 if (r)
231 return r;
233 r = chunk_io(ps, 0, READ);
234 if (r)
235 goto bad;
237 dh = (struct disk_header *) ps->area;
239 if (le32_to_cpu(dh->magic) == 0) {
240 *new_snapshot = 1;
241 return 0;
244 if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
245 DMWARN("Invalid or corrupt snapshot");
246 r = -ENXIO;
247 goto bad;
250 *new_snapshot = 0;
251 ps->valid = le32_to_cpu(dh->valid);
252 ps->version = le32_to_cpu(dh->version);
253 chunk_size = le32_to_cpu(dh->chunk_size);
255 if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
256 return 0;
258 DMWARN("chunk size %llu in device metadata overrides "
259 "table chunk size of %llu.",
260 (unsigned long long)chunk_size,
261 (unsigned long long)ps->snap->chunk_size);
263 /* We had a bogus chunk_size. Fix stuff up. */
264 free_area(ps);
266 ps->snap->chunk_size = chunk_size;
267 ps->snap->chunk_mask = chunk_size - 1;
268 ps->snap->chunk_shift = ffs(chunk_size) - 1;
270 r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
271 ps->io_client);
272 if (r)
273 return r;
275 r = alloc_area(ps);
276 return r;
278 bad:
279 free_area(ps);
280 return r;
283 static int write_header(struct pstore *ps)
285 struct disk_header *dh;
287 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
289 dh = (struct disk_header *) ps->area;
290 dh->magic = cpu_to_le32(SNAP_MAGIC);
291 dh->valid = cpu_to_le32(ps->valid);
292 dh->version = cpu_to_le32(ps->version);
293 dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
295 return chunk_io(ps, 0, WRITE);
299 * Access functions for the disk exceptions, these do the endian conversions.
301 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
303 BUG_ON(index >= ps->exceptions_per_area);
305 return ((struct disk_exception *) ps->area) + index;
308 static void read_exception(struct pstore *ps,
309 uint32_t index, struct disk_exception *result)
311 struct disk_exception *e = get_exception(ps, index);
313 /* copy it */
314 result->old_chunk = le64_to_cpu(e->old_chunk);
315 result->new_chunk = le64_to_cpu(e->new_chunk);
318 static void write_exception(struct pstore *ps,
319 uint32_t index, struct disk_exception *de)
321 struct disk_exception *e = get_exception(ps, index);
323 /* copy it */
324 e->old_chunk = cpu_to_le64(de->old_chunk);
325 e->new_chunk = cpu_to_le64(de->new_chunk);
329 * Registers the exceptions that are present in the current area.
330 * 'full' is filled in to indicate if the area has been
331 * filled.
333 static int insert_exceptions(struct pstore *ps, int *full)
335 int r;
336 unsigned int i;
337 struct disk_exception de;
339 /* presume the area is full */
340 *full = 1;
342 for (i = 0; i < ps->exceptions_per_area; i++) {
343 read_exception(ps, i, &de);
346 * If the new_chunk is pointing at the start of
347 * the COW device, where the first metadata area
348 * is we know that we've hit the end of the
349 * exceptions. Therefore the area is not full.
351 if (de.new_chunk == 0LL) {
352 ps->current_committed = i;
353 *full = 0;
354 break;
358 * Keep track of the start of the free chunks.
360 if (ps->next_free <= de.new_chunk)
361 ps->next_free = de.new_chunk + 1;
364 * Otherwise we add the exception to the snapshot.
366 r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
367 if (r)
368 return r;
371 return 0;
374 static int read_exceptions(struct pstore *ps)
376 uint32_t area;
377 int r, full = 1;
380 * Keeping reading chunks and inserting exceptions until
381 * we find a partially full area.
383 for (area = 0; full; area++) {
384 r = area_io(ps, area, READ);
385 if (r)
386 return r;
388 r = insert_exceptions(ps, &full);
389 if (r)
390 return r;
393 return 0;
396 static inline struct pstore *get_info(struct exception_store *store)
398 return (struct pstore *) store->context;
401 static void persistent_fraction_full(struct exception_store *store,
402 sector_t *numerator, sector_t *denominator)
404 *numerator = get_info(store)->next_free * store->snap->chunk_size;
405 *denominator = get_dev_size(store->snap->cow->bdev);
408 static void persistent_destroy(struct exception_store *store)
410 struct pstore *ps = get_info(store);
412 dm_io_client_destroy(ps->io_client);
413 vfree(ps->callbacks);
414 free_area(ps);
415 kfree(ps);
418 static int persistent_read_metadata(struct exception_store *store)
420 int r, new_snapshot;
421 struct pstore *ps = get_info(store);
424 * Read the snapshot header.
426 r = read_header(ps, &new_snapshot);
427 if (r)
428 return r;
431 * Now we know correct chunk_size, complete the initialisation.
433 ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
434 sizeof(struct disk_exception);
435 ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
436 sizeof(*ps->callbacks));
437 if (!ps->callbacks)
438 return -ENOMEM;
441 * Do we need to setup a new snapshot ?
443 if (new_snapshot) {
444 r = write_header(ps);
445 if (r) {
446 DMWARN("write_header failed");
447 return r;
450 r = zero_area(ps, 0);
451 if (r) {
452 DMWARN("zero_area(0) failed");
453 return r;
456 } else {
458 * Sanity checks.
460 if (!ps->valid) {
461 DMWARN("snapshot is marked invalid");
462 return -EINVAL;
465 if (ps->version != SNAPSHOT_DISK_VERSION) {
466 DMWARN("unable to handle snapshot disk version %d",
467 ps->version);
468 return -EINVAL;
472 * Read the metadata.
474 r = read_exceptions(ps);
475 if (r)
476 return r;
479 return 0;
482 static int persistent_prepare(struct exception_store *store,
483 struct exception *e)
485 struct pstore *ps = get_info(store);
486 uint32_t stride;
487 sector_t size = get_dev_size(store->snap->cow->bdev);
489 /* Is there enough room ? */
490 if (size < ((ps->next_free + 1) * store->snap->chunk_size))
491 return -ENOSPC;
493 e->new_chunk = ps->next_free;
496 * Move onto the next free pending, making sure to take
497 * into account the location of the metadata chunks.
499 stride = (ps->exceptions_per_area + 1);
500 if ((++ps->next_free % stride) == 1)
501 ps->next_free++;
503 atomic_inc(&ps->pending_count);
504 return 0;
507 static void persistent_commit(struct exception_store *store,
508 struct exception *e,
509 void (*callback) (void *, int success),
510 void *callback_context)
512 int r;
513 unsigned int i;
514 struct pstore *ps = get_info(store);
515 struct disk_exception de;
516 struct commit_callback *cb;
518 de.old_chunk = e->old_chunk;
519 de.new_chunk = e->new_chunk;
520 write_exception(ps, ps->current_committed++, &de);
523 * Add the callback to the back of the array. This code
524 * is the only place where the callback array is
525 * manipulated, and we know that it will never be called
526 * multiple times concurrently.
528 cb = ps->callbacks + ps->callback_count++;
529 cb->callback = callback;
530 cb->context = callback_context;
533 * If there are no more exceptions in flight, or we have
534 * filled this metadata area we commit the exceptions to
535 * disk.
537 if (atomic_dec_and_test(&ps->pending_count) ||
538 (ps->current_committed == ps->exceptions_per_area)) {
539 r = area_io(ps, ps->current_area, WRITE);
540 if (r)
541 ps->valid = 0;
544 * Have we completely filled the current area ?
546 if (ps->current_committed == ps->exceptions_per_area) {
547 ps->current_committed = 0;
548 r = zero_area(ps, ps->current_area + 1);
549 if (r)
550 ps->valid = 0;
553 for (i = 0; i < ps->callback_count; i++) {
554 cb = ps->callbacks + i;
555 cb->callback(cb->context, r == 0 ? 1 : 0);
558 ps->callback_count = 0;
562 static void persistent_drop(struct exception_store *store)
564 struct pstore *ps = get_info(store);
566 ps->valid = 0;
567 if (write_header(ps))
568 DMWARN("write header failed");
571 int dm_create_persistent(struct exception_store *store)
573 struct pstore *ps;
575 /* allocate the pstore */
576 ps = kmalloc(sizeof(*ps), GFP_KERNEL);
577 if (!ps)
578 return -ENOMEM;
580 ps->snap = store->snap;
581 ps->valid = 1;
582 ps->version = SNAPSHOT_DISK_VERSION;
583 ps->area = NULL;
584 ps->next_free = 2; /* skipping the header and first area */
585 ps->current_committed = 0;
587 ps->callback_count = 0;
588 atomic_set(&ps->pending_count, 0);
589 ps->callbacks = NULL;
591 store->destroy = persistent_destroy;
592 store->read_metadata = persistent_read_metadata;
593 store->prepare_exception = persistent_prepare;
594 store->commit_exception = persistent_commit;
595 store->drop_snapshot = persistent_drop;
596 store->fraction_full = persistent_fraction_full;
597 store->context = ps;
599 return 0;
602 /*-----------------------------------------------------------------
603 * Implementation of the store for non-persistent snapshots.
604 *---------------------------------------------------------------*/
605 struct transient_c {
606 sector_t next_free;
609 static void transient_destroy(struct exception_store *store)
611 kfree(store->context);
614 static int transient_read_metadata(struct exception_store *store)
616 return 0;
619 static int transient_prepare(struct exception_store *store, struct exception *e)
621 struct transient_c *tc = (struct transient_c *) store->context;
622 sector_t size = get_dev_size(store->snap->cow->bdev);
624 if (size < (tc->next_free + store->snap->chunk_size))
625 return -1;
627 e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
628 tc->next_free += store->snap->chunk_size;
630 return 0;
633 static void transient_commit(struct exception_store *store,
634 struct exception *e,
635 void (*callback) (void *, int success),
636 void *callback_context)
638 /* Just succeed */
639 callback(callback_context, 1);
642 static void transient_fraction_full(struct exception_store *store,
643 sector_t *numerator, sector_t *denominator)
645 *numerator = ((struct transient_c *) store->context)->next_free;
646 *denominator = get_dev_size(store->snap->cow->bdev);
649 int dm_create_transient(struct exception_store *store)
651 struct transient_c *tc;
653 store->destroy = transient_destroy;
654 store->read_metadata = transient_read_metadata;
655 store->prepare_exception = transient_prepare;
656 store->commit_exception = transient_commit;
657 store->drop_snapshot = NULL;
658 store->fraction_full = transient_fraction_full;
660 tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
661 if (!tc)
662 return -ENOMEM;
664 tc->next_free = 0;
665 store->context = tc;
667 return 0;