Merge tag 'v3.3.7' into 3.3/master
[zen-stable.git] / drivers / md / dm-raid45.c
blobfecc9b723d5c5f82ccfa5c916105f23afafd4515
1 /*
2 * Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
4 * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
6 * This file is released under the GPL.
9 * Linux 2.6 Device Mapper RAID4 and RAID5 target.
11 * Tested-by: Intel; Marcin.Labun@intel.com, krzysztof.wojcik@intel.com
14 * Supports the following ATARAID vendor solutions (and SNIA DDF):
16 * Adaptec HostRAID ASR
17 * SNIA DDF1
18 * Hiphpoint 37x
19 * Hiphpoint 45x
20 * Intel IMSM
21 * Jmicron ATARAID
22 * LSI Logic MegaRAID
23 * NVidia RAID
24 * Promise FastTrack
25 * Silicon Image Medley
26 * VIA Software RAID
28 * via the dmraid application.
31 * Features:
33 * o RAID4 with dedicated and selectable parity device
34 * o RAID5 with rotating parity (left+right, symmetric+asymmetric)
35 * o recovery of out of sync device for initial
36 * RAID set creation or after dead drive replacement
37 * o run time optimization of xor algorithm used to calculate parity
40 * Thanks to MD for:
41 * o the raid address calculation algorithm
42 * o the base of the biovec <-> page list copier.
45 * Uses region hash to keep track of how many writes are in flight to
46 * regions in order to use dirty log to keep state of regions to recover:
48 * o clean regions (those which are synchronized
49 * and don't have write io in flight)
50 * o dirty regions (those with write io in flight)
53 * On startup, any dirty regions are migrated to the
54 * 'nosync' state and are subject to recovery by the daemon.
56 * See raid_ctr() for table definition.
58 * ANALYZEME: recovery bandwidth
61 static const char *version = "v0.2597k";
63 #include "dm.h"
64 #include "dm-memcache.h"
65 #include "dm-raid45.h"
67 #include <linux/kernel.h>
68 #include <linux/vmalloc.h>
69 #include <linux/raid/xor.h>
70 #include <linux/slab.h>
71 #include <linux/module.h>
73 #include <linux/bio.h>
74 #include <linux/dm-io.h>
75 #include <linux/dm-dirty-log.h>
76 #include <linux/dm-region-hash.h>
80 * Configurable parameters
83 /* Minimum/maximum and default # of selectable stripes. */
84 #define STRIPES_MIN 8
85 #define STRIPES_MAX 16384
86 #define STRIPES_DEFAULT 80
88 /* Maximum and default chunk size in sectors if not set in constructor. */
89 #define CHUNK_SIZE_MIN 8
90 #define CHUNK_SIZE_MAX 16384
91 #define CHUNK_SIZE_DEFAULT 64
93 /* Default io size in sectors if not set in constructor. */
94 #define IO_SIZE_MIN CHUNK_SIZE_MIN
95 #define IO_SIZE_DEFAULT IO_SIZE_MIN
97 /* Recover io size default in sectors. */
98 #define RECOVER_IO_SIZE_MIN 64
99 #define RECOVER_IO_SIZE_DEFAULT 256
101 /* Default, minimum and maximum percentage of recover io bandwidth. */
102 #define BANDWIDTH_DEFAULT 10
103 #define BANDWIDTH_MIN 1
104 #define BANDWIDTH_MAX 100
106 /* # of parallel recovered regions */
107 #define RECOVERY_STRIPES_MIN 1
108 #define RECOVERY_STRIPES_MAX 64
109 #define RECOVERY_STRIPES_DEFAULT RECOVERY_STRIPES_MIN
111 * END Configurable parameters
114 #define TARGET "dm-raid45"
115 #define DAEMON "kraid45d"
116 #define DM_MSG_PREFIX TARGET
118 #define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT)
120 /* Amount/size for __xor(). */
121 #define XOR_SIZE PAGE_SIZE
123 /* Ticks to run xor_speed() test for. */
124 #define XOR_SPEED_TICKS 5
126 /* Check value in range. */
127 #define range_ok(i, min, max) (i >= min && i <= max)
129 /* Structure access macros. */
130 /* Derive raid_set from stripe_cache pointer. */
131 #define RS(x) container_of(x, struct raid_set, sc)
133 /* Page reference. */
134 #define PAGE(stripe, p) ((stripe)->obj[p].pl->page)
136 /* Stripe chunk reference. */
137 #define CHUNK(stripe, p) ((stripe)->chunk + p)
139 /* Bio list reference. */
140 #define BL(stripe, p, rw) (stripe->chunk[p].bl + rw)
141 #define BL_CHUNK(chunk, rw) (chunk->bl + rw)
143 /* Page list reference. */
144 #define PL(stripe, p) (stripe->obj[p].pl)
145 /* END: structure access macros. */
147 /* Factor out to dm-bio-list.h */
148 static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
150 bio->bi_next = bl->head;
151 bl->head = bio;
153 if (!bl->tail)
154 bl->tail = bio;
157 /* Factor out to dm.h */
158 #define TI_ERR_RET(str, ret) \
159 do { ti->error = str; return ret; } while (0);
160 #define TI_ERR(str) TI_ERR_RET(str, -EINVAL)
162 /* Macro to define access IO flags access inline functions. */
163 #define BITOPS(name, what, var, flag) \
164 static inline int TestClear ## name ## what(struct var *v) \
165 { return test_and_clear_bit(flag, &v->io.flags); } \
166 static inline int TestSet ## name ## what(struct var *v) \
167 { return test_and_set_bit(flag, &v->io.flags); } \
168 static inline void Clear ## name ## what(struct var *v) \
169 { clear_bit(flag, &v->io.flags); } \
170 static inline void Set ## name ## what(struct var *v) \
171 { set_bit(flag, &v->io.flags); } \
172 static inline int name ## what(struct var *v) \
173 { return test_bit(flag, &v->io.flags); }
175 /*-----------------------------------------------------------------
176 * Stripe cache
178 * Cache for all reads and writes to raid sets (operational or degraded)
180 * We need to run all data to and from a RAID set through this cache,
181 * because parity chunks need to get calculated from data chunks
182 * or, in the degraded/resynchronization case, missing chunks need
183 * to be reconstructed using the other chunks of the stripe.
184 *---------------------------------------------------------------*/
185 /* Unique kmem cache name suffix # counter. */
186 static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
188 /* A chunk within a stripe (holds bios hanging off). */
189 /* IO status flags for chunks of a stripe. */
190 enum chunk_flags {
191 CHUNK_DIRTY, /* Pages of chunk dirty; need writing. */
192 CHUNK_ERROR, /* IO error on any chunk page. */
193 CHUNK_IO, /* Allow/prohibit IO on chunk pages. */
194 CHUNK_LOCKED, /* Chunk pages locked during IO. */
195 CHUNK_MUST_IO, /* Chunk must io. */
196 CHUNK_UNLOCK, /* Enforce chunk unlock. */
197 CHUNK_UPTODATE, /* Chunk pages are uptodate. */
200 enum bl_type {
201 WRITE_QUEUED = WRITE + 1,
202 WRITE_MERGED,
203 NR_BL_TYPES, /* Must be last one! */
205 struct stripe_chunk {
206 atomic_t cnt; /* Reference count. */
207 struct stripe *stripe; /* Backpointer to stripe for endio(). */
208 /* Bio lists for reads, writes, and writes merged. */
209 struct bio_list bl[NR_BL_TYPES];
210 struct {
211 unsigned long flags; /* IO status flags. */
212 } io;
215 /* Define chunk bit operations. */
216 BITOPS(Chunk, Dirty, stripe_chunk, CHUNK_DIRTY)
217 BITOPS(Chunk, Error, stripe_chunk, CHUNK_ERROR)
218 BITOPS(Chunk, Io, stripe_chunk, CHUNK_IO)
219 BITOPS(Chunk, Locked, stripe_chunk, CHUNK_LOCKED)
220 BITOPS(Chunk, MustIo, stripe_chunk, CHUNK_MUST_IO)
221 BITOPS(Chunk, Unlock, stripe_chunk, CHUNK_UNLOCK)
222 BITOPS(Chunk, Uptodate, stripe_chunk, CHUNK_UPTODATE)
225 * Stripe linked list indexes. Keep order, because the stripe
226 * and the stripe cache rely on the first 3!
228 enum list_types {
229 LIST_FLUSH, /* Stripes to flush for io. */
230 LIST_ENDIO, /* Stripes to endio. */
231 LIST_LRU, /* Least recently used stripes. */
232 SC_NR_LISTS, /* # of lists in stripe cache. */
233 LIST_HASH = SC_NR_LISTS, /* Hashed stripes. */
234 LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
235 STRIPE_NR_LISTS,/* To size array in struct stripe. */
238 /* Adressing region recovery. */
239 struct recover_addr {
240 struct dm_region *reg; /* Actual region to recover. */
241 sector_t pos; /* Position within region to recover. */
242 sector_t end; /* End of region to recover. */
245 /* A stripe: the io object to handle all reads and writes to a RAID set. */
246 struct stripe {
247 atomic_t cnt; /* Reference count. */
248 struct stripe_cache *sc; /* Backpointer to stripe cache. */
251 * 4 linked lists:
252 * o io list to flush io
253 * o endio list
254 * o LRU list to put stripes w/o reference count on
255 * o stripe cache hash
257 struct list_head lists[STRIPE_NR_LISTS];
259 sector_t key; /* Hash key. */
260 region_t region; /* Region stripe is mapped to. */
262 struct {
263 unsigned long flags; /* Stripe state flags (see below). */
266 * Pending ios in flight:
268 * used to control move of stripe to endio list
270 atomic_t pending;
272 /* Sectors to read and write for multi page stripe sets. */
273 unsigned size;
274 } io;
276 /* Address region recovery. */
277 struct recover_addr *recover;
279 /* Lock on stripe (Future: for clustering). */
280 void *lock;
282 struct {
283 unsigned short parity; /* Parity chunk index. */
284 short recover; /* Recovery chunk index. */
285 } idx;
288 * This stripe's memory cache object (dm-mem-cache);
289 * i.e. the io chunk pages.
291 struct dm_mem_cache_object *obj;
293 /* Array of stripe sets (dynamically allocated). */
294 struct stripe_chunk chunk[0];
297 /* States stripes can be in (flags field). */
298 enum stripe_states {
299 STRIPE_ERROR, /* io error on stripe. */
300 STRIPE_MERGED, /* Writes got merged to be written. */
301 STRIPE_RBW, /* Read-before-write stripe. */
302 STRIPE_RECONSTRUCT, /* Reconstruct of a missing chunk required. */
303 STRIPE_RECONSTRUCTED, /* Reconstructed of a missing chunk. */
304 STRIPE_RECOVER, /* Stripe used for RAID set recovery. */
307 /* Define stripe bit operations. */
308 BITOPS(Stripe, Error, stripe, STRIPE_ERROR)
309 BITOPS(Stripe, Merged, stripe, STRIPE_MERGED)
310 BITOPS(Stripe, RBW, stripe, STRIPE_RBW)
311 BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT)
312 BITOPS(Stripe, Reconstructed, stripe, STRIPE_RECONSTRUCTED)
313 BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER)
315 /* A stripe hash. */
316 struct stripe_hash {
317 struct list_head *hash;
318 unsigned buckets;
319 unsigned mask;
320 unsigned prime;
321 unsigned shift;
324 enum sc_lock_types {
325 LOCK_ENDIO, /* Protect endio list. */
326 NR_LOCKS, /* To size array in struct stripe_cache. */
329 /* A stripe cache. */
330 struct stripe_cache {
331 /* Stripe hash. */
332 struct stripe_hash hash;
334 spinlock_t locks[NR_LOCKS]; /* Locks to protect lists. */
336 /* Stripes with io to flush, stripes to endio and LRU lists. */
337 struct list_head lists[SC_NR_LISTS];
339 /* Slab cache to allocate stripes from. */
340 struct {
341 struct kmem_cache *cache; /* Cache itself. */
342 char name[32]; /* Unique name. */
343 } kc;
345 struct dm_io_client *dm_io_client; /* dm-io client resource context. */
347 /* dm-mem-cache client resource context. */
348 struct dm_mem_cache_client *mem_cache_client;
350 int stripes_parm; /* # stripes parameter from constructor. */
351 atomic_t stripes; /* actual # of stripes in cache. */
352 atomic_t stripes_to_set; /* # of stripes to resize cache to. */
353 atomic_t stripes_last; /* last # of stripes in cache. */
354 atomic_t active_stripes; /* actual # of active stripes in cache. */
356 /* REMOVEME: */
357 atomic_t active_stripes_max; /* actual # of active stripes in cache. */
360 /* Flag specs for raid_dev */ ;
361 enum raid_dev_flags {
362 DEV_FAILED, /* Device failed. */
363 DEV_IO_QUEUED, /* Io got queued to device. */
366 /* The raid device in a set. */
367 struct raid_dev {
368 struct dm_dev *dev;
369 sector_t start; /* Offset to map to. */
370 struct { /* Using struct to be able to BITOPS(). */
371 unsigned long flags; /* raid_dev_flags. */
372 } io;
375 BITOPS(Dev, Failed, raid_dev, DEV_FAILED)
376 BITOPS(Dev, IoQueued, raid_dev, DEV_IO_QUEUED)
378 /* Flags spec for raid_set. */
379 enum raid_set_flags {
380 RS_CHECK_OVERWRITE, /* Check for chunk overwrites. */
381 RS_DEAD, /* RAID set inoperational. */
382 RS_DEAD_ENDIO_MESSAGE, /* RAID set dead endio one-off message. */
383 RS_DEGRADED, /* Io errors on RAID device. */
384 RS_DEVEL_STATS, /* REMOVEME: display status information. */
385 RS_ENFORCE_PARITY_CREATION,/* Enforce parity creation. */
386 RS_PROHIBIT_WRITES, /* Prohibit writes on device failure. */
387 RS_RECOVER, /* Do recovery. */
388 RS_RECOVERY_BANDWIDTH, /* Allow recovery bandwidth (delayed bios). */
389 RS_SC_BUSY, /* Stripe cache busy -> send an event. */
390 RS_SUSPEND, /* Suspend RAID set. */
393 /* REMOVEME: devel stats counters. */
394 enum stats_types {
395 S_BIOS_READ,
396 S_BIOS_ADDED_READ,
397 S_BIOS_ENDIO_READ,
398 S_BIOS_WRITE,
399 S_BIOS_ADDED_WRITE,
400 S_BIOS_ENDIO_WRITE,
401 S_CAN_MERGE,
402 S_CANT_MERGE,
403 S_CONGESTED,
404 S_DM_IO_READ,
405 S_DM_IO_WRITE,
406 S_BANDWIDTH,
407 S_BARRIER,
408 S_BIO_COPY_PL_NEXT,
409 S_DEGRADED,
410 S_DELAYED_BIOS,
411 S_FLUSHS,
412 S_HITS_1ST,
413 S_IOS_POST,
414 S_INSCACHE,
415 S_MAX_LOOKUP,
416 S_CHUNK_LOCKED,
417 S_NO_BANDWIDTH,
418 S_NOT_CONGESTED,
419 S_NO_RW,
420 S_NOSYNC,
421 S_OVERWRITE,
422 S_PROHIBITCHUNKIO,
423 S_RECONSTRUCT_EI,
424 S_RECONSTRUCT_DEV,
425 S_RECONSTRUCT_SET,
426 S_RECONSTRUCTED,
427 S_REQUEUE,
428 S_STRIPE_ERROR,
429 S_SUM_DELAYED_BIOS,
430 S_XORS,
431 S_NR_STATS, /* # of stats counters. Must be last! */
434 /* Status type -> string mappings. */
435 struct stats_map {
436 const enum stats_types type;
437 const char *str;
440 static struct stats_map stats_map[] = {
441 { S_BIOS_READ, "r=" },
442 { S_BIOS_ADDED_READ, "/" },
443 { S_BIOS_ENDIO_READ, "/" },
444 { S_BIOS_WRITE, " w=" },
445 { S_BIOS_ADDED_WRITE, "/" },
446 { S_BIOS_ENDIO_WRITE, "/" },
447 { S_DM_IO_READ, " rc=" },
448 { S_DM_IO_WRITE, " wc=" },
449 { S_BANDWIDTH, "\nbw=" },
450 { S_NO_BANDWIDTH, " no_bw=" },
451 { S_BARRIER, "\nbarrier=" },
452 { S_BIO_COPY_PL_NEXT, "\nbio_cp_next=" },
453 { S_CAN_MERGE, "\nmerge=" },
454 { S_CANT_MERGE, "/no_merge=" },
455 { S_CHUNK_LOCKED, "\nchunk_locked=" },
456 { S_CONGESTED, "\ncgst=" },
457 { S_NOT_CONGESTED, "/not_cgst=" },
458 { S_DEGRADED, "\ndegraded=" },
459 { S_DELAYED_BIOS, "\ndel_bios=" },
460 { S_SUM_DELAYED_BIOS, "/sum_del_bios=" },
461 { S_FLUSHS, "\nflushs=" },
462 { S_HITS_1ST, "\nhits_1st=" },
463 { S_IOS_POST, " ios_post=" },
464 { S_INSCACHE, " inscache=" },
465 { S_MAX_LOOKUP, " maxlookup=" },
466 { S_NO_RW, "\nno_rw=" },
467 { S_NOSYNC, " nosync=" },
468 { S_OVERWRITE, " ovr=" },
469 { S_PROHIBITCHUNKIO, " prhbt_io=" },
470 { S_RECONSTRUCT_EI, "\nrec_ei=" },
471 { S_RECONSTRUCT_DEV, " rec_dev=" },
472 { S_RECONSTRUCT_SET, " rec_set=" },
473 { S_RECONSTRUCTED, " rec=" },
474 { S_REQUEUE, " requeue=" },
475 { S_STRIPE_ERROR, " stripe_err=" },
476 { S_XORS, " xors=" },
480 * A RAID set.
482 #define dm_rh_client dm_region_hash
483 enum count_type { IO_WORK = 0, IO_RECOVER, IO_NR_COUNT };
484 typedef void (*xor_function_t)(unsigned count, unsigned long **data);
485 struct raid_set {
486 struct dm_target *ti; /* Target pointer. */
488 struct {
489 unsigned long flags; /* State flags. */
490 struct mutex in_lock; /* Protects central input list below. */
491 struct mutex xor_lock; /* Protects xor algorithm set. */
492 struct bio_list in; /* Pending ios (central input list). */
493 struct bio_list work; /* ios work set. */
494 wait_queue_head_t suspendq; /* suspend synchronization. */
495 atomic_t in_process; /* counter of queued bios (suspendq). */
496 atomic_t in_process_max;/* counter of queued bios max. */
498 /* io work. */
499 struct workqueue_struct *wq;
500 struct delayed_work dws_do_raid; /* For main worker. */
501 struct work_struct ws_do_table_event; /* For event worker. */
502 } io;
504 /* Stripe locking abstraction. */
505 struct dm_raid45_locking_type *locking;
507 struct stripe_cache sc; /* Stripe cache for this set. */
509 /* Xor optimization. */
510 struct {
511 struct xor_func *f;
512 unsigned chunks;
513 unsigned speed;
514 } xor;
516 /* Recovery parameters. */
517 struct recover {
518 struct dm_dirty_log *dl; /* Dirty log. */
519 struct dm_rh_client *rh; /* Region hash. */
521 struct dm_io_client *dm_io_client; /* recovery dm-io client. */
522 /* dm-mem-cache client resource context for recovery stripes. */
523 struct dm_mem_cache_client *mem_cache_client;
525 struct list_head stripes; /* List of recovery stripes. */
527 region_t nr_regions;
528 region_t nr_regions_to_recover;
529 region_t nr_regions_recovered;
530 unsigned long start_jiffies;
531 unsigned long end_jiffies;
533 unsigned bandwidth; /* Recovery bandwidth [%]. */
534 unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
535 unsigned bandwidth_parm; /* " constructor parm. */
536 unsigned io_size; /* recovery io size <= region size. */
537 unsigned io_size_parm; /* recovery io size ctr parameter. */
538 unsigned recovery; /* Recovery allowed/prohibited. */
539 unsigned recovery_stripes; /* # of parallel recovery stripes. */
541 /* recovery io throttling. */
542 atomic_t io_count[IO_NR_COUNT]; /* counter recover/regular io.*/
543 unsigned long last_jiffies;
544 } recover;
546 /* RAID set parameters. */
547 struct {
548 struct raid_type *raid_type; /* RAID type (eg, RAID4). */
549 unsigned raid_parms; /* # variable raid parameters. */
551 unsigned chunk_size; /* Sectors per chunk. */
552 unsigned chunk_size_parm;
553 unsigned chunk_shift; /* rsector chunk size shift. */
555 unsigned io_size; /* Sectors per io. */
556 unsigned io_size_parm;
557 unsigned io_mask; /* Mask for bio_copy_page_list(). */
558 unsigned io_inv_mask; /* Mask for raid_address(). */
560 sector_t sectors_per_dev; /* Sectors per device. */
562 atomic_t failed_devs; /* Amount of devices failed. */
564 /* Index of device to initialize. */
565 int dev_to_init;
566 int dev_to_init_parm;
568 /* Raid devices dynamically allocated. */
569 unsigned raid_devs; /* # of RAID devices below. */
570 unsigned data_devs; /* # of RAID data devices. */
572 int ei; /* index of failed RAID device. */
574 /* Index of dedicated parity device (i.e. RAID4). */
575 int pi;
576 int pi_parm; /* constructor parm for status output. */
577 } set;
579 /* REMOVEME: devel stats counters. */
580 atomic_t stats[S_NR_STATS];
582 /* Dynamically allocated temporary pointers for xor(). */
583 unsigned long **data;
585 /* Dynamically allocated RAID devices. Alignment? */
586 struct raid_dev dev[0];
589 /* Define RAID set bit operations. */
590 BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
591 BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
592 BITOPS(RS, Dead, raid_set, RS_DEAD)
593 BITOPS(RS, DeadEndioMessage, raid_set, RS_DEAD_ENDIO_MESSAGE)
594 BITOPS(RS, Degraded, raid_set, RS_DEGRADED)
595 BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
596 BITOPS(RS, EnforceParityCreation, raid_set, RS_ENFORCE_PARITY_CREATION)
597 BITOPS(RS, ProhibitWrites, raid_set, RS_PROHIBIT_WRITES)
598 BITOPS(RS, Recover, raid_set, RS_RECOVER)
599 BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
600 BITOPS(RS, Suspend, raid_set, RS_SUSPEND)
601 #undef BITOPS
603 /*-----------------------------------------------------------------
604 * Raid-4/5 set structures.
605 *---------------------------------------------------------------*/
606 /* RAID level definitions. */
607 enum raid_level {
608 raid4,
609 raid5,
612 /* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
613 enum raid_algorithm {
614 none,
615 left_asym,
616 right_asym,
617 left_sym,
618 right_sym,
621 struct raid_type {
622 const char *name; /* RAID algorithm. */
623 const char *descr; /* Descriptor text for logging. */
624 const unsigned parity_devs; /* # of parity devices. */
625 const unsigned minimal_devs; /* minimal # of devices in set. */
626 const enum raid_level level; /* RAID level. */
627 const enum raid_algorithm algorithm; /* RAID algorithm. */
630 /* Supported raid types and properties. */
631 static struct raid_type raid_types[] = {
632 {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
633 {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym},
634 {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym},
635 {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym},
636 {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym},
639 /* Address as calculated by raid_address(). */
640 struct raid_address {
641 sector_t key; /* Hash key (address of stripe % chunk_size). */
642 unsigned di, pi; /* Data and parity disks index. */
645 /* REMOVEME: reset statistics counters. */
646 static void stats_reset(struct raid_set *rs)
648 unsigned s = S_NR_STATS;
650 while (s--)
651 atomic_set(rs->stats + s, 0);
654 /*----------------------------------------------------------------
655 * RAID set management routines.
656 *--------------------------------------------------------------*/
658 * Begin small helper functions.
660 /* No need to be called from region hash indirectly at dm_rh_dec(). */
661 static void wake_dummy(void *context) {}
663 /* Return # of io reference. */
664 static int io_ref(struct raid_set *rs)
666 return atomic_read(&rs->io.in_process);
669 /* Get an io reference. */
670 static void io_get(struct raid_set *rs)
672 int p = atomic_inc_return(&rs->io.in_process);
674 if (p > atomic_read(&rs->io.in_process_max))
675 atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
678 /* Put the io reference and conditionally wake io waiters. */
679 static void io_put(struct raid_set *rs)
681 /* Intel: rebuild data corrupter? */
682 if (atomic_dec_and_test(&rs->io.in_process))
683 wake_up(&rs->io.suspendq);
684 else
685 BUG_ON(io_ref(rs) < 0);
688 /* Wait until all io has been processed. */
689 static void wait_ios(struct raid_set *rs)
691 wait_event(rs->io.suspendq, !io_ref(rs));
694 /* Queue (optionally delayed) io work. */
695 static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
697 queue_delayed_work(rs->io.wq, &rs->io.dws_do_raid, delay);
700 /* Queue io work immediately (called from region hash too). */
701 static void wake_do_raid(void *context)
703 struct raid_set *rs = context;
705 queue_work(rs->io.wq, &rs->io.dws_do_raid.work);
708 /* Calculate device sector offset. */
709 static sector_t _sector(struct raid_set *rs, struct bio *bio)
711 sector_t sector = bio->bi_sector;
713 sector_div(sector, rs->set.data_devs);
714 return sector;
717 /* Return # of active stripes in stripe cache. */
718 static int sc_active(struct stripe_cache *sc)
720 return atomic_read(&sc->active_stripes);
723 /* Stripe cache busy indicator. */
724 static int sc_busy(struct raid_set *rs)
726 return sc_active(&rs->sc) >
727 atomic_read(&rs->sc.stripes) - (STRIPES_MIN / 2);
730 /* Set chunks states. */
731 enum chunk_dirty_type { CLEAN, DIRTY, ERROR };
732 static void chunk_set(struct stripe_chunk *chunk, enum chunk_dirty_type type)
734 switch (type) {
735 case CLEAN:
736 ClearChunkDirty(chunk);
737 break;
738 case DIRTY:
739 SetChunkDirty(chunk);
740 break;
741 case ERROR:
742 SetChunkError(chunk);
743 SetStripeError(chunk->stripe);
744 return;
745 default:
746 BUG();
749 SetChunkUptodate(chunk);
750 SetChunkIo(chunk);
751 ClearChunkError(chunk);
754 /* Return region state for a sector. */
755 static int region_state(struct raid_set *rs, sector_t sector,
756 enum dm_rh_region_states state)
758 struct dm_rh_client *rh = rs->recover.rh;
759 region_t region = dm_rh_sector_to_region(rh, sector);
761 return !!(dm_rh_get_state(rh, region, 1) & state);
765 * Return true in case a chunk should be read/written
767 * Conditions to read/write:
768 * o chunk not uptodate
769 * o chunk dirty
771 * Conditios to avoid io:
772 * o io already ongoing on chunk
773 * o io explitely prohibited
775 static int chunk_io(struct stripe_chunk *chunk)
777 /* 2nd run optimization (flag set below on first run). */
778 if (TestClearChunkMustIo(chunk))
779 return 1;
781 /* Avoid io if prohibited or a locked chunk. */
782 if (!ChunkIo(chunk) || ChunkLocked(chunk))
783 return 0;
785 if (!ChunkUptodate(chunk) || ChunkDirty(chunk)) {
786 SetChunkMustIo(chunk); /* 2nd run optimization. */
787 return 1;
790 return 0;
793 /* Call a function on each chunk needing io unless device failed. */
794 static unsigned for_each_io_dev(struct stripe *stripe,
795 void (*f_io)(struct stripe *stripe, unsigned p))
797 struct raid_set *rs = RS(stripe->sc);
798 unsigned p, r = 0;
800 for (p = 0; p < rs->set.raid_devs; p++) {
801 if (chunk_io(CHUNK(stripe, p)) && !DevFailed(rs->dev + p)) {
802 f_io(stripe, p);
803 r++;
807 return r;
811 * Index of device to calculate parity on.
813 * Either the parity device index *or* the selected
814 * device to init after a spare replacement.
816 static int dev_for_parity(struct stripe *stripe, int *sync)
818 struct raid_set *rs = RS(stripe->sc);
819 int r = region_state(rs, stripe->key, DM_RH_NOSYNC | DM_RH_RECOVERING);
821 *sync = !r;
823 /* Reconstruct a particular device ?. */
824 if (r && rs->set.dev_to_init > -1)
825 return rs->set.dev_to_init;
826 else if (rs->set.raid_type->level == raid4)
827 return rs->set.pi;
828 else if (!StripeRecover(stripe))
829 return stripe->idx.parity;
830 else
831 return -1;
834 /* RAID set congested function. */
835 static int rs_congested(void *congested_data, int bdi_bits)
837 int r;
838 unsigned p;
839 struct raid_set *rs = congested_data;
841 if (sc_busy(rs) || RSSuspend(rs) || RSProhibitWrites(rs))
842 r = 1;
843 else for (r = 0, p = rs->set.raid_devs; !r && p--; ) {
844 /* If any of our component devices are overloaded. */
845 struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
847 r |= bdi_congested(&q->backing_dev_info, bdi_bits);
850 /* REMOVEME: statistics. */
851 atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
852 return r;
855 /* RAID device degrade check. */
856 static void rs_check_degrade_dev(struct raid_set *rs,
857 struct stripe *stripe, unsigned p)
859 if (TestSetDevFailed(rs->dev + p))
860 return;
862 /* Through an event in case of member device errors. */
863 if ((atomic_inc_return(&rs->set.failed_devs) >
864 rs->set.raid_type->parity_devs) &&
865 !TestSetRSDead(rs)) {
866 /* Display RAID set dead message once. */
867 unsigned p;
868 char buf[BDEVNAME_SIZE];
870 DMERR("FATAL: too many devices failed -> RAID set broken");
871 for (p = 0; p < rs->set.raid_devs; p++) {
872 if (DevFailed(rs->dev + p))
873 DMERR("device /dev/%s failed",
874 bdevname(rs->dev[p].dev->bdev, buf));
878 /* Only log the first member error. */
879 if (!TestSetRSDegraded(rs)) {
880 char buf[BDEVNAME_SIZE];
882 /* Store index for recovery. */
883 rs->set.ei = p;
884 DMERR("CRITICAL: %sio error on device /dev/%s "
885 "in region=%llu; DEGRADING RAID set\n",
886 stripe ? "" : "FAKED ",
887 bdevname(rs->dev[p].dev->bdev, buf),
888 (unsigned long long) (stripe ? stripe->key : 0));
889 DMERR("further device error messages suppressed");
892 /* Prohibit further writes to allow for userpace to update metadata. */
893 SetRSProhibitWrites(rs);
894 schedule_work(&rs->io.ws_do_table_event);
897 /* RAID set degrade check. */
898 static void rs_check_degrade(struct stripe *stripe)
900 struct raid_set *rs = RS(stripe->sc);
901 unsigned p = rs->set.raid_devs;
903 while (p--) {
904 if (ChunkError(CHUNK(stripe, p)))
905 rs_check_degrade_dev(rs, stripe, p);
909 /* Lookup a RAID device by name or by major:minor number. */
910 static int raid_dev_lookup(struct raid_set *rs, struct raid_dev *dev_lookup)
912 unsigned p;
913 struct raid_dev *dev;
916 * Must be an incremental loop, because the device array
917 * can have empty slots still on calls from raid_ctr()
919 for (dev = rs->dev, p = 0;
920 dev->dev && p < rs->set.raid_devs;
921 dev++, p++) {
922 if (dev_lookup->dev->bdev->bd_dev == dev->dev->bdev->bd_dev)
923 return p;
926 return -ENODEV;
929 * End small helper functions.
933 * Stripe hash functions
935 /* Initialize/destroy stripe hash. */
936 static int hash_init(struct stripe_hash *hash, unsigned stripes)
938 unsigned buckets = roundup_pow_of_two(stripes >> 1);
939 static unsigned hash_primes[] = {
940 /* Table of primes for hash_fn/table size optimization. */
941 1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769,
942 1543, 3079, 6151, 12289, 24593, 49157, 98317,
945 /* Allocate stripe hash buckets. */
946 hash->hash = vmalloc(buckets * sizeof(*hash->hash));
947 if (!hash->hash)
948 return -ENOMEM;
950 hash->buckets = buckets;
951 hash->mask = buckets - 1;
952 hash->shift = ffs(buckets);
953 if (hash->shift > ARRAY_SIZE(hash_primes))
954 hash->shift = ARRAY_SIZE(hash_primes) - 1;
956 BUG_ON(hash->shift < 2);
957 hash->prime = hash_primes[hash->shift];
959 /* Initialize buckets. */
960 while (buckets--)
961 INIT_LIST_HEAD(hash->hash + buckets);
962 return 0;
965 static void hash_exit(struct stripe_hash *hash)
967 if (hash->hash) {
968 vfree(hash->hash);
969 hash->hash = NULL;
973 static unsigned hash_fn(struct stripe_hash *hash, sector_t key)
975 return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
978 static struct list_head *hash_bucket(struct stripe_hash *hash, sector_t key)
980 return hash->hash + hash_fn(hash, key);
983 /* Insert an entry into a hash. */
984 static void stripe_insert(struct stripe_hash *hash, struct stripe *stripe)
986 list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
989 /* Lookup an entry in the stripe hash. */
990 static struct stripe *stripe_lookup(struct stripe_cache *sc, sector_t key)
992 unsigned look = 0;
993 struct stripe *stripe;
994 struct list_head *bucket = hash_bucket(&sc->hash, key);
996 list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
997 look++;
999 if (stripe->key == key) {
1000 /* REMOVEME: statisics. */
1001 if (look > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
1002 atomic_set(RS(sc)->stats + S_MAX_LOOKUP, look);
1003 return stripe;
1007 return NULL;
1010 /* Resize the stripe cache hash on size changes. */
1011 static int sc_hash_resize(struct stripe_cache *sc)
1013 /* Resize indicated ? */
1014 if (atomic_read(&sc->stripes) != atomic_read(&sc->stripes_last)) {
1015 int r;
1016 struct stripe_hash hash;
1018 r = hash_init(&hash, atomic_read(&sc->stripes));
1019 if (r)
1020 return r;
1022 if (sc->hash.hash) {
1023 unsigned b = sc->hash.buckets;
1024 struct list_head *pos, *tmp;
1026 /* Walk old buckets and insert into new. */
1027 while (b--) {
1028 list_for_each_safe(pos, tmp, sc->hash.hash + b)
1029 stripe_insert(&hash,
1030 list_entry(pos, struct stripe,
1031 lists[LIST_HASH]));
1036 hash_exit(&sc->hash);
1037 memcpy(&sc->hash, &hash, sizeof(sc->hash));
1038 atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
1041 return 0;
1043 /* End hash stripe hash function. */
1045 /* List add, delete, push and pop functions. */
1046 /* Add stripe to flush list. */
1047 #define DEL_LIST(lh) \
1048 if (!list_empty(lh)) \
1049 list_del_init(lh);
1051 /* Delete stripe from hash. */
1052 static void stripe_hash_del(struct stripe *stripe)
1054 DEL_LIST(stripe->lists + LIST_HASH);
1057 /* Return stripe reference count. */
1058 static inline int stripe_ref(struct stripe *stripe)
1060 return atomic_read(&stripe->cnt);
1063 static void stripe_flush_add(struct stripe *stripe)
1065 struct stripe_cache *sc = stripe->sc;
1066 struct list_head *lh = stripe->lists + LIST_FLUSH;
1068 if (!StripeReconstruct(stripe) && list_empty(lh))
1069 list_add_tail(lh, sc->lists + LIST_FLUSH);
1073 * Add stripe to LRU (inactive) list.
1075 * Need lock, because of concurrent access from message interface.
1077 static void stripe_lru_add(struct stripe *stripe)
1079 if (!StripeRecover(stripe)) {
1080 struct list_head *lh = stripe->lists + LIST_LRU;
1082 if (list_empty(lh))
1083 list_add_tail(lh, stripe->sc->lists + LIST_LRU);
1087 #define POP_LIST(list) \
1088 do { \
1089 if (list_empty(sc->lists + (list))) \
1090 stripe = NULL; \
1091 else { \
1092 stripe = list_first_entry(sc->lists + (list), \
1093 struct stripe, \
1094 lists[(list)]); \
1095 list_del_init(stripe->lists + (list)); \
1097 } while (0);
1099 /* Pop an available stripe off the LRU list. */
1100 static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
1102 struct stripe *stripe;
1104 POP_LIST(LIST_LRU);
1105 return stripe;
1108 /* Pop an available stripe off the io list. */
1109 static struct stripe *stripe_io_pop(struct stripe_cache *sc)
1111 struct stripe *stripe;
1113 POP_LIST(LIST_FLUSH);
1114 return stripe;
1117 /* Push a stripe safely onto the endio list to be handled by do_endios(). */
1118 static void stripe_endio_push(struct stripe *stripe)
1120 unsigned long flags;
1121 struct stripe_cache *sc = stripe->sc;
1122 struct list_head *stripe_list = stripe->lists + LIST_ENDIO,
1123 *sc_list = sc->lists + LIST_ENDIO;
1124 spinlock_t *lock = sc->locks + LOCK_ENDIO;
1126 /* This runs in parallel with do_endios(). */
1127 spin_lock_irqsave(lock, flags);
1128 if (list_empty(stripe_list))
1129 list_add_tail(stripe_list, sc_list);
1130 spin_unlock_irqrestore(lock, flags);
1132 wake_do_raid(RS(sc)); /* Wake myself. */
1135 /* Pop a stripe off safely off the endio list. */
1136 static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
1138 struct stripe *stripe;
1139 spinlock_t *lock = sc->locks + LOCK_ENDIO;
1141 /* This runs in parallel with endio(). */
1142 spin_lock_irq(lock);
1143 POP_LIST(LIST_ENDIO)
1144 spin_unlock_irq(lock);
1145 return stripe;
1147 #undef POP_LIST
1150 * Stripe cache locking functions
1152 /* Dummy lock function for single host RAID4+5. */
1153 static void *no_lock(sector_t key, enum dm_lock_type type)
1155 return &no_lock;
1158 /* Dummy unlock function for single host RAID4+5. */
1159 static void no_unlock(void *lock_handle)
1163 /* No locking (for single host RAID 4+5). */
1164 static struct dm_raid45_locking_type locking_none = {
1165 .lock = no_lock,
1166 .unlock = no_unlock,
1169 /* Lock a stripe (for clustering). */
1170 static int
1171 stripe_lock(struct stripe *stripe, int rw, sector_t key)
1173 stripe->lock = RS(stripe->sc)->locking->lock(key, rw == READ ? DM_RAID45_SHARED : DM_RAID45_EX);
1174 return stripe->lock ? 0 : -EPERM;
1177 /* Unlock a stripe (for clustering). */
1178 static void stripe_unlock(struct stripe *stripe)
1180 RS(stripe->sc)->locking->unlock(stripe->lock);
1181 stripe->lock = NULL;
1184 /* Test io pending on stripe. */
1185 static int stripe_io_ref(struct stripe *stripe)
1187 return atomic_read(&stripe->io.pending);
1190 static void stripe_io_get(struct stripe *stripe)
1192 if (atomic_inc_return(&stripe->io.pending) == 1)
1193 /* REMOVEME: statistics */
1194 atomic_inc(&stripe->sc->active_stripes);
1195 else
1196 BUG_ON(stripe_io_ref(stripe) < 0);
1199 static void stripe_io_put(struct stripe *stripe)
1201 if (atomic_dec_and_test(&stripe->io.pending)) {
1202 if (unlikely(StripeRecover(stripe)))
1203 /* Don't put recovery stripe on endio list. */
1204 wake_do_raid(RS(stripe->sc));
1205 else
1206 /* Add regular stripe to endio list and wake daemon. */
1207 stripe_endio_push(stripe);
1209 /* REMOVEME: statistics */
1210 atomic_dec(&stripe->sc->active_stripes);
1211 } else
1212 BUG_ON(stripe_io_ref(stripe) < 0);
1215 /* Take stripe reference out. */
1216 static int stripe_get(struct stripe *stripe)
1218 int r;
1219 struct list_head *lh = stripe->lists + LIST_LRU;
1221 /* Delete stripe from LRU (inactive) list if on. */
1222 DEL_LIST(lh);
1223 BUG_ON(stripe_ref(stripe) < 0);
1225 /* Lock stripe on first reference */
1226 r = (atomic_inc_return(&stripe->cnt) == 1) ?
1227 stripe_lock(stripe, WRITE, stripe->key) : 0;
1229 return r;
1231 #undef DEL_LIST
1233 /* Return references on a chunk. */
1234 static int chunk_ref(struct stripe_chunk *chunk)
1236 return atomic_read(&chunk->cnt);
1239 /* Take out reference on a chunk. */
1240 static int chunk_get(struct stripe_chunk *chunk)
1242 return atomic_inc_return(&chunk->cnt);
1245 /* Drop reference on a chunk. */
1246 static void chunk_put(struct stripe_chunk *chunk)
1248 BUG_ON(atomic_dec_return(&chunk->cnt) < 0);
1252 * Drop reference on a stripe.
1254 * Move it to list of LRU stripes if zero.
1256 static void stripe_put(struct stripe *stripe)
1258 if (atomic_dec_and_test(&stripe->cnt)) {
1259 BUG_ON(stripe_io_ref(stripe));
1260 stripe_unlock(stripe);
1261 } else
1262 BUG_ON(stripe_ref(stripe) < 0);
1265 /* Helper needed by for_each_io_dev(). */
1266 static void stripe_get_references(struct stripe *stripe, unsigned p)
1270 * Another one to reference the stripe in
1271 * order to protect vs. LRU list moves.
1273 io_get(RS(stripe->sc)); /* Global io references. */
1274 stripe_get(stripe);
1275 stripe_io_get(stripe); /* One for each chunk io. */
1278 /* Helper for endio() to put all take references. */
1279 static void stripe_put_references(struct stripe *stripe)
1281 stripe_io_put(stripe); /* One for each chunk io. */
1282 stripe_put(stripe);
1283 io_put(RS(stripe->sc));
1287 * Stripe cache functions.
1290 * Invalidate all chunks (i.e. their pages) of a stripe.
1292 * I only keep state for the whole chunk.
1294 static inline void stripe_chunk_invalidate(struct stripe_chunk *chunk)
1296 chunk->io.flags = 0;
1299 static void
1300 stripe_chunks_invalidate(struct stripe *stripe)
1302 unsigned p = RS(stripe->sc)->set.raid_devs;
1304 while (p--)
1305 stripe_chunk_invalidate(CHUNK(stripe, p));
1308 /* Prepare stripe for (re)use. */
1309 static void stripe_invalidate(struct stripe *stripe)
1311 stripe->io.flags = 0;
1312 stripe->idx.parity = stripe->idx.recover = -1;
1313 stripe_chunks_invalidate(stripe);
1317 * Allow io on all chunks of a stripe.
1318 * If not set, IO will not occur; i.e. it's prohibited.
1320 * Actual IO submission for allowed chunks depends
1321 * on their !uptodate or dirty state.
1323 static void stripe_allow_io(struct stripe *stripe)
1325 unsigned p = RS(stripe->sc)->set.raid_devs;
1327 while (p--)
1328 SetChunkIo(CHUNK(stripe, p));
1331 /* Initialize a stripe. */
1332 static void stripe_init(struct stripe_cache *sc, struct stripe *stripe)
1334 unsigned i, p = RS(sc)->set.raid_devs;
1336 /* Work all io chunks. */
1337 while (p--) {
1338 struct stripe_chunk *chunk = CHUNK(stripe, p);
1340 atomic_set(&chunk->cnt, 0);
1341 chunk->stripe = stripe;
1342 i = ARRAY_SIZE(chunk->bl);
1343 while (i--)
1344 bio_list_init(chunk->bl + i);
1347 stripe->sc = sc;
1349 i = ARRAY_SIZE(stripe->lists);
1350 while (i--)
1351 INIT_LIST_HEAD(stripe->lists + i);
1353 stripe->io.size = RS(sc)->set.io_size;
1354 atomic_set(&stripe->cnt, 0);
1355 atomic_set(&stripe->io.pending, 0);
1356 stripe_invalidate(stripe);
1359 /* Number of pages per chunk. */
1360 static inline unsigned chunk_pages(unsigned sectors)
1362 return dm_div_up(sectors, SECTORS_PER_PAGE);
1365 /* Number of pages per stripe. */
1366 static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
1368 return chunk_pages(io_size) * rs->set.raid_devs;
1371 /* Initialize part of page_list (recovery). */
1372 static void stripe_zero_pl_part(struct stripe *stripe, int p,
1373 unsigned start, unsigned count)
1375 unsigned o = start / SECTORS_PER_PAGE, pages = chunk_pages(count);
1376 /* Get offset into the page_list. */
1377 struct page_list *pl = pl_elem(PL(stripe, p), o);
1379 BUG_ON(!pl);
1380 while (pl && pages--) {
1381 BUG_ON(!pl->page);
1382 memset(page_address(pl->page), 0, PAGE_SIZE);
1383 pl = pl->next;
1387 /* Initialize parity chunk of stripe. */
1388 static void stripe_zero_chunk(struct stripe *stripe, int p)
1390 if (p > -1)
1391 stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
1394 /* Return dynamic stripe structure size. */
1395 static size_t stripe_size(struct raid_set *rs)
1397 return sizeof(struct stripe) +
1398 rs->set.raid_devs * sizeof(struct stripe_chunk);
1401 /* Allocate a stripe and its memory object. */
1402 /* XXX adjust to cope with stripe cache and recovery stripe caches. */
1403 enum grow { SC_GROW, SC_KEEP };
1404 static struct stripe *stripe_alloc(struct stripe_cache *sc,
1405 struct dm_mem_cache_client *mc,
1406 enum grow grow)
1408 int r;
1409 struct stripe *stripe;
1411 stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
1412 if (stripe) {
1413 /* Grow the dm-mem-cache by one object. */
1414 if (grow == SC_GROW) {
1415 r = dm_mem_cache_grow(mc, 1);
1416 if (r)
1417 goto err_free;
1420 stripe->obj = dm_mem_cache_alloc(mc);
1421 if (IS_ERR(stripe->obj))
1422 goto err_shrink;
1424 stripe_init(sc, stripe);
1427 return stripe;
1429 err_shrink:
1430 if (grow == SC_GROW)
1431 dm_mem_cache_shrink(mc, 1);
1432 err_free:
1433 kmem_cache_free(sc->kc.cache, stripe);
1434 return NULL;
1438 * Free a stripes memory object, shrink the
1439 * memory cache and free the stripe itself.
1441 static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
1443 dm_mem_cache_free(mc, stripe->obj);
1444 dm_mem_cache_shrink(mc, 1);
1445 kmem_cache_free(stripe->sc->kc.cache, stripe);
1448 /* Free the recovery stripe. */
1449 static void stripe_recover_free(struct raid_set *rs)
1451 struct recover *rec = &rs->recover;
1452 struct dm_mem_cache_client *mc;
1454 mc = rec->mem_cache_client;
1455 rec->mem_cache_client = NULL;
1456 if (mc) {
1457 struct stripe *stripe;
1459 while (!list_empty(&rec->stripes)) {
1460 stripe = list_first_entry(&rec->stripes, struct stripe,
1461 lists[LIST_RECOVER]);
1462 list_del(stripe->lists + LIST_RECOVER);
1463 kfree(stripe->recover);
1464 stripe_free(stripe, mc);
1467 dm_mem_cache_client_destroy(mc);
1468 dm_io_client_destroy(rec->dm_io_client);
1469 rec->dm_io_client = NULL;
1473 /* Grow stripe cache. */
1474 static int sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
1476 int r = 0;
1478 /* Try to allocate this many (additional) stripes. */
1479 while (stripes--) {
1480 struct stripe *stripe =
1481 stripe_alloc(sc, sc->mem_cache_client, grow);
1483 if (likely(stripe)) {
1484 stripe_lru_add(stripe);
1485 atomic_inc(&sc->stripes);
1486 } else {
1487 r = -ENOMEM;
1488 break;
1492 return r ? r : sc_hash_resize(sc);
1495 /* Shrink stripe cache. */
1496 static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
1498 int r = 0;
1500 /* Try to get unused stripe from LRU list. */
1501 while (stripes--) {
1502 struct stripe *stripe;
1504 stripe = stripe_lru_pop(sc);
1505 if (stripe) {
1506 /* An LRU stripe may never have ios pending! */
1507 BUG_ON(stripe_io_ref(stripe));
1508 BUG_ON(stripe_ref(stripe));
1509 atomic_dec(&sc->stripes);
1510 /* Remove from hash if on before deletion. */
1511 stripe_hash_del(stripe);
1512 stripe_free(stripe, sc->mem_cache_client);
1513 } else {
1514 r = -ENOENT;
1515 break;
1519 /* Check if stats are still sane. */
1520 if (atomic_read(&sc->active_stripes_max) >
1521 atomic_read(&sc->stripes))
1522 atomic_set(&sc->active_stripes_max, 0);
1524 if (r)
1525 return r;
1527 return atomic_read(&sc->stripes) ? sc_hash_resize(sc) : 0;
1530 /* Create stripe cache and recovery. */
1531 static int sc_init(struct raid_set *rs, unsigned stripes)
1533 unsigned i, r, rstripes;
1534 struct stripe_cache *sc = &rs->sc;
1535 struct stripe *stripe;
1536 struct recover *rec = &rs->recover;
1537 struct mapped_device *md;
1538 struct gendisk *disk;
1541 /* Initialize lists and locks. */
1542 i = ARRAY_SIZE(sc->lists);
1543 while (i--)
1544 INIT_LIST_HEAD(sc->lists + i);
1546 INIT_LIST_HEAD(&rec->stripes);
1548 /* Initialize endio and LRU list locks. */
1549 i = NR_LOCKS;
1550 while (i--)
1551 spin_lock_init(sc->locks + i);
1553 /* Initialize atomic variables. */
1554 atomic_set(&sc->stripes, 0);
1555 atomic_set(&sc->stripes_to_set, 0);
1556 atomic_set(&sc->active_stripes, 0);
1557 atomic_set(&sc->active_stripes_max, 0); /* REMOVEME: statistics. */
1560 * We need a runtime unique # to suffix the kmem cache name
1561 * because we'll have one for each active RAID set.
1563 md = dm_table_get_md(rs->ti->table);
1564 disk = dm_disk(md);
1565 snprintf(sc->kc.name, sizeof(sc->kc.name), "%s-%d.%d", TARGET,
1566 disk->first_minor, atomic_inc_return(&_stripe_sc_nr));
1567 sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
1568 0, 0, NULL);
1569 if (!sc->kc.cache)
1570 return -ENOMEM;
1572 /* Create memory cache client context for RAID stripe cache. */
1573 sc->mem_cache_client =
1574 dm_mem_cache_client_create(stripes, rs->set.raid_devs,
1575 chunk_pages(rs->set.io_size));
1576 if (IS_ERR(sc->mem_cache_client))
1577 return PTR_ERR(sc->mem_cache_client);
1579 /* Create memory cache client context for RAID recovery stripe(s). */
1580 rstripes = rec->recovery_stripes;
1581 rec->mem_cache_client =
1582 dm_mem_cache_client_create(rstripes, rs->set.raid_devs,
1583 chunk_pages(rec->io_size));
1584 if (IS_ERR(rec->mem_cache_client))
1585 return PTR_ERR(rec->mem_cache_client);
1587 /* Create dm-io client context for IO stripes. */
1588 sc->dm_io_client = dm_io_client_create();
1589 if (IS_ERR(sc->dm_io_client))
1590 return PTR_ERR(sc->dm_io_client);
1592 /* FIXME: intermingeled with stripe cache initialization. */
1593 /* Create dm-io client context for recovery stripes. */
1594 rec->dm_io_client = dm_io_client_create();
1595 if (IS_ERR(rec->dm_io_client))
1596 return PTR_ERR(rec->dm_io_client);
1598 /* Allocate stripes for set recovery. */
1599 while (rstripes--) {
1600 stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
1601 if (!stripe)
1602 return -ENOMEM;
1604 stripe->recover = kzalloc(sizeof(*stripe->recover), GFP_KERNEL);
1605 if (!stripe->recover) {
1606 stripe_free(stripe, rec->mem_cache_client);
1607 return -ENOMEM;
1610 SetStripeRecover(stripe);
1611 stripe->io.size = rec->io_size;
1612 list_add_tail(stripe->lists + LIST_RECOVER, &rec->stripes);
1613 /* Don't add recovery stripes to LRU list! */
1617 * Allocate the stripe objetcs from the
1618 * cache and add them to the LRU list.
1620 r = sc_grow(sc, stripes, SC_KEEP);
1621 if (!r)
1622 atomic_set(&sc->stripes_last, stripes);
1624 return r;
1627 /* Destroy the stripe cache. */
1628 static void sc_exit(struct stripe_cache *sc)
1630 struct raid_set *rs = RS(sc);
1632 if (sc->kc.cache) {
1633 stripe_recover_free(rs);
1634 BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
1635 kmem_cache_destroy(sc->kc.cache);
1636 sc->kc.cache = NULL;
1638 if (sc->mem_cache_client && !IS_ERR(sc->mem_cache_client))
1639 dm_mem_cache_client_destroy(sc->mem_cache_client);
1641 if (sc->dm_io_client && !IS_ERR(sc->dm_io_client))
1642 dm_io_client_destroy(sc->dm_io_client);
1644 hash_exit(&sc->hash);
1649 * Calculate RAID address
1651 * Delivers tuple with the index of the data disk holding the chunk
1652 * in the set, the parity disks index and the start of the stripe
1653 * within the address space of the set (used as the stripe cache hash key).
1655 /* thx MD. */
1656 static struct raid_address *raid_address(struct raid_set *rs, sector_t sector,
1657 struct raid_address *addr)
1659 sector_t stripe, tmp;
1662 * chunk_number = sector / chunk_size
1663 * stripe_number = chunk_number / data_devs
1664 * di = stripe % data_devs;
1666 stripe = sector >> rs->set.chunk_shift;
1667 addr->di = sector_div(stripe, rs->set.data_devs);
1669 switch (rs->set.raid_type->level) {
1670 case raid4:
1671 addr->pi = rs->set.pi;
1672 goto check_shift_di;
1673 case raid5:
1674 tmp = stripe;
1675 addr->pi = sector_div(tmp, rs->set.raid_devs);
1677 switch (rs->set.raid_type->algorithm) {
1678 case left_asym: /* Left asymmetric. */
1679 addr->pi = rs->set.data_devs - addr->pi;
1680 case right_asym: /* Right asymmetric. */
1681 check_shift_di:
1682 if (addr->di >= addr->pi)
1683 addr->di++;
1684 break;
1685 case left_sym: /* Left symmetric. */
1686 addr->pi = rs->set.data_devs - addr->pi;
1687 case right_sym: /* Right symmetric. */
1688 addr->di = (addr->pi + addr->di + 1) %
1689 rs->set.raid_devs;
1690 break;
1691 case none: /* Ain't happen: RAID4 algorithm placeholder. */
1692 BUG();
1697 * Start offset of the stripes chunk on any single device of the RAID
1698 * set, adjusted in case io size differs from chunk size.
1700 addr->key = (stripe << rs->set.chunk_shift) +
1701 (sector & rs->set.io_inv_mask);
1702 return addr;
1706 * Copy data across between stripe pages and bio vectors.
1708 * Pay attention to data alignment in stripe and bio pages.
1710 static void bio_copy_page_list(int rw, struct stripe *stripe,
1711 struct page_list *pl, struct bio *bio)
1713 unsigned i, page_offset;
1714 void *page_addr;
1715 struct raid_set *rs = RS(stripe->sc);
1716 struct bio_vec *bv;
1718 /* Get start page in page list for this sector. */
1719 i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
1720 pl = pl_elem(pl, i);
1721 BUG_ON(!pl);
1722 BUG_ON(!pl->page);
1724 page_addr = page_address(pl->page);
1725 page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
1727 /* Walk all segments and copy data across between bio_vecs and pages. */
1728 bio_for_each_segment(bv, bio, i) {
1729 int len = bv->bv_len, size;
1730 unsigned bio_offset = 0;
1731 void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
1732 redo:
1733 size = (page_offset + len > PAGE_SIZE) ?
1734 PAGE_SIZE - page_offset : len;
1736 if (rw == READ)
1737 memcpy(bio_addr + bio_offset,
1738 page_addr + page_offset, size);
1739 else
1740 memcpy(page_addr + page_offset,
1741 bio_addr + bio_offset, size);
1743 page_offset += size;
1744 if (page_offset == PAGE_SIZE) {
1746 * We reached the end of the chunk page ->
1747 * need to refer to the next one to copy more data.
1749 len -= size;
1750 if (len) {
1751 /* Get next page. */
1752 pl = pl->next;
1753 BUG_ON(!pl);
1754 BUG_ON(!pl->page);
1755 page_addr = page_address(pl->page);
1756 page_offset = 0;
1757 bio_offset += size;
1758 /* REMOVEME: statistics. */
1759 atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
1760 goto redo;
1764 __bio_kunmap_atomic(bio_addr, KM_USER0);
1769 * Xor optimization macros.
1771 /* Xor data pointer declaration and initialization macros. */
1772 #define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1]
1773 #define DECLARE_3 DECLARE_2, *d2 = data[2]
1774 #define DECLARE_4 DECLARE_3, *d3 = data[3]
1775 #define DECLARE_5 DECLARE_4, *d4 = data[4]
1776 #define DECLARE_6 DECLARE_5, *d5 = data[5]
1777 #define DECLARE_7 DECLARE_6, *d6 = data[6]
1778 #define DECLARE_8 DECLARE_7, *d7 = data[7]
1780 /* Xor unrole macros. */
1781 #define D2(n) d0[n] = d0[n] ^ d1[n]
1782 #define D3(n) D2(n) ^ d2[n]
1783 #define D4(n) D3(n) ^ d3[n]
1784 #define D5(n) D4(n) ^ d4[n]
1785 #define D6(n) D5(n) ^ d5[n]
1786 #define D7(n) D6(n) ^ d6[n]
1787 #define D8(n) D7(n) ^ d7[n]
1789 #define X_2(macro, offset) macro(offset); macro(offset + 1);
1790 #define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2);
1791 #define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4);
1792 #define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8);
1793 #define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16);
1794 #define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32);
1796 /* Define a _xor_#chunks_#xors_per_run() function. */
1797 #define _XOR(chunks, xors_per_run) \
1798 static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
1800 unsigned end = XOR_SIZE / sizeof(data[0]), i; \
1801 DECLARE_ ## chunks; \
1803 for (i = 0; i < end; i += xors_per_run) { \
1804 X_ ## xors_per_run(D ## chunks, i); \
1808 /* Define xor functions for 2 - 8 chunks and xors per run. */
1809 #define MAKE_XOR_PER_RUN(xors_per_run) \
1810 _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
1811 _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
1812 _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
1813 _XOR(8, xors_per_run);
1815 MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */
1816 MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */
1817 MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */
1818 MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */
1820 #define MAKE_XOR(xors_per_run) \
1821 struct { \
1822 void (*f)(unsigned long **); \
1823 } static xor_funcs ## xors_per_run[] = { \
1824 { NULL }, /* NULL pointers to optimize indexing in xor(). */ \
1825 { NULL }, \
1826 { _xor2_ ## xors_per_run }, \
1827 { _xor3_ ## xors_per_run }, \
1828 { _xor4_ ## xors_per_run }, \
1829 { _xor5_ ## xors_per_run }, \
1830 { _xor6_ ## xors_per_run }, \
1831 { _xor7_ ## xors_per_run }, \
1832 { _xor8_ ## xors_per_run }, \
1833 }; \
1835 static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
1837 /* Call respective function for amount of chunks. */ \
1838 xor_funcs ## xors_per_run[n].f(data); \
1841 /* Define xor_8() - xor_64 functions. */
1842 MAKE_XOR(8)
1843 MAKE_XOR(16)
1844 MAKE_XOR(32)
1845 MAKE_XOR(64)
1847 * END xor optimization macros.
1850 /* Maximum number of chunks, which can be xor'ed in one go. */
1851 #define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1)
1853 /* xor_blocks wrapper to allow for using that crypto library function. */
1854 static void xor_blocks_wrapper(unsigned n, unsigned long **data)
1856 BUG_ON(n < 2 || n > MAX_XOR_BLOCKS + 1);
1857 xor_blocks(n - 1, XOR_SIZE, (void *) data[0], (void **) data + 1);
1860 struct xor_func {
1861 xor_function_t f;
1862 const char *name;
1863 } static xor_funcs[] = {
1864 { xor_64, "xor_64" },
1865 { xor_32, "xor_32" },
1866 { xor_16, "xor_16" },
1867 { xor_8, "xor_8" },
1868 { xor_blocks_wrapper, "xor_blocks" },
1872 * Check, if chunk has to be xored in/out:
1874 * o if writes are queued
1875 * o if writes are merged
1876 * o if stripe is to be reconstructed
1877 * o if recovery stripe
1879 static inline int chunk_must_xor(struct stripe_chunk *chunk)
1881 if (ChunkUptodate(chunk)) {
1882 BUG_ON(!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) &&
1883 !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)));
1885 if (!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) ||
1886 !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)))
1887 return 1;
1889 if (StripeReconstruct(chunk->stripe) ||
1890 StripeRecover(chunk->stripe))
1891 return 1;
1894 return 0;
1898 * Calculate crc.
1900 * This indexes into the chunks of a stripe and their pages.
1902 * All chunks will be xored into the indexed (@pi)
1903 * chunk in maximum groups of xor.chunks.
1906 static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
1908 struct raid_set *rs = RS(stripe->sc);
1909 unsigned max_chunks = rs->xor.chunks, n = 1,
1910 o = sector / SECTORS_PER_PAGE, /* Offset into the page_list. */
1911 p = rs->set.raid_devs;
1912 unsigned long **d = rs->data;
1913 xor_function_t xor_f = rs->xor.f->f;
1915 BUG_ON(sector > stripe->io.size);
1917 /* Address of parity page to xor into. */
1918 d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
1920 while (p--) {
1921 /* Preset pointers to data pages. */
1922 if (p != pi && chunk_must_xor(CHUNK(stripe, p)))
1923 d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
1925 /* If max chunks -> xor. */
1926 if (n == max_chunks) {
1927 mutex_lock(&rs->io.xor_lock);
1928 xor_f(n, d);
1929 mutex_unlock(&rs->io.xor_lock);
1930 n = 1;
1934 /* If chunks -> xor. */
1935 if (n > 1) {
1936 mutex_lock(&rs->io.xor_lock);
1937 xor_f(n, d);
1938 mutex_unlock(&rs->io.xor_lock);
1942 /* Common xor loop through all stripe page lists. */
1943 static void common_xor(struct stripe *stripe, sector_t count,
1944 unsigned off, unsigned pi)
1946 unsigned sector;
1948 BUG_ON(!count);
1949 for (sector = off; sector < count; sector += SECTORS_PER_PAGE)
1950 xor(stripe, pi, sector);
1952 /* Set parity page uptodate and clean. */
1953 chunk_set(CHUNK(stripe, pi), CLEAN);
1954 atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
1958 * Calculate parity sectors on intact stripes.
1960 * Need to calculate raid address for recover stripe, because its
1961 * chunk sizes differs and is typically larger than io chunk size.
1963 static void parity_xor(struct stripe *stripe)
1965 struct raid_set *rs = RS(stripe->sc);
1966 int size_differs = stripe->io.size != rs->set.io_size;
1967 unsigned chunk_size = rs->set.chunk_size, io_size = stripe->io.size,
1968 xor_size = chunk_size > io_size ? io_size : chunk_size;
1969 sector_t off;
1971 /* This can be the recover stripe with a larger io size. */
1972 for (off = 0; off < io_size; off += xor_size) {
1974 * Recover stripe is likely bigger than regular io
1975 * ones and has no precalculated parity disk index ->
1976 * need to calculate RAID address.
1978 if (unlikely(size_differs)) {
1979 struct raid_address addr;
1981 raid_address(rs, (stripe->key + off) *
1982 rs->set.data_devs, &addr);
1983 stripe->idx.parity = addr.pi;
1984 stripe_zero_pl_part(stripe, addr.pi, off, xor_size);
1987 common_xor(stripe, xor_size, off, stripe->idx.parity);
1988 chunk_set(CHUNK(stripe, stripe->idx.parity), DIRTY);
1992 /* Reconstruct missing chunk. */
1993 static void stripe_reconstruct(struct stripe *stripe)
1995 struct raid_set *rs = RS(stripe->sc);
1996 int p = rs->set.raid_devs, pr = stripe->idx.recover;
1998 BUG_ON(pr < 0);
2000 /* Check if all but the chunk to be reconstructed are uptodate. */
2001 while (p--)
2002 BUG_ON(p != pr && !ChunkUptodate(CHUNK(stripe, p)));
2004 /* REMOVEME: statistics. */
2005 atomic_inc(rs->stats + (RSDegraded(rs) ? S_RECONSTRUCT_EI :
2006 S_RECONSTRUCT_DEV));
2007 /* Zero chunk to be reconstructed. */
2008 stripe_zero_chunk(stripe, pr);
2009 common_xor(stripe, stripe->io.size, 0, pr);
2013 * Recovery io throttling
2015 /* Conditionally reset io counters. */
2016 static int recover_io_reset(struct raid_set *rs)
2018 unsigned long j = jiffies;
2020 /* Pay attention to jiffies overflows. */
2021 if (j > rs->recover.last_jiffies + HZ ||
2022 j < rs->recover.last_jiffies) {
2023 atomic_set(rs->recover.io_count + IO_WORK, 0);
2024 atomic_set(rs->recover.io_count + IO_RECOVER, 0);
2025 rs->recover.last_jiffies = j;
2026 return 1;
2029 return 0;
2032 /* Count ios. */
2033 static void recover_io_count(struct stripe *stripe)
2035 struct raid_set *rs = RS(stripe->sc);
2037 atomic_inc(rs->recover.io_count +
2038 (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
2041 /* Try getting a stripe either from the hash or from the LRU list. */
2042 static struct stripe *stripe_find(struct raid_set *rs,
2043 struct raid_address *addr)
2045 int r;
2046 struct stripe_cache *sc = &rs->sc;
2047 struct stripe *stripe;
2049 /* Try stripe from hash. */
2050 stripe = stripe_lookup(sc, addr->key);
2051 if (stripe) {
2052 r = stripe_get(stripe);
2053 if (r)
2054 goto get_lock_failed;
2056 atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
2057 } else {
2058 /* Not in hash -> try to get an LRU stripe. */
2059 stripe = stripe_lru_pop(sc);
2060 if (stripe) {
2062 * An LRU stripe may not be referenced
2063 * and may never have ios pending!
2065 BUG_ON(stripe_ref(stripe));
2066 BUG_ON(stripe_io_ref(stripe));
2068 /* Remove from hash if on before reuse. */
2069 stripe_hash_del(stripe);
2071 /* Invalidate before reinserting with changed key. */
2072 stripe_invalidate(stripe);
2074 stripe->key = addr->key;
2075 stripe->region = dm_rh_sector_to_region(rs->recover.rh,
2076 addr->key);
2077 stripe->idx.parity = addr->pi;
2078 r = stripe_get(stripe);
2079 if (r)
2080 goto get_lock_failed;
2082 /* Insert stripe into the stripe hash. */
2083 stripe_insert(&sc->hash, stripe);
2084 /* REMOVEME: statistics. */
2085 atomic_inc(rs->stats + S_INSCACHE);
2089 return stripe;
2091 get_lock_failed:
2092 stripe_put(stripe);
2093 return NULL;
2097 * Process end io
2099 * I need to do it here because I can't in interrupt
2101 /* End io all bios on a bio list. */
2102 static void bio_list_endio(struct stripe *stripe, struct bio_list *bl,
2103 int p, int error)
2105 struct raid_set *rs = RS(stripe->sc);
2106 struct bio *bio;
2107 struct page_list *pl = PL(stripe, p);
2108 struct stripe_chunk *chunk = CHUNK(stripe, p);
2110 /* Update region counters. */
2111 while ((bio = bio_list_pop(bl))) {
2112 if (bio_data_dir(bio) == WRITE)
2113 /* Drop io pending count for any writes. */
2114 dm_rh_dec(rs->recover.rh, stripe->region);
2115 else if (!error)
2116 /* Copy data accross. */
2117 bio_copy_page_list(READ, stripe, pl, bio);
2119 bio_endio(bio, error);
2121 /* REMOVEME: statistics. */
2122 atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
2123 S_BIOS_ENDIO_READ : S_BIOS_ENDIO_WRITE));
2125 chunk_put(chunk);
2126 stripe_put(stripe);
2127 io_put(rs); /* Wake any suspend waiters on last bio. */
2132 * End io all reads/writes on a stripe copying
2133 * read data accross from stripe to bios and
2134 * decrementing region counters for writes.
2136 * Processing of ios depeding on state:
2137 * o no chunk error -> endio ok
2138 * o degraded:
2139 * - chunk error and read -> ignore to be requeued
2140 * - chunk error and write -> endio ok
2141 * o dead (more than parity_devs failed) and chunk_error-> endio failed
2143 static void stripe_endio(int rw, struct stripe *stripe)
2145 struct raid_set *rs = RS(stripe->sc);
2146 unsigned p = rs->set.raid_devs;
2147 int write = (rw != READ);
2149 while (p--) {
2150 struct stripe_chunk *chunk = CHUNK(stripe, p);
2151 struct bio_list *bl;
2153 BUG_ON(ChunkLocked(chunk));
2155 bl = BL_CHUNK(chunk, rw);
2156 if (bio_list_empty(bl))
2157 continue;
2159 if (unlikely(ChunkError(chunk) || !ChunkUptodate(chunk))) {
2160 /* RAID set dead. */
2161 if (unlikely(RSDead(rs)))
2162 bio_list_endio(stripe, bl, p, -EIO);
2163 /* RAID set degraded. */
2164 else if (write)
2165 bio_list_endio(stripe, bl, p, 0);
2166 } else {
2167 BUG_ON(!RSDegraded(rs) && ChunkDirty(chunk));
2168 bio_list_endio(stripe, bl, p, 0);
2173 /* Fail all ios hanging off all bio lists of a stripe. */
2174 static void stripe_fail_io(struct stripe *stripe)
2176 struct raid_set *rs = RS(stripe->sc);
2177 unsigned p = rs->set.raid_devs;
2179 while (p--) {
2180 struct stripe_chunk *chunk = CHUNK(stripe, p);
2181 int i = ARRAY_SIZE(chunk->bl);
2183 /* Fail all bios on all bio lists of the stripe. */
2184 while (i--) {
2185 struct bio_list *bl = chunk->bl + i;
2187 if (!bio_list_empty(bl))
2188 bio_list_endio(stripe, bl, p, -EIO);
2192 /* Put stripe on LRU list. */
2193 BUG_ON(stripe_io_ref(stripe));
2194 BUG_ON(stripe_ref(stripe));
2197 /* Unlock all required chunks. */
2198 static void stripe_chunks_unlock(struct stripe *stripe)
2200 unsigned p = RS(stripe->sc)->set.raid_devs;
2201 struct stripe_chunk *chunk;
2203 while (p--) {
2204 chunk = CHUNK(stripe, p);
2206 if (TestClearChunkUnlock(chunk))
2207 ClearChunkLocked(chunk);
2212 * Queue reads and writes to a stripe by hanging
2213 * their bios off the stripesets read/write lists.
2215 static int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
2216 struct bio_list *reject)
2218 struct raid_address addr;
2219 struct stripe *stripe;
2221 stripe = stripe_find(rs, raid_address(rs, bio->bi_sector, &addr));
2222 if (stripe) {
2223 int r = 0, rw = bio_data_dir(bio);
2225 /* Distinguish reads and writes. */
2226 bio_list_add(BL(stripe, addr.di, rw), bio);
2228 if (rw == READ)
2229 /* REMOVEME: statistics. */
2230 atomic_inc(rs->stats + S_BIOS_ADDED_READ);
2231 else {
2232 /* Inrement pending write count on region. */
2233 dm_rh_inc(rs->recover.rh, stripe->region);
2234 r = 1;
2236 /* REMOVEME: statistics. */
2237 atomic_inc(rs->stats + S_BIOS_ADDED_WRITE);
2241 * Put on io (flush) list in case of
2242 * initial bio queued to chunk.
2244 if (chunk_get(CHUNK(stripe, addr.di)) == 1)
2245 stripe_flush_add(stripe);
2247 return r;
2250 /* Got no stripe from cache or failed to lock it -> reject bio. */
2251 bio_list_add(reject, bio);
2252 atomic_inc(rs->stats + S_IOS_POST); /* REMOVEME: statistics. */
2253 return 0;
2257 * Handle all stripes by handing them to the daemon, because we can't
2258 * map their chunk pages to copy the data in interrupt context.
2260 * We don't want to handle them here either, while interrupts are disabled.
2263 /* Read/write endio function for dm-io (interrupt context). */
2264 static void endio(unsigned long error, void *context)
2266 struct stripe_chunk *chunk = context;
2268 if (unlikely(error)) {
2269 chunk_set(chunk, ERROR);
2270 /* REMOVEME: statistics. */
2271 atomic_inc(RS(chunk->stripe->sc)->stats + S_STRIPE_ERROR);
2272 } else
2273 chunk_set(chunk, CLEAN);
2276 * For recovery stripes, I need to reset locked locked
2277 * here, because those aren't processed in do_endios().
2279 if (unlikely(StripeRecover(chunk->stripe)))
2280 ClearChunkLocked(chunk);
2281 else
2282 SetChunkUnlock(chunk);
2284 /* Indirectly puts stripe on cache's endio list via stripe_io_put(). */
2285 stripe_put_references(chunk->stripe);
2288 /* Read/Write a chunk asynchronously. */
2289 static void stripe_chunk_rw(struct stripe *stripe, unsigned p)
2291 struct stripe_cache *sc = stripe->sc;
2292 struct raid_set *rs = RS(sc);
2293 struct dm_mem_cache_object *obj = stripe->obj + p;
2294 struct page_list *pl = obj->pl;
2295 struct stripe_chunk *chunk = CHUNK(stripe, p);
2296 struct raid_dev *dev = rs->dev + p;
2297 struct dm_io_region io = {
2298 .bdev = dev->dev->bdev,
2299 .sector = stripe->key,
2300 .count = stripe->io.size,
2302 struct dm_io_request control = {
2303 .bi_rw = ChunkDirty(chunk) ? WRITE : READ,
2304 .mem = {
2305 .type = DM_IO_PAGE_LIST,
2306 .ptr.pl = pl,
2307 .offset = 0,
2309 .notify = {
2310 .fn = endio,
2311 .context = chunk,
2313 .client = StripeRecover(stripe) ? rs->recover.dm_io_client :
2314 sc->dm_io_client,
2317 BUG_ON(ChunkLocked(chunk));
2318 BUG_ON(!ChunkUptodate(chunk) && ChunkDirty(chunk));
2319 BUG_ON(ChunkUptodate(chunk) && !ChunkDirty(chunk));
2322 * Don't rw past end of device, which can happen, because
2323 * typically sectors_per_dev isn't divisible by io_size.
2325 if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
2326 io.count = rs->set.sectors_per_dev - io.sector;
2328 BUG_ON(!io.count);
2329 io.sector += dev->start; /* Add <offset>. */
2330 if (RSRecover(rs))
2331 recover_io_count(stripe); /* Recovery io accounting. */
2333 /* REMOVEME: statistics. */
2334 atomic_inc(rs->stats + (ChunkDirty(chunk) ? S_DM_IO_WRITE :
2335 S_DM_IO_READ));
2336 SetChunkLocked(chunk);
2337 SetDevIoQueued(dev);
2338 BUG_ON(dm_io(&control, 1, &io, NULL));
2342 * Write dirty or read not uptodate page lists of a stripe.
2344 static int stripe_chunks_rw(struct stripe *stripe)
2346 int r;
2347 struct raid_set *rs = RS(stripe->sc);
2350 * Increment the pending count on the stripe
2351 * first, so that we don't race in endio().
2353 * An inc (IO) is needed for any chunk unless !ChunkIo(chunk):
2355 * o not uptodate
2356 * o dirtied by writes merged
2357 * o dirtied by parity calculations
2359 r = for_each_io_dev(stripe, stripe_get_references);
2360 if (r) {
2361 /* Io needed: chunks are either not uptodate or dirty. */
2362 int max; /* REMOVEME: */
2363 struct stripe_cache *sc = &rs->sc;
2365 /* Submit actual io. */
2366 for_each_io_dev(stripe, stripe_chunk_rw);
2368 /* REMOVEME: statistics */
2369 max = sc_active(sc);
2370 if (atomic_read(&sc->active_stripes_max) < max)
2371 atomic_set(&sc->active_stripes_max, max);
2373 atomic_inc(rs->stats + S_FLUSHS);
2374 /* END REMOVEME: statistics */
2377 return r;
2380 /* Merge in all writes hence dirtying respective chunks. */
2381 static void stripe_merge_writes(struct stripe *stripe)
2383 unsigned p = RS(stripe->sc)->set.raid_devs;
2385 while (p--) {
2386 struct stripe_chunk *chunk = CHUNK(stripe, p);
2387 struct bio_list *write = BL_CHUNK(chunk, WRITE_QUEUED);
2389 if (!bio_list_empty(write)) {
2390 struct bio *bio;
2391 struct page_list *pl = stripe->obj[p].pl;
2394 * We can play with the lists without holding a lock,
2395 * because it is just us accessing them anyway.
2397 bio_list_for_each(bio, write)
2398 bio_copy_page_list(WRITE, stripe, pl, bio);
2400 bio_list_merge(BL_CHUNK(chunk, WRITE_MERGED), write);
2401 bio_list_init(write);
2402 chunk_set(chunk, DIRTY);
2407 /* Queue all writes to get merged. */
2408 static int stripe_queue_writes(struct stripe *stripe)
2410 int r = 0;
2411 unsigned p = RS(stripe->sc)->set.raid_devs;
2413 while (p--) {
2414 struct stripe_chunk *chunk = CHUNK(stripe, p);
2415 struct bio_list *write = BL_CHUNK(chunk, WRITE);
2417 if (!bio_list_empty(write)) {
2418 bio_list_merge(BL_CHUNK(chunk, WRITE_QUEUED), write);
2419 bio_list_init(write);
2420 SetChunkIo(chunk);
2421 r = 1;
2425 return r;
2429 /* Check, if a chunk gets completely overwritten. */
2430 static int stripe_check_chunk_overwrite(struct stripe *stripe, unsigned p)
2432 unsigned sectors = 0;
2433 struct bio *bio;
2434 struct bio_list *bl = BL(stripe, p, WRITE_QUEUED);
2436 bio_list_for_each(bio, bl)
2437 sectors += bio_sectors(bio);
2439 BUG_ON(sectors > RS(stripe->sc)->set.io_size);
2440 return sectors == RS(stripe->sc)->set.io_size;
2444 * Avoid io on broken/reconstructed drive in order to
2445 * reconstruct date on endio.
2447 * (*1*) We set StripeReconstruct() in here, so that _do_endios()
2448 * will trigger a reconstruct call before resetting it.
2450 static int stripe_chunk_set_io_flags(struct stripe *stripe, int pr)
2452 struct stripe_chunk *chunk = CHUNK(stripe, pr);
2455 * Allow io on all chunks but the indexed one,
2456 * because we're either degraded or prohibit it
2457 * on the one for later reconstruction.
2459 /* Includes ClearChunkIo(), ClearChunkUptodate(). */
2460 stripe_chunk_invalidate(chunk);
2461 stripe->idx.recover = pr;
2462 SetStripeReconstruct(stripe);
2464 /* REMOVEME: statistics. */
2465 atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2466 return -EPERM;
2469 /* Chunk locked/uptodate and device failed tests. */
2470 static struct stripe_chunk *
2471 stripe_chunk_check(struct stripe *stripe, unsigned p, unsigned *chunks_uptodate)
2473 struct raid_set *rs = RS(stripe->sc);
2474 struct stripe_chunk *chunk = CHUNK(stripe, p);
2476 /* Can't access active chunks. */
2477 if (ChunkLocked(chunk)) {
2478 /* REMOVEME: statistics. */
2479 atomic_inc(rs->stats + S_CHUNK_LOCKED);
2480 return NULL;
2483 /* Can't access broken devive. */
2484 if (ChunkError(chunk) || DevFailed(rs->dev + p))
2485 return NULL;
2487 /* Can access uptodate chunks. */
2488 if (ChunkUptodate(chunk)) {
2489 (*chunks_uptodate)++;
2490 return NULL;
2493 return chunk;
2497 * Degraded/reconstruction mode.
2499 * Check stripe state to figure which chunks don't need IO.
2501 * Returns 0 for fully operational, -EPERM for degraded/resynchronizing.
2503 static int stripe_check_reconstruct(struct stripe *stripe)
2505 struct raid_set *rs = RS(stripe->sc);
2507 if (RSDead(rs)) {
2508 ClearStripeReconstruct(stripe);
2509 ClearStripeReconstructed(stripe);
2510 stripe_allow_io(stripe);
2511 return 0;
2514 /* Avoid further reconstruction setting, when already set. */
2515 if (StripeReconstruct(stripe)) {
2516 /* REMOVEME: statistics. */
2517 atomic_inc(rs->stats + S_RECONSTRUCT_SET);
2518 return -EBUSY;
2521 /* Initially allow io on all chunks. */
2522 stripe_allow_io(stripe);
2524 /* Return if stripe is already reconstructed. */
2525 if (StripeReconstructed(stripe)) {
2526 atomic_inc(rs->stats + S_RECONSTRUCTED);
2527 return 0;
2531 * Degraded/reconstruction mode (device failed) ->
2532 * avoid io on the failed device.
2534 if (unlikely(RSDegraded(rs))) {
2535 /* REMOVEME: statistics. */
2536 atomic_inc(rs->stats + S_DEGRADED);
2537 /* Allow IO on all devices but the dead one. */
2538 BUG_ON(rs->set.ei < 0);
2539 return stripe_chunk_set_io_flags(stripe, rs->set.ei);
2540 } else {
2541 int sync, pi = dev_for_parity(stripe, &sync);
2544 * Reconstruction mode (ie. a particular (replaced) device or
2545 * some (rotating) parity chunk is being resynchronized) ->
2546 * o make sure all needed chunks are read in
2547 * o cope with 3/4 disk array special case where it
2548 * doesn't make a difference to read in parity
2549 * to xor data in/out
2551 if (RSEnforceParityCreation(rs) || !sync) {
2552 /* REMOVEME: statistics. */
2553 atomic_inc(rs->stats + S_NOSYNC);
2554 /* Allow IO on all devs but the one to reconstruct. */
2555 return stripe_chunk_set_io_flags(stripe, pi);
2559 return 0;
2563 * Check, if stripe is ready to merge writes.
2564 * I.e. if all chunks present to allow to merge bios.
2566 * We prohibit io on:
2568 * o chunks without bios
2569 * o chunks which get completely written over
2571 static int stripe_merge_possible(struct stripe *stripe, int nosync)
2573 struct raid_set *rs = RS(stripe->sc);
2574 unsigned chunks_overwrite = 0, chunks_prohibited = 0,
2575 chunks_uptodate = 0, p = rs->set.raid_devs;
2577 /* Walk all chunks. */
2578 while (p--) {
2579 struct stripe_chunk *chunk;
2581 /* Prohibit io on broken devices. */
2582 if (DevFailed(rs->dev + p)) {
2583 chunk = CHUNK(stripe, p);
2584 goto prohibit_io;
2587 /* We can't optimize any further if no chunk. */
2588 chunk = stripe_chunk_check(stripe, p, &chunks_uptodate);
2589 if (!chunk || nosync)
2590 continue;
2593 * We have a chunk, which is not uptodate.
2595 * If this is not parity and we don't have
2596 * reads queued, we can optimize further.
2598 if (p != stripe->idx.parity &&
2599 bio_list_empty(BL_CHUNK(chunk, READ)) &&
2600 bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) {
2601 if (bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)))
2602 goto prohibit_io;
2603 else if (RSCheckOverwrite(rs) &&
2604 stripe_check_chunk_overwrite(stripe, p))
2605 /* Completely overwritten chunk. */
2606 chunks_overwrite++;
2609 /* Allow io for chunks with bios and overwritten ones. */
2610 SetChunkIo(chunk);
2611 continue;
2613 prohibit_io:
2614 /* No io for broken devices or for chunks w/o bios. */
2615 ClearChunkIo(chunk);
2616 chunks_prohibited++;
2617 /* REMOVEME: statistics. */
2618 atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2621 /* All data chunks will get written over. */
2622 if (chunks_overwrite == rs->set.data_devs)
2623 atomic_inc(rs->stats + S_OVERWRITE); /* REMOVEME: statistics.*/
2624 else if (chunks_uptodate + chunks_prohibited < rs->set.raid_devs) {
2625 /* We don't have enough chunks to merge. */
2626 atomic_inc(rs->stats + S_CANT_MERGE); /* REMOVEME: statistics.*/
2627 return -EPERM;
2631 * If we have all chunks up to date or overwrite them, we
2632 * just zero the parity chunk and let stripe_rw() recreate it.
2634 if (chunks_uptodate == rs->set.raid_devs ||
2635 chunks_overwrite == rs->set.data_devs) {
2636 stripe_zero_chunk(stripe, stripe->idx.parity);
2637 BUG_ON(StripeReconstruct(stripe));
2638 SetStripeReconstruct(stripe); /* Enforce xor in caller. */
2639 } else {
2641 * With less chunks, we xor parity out.
2643 * (*4*) We rely on !StripeReconstruct() in chunk_must_xor(),
2644 * so that only chunks with queued or merged writes
2645 * are being xored.
2647 parity_xor(stripe);
2651 * We do have enough chunks to merge.
2652 * All chunks are uptodate or get written over.
2654 atomic_inc(rs->stats + S_CAN_MERGE); /* REMOVEME: statistics. */
2655 return 0;
2659 * Avoid reading chunks in case we're fully operational.
2661 * We prohibit io on any chunks without bios but the parity chunk.
2663 static void stripe_avoid_reads(struct stripe *stripe)
2665 struct raid_set *rs = RS(stripe->sc);
2666 unsigned dummy = 0, p = rs->set.raid_devs;
2668 /* Walk all chunks. */
2669 while (p--) {
2670 struct stripe_chunk *chunk =
2671 stripe_chunk_check(stripe, p, &dummy);
2673 if (!chunk)
2674 continue;
2676 /* If parity or any bios pending -> allow io. */
2677 if (chunk_ref(chunk) || p == stripe->idx.parity)
2678 SetChunkIo(chunk);
2679 else {
2680 ClearChunkIo(chunk);
2681 /* REMOVEME: statistics. */
2682 atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2688 * Read/write a stripe.
2690 * All stripe read/write activity goes through this function
2691 * unless recovery, which has to call stripe_chunk_rw() directly.
2693 * Make sure we don't try already merged stripes in order
2694 * to avoid data corruption.
2696 * Check the state of the RAID set and if degraded (or
2697 * resynchronizing for reads), read in all other chunks but
2698 * the one on the dead/resynchronizing device in order to be
2699 * able to reconstruct the missing one in _do_endios().
2701 * Can be called on active stripes in order
2702 * to dispatch new io on inactive chunks.
2704 * States to cover:
2705 * o stripe to read and/or write
2706 * o stripe with error to reconstruct
2708 static int stripe_rw(struct stripe *stripe)
2710 int nosync, r;
2711 struct raid_set *rs = RS(stripe->sc);
2714 * Check, if a chunk needs to be reconstructed
2715 * because of a degraded set or a region out of sync.
2717 nosync = stripe_check_reconstruct(stripe);
2718 switch (nosync) {
2719 case -EBUSY:
2720 return 0; /* Wait for stripe reconstruction to finish. */
2721 case -EPERM:
2722 goto io;
2726 * If we don't have merged writes pending, we can schedule
2727 * queued writes to be merged next without corrupting data.
2729 if (!StripeMerged(stripe)) {
2730 r = stripe_queue_writes(stripe);
2731 if (r)
2732 /* Writes got queued -> flag RBW. */
2733 SetStripeRBW(stripe);
2737 * Merge all writes hanging off uptodate/overwritten
2738 * chunks of the stripe.
2740 if (StripeRBW(stripe)) {
2741 r = stripe_merge_possible(stripe, nosync);
2742 if (!r) { /* Merge possible. */
2743 struct stripe_chunk *chunk;
2746 * I rely on valid parity in order
2747 * to xor a fraction of chunks out
2748 * of parity and back in.
2750 stripe_merge_writes(stripe); /* Merge writes in. */
2751 parity_xor(stripe); /* Update parity. */
2752 ClearStripeReconstruct(stripe); /* Reset xor enforce. */
2753 SetStripeMerged(stripe); /* Writes merged. */
2754 ClearStripeRBW(stripe); /* Disable RBW. */
2757 * REMOVEME: sanity check on parity chunk
2758 * states after writes got merged.
2760 chunk = CHUNK(stripe, stripe->idx.parity);
2761 BUG_ON(ChunkLocked(chunk));
2762 BUG_ON(!ChunkUptodate(chunk));
2763 BUG_ON(!ChunkDirty(chunk));
2764 BUG_ON(!ChunkIo(chunk));
2766 } else if (!nosync && !StripeMerged(stripe))
2767 /* Read avoidance if not degraded/resynchronizing/merged. */
2768 stripe_avoid_reads(stripe);
2771 /* Now submit any reads/writes for non-uptodate or dirty chunks. */
2772 r = stripe_chunks_rw(stripe);
2773 if (!r) {
2775 * No io submitted because of chunk io
2776 * prohibited or locked chunks/failed devices
2777 * -> push to end io list for processing.
2779 stripe_endio_push(stripe);
2780 atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
2783 return r;
2787 * Recovery functions
2789 /* Read a stripe off a raid set for recovery. */
2790 static int stripe_recover_read(struct stripe *stripe, int pi)
2792 BUG_ON(stripe_io_ref(stripe));
2794 /* Invalidate all chunks so that they get read in. */
2795 stripe_chunks_invalidate(stripe);
2796 stripe_allow_io(stripe); /* Allow io on all recovery chunks. */
2799 * If we are reconstructing a perticular device, we can avoid
2800 * reading the respective chunk in, because we're going to
2801 * reconstruct it anyway.
2803 * We can't do that for resynchronization of rotating parity,
2804 * because the recovery stripe chunk size is typically larger
2805 * than the sets chunk size.
2807 if (pi > -1)
2808 ClearChunkIo(CHUNK(stripe, pi));
2810 return stripe_chunks_rw(stripe);
2813 /* Write a stripe to a raid set for recovery. */
2814 static int stripe_recover_write(struct stripe *stripe, int pi)
2816 BUG_ON(stripe_io_ref(stripe));
2819 * If this is a reconstruct of a particular device, then
2820 * reconstruct the respective chunk, else create parity chunk.
2822 if (pi > -1) {
2823 stripe_zero_chunk(stripe, pi);
2824 common_xor(stripe, stripe->io.size, 0, pi);
2825 chunk_set(CHUNK(stripe, pi), DIRTY);
2826 } else
2827 parity_xor(stripe);
2829 return stripe_chunks_rw(stripe);
2832 /* Read/write a recovery stripe. */
2833 static int stripe_recover_rw(struct stripe *stripe)
2835 int r = 0, sync = 0;
2837 /* Read/write flip-flop. */
2838 if (TestClearStripeRBW(stripe)) {
2839 SetStripeMerged(stripe);
2840 stripe->key = stripe->recover->pos;
2841 r = stripe_recover_read(stripe, dev_for_parity(stripe, &sync));
2842 BUG_ON(!r);
2843 } else if (TestClearStripeMerged(stripe)) {
2844 r = stripe_recover_write(stripe, dev_for_parity(stripe, &sync));
2845 BUG_ON(!r);
2848 BUG_ON(sync);
2849 return r;
2852 /* Recover bandwidth available ?. */
2853 static int recover_bandwidth(struct raid_set *rs)
2855 int r, work;
2857 /* On reset or when bios delayed -> allow recovery. */
2858 r = recover_io_reset(rs);
2859 if (r || RSBandwidth(rs))
2860 goto out;
2862 work = atomic_read(rs->recover.io_count + IO_WORK);
2863 if (work) {
2864 /* Pay attention to larger recover stripe size. */
2865 int recover = atomic_read(rs->recover.io_count + IO_RECOVER) *
2866 rs->recover.io_size / rs->set.io_size;
2869 * Don't use more than given bandwidth
2870 * of the work io for recovery.
2872 if (recover > work / rs->recover.bandwidth_work) {
2873 /* REMOVEME: statistics. */
2874 atomic_inc(rs->stats + S_NO_BANDWIDTH);
2875 return 0;
2879 out:
2880 atomic_inc(rs->stats + S_BANDWIDTH); /* REMOVEME: statistics. */
2881 return 1;
2884 /* Try to get a region to recover. */
2885 static int stripe_recover_get_region(struct stripe *stripe)
2887 struct raid_set *rs = RS(stripe->sc);
2888 struct recover *rec = &rs->recover;
2889 struct recover_addr *addr = stripe->recover;
2890 struct dm_dirty_log *dl = rec->dl;
2891 struct dm_rh_client *rh = rec->rh;
2893 BUG_ON(!dl);
2894 BUG_ON(!rh);
2896 /* Return, that we have region first to finish it during suspension. */
2897 if (addr->reg)
2898 return 1;
2900 if (RSSuspend(rs))
2901 return -EPERM;
2903 if (dl->type->get_sync_count(dl) >= rec->nr_regions)
2904 return -ENOENT;
2906 /* If we don't have enough bandwidth, we don't proceed recovering. */
2907 if (!recover_bandwidth(rs))
2908 return -EAGAIN;
2910 /* Start quiescing a region. */
2911 dm_rh_recovery_prepare(rh);
2912 addr->reg = dm_rh_recovery_start(rh);
2913 if (!addr->reg)
2914 return -EAGAIN;
2916 addr->pos = dm_rh_region_to_sector(rh, dm_rh_get_region_key(addr->reg));
2917 addr->end = addr->pos + dm_rh_get_region_size(rh);
2920 * Take one global io reference out for the
2921 * whole region, which is going to be released
2922 * when the region is completely done with.
2924 io_get(rs);
2925 return 0;
2928 /* Update region hash state. */
2929 enum recover_type { REC_FAILURE = 0, REC_SUCCESS = 1 };
2930 static void recover_rh_update(struct stripe *stripe, enum recover_type success)
2932 struct recover_addr *addr = stripe->recover;
2933 struct raid_set *rs = RS(stripe->sc);
2934 struct recover *rec = &rs->recover;
2936 if (!addr->reg) {
2937 DMERR("%s- Called w/o region", __func__);
2938 return;
2941 dm_rh_recovery_end(addr->reg, success);
2942 if (success)
2943 rec->nr_regions_recovered++;
2945 addr->reg = NULL;
2948 * Completely done with this region ->
2949 * release the 1st io reference.
2951 io_put(rs);
2954 /* Set start of recovery state. */
2955 static void set_start_recovery(struct raid_set *rs)
2957 /* Initialize recovery. */
2958 rs->recover.start_jiffies = jiffies;
2959 rs->recover.end_jiffies = 0;
2962 /* Set end of recovery state. */
2963 static void set_end_recovery(struct raid_set *rs)
2965 ClearRSRecover(rs);
2966 /* Achtung: nicht mehr zurück setzten -> 'i' belibt in status output und userpace könnte sich darauf verlassen, das es verschiwndet!!!! */
2967 rs->set.dev_to_init = -1;
2969 /* Check for jiffies overrun. */
2970 rs->recover.end_jiffies = jiffies;
2971 if (rs->recover.end_jiffies < rs->recover.start_jiffies)
2972 rs->recover.end_jiffies = ~0;
2975 /* Handle recovery on one recovery stripe. */
2976 static int _do_recovery(struct stripe *stripe)
2978 int r;
2979 struct raid_set *rs = RS(stripe->sc);
2980 struct recover_addr *addr = stripe->recover;
2982 /* If recovery is active -> return. */
2983 if (stripe_io_ref(stripe))
2984 return 1;
2986 /* IO error is fatal for recovery -> stop it. */
2987 if (unlikely(StripeError(stripe)))
2988 goto err;
2990 /* Recovery end required. */
2991 if (unlikely(RSDegraded(rs)))
2992 goto err;
2994 /* Get a region to recover. */
2995 r = stripe_recover_get_region(stripe);
2996 switch (r) {
2997 case 0: /* Got a new region: flag initial read before write. */
2998 SetStripeRBW(stripe);
2999 case 1: /* Have a region in the works. */
3000 break;
3001 case -EAGAIN:
3002 /* No bandwidth/quiesced region yet, try later. */
3003 if (!io_ref(rs))
3004 wake_do_raid_delayed(rs, HZ / 4);
3005 case -EPERM:
3006 /* Suspend. */
3007 return 1;
3008 case -ENOENT: /* No more regions to recover. */
3009 schedule_work(&rs->io.ws_do_table_event);
3010 return 0;
3011 default:
3012 BUG();
3015 /* Read/write a recover stripe. */
3016 r = stripe_recover_rw(stripe);
3017 if (r)
3018 /* IO initiated. */
3019 return 1;
3021 /* Read and write finished-> update recovery position within region. */
3022 addr->pos += stripe->io.size;
3024 /* If we're at end of region, update region hash. */
3025 if (addr->pos >= addr->end ||
3026 addr->pos >= rs->set.sectors_per_dev)
3027 recover_rh_update(stripe, REC_SUCCESS);
3028 else
3029 /* Prepare to read next region segment. */
3030 SetStripeRBW(stripe);
3032 /* Schedule myself for another round... */
3033 wake_do_raid(rs);
3034 return 1;
3036 err:
3037 /* FIXME: rather try recovering other regions on error? */
3038 rs_check_degrade(stripe);
3039 recover_rh_update(stripe, REC_FAILURE);
3041 /* Check state of partially recovered array. */
3042 if (RSDegraded(rs) && !RSDead(rs) &&
3043 rs->set.dev_to_init != -1 &&
3044 rs->set.ei != rs->set.dev_to_init) {
3045 /* Broken drive != drive to recover -> FATAL. */
3046 SetRSDead(rs);
3047 DMERR("FATAL: failed device != device to initialize -> "
3048 "RAID set broken");
3051 if (StripeError(stripe) || RSDegraded(rs)) {
3052 char buf[BDEVNAME_SIZE];
3054 DMERR("stopping recovery due to "
3055 "ERROR on /dev/%s, stripe at offset %llu",
3056 bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
3057 (unsigned long long) stripe->key);
3061 /* Make sure, that all quiesced regions get released. */
3062 while (addr->reg) {
3063 dm_rh_recovery_end(addr->reg, -EIO);
3064 addr->reg = dm_rh_recovery_start(rs->recover.rh);
3067 return 0;
3070 /* Called by main io daemon to recover regions. */
3071 static int do_recovery(struct raid_set *rs)
3073 if (RSRecover(rs)) {
3074 int r = 0;
3075 struct stripe *stripe;
3077 list_for_each_entry(stripe, &rs->recover.stripes,
3078 lists[LIST_RECOVER])
3079 r += _do_recovery(stripe);
3081 if (r)
3082 return r;
3084 set_end_recovery(rs);
3085 stripe_recover_free(rs);
3088 return 0;
3092 * END recovery functions
3095 /* End io process all stripes handed in by endio() callback. */
3096 static void _do_endios(struct raid_set *rs, struct stripe *stripe,
3097 struct list_head *flush_list)
3099 /* First unlock all required chunks. */
3100 stripe_chunks_unlock(stripe);
3103 * If an io error on a stripe occured, degrade the RAID set
3104 * and try to endio as many bios as possible. If any bios can't
3105 * be endio processed, requeue the stripe (stripe_ref() != 0).
3107 if (TestClearStripeError(stripe)) {
3109 * FIXME: if read, rewrite the failed chunk after reconstruction
3110 * in order to trigger disk bad sector relocation.
3112 rs_check_degrade(stripe); /* Resets ChunkError(). */
3113 ClearStripeReconstruct(stripe);
3114 ClearStripeReconstructed(stripe);
3117 * FIXME: if write, don't endio writes in flight and don't
3118 * allow for new writes until userspace has updated
3119 * its metadata.
3123 /* Got to reconstruct a missing chunk. */
3124 if (StripeReconstruct(stripe)) {
3126 * (*2*) We use StripeReconstruct() to allow for
3127 * all chunks to be xored into the reconstructed
3128 * one (see chunk_must_xor()).
3130 stripe_reconstruct(stripe);
3133 * (*3*) Now we reset StripeReconstruct() and flag
3134 * StripeReconstructed() to show to stripe_rw(),
3135 * that we have reconstructed a missing chunk.
3137 ClearStripeReconstruct(stripe);
3138 SetStripeReconstructed(stripe);
3140 /* FIXME: reschedule to be written in case of read. */
3141 /* if (!RSDead && RSDegraded(rs) !StripeRBW(stripe)) {
3142 chunk_set(CHUNK(stripe, stripe->idx.recover), DIRTY);
3143 stripe_chunks_rw(stripe);
3144 } */
3146 stripe->idx.recover = -1;
3150 * Now that we eventually got a complete stripe, we
3151 * can process the rest of the end ios on reads.
3153 stripe_endio(READ, stripe);
3155 /* End io all merged writes if not prohibited. */
3156 if (!RSProhibitWrites(rs) && StripeMerged(stripe)) {
3157 ClearStripeMerged(stripe);
3158 stripe_endio(WRITE_MERGED, stripe);
3161 /* If RAID set is dead -> fail any ios to dead drives. */
3162 if (RSDead(rs)) {
3163 if (!TestSetRSDeadEndioMessage(rs))
3164 DMERR("RAID set dead: failing ios to dead devices");
3166 stripe_fail_io(stripe);
3170 * We have stripe references still,
3171 * beacuse of read before writes or IO errors ->
3172 * got to put on flush list for processing.
3174 if (stripe_ref(stripe)) {
3175 BUG_ON(!list_empty(stripe->lists + LIST_LRU));
3176 list_add_tail(stripe->lists + LIST_FLUSH, flush_list);
3177 atomic_inc(rs->stats + S_REQUEUE); /* REMOVEME: statistics. */
3178 } else
3179 stripe_lru_add(stripe);
3182 /* Pop any endio stripes off of the endio list and belabour them. */
3183 static void do_endios(struct raid_set *rs)
3185 struct stripe_cache *sc = &rs->sc;
3186 struct stripe *stripe;
3187 /* IO flush list for sorted requeued stripes. */
3188 struct list_head flush_list;
3190 INIT_LIST_HEAD(&flush_list);
3192 while ((stripe = stripe_endio_pop(sc))) {
3193 /* Avoid endio on stripes with newly io'ed chunks. */
3194 if (!stripe_io_ref(stripe))
3195 _do_endios(rs, stripe, &flush_list);
3199 * Insert any requeued stripes in the proper
3200 * order at the beginning of the io (flush) list.
3202 list_splice(&flush_list, sc->lists + LIST_FLUSH);
3205 /* Flush any stripes on the io list. */
3206 static int do_flush(struct raid_set *rs)
3208 int r = 0;
3209 struct stripe *stripe;
3211 while ((stripe = stripe_io_pop(&rs->sc)))
3212 r += stripe_rw(stripe); /* Read/write stripe. */
3214 return r;
3217 /* Stripe cache resizing. */
3218 static void do_sc_resize(struct raid_set *rs)
3220 unsigned set = atomic_read(&rs->sc.stripes_to_set);
3222 if (set) {
3223 unsigned cur = atomic_read(&rs->sc.stripes);
3224 int r = (set > cur) ? sc_grow(&rs->sc, set - cur, SC_GROW) :
3225 sc_shrink(&rs->sc, cur - set);
3227 /* Flag end of resizeing if ok. */
3228 if (!r)
3229 atomic_set(&rs->sc.stripes_to_set, 0);
3234 * Process all ios
3236 * We do different things with the io depending
3237 * on the state of the region that it is in:
3239 * o reads: hang off stripe cache or postpone if full
3241 * o writes:
3243 * CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
3244 * In case stripe cache is full or busy, postpone the io.
3246 * RECOVERING: delay the io until recovery of the region completes.
3249 static void do_ios(struct raid_set *rs, struct bio_list *ios)
3251 int r;
3252 unsigned flush = 0, delay = 0;
3253 sector_t sector;
3254 struct dm_rh_client *rh = rs->recover.rh;
3255 struct bio *bio;
3256 struct bio_list reject;
3258 bio_list_init(&reject);
3261 * Classify each io:
3262 * o delay writes to recovering regions (let reads go through)
3263 * o queue io to all other regions
3265 while ((bio = bio_list_pop(ios))) {
3267 * In case we get a barrier bio, push it back onto
3268 * the input queue unless all work queues are empty
3269 * and the stripe cache is inactive.
3271 if (bio->bi_rw & REQ_FLUSH) {
3272 /* REMOVEME: statistics. */
3273 atomic_inc(rs->stats + S_BARRIER);
3274 if (delay ||
3275 !list_empty(rs->sc.lists + LIST_FLUSH) ||
3276 !bio_list_empty(&reject) ||
3277 sc_active(&rs->sc)) {
3278 bio_list_push(ios, bio);
3279 break;
3283 /* If writes prohibited because of failures -> postpone. */
3284 if (RSProhibitWrites(rs) && bio_data_dir(bio) == WRITE) {
3285 bio_list_add(&reject, bio);
3286 continue;
3289 /* Check for recovering regions. */
3290 sector = _sector(rs, bio);
3291 r = region_state(rs, sector, DM_RH_RECOVERING);
3292 if (unlikely(r)) {
3293 delay++;
3294 /* Wait writing to recovering regions. */
3295 dm_rh_delay_by_region(rh, bio,
3296 dm_rh_sector_to_region(rh,
3297 sector));
3298 /* REMOVEME: statistics.*/
3299 atomic_inc(rs->stats + S_DELAYED_BIOS);
3300 atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
3302 /* Force bandwidth tests in recovery. */
3303 SetRSBandwidth(rs);
3304 } else {
3306 * Process ios to non-recovering regions by queueing
3307 * them to stripes (does dm_rh_inc()) for writes).
3309 flush += stripe_queue_bio(rs, bio, &reject);
3313 if (flush) {
3314 /* FIXME: better error handling. */
3315 r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
3316 if (r)
3317 DMERR_LIMIT("dirty log flush");
3320 /* Merge any rejected bios back to the head of the input list. */
3321 bio_list_merge_head(ios, &reject);
3324 /* Send an event in case we're getting too busy. */
3325 static void do_busy_event(struct raid_set *rs)
3327 if (sc_busy(rs)) {
3328 if (!TestSetRSScBusy(rs))
3329 schedule_work(&rs->io.ws_do_table_event);
3330 } else
3331 ClearRSScBusy(rs);
3334 /* Throw an event. */
3335 static void do_table_event(struct work_struct *ws)
3337 struct raid_set *rs = container_of(ws, struct raid_set,
3338 io.ws_do_table_event);
3339 dm_table_event(rs->ti->table);
3343 /*-----------------------------------------------------------------
3344 * RAID daemon
3345 *---------------------------------------------------------------*/
3347 * o belabour all end ios
3348 * o update the region hash states
3349 * o optionally shrink the stripe cache
3350 * o optionally do recovery
3351 * o unplug any component raid devices with queued bios
3352 * o grab the input queue
3353 * o work an all requeued or new ios and perform stripe cache flushs
3354 * o unplug any component raid devices with queued bios
3355 * o check, if the stripe cache gets too busy and throw an event if so
3357 static void do_raid(struct work_struct *ws)
3359 int r;
3360 struct raid_set *rs = container_of(ws, struct raid_set,
3361 io.dws_do_raid.work);
3362 struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
3365 * We always need to end io, so that ios can get errored in
3366 * case the set failed and the region counters get decremented
3367 * before we update region hash states and go any further.
3369 do_endios(rs);
3370 dm_rh_update_states(rs->recover.rh, 1);
3373 * Now that we've end io'd, which may have put stripes on the LRU list
3374 * to allow for shrinking, we resize the stripe cache if requested.
3376 do_sc_resize(rs);
3378 /* Try to recover regions. */
3379 r = do_recovery(rs);
3381 /* Quickly grab all new ios queued and add them to the work list. */
3382 mutex_lock(&rs->io.in_lock);
3383 bio_list_merge(ios, ios_in);
3384 bio_list_init(ios_in);
3385 mutex_unlock(&rs->io.in_lock);
3387 if (!bio_list_empty(ios))
3388 do_ios(rs, ios); /* Got ios to work into the cache. */
3390 r = do_flush(rs); /* Flush any stripes on io list. */
3392 do_busy_event(rs); /* Check if we got too busy. */
3396 * Callback for region hash to dispatch
3397 * delayed bios queued to recovered regions
3398 * (gets called via dm_rh_update_states()).
3400 static void dispatch_delayed_bios(void *context, struct bio_list *bl)
3402 struct raid_set *rs = context;
3403 struct bio *bio;
3405 /* REMOVEME: statistics; decrement pending delayed bios counter. */
3406 bio_list_for_each(bio, bl)
3407 atomic_dec(rs->stats + S_DELAYED_BIOS);
3409 /* Merge region hash private list to work list. */
3410 bio_list_merge_head(&rs->io.work, bl);
3411 bio_list_init(bl);
3412 ClearRSBandwidth(rs);
3415 /*************************************************************
3416 * Constructor helpers
3417 *************************************************************/
3418 /* Calculate MB/sec. */
3419 static unsigned mbpers(struct raid_set *rs, unsigned io_size)
3421 return to_bytes((rs->xor.speed * rs->set.data_devs *
3422 io_size * HZ / XOR_SPEED_TICKS) >> 10) >> 10;
3426 * Discover fastest xor algorithm and # of chunks combination.
3428 /* Calculate speed of particular algorithm and # of chunks. */
3429 static unsigned xor_speed(struct stripe *stripe)
3431 int ticks = XOR_SPEED_TICKS;
3432 unsigned p = RS(stripe->sc)->set.raid_devs, r = 0;
3433 unsigned long j;
3435 /* Set uptodate so that common_xor()->xor() will belabour chunks. */
3436 while (p--)
3437 SetChunkUptodate(CHUNK(stripe, p));
3439 /* Wait for next tick. */
3440 for (j = jiffies; j == jiffies; );
3442 /* Do xors for a few ticks. */
3443 while (ticks--) {
3444 unsigned xors = 0;
3446 for (j = jiffies; j == jiffies; ) {
3447 mb();
3448 common_xor(stripe, stripe->io.size, 0, 0);
3449 mb();
3450 xors++;
3451 mb();
3454 if (xors > r)
3455 r = xors;
3458 return r;
3461 /* Define for xor multi recovery stripe optimization runs. */
3462 #define DMRAID45_XOR_TEST
3464 /* Optimize xor algorithm for this RAID set. */
3465 static unsigned xor_optimize(struct raid_set *rs)
3467 unsigned chunks_max = 2, speed_max = 0;
3468 struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
3469 struct stripe *stripe;
3470 unsigned io_size = 0, speed_hm = 0, speed_min = ~0, speed_xor_blocks = 0;
3472 BUG_ON(list_empty(&rs->recover.stripes));
3473 #ifndef DMRAID45_XOR_TEST
3474 stripe = list_first_entry(&rs->recover.stripes, struct stripe,
3475 lists[LIST_RECOVER]);
3476 #endif
3478 /* Try all xor functions. */
3479 while (f-- > xor_funcs) {
3480 unsigned speed;
3482 #ifdef DMRAID45_XOR_TEST
3483 list_for_each_entry(stripe, &rs->recover.stripes,
3484 lists[LIST_RECOVER]) {
3485 io_size = stripe->io.size;
3486 #endif
3488 /* Set actual xor function for common_xor(). */
3489 rs->xor.f = f;
3490 rs->xor.chunks = (f->f == xor_blocks_wrapper ?
3491 (MAX_XOR_BLOCKS + 1) :
3492 XOR_CHUNKS_MAX);
3493 if (rs->xor.chunks > rs->set.raid_devs)
3494 rs->xor.chunks = rs->set.raid_devs;
3496 for ( ; rs->xor.chunks > 1; rs->xor.chunks--) {
3497 speed = xor_speed(stripe);
3499 #ifdef DMRAID45_XOR_TEST
3500 if (f->f == xor_blocks_wrapper) {
3501 if (speed > speed_xor_blocks)
3502 speed_xor_blocks = speed;
3503 } else if (speed > speed_hm)
3504 speed_hm = speed;
3506 if (speed < speed_min)
3507 speed_min = speed;
3508 #endif
3510 if (speed > speed_max) {
3511 speed_max = speed;
3512 chunks_max = rs->xor.chunks;
3513 f_max = f;
3516 #ifdef DMRAID45_XOR_TEST
3518 #endif
3521 /* Memorize optimal parameters. */
3522 rs->xor.f = f_max;
3523 rs->xor.chunks = chunks_max;
3524 #ifdef DMRAID45_XOR_TEST
3525 DMINFO("%s stripes=%u/size=%u min=%u xor_blocks=%u hm=%u max=%u",
3526 speed_max == speed_hm ? "HM" : "NB",
3527 rs->recover.recovery_stripes, io_size, speed_min,
3528 speed_xor_blocks, speed_hm, speed_max);
3529 #endif
3530 return speed_max;
3534 * Allocate a RAID context (a RAID set)
3536 /* Structure for variable RAID parameters. */
3537 struct variable_parms {
3538 int bandwidth;
3539 int bandwidth_parm;
3540 int chunk_size;
3541 int chunk_size_parm;
3542 int io_size;
3543 int io_size_parm;
3544 int stripes;
3545 int stripes_parm;
3546 int recover_io_size;
3547 int recover_io_size_parm;
3548 int raid_parms;
3549 int recovery;
3550 int recovery_stripes;
3551 int recovery_stripes_parm;
3554 static struct raid_set *
3555 context_alloc(struct raid_type *raid_type, struct variable_parms *p,
3556 unsigned raid_devs, sector_t sectors_per_dev,
3557 struct dm_target *ti, unsigned dl_parms, char **argv)
3559 int r;
3560 size_t len;
3561 sector_t region_size, ti_len;
3562 struct raid_set *rs = NULL;
3563 struct dm_dirty_log *dl;
3564 struct recover *rec;
3567 * Create the dirty log
3569 * We need to change length for the dirty log constructor,
3570 * because we want an amount of regions for all stripes derived
3571 * from the single device size, so that we can keep region
3572 * size = 2^^n independant of the number of devices
3574 ti_len = ti->len;
3575 ti->len = sectors_per_dev;
3576 dl = dm_dirty_log_create(argv[0], ti, NULL, dl_parms, argv + 2);
3577 ti->len = ti_len;
3578 if (!dl)
3579 goto bad_dirty_log;
3581 /* Chunk size *must* be smaller than region size. */
3582 region_size = dl->type->get_region_size(dl);
3583 if (p->chunk_size > region_size)
3584 goto bad_chunk_size;
3586 /* Recover io size *must* be smaller than region size as well. */
3587 if (p->recover_io_size > region_size)
3588 goto bad_recover_io_size;
3590 /* Size and allocate the RAID set structure. */
3591 len = sizeof(*rs->data) + sizeof(*rs->dev);
3592 if (dm_array_too_big(sizeof(*rs), len, raid_devs))
3593 goto bad_array;
3595 len = sizeof(*rs) + raid_devs * len;
3596 rs = kzalloc(len, GFP_KERNEL);
3597 if (!rs)
3598 goto bad_alloc;
3600 rec = &rs->recover;
3601 atomic_set(&rs->io.in_process, 0);
3602 atomic_set(&rs->io.in_process_max, 0);
3603 rec->io_size = p->recover_io_size;
3605 /* Pointer to data array. */
3606 rs->data = (unsigned long **)
3607 ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
3608 rec->dl = dl;
3609 rs->set.raid_devs = raid_devs;
3610 rs->set.data_devs = raid_devs - raid_type->parity_devs;
3611 rs->set.raid_type = raid_type;
3613 rs->set.raid_parms = p->raid_parms;
3614 rs->set.chunk_size_parm = p->chunk_size_parm;
3615 rs->set.io_size_parm = p->io_size_parm;
3616 rs->sc.stripes_parm = p->stripes_parm;
3617 rec->io_size_parm = p->recover_io_size_parm;
3618 rec->bandwidth_parm = p->bandwidth_parm;
3619 rec->recovery = p->recovery;
3620 rec->recovery_stripes = p->recovery_stripes;
3623 * Set chunk and io size and respective shifts
3624 * (used to avoid divisions)
3626 rs->set.chunk_size = p->chunk_size;
3627 rs->set.chunk_shift = ffs(p->chunk_size) - 1;
3629 rs->set.io_size = p->io_size;
3630 rs->set.io_mask = p->io_size - 1;
3631 /* Mask to adjust address key in case io_size != chunk_size. */
3632 rs->set.io_inv_mask = (p->chunk_size - 1) & ~rs->set.io_mask;
3634 rs->set.sectors_per_dev = sectors_per_dev;
3636 rs->set.ei = -1; /* Indicate no failed device. */
3637 atomic_set(&rs->set.failed_devs, 0);
3639 rs->ti = ti;
3641 atomic_set(rec->io_count + IO_WORK, 0);
3642 atomic_set(rec->io_count + IO_RECOVER, 0);
3644 /* Initialize io lock and queues. */
3645 mutex_init(&rs->io.in_lock);
3646 mutex_init(&rs->io.xor_lock);
3647 bio_list_init(&rs->io.in);
3648 bio_list_init(&rs->io.work);
3650 init_waitqueue_head(&rs->io.suspendq); /* Suspend waiters (dm-io). */
3652 rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
3653 rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios,
3654 wake_dummy, wake_do_raid, 0, p->recovery_stripes,
3655 dl, region_size, rec->nr_regions);
3656 if (IS_ERR(rec->rh))
3657 goto bad_rh;
3659 /* Initialize stripe cache. */
3660 r = sc_init(rs, p->stripes);
3661 if (r)
3662 goto bad_sc;
3664 /* REMOVEME: statistics. */
3665 stats_reset(rs);
3666 ClearRSDevelStats(rs); /* Disnable development status. */
3667 return rs;
3669 bad_dirty_log:
3670 TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM));
3672 bad_chunk_size:
3673 dm_dirty_log_destroy(dl);
3674 TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL));
3676 bad_recover_io_size:
3677 dm_dirty_log_destroy(dl);
3678 TI_ERR_RET("Recover stripe io size larger than region size",
3679 ERR_PTR(-EINVAL));
3681 bad_array:
3682 dm_dirty_log_destroy(dl);
3683 TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL));
3685 bad_alloc:
3686 dm_dirty_log_destroy(dl);
3687 TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM));
3689 bad_rh:
3690 dm_dirty_log_destroy(dl);
3691 ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
3692 goto free_rs;
3694 bad_sc:
3695 dm_region_hash_destroy(rec->rh); /* Destroys dirty log too. */
3696 sc_exit(&rs->sc);
3697 ti->error = DM_MSG_PREFIX "Error creating stripe cache";
3698 free_rs:
3699 kfree(rs);
3700 return ERR_PTR(-ENOMEM);
3703 /* Free a RAID context (a RAID set). */
3704 static void context_free(struct raid_set *rs, unsigned p)
3706 while (p--)
3707 dm_put_device(rs->ti, rs->dev[p].dev);
3709 sc_exit(&rs->sc);
3710 dm_region_hash_destroy(rs->recover.rh); /* Destroys dirty log too. */
3711 kfree(rs);
3714 /* Create work queue and initialize delayed work. */
3715 static int rs_workqueue_init(struct raid_set *rs)
3717 struct dm_target *ti = rs->ti;
3719 rs->io.wq = create_singlethread_workqueue(DAEMON);
3720 if (!rs->io.wq)
3721 TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
3723 INIT_DELAYED_WORK(&rs->io.dws_do_raid, do_raid);
3724 INIT_WORK(&rs->io.ws_do_table_event, do_table_event);
3725 return 0;
3728 /* Return pointer to raid_type structure for raid name. */
3729 static struct raid_type *get_raid_type(char *name)
3731 struct raid_type *r = ARRAY_END(raid_types);
3733 while (r-- > raid_types) {
3734 if (!strcmp(r->name, name))
3735 return r;
3738 return NULL;
3741 /* FIXME: factor out to dm core. */
3742 static int multiple(sector_t a, sector_t b, sector_t *n)
3744 sector_t r = a;
3746 sector_div(r, b);
3747 *n = r;
3748 return a == r * b;
3751 /* Log RAID set information to kernel log. */
3752 static void rs_log(struct raid_set *rs, unsigned io_size)
3754 unsigned p;
3755 char buf[BDEVNAME_SIZE];
3757 for (p = 0; p < rs->set.raid_devs; p++)
3758 DMINFO("/dev/%s is raid disk %u%s",
3759 bdevname(rs->dev[p].dev->bdev, buf), p,
3760 (p == rs->set.pi) ? " (parity)" : "");
3762 DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n"
3763 "algorithm \"%s\", %u chunks with %uMB/s\n"
3764 "%s set with net %u/%u devices",
3765 rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
3766 atomic_read(&rs->sc.stripes),
3767 rs->xor.f->name, rs->xor.chunks, mbpers(rs, io_size),
3768 rs->set.raid_type->descr, rs->set.data_devs, rs->set.raid_devs);
3771 /* Get all devices and offsets. */
3772 static int dev_parms(struct raid_set *rs, char **argv, int *p)
3774 struct dm_target *ti = rs->ti;
3776 DMINFO("rs->set.sectors_per_dev=%llu", (unsigned long long) rs->set.sectors_per_dev);
3777 for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
3778 int r;
3779 unsigned long long tmp;
3780 struct raid_dev *dev = rs->dev + *p;
3782 /* Get offset and device. */
3783 if (sscanf(argv[1], "%llu", &tmp) != 1 ||
3784 tmp > rs->set.sectors_per_dev)
3785 TI_ERR("Invalid RAID device offset parameter");
3787 dev->start = tmp;
3788 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
3789 &dev->dev);
3790 if (r)
3791 TI_ERR_RET("RAID device lookup failure", r);
3793 r = raid_dev_lookup(rs, dev);
3794 if (r != -ENODEV && r < *p) {
3795 (*p)++; /* Ensure dm_put_device() on actual device. */
3796 TI_ERR_RET("Duplicate RAID device", -ENXIO);
3800 return 0;
3803 /* Set recovery bandwidth. */
3804 static void
3805 recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
3807 rs->recover.bandwidth = bandwidth;
3808 rs->recover.bandwidth_work = 100 / bandwidth;
3811 /* Handle variable number of RAID parameters. */
3812 static int get_raid_variable_parms(struct dm_target *ti, char **argv,
3813 struct variable_parms *vp)
3815 int p, value;
3816 struct {
3817 int action; /* -1: skip, 0: no power2 check, 1: power2 check */
3818 char *errmsg;
3819 int min, max;
3820 int *var, *var2, *var3;
3821 } argctr[] = {
3822 { 1,
3823 "Invalid chunk size; must be -1 or 2^^n and <= 16384",
3824 IO_SIZE_MIN, CHUNK_SIZE_MAX,
3825 &vp->chunk_size_parm, &vp->chunk_size, &vp->io_size },
3826 { 0,
3827 "Invalid number of stripes: must be -1 or >= 8 and <= 16384",
3828 STRIPES_MIN, STRIPES_MAX,
3829 &vp->stripes_parm, &vp->stripes, NULL },
3830 { 1,
3831 "Invalid io size; must -1 or >= 8, 2^^n and less equal "
3832 "min(BIO_MAX_SECTORS/2, chunk size)",
3833 IO_SIZE_MIN, 0, /* Needs to be updated in loop below. */
3834 &vp->io_size_parm, &vp->io_size, NULL },
3835 { 1,
3836 "Invalid recovery io size; must be -1 or "
3837 "2^^n and less equal BIO_MAX_SECTORS/2",
3838 RECOVER_IO_SIZE_MIN, BIO_MAX_SECTORS / 2,
3839 &vp->recover_io_size_parm, &vp->recover_io_size, NULL },
3840 { 0,
3841 "Invalid recovery bandwidth percentage; "
3842 "must be -1 or > 0 and <= 100",
3843 BANDWIDTH_MIN, BANDWIDTH_MAX,
3844 &vp->bandwidth_parm, &vp->bandwidth, NULL },
3845 /* Handle sync argument seperately in loop. */
3846 { -1,
3847 "Invalid recovery switch; must be \"sync\" or \"nosync\"" },
3848 { 0,
3849 "Invalid number of recovery stripes;"
3850 "must be -1, > 0 and <= 64",
3851 RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX,
3852 &vp->recovery_stripes_parm, &vp->recovery_stripes, NULL },
3853 }, *varp;
3855 /* Fetch # of variable raid parameters. */
3856 if (sscanf(*(argv++), "%d", &vp->raid_parms) != 1 ||
3857 !range_ok(vp->raid_parms, 0, 7))
3858 TI_ERR("Bad variable raid parameters number");
3860 /* Preset variable RAID parameters. */
3861 vp->chunk_size = CHUNK_SIZE_DEFAULT;
3862 vp->io_size = IO_SIZE_DEFAULT;
3863 vp->stripes = STRIPES_DEFAULT;
3864 vp->recover_io_size = RECOVER_IO_SIZE_DEFAULT;
3865 vp->bandwidth = BANDWIDTH_DEFAULT;
3866 vp->recovery = 1;
3867 vp->recovery_stripes = RECOVERY_STRIPES_DEFAULT;
3869 /* Walk the array of argument constraints for all given ones. */
3870 for (p = 0, varp = argctr; p < vp->raid_parms; p++, varp++) {
3871 BUG_ON(varp >= ARRAY_END(argctr));
3873 /* Special case for "[no]sync" string argument. */
3874 if (varp->action < 0) {
3875 if (!strcmp(*argv, "sync"))
3877 else if (!strcmp(*argv, "nosync"))
3878 vp->recovery = 0;
3879 else
3880 TI_ERR(varp->errmsg);
3882 argv++;
3883 continue;
3887 * Special case for io_size depending
3888 * on previously set chunk size.
3890 if (p == 2)
3891 varp->max = min(BIO_MAX_SECTORS / 2, vp->chunk_size);
3893 if (sscanf(*(argv++), "%d", &value) != 1 ||
3894 (value != -1 &&
3895 ((varp->action && !is_power_of_2(value)) ||
3896 !range_ok(value, varp->min, varp->max))))
3897 TI_ERR(varp->errmsg);
3899 *varp->var = value;
3900 if (value != -1) {
3901 if (varp->var2)
3902 *varp->var2 = value;
3903 if (varp->var3)
3904 *varp->var3 = value;
3908 return 0;
3911 /* Parse optional locking parameters. */
3912 static int get_raid_locking_parms(struct dm_target *ti, char **argv,
3913 int *locking_parms,
3914 struct dm_raid45_locking_type **locking_type)
3916 if (!strnicmp(argv[0], "locking", strlen(argv[0]))) {
3917 char *lckstr = argv[1];
3918 size_t lcksz = strlen(lckstr);
3920 if (!strnicmp(lckstr, "none", lcksz)) {
3921 *locking_type = &locking_none;
3922 *locking_parms = 2;
3923 } else if (!strnicmp(lckstr, "cluster", lcksz)) {
3924 DMERR("locking type \"%s\" not yet implemented",
3925 lckstr);
3926 return -EINVAL;
3927 } else {
3928 DMERR("unknown locking type \"%s\"", lckstr);
3929 return -EINVAL;
3933 *locking_parms = 0;
3934 *locking_type = &locking_none;
3935 return 0;
3938 /* Set backing device read ahead properties of RAID set. */
3939 static void rs_set_read_ahead(struct raid_set *rs,
3940 unsigned sectors, unsigned stripes)
3942 unsigned ra_pages = dm_div_up(sectors, SECTORS_PER_PAGE);
3943 struct mapped_device *md = dm_table_get_md(rs->ti->table);
3944 struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
3946 /* Set read-ahead for the RAID set and the component devices. */
3947 if (ra_pages) {
3948 unsigned p = rs->set.raid_devs;
3950 bdi->ra_pages = stripes * ra_pages * rs->set.data_devs;
3952 while (p--) {
3953 struct request_queue *q =
3954 bdev_get_queue(rs->dev[p].dev->bdev);
3956 q->backing_dev_info.ra_pages = ra_pages;
3961 /* Set congested function. */
3962 static void rs_set_congested_fn(struct raid_set *rs)
3964 struct mapped_device *md = dm_table_get_md(rs->ti->table);
3965 struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
3967 /* Set congested function and data. */
3968 bdi->congested_fn = rs_congested;
3969 bdi->congested_data = rs;
3973 * Construct a RAID4/5 mapping:
3975 * log_type #log_params <log_params> \
3976 * raid_type [#parity_dev] #raid_variable_params <raid_params> \
3977 * [locking "none"/"cluster"]
3978 * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
3980 * log_type = "core"/"disk",
3981 * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
3982 * log_params = [dirty_log_path] region_size [[no]sync])
3984 * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
3986 * #parity_dev = N if raid_type = "raid4"
3987 * o N = -1: pick default = last device
3988 * o N >= 0 and < #raid_devs: parity device index
3990 * #raid_variable_params = 0-7; raid_params (-1 = default):
3991 * [chunk_size [#stripes [io_size [recover_io_size \
3992 * [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]]
3993 * o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
3994 * and <= CHUNK_SIZE_MAX)
3995 * o #stripes is number of stripes allocated to stripe cache
3996 * (must be > 1 and < STRIPES_MAX)
3997 * o io_size (io unit size per device in sectors; must be 2^^n and > 8)
3998 * o recover_io_size (io unit size per device for recovery in sectors;
3999 must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
4000 * o %recovery_bandwith is the maximum amount spend for recovery during
4001 * application io (1-100%)
4002 * o recovery switch = [sync|nosync]
4003 * o #recovery_stripes is the number of recovery stripes used for
4004 * parallel recovery of the RAID set
4005 * If raid_variable_params = 0, defaults will be used.
4006 * Any raid_variable_param can be set to -1 to apply a default
4008 * #raid_devs = N (N >= 3)
4010 * #dev_to_initialize = N
4011 * -1: initialize parity on all devices
4012 * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
4013 * of a failed devices content after replacement
4015 * <dev_path> = device_path (eg, /dev/sdd1)
4016 * <offset> = begin at offset on <dev_path>
4019 #define MIN_PARMS 13
4020 static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
4022 int dev_to_init, dl_parms, i, locking_parms,
4023 parity_parm, pi = -1, r, raid_devs;
4024 sector_t tmp, sectors_per_dev;
4025 struct dm_raid45_locking_type *locking;
4026 struct raid_set *rs;
4027 struct raid_type *raid_type;
4028 struct variable_parms parms;
4030 /* Ensure minimum number of parameters. */
4031 if (argc < MIN_PARMS)
4032 TI_ERR("Not enough parameters");
4034 /* Fetch # of dirty log parameters. */
4035 if (sscanf(argv[1], "%d", &dl_parms) != 1 ||
4036 !range_ok(dl_parms, 1, 4711)) /* ;-) */
4037 TI_ERR("Bad dirty log parameters number");
4039 /* Check raid_type. */
4040 raid_type = get_raid_type(argv[dl_parms + 2]);
4041 if (!raid_type)
4042 TI_ERR("Bad raid type");
4044 /* In case of RAID4, parity drive is selectable. */
4045 parity_parm = !!(raid_type->level == raid4);
4047 /* Handle variable number of RAID parameters. */
4048 r = get_raid_variable_parms(ti, argv + dl_parms + parity_parm + 3,
4049 &parms);
4050 if (r)
4051 return r;
4053 /* Handle any locking parameters. */
4054 r = get_raid_locking_parms(ti,
4055 argv + dl_parms + parity_parm +
4056 parms.raid_parms + 4,
4057 &locking_parms, &locking);
4058 if (r)
4059 return r;
4061 /* # of raid devices. */
4062 i = dl_parms + parity_parm + parms.raid_parms + locking_parms + 4;
4063 if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
4064 raid_devs < raid_type->minimal_devs)
4065 TI_ERR("Invalid number of raid devices");
4067 /* In case of RAID4, check parity drive index is in limits. */
4068 if (raid_type->level == raid4) {
4069 /* Fetch index of parity device. */
4070 if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
4071 (pi != -1 && !range_ok(pi, 0, raid_devs - 1)))
4072 TI_ERR("Invalid RAID4 parity device index");
4076 * Index of device to initialize starts at 0
4078 * o -1 -> don't initialize a selected device;
4079 * initialize parity conforming to algorithm
4080 * o 0..raid_devs-1 -> initialize respective device
4081 * (used for reconstruction of a replaced device)
4083 if (sscanf(argv[dl_parms + parity_parm + parms.raid_parms +
4084 locking_parms + 5], "%d", &dev_to_init) != 1 ||
4085 !range_ok(dev_to_init, -1, raid_devs - 1))
4086 TI_ERR("Invalid number for raid device to initialize");
4088 /* Check # of raid device arguments. */
4089 if (argc - dl_parms - parity_parm - parms.raid_parms - 6 !=
4090 2 * raid_devs)
4091 TI_ERR("Wrong number of raid device/offset arguments");
4094 * Check that the table length is devisable
4095 * w/o rest by (raid_devs - parity_devs)
4097 if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
4098 &sectors_per_dev))
4099 TI_ERR("Target length not divisible by number of data devices");
4102 * Check that the device size is
4103 * devisable w/o rest by chunk size
4105 if (!multiple(sectors_per_dev, parms.chunk_size, &tmp))
4106 TI_ERR("Device length not divisible by chunk_size");
4108 /****************************************************************
4109 * Now that we checked the constructor arguments ->
4110 * let's allocate the RAID set
4111 ****************************************************************/
4112 rs = context_alloc(raid_type, &parms, raid_devs, sectors_per_dev,
4113 ti, dl_parms, argv);
4114 if (IS_ERR(rs))
4115 return PTR_ERR(rs);
4118 rs->set.dev_to_init = rs->set.dev_to_init_parm = dev_to_init;
4119 rs->set.pi = rs->set.pi_parm = pi;
4121 /* Set RAID4 parity drive index. */
4122 if (raid_type->level == raid4)
4123 rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
4125 recover_set_bandwidth(rs, parms.bandwidth);
4127 /* Use locking type to lock stripe access. */
4128 rs->locking = locking;
4130 /* Get the device/offset tupels. */
4131 argv += dl_parms + 6 + parity_parm + parms.raid_parms;
4132 r = dev_parms(rs, argv, &i);
4133 if (r)
4134 goto err;
4136 /* Set backing device information (eg. read ahead). */
4137 rs_set_read_ahead(rs, 2 * rs->set.chunk_size /* sectors per device */,
4138 2 /* # of stripes */);
4139 rs_set_congested_fn(rs); /* Set congested function. */
4140 SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
4141 rs->xor.speed = xor_optimize(rs); /* Select best xor algorithm. */
4143 /* Set for recovery of any nosync regions. */
4144 if (parms.recovery)
4145 SetRSRecover(rs);
4146 else {
4148 * Need to free recovery stripe(s) here in case
4149 * of nosync, because xor_optimize uses one.
4151 set_start_recovery(rs);
4152 set_end_recovery(rs);
4153 stripe_recover_free(rs);
4157 * Enable parity chunk creation enformcement for
4158 * little numbers of array members where it doesn'ti
4159 * gain us performance to xor parity out and back in as
4160 * with larger array member numbers.
4162 if (rs->set.raid_devs <= rs->set.raid_type->minimal_devs + 1)
4163 SetRSEnforceParityCreation(rs);
4166 * Make sure that dm core only hands maximum io size
4167 * length down and pays attention to io boundaries.
4169 ti->split_io = rs->set.io_size;
4170 ti->private = rs;
4172 /* Initialize work queue to handle this RAID set's io. */
4173 r = rs_workqueue_init(rs);
4174 if (r)
4175 goto err;
4177 rs_log(rs, rs->recover.io_size); /* Log information about RAID set. */
4178 return 0;
4180 err:
4181 context_free(rs, i);
4182 return r;
4186 * Destruct a raid mapping
4188 static void raid_dtr(struct dm_target *ti)
4190 struct raid_set *rs = ti->private;
4192 destroy_workqueue(rs->io.wq);
4193 context_free(rs, rs->set.raid_devs);
4196 /* Raid mapping function. */
4197 static int raid_map(struct dm_target *ti, struct bio *bio,
4198 union map_info *map_context)
4200 /* I don't want to waste stripe cache capacity. */
4201 if (bio_rw(bio) == READA)
4202 return -EIO;
4203 else {
4204 struct raid_set *rs = ti->private;
4207 * Get io reference to be waiting for to drop
4208 * to zero on device suspension/destruction.
4210 io_get(rs);
4211 bio->bi_sector -= ti->begin; /* Remap sector. */
4213 /* Queue io to RAID set. */
4214 mutex_lock(&rs->io.in_lock);
4215 bio_list_add(&rs->io.in, bio);
4216 mutex_unlock(&rs->io.in_lock);
4218 /* Wake daemon to process input list. */
4219 wake_do_raid(rs);
4221 /* REMOVEME: statistics. */
4222 atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
4223 S_BIOS_READ : S_BIOS_WRITE));
4224 return DM_MAPIO_SUBMITTED; /* Handle later. */
4228 /* Device suspend. */
4229 static void raid_presuspend(struct dm_target *ti)
4231 struct raid_set *rs = ti->private;
4232 struct dm_dirty_log *dl = rs->recover.dl;
4234 SetRSSuspend(rs);
4236 if (RSRecover(rs))
4237 dm_rh_stop_recovery(rs->recover.rh);
4239 cancel_delayed_work(&rs->io.dws_do_raid);
4240 flush_workqueue(rs->io.wq);
4241 wait_ios(rs); /* Wait for completion of all ios being processed. */
4243 if (dl->type->presuspend && dl->type->presuspend(dl))
4244 /* FIXME: need better error handling. */
4245 DMWARN("log presuspend failed");
4248 static void raid_postsuspend(struct dm_target *ti)
4250 struct raid_set *rs = ti->private;
4251 struct dm_dirty_log *dl = rs->recover.dl;
4253 if (dl->type->postsuspend && dl->type->postsuspend(dl))
4254 /* FIXME: need better error handling. */
4255 DMWARN("log postsuspend failed");
4259 /* Device resume. */
4260 static void raid_resume(struct dm_target *ti)
4262 struct raid_set *rs = ti->private;
4263 struct recover *rec = &rs->recover;
4264 struct dm_dirty_log *dl = rec->dl;
4266 DMINFO("%s...", __func__);
4267 if (dl->type->resume && dl->type->resume(dl))
4268 /* Resume dirty log. */
4269 /* FIXME: need better error handling. */
4270 DMWARN("log resume failed");
4272 rec->nr_regions_to_recover =
4273 rec->nr_regions - dl->type->get_sync_count(dl);
4275 /* Restart any unfinished recovery. */
4276 if (RSRecover(rs)) {
4277 set_start_recovery(rs);
4278 dm_rh_start_recovery(rec->rh);
4281 ClearRSSuspend(rs);
4284 /* Return stripe cache size. */
4285 static unsigned sc_size(struct raid_set *rs)
4287 return to_sector(atomic_read(&rs->sc.stripes) *
4288 (sizeof(struct stripe) +
4289 (sizeof(struct stripe_chunk) +
4290 (sizeof(struct page_list) +
4291 to_bytes(rs->set.io_size) *
4292 rs->set.raid_devs)) +
4293 (rs->recover.end_jiffies ?
4294 0 : rs->recover.recovery_stripes *
4295 to_bytes(rs->set.raid_devs * rs->recover.io_size))));
4298 /* REMOVEME: status output for development. */
4299 static void raid_devel_stats(struct dm_target *ti, char *result,
4300 unsigned *size, unsigned maxlen)
4302 unsigned sz = *size;
4303 unsigned long j;
4304 char buf[BDEVNAME_SIZE], *p;
4305 struct stats_map *sm;
4306 struct raid_set *rs = ti->private;
4307 struct recover *rec = &rs->recover;
4308 struct timespec ts;
4310 DMEMIT("%s %s=%u bw=%u\n",
4311 version, rs->xor.f->name, rs->xor.chunks, rs->recover.bandwidth);
4312 DMEMIT("act_ios=%d ", io_ref(rs));
4313 DMEMIT("act_ios_max=%d\n", atomic_read(&rs->io.in_process_max));
4314 DMEMIT("act_stripes=%d ", sc_active(&rs->sc));
4315 DMEMIT("act_stripes_max=%d\n",
4316 atomic_read(&rs->sc.active_stripes_max));
4318 for (sm = stats_map; sm < ARRAY_END(stats_map); sm++)
4319 DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
4321 DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs) ? "on" : "off");
4322 DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs->set.chunk_size,
4323 atomic_read(&rs->sc.stripes), rs->set.io_size,
4324 rec->recovery_stripes, rec->io_size, rs->sc.hash.buckets,
4325 sc_size(rs));
4327 j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
4328 rec->start_jiffies;
4329 jiffies_to_timespec(j, &ts);
4330 sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
4331 p = strchr(buf, '.');
4332 p[3] = 0;
4334 DMEMIT("rg=%llu/%llu/%llu/%u %s\n",
4335 (unsigned long long) rec->nr_regions_recovered,
4336 (unsigned long long) rec->nr_regions_to_recover,
4337 (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
4339 *size = sz;
4342 static int raid_status(struct dm_target *ti, status_type_t type,
4343 char *result, unsigned maxlen)
4345 unsigned p, sz = 0;
4346 char buf[BDEVNAME_SIZE];
4347 struct raid_set *rs = ti->private;
4348 struct dm_dirty_log *dl = rs->recover.dl;
4349 int raid_parms[] = {
4350 rs->set.chunk_size_parm,
4351 rs->sc.stripes_parm,
4352 rs->set.io_size_parm,
4353 rs->recover.io_size_parm,
4354 rs->recover.bandwidth_parm,
4356 rs->recover.recovery_stripes,
4359 switch (type) {
4360 case STATUSTYPE_INFO:
4361 /* REMOVEME: statistics. */
4362 if (RSDevelStats(rs))
4363 raid_devel_stats(ti, result, &sz, maxlen);
4365 DMEMIT("%u ", rs->set.raid_devs);
4367 for (p = 0; p < rs->set.raid_devs; p++)
4368 DMEMIT("%s ",
4369 format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev));
4371 DMEMIT("2 ");
4372 for (p = 0; p < rs->set.raid_devs; p++) {
4373 DMEMIT("%c", !DevFailed(rs->dev + p) ? 'A' : 'D');
4375 if (p == rs->set.pi)
4376 DMEMIT("p");
4378 if (p == rs->set.dev_to_init)
4379 DMEMIT("i");
4382 DMEMIT(" %llu/%llu ",
4383 (unsigned long long) dl->type->get_sync_count(dl),
4384 (unsigned long long) rs->recover.nr_regions);
4386 sz += dl->type->status(dl, type, result+sz, maxlen-sz);
4387 break;
4388 case STATUSTYPE_TABLE:
4389 sz = rs->recover.dl->type->status(rs->recover.dl, type,
4390 result, maxlen);
4391 DMEMIT("%s %u ", rs->set.raid_type->name, rs->set.raid_parms);
4393 for (p = 0; p < rs->set.raid_parms; p++) {
4394 if (raid_parms[p] > -2)
4395 DMEMIT("%d ", raid_parms[p]);
4396 else
4397 DMEMIT("%s ", rs->recover.recovery ?
4398 "sync" : "nosync");
4401 DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
4403 for (p = 0; p < rs->set.raid_devs; p++)
4404 DMEMIT("%s %llu ",
4405 format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev),
4406 (unsigned long long) rs->dev[p].start);
4409 return 0;
4413 * Message interface
4415 /* Turn a delta into an absolute value. */
4416 static int _absolute(char *action, int act, int r)
4418 size_t len = strlen(action);
4420 if (len < 2)
4421 len = 2;
4423 /* Make delta absolute. */
4424 if (!strncmp("set", action, len))
4426 else if (!strncmp("grow", action, len))
4427 r += act;
4428 else if (!strncmp("shrink", action, len))
4429 r = act - r;
4430 else
4431 r = -EINVAL;
4433 return r;
4436 /* Change recovery io bandwidth. */
4437 static int bandwidth_change(struct raid_set *rs, int argc, char **argv,
4438 enum raid_set_flags flag)
4440 int act = rs->recover.bandwidth, bandwidth;
4442 if (argc != 2)
4443 return -EINVAL;
4445 if (sscanf(argv[1], "%d", &bandwidth) == 1 &&
4446 range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4447 /* Make delta bandwidth absolute. */
4448 bandwidth = _absolute(argv[0], act, bandwidth);
4450 /* Check range. */
4451 if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4452 recover_set_bandwidth(rs, bandwidth);
4453 return 0;
4457 return -EINVAL;
4460 /* Set/reset development feature flags. */
4461 static int devel_flags(struct raid_set *rs, int argc, char **argv,
4462 enum raid_set_flags flag)
4464 size_t len;
4466 if (argc != 1)
4467 return -EINVAL;
4469 len = strlen(argv[0]);
4470 if (len < 2)
4471 len = 2;
4473 if (!strncmp(argv[0], "on", len))
4474 return test_and_set_bit(flag, &rs->io.flags) ? -EPERM : 0;
4475 else if (!strncmp(argv[0], "off", len))
4476 return test_and_clear_bit(flag, &rs->io.flags) ? 0 : -EPERM;
4477 else if (!strncmp(argv[0], "reset", len)) {
4478 if (flag == RS_DEVEL_STATS) {
4479 if (test_bit(flag, &rs->io.flags)) {
4480 stats_reset(rs);
4481 return 0;
4482 } else
4483 return -EPERM;
4484 } else {
4485 set_bit(flag, &rs->io.flags);
4486 return 0;
4490 return -EINVAL;
4493 /* Resize the stripe cache. */
4494 static int sc_resize(struct raid_set *rs, int argc, char **argv,
4495 enum raid_set_flags flag)
4497 int act, stripes;
4499 if (argc != 2)
4500 return -EINVAL;
4502 /* Deny permission in case the daemon is still resizing!. */
4503 if (atomic_read(&rs->sc.stripes_to_set))
4504 return -EPERM;
4506 if (sscanf(argv[1], "%d", &stripes) == 1 &&
4507 stripes > 0) {
4508 act = atomic_read(&rs->sc.stripes);
4510 /* Make delta stripes absolute. */
4511 stripes = _absolute(argv[0], act, stripes);
4514 * Check range and that the # of stripes changes.
4515 * We leave the resizing to the wroker.
4517 if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX) &&
4518 stripes != atomic_read(&rs->sc.stripes)) {
4519 atomic_set(&rs->sc.stripes_to_set, stripes);
4520 wake_do_raid(rs);
4521 return 0;
4525 return -EINVAL;
4528 /* Change xor algorithm and number of chunks. */
4529 static int xor_set(struct raid_set *rs, int argc, char **argv,
4530 enum raid_set_flags flag)
4532 if (argc == 2) {
4533 int chunks;
4534 char *algorithm = argv[0];
4535 struct xor_func *f = ARRAY_END(xor_funcs);
4537 if (sscanf(argv[1], "%d", &chunks) == 1 &&
4538 range_ok(chunks, 2, XOR_CHUNKS_MAX) &&
4539 chunks <= rs->set.raid_devs) {
4540 while (f-- > xor_funcs) {
4541 if (!strcmp(algorithm, f->name)) {
4542 unsigned io_size = 0;
4543 struct stripe *stripe = stripe_alloc(&rs->sc, rs->sc.mem_cache_client, SC_GROW);
4545 DMINFO("xor: %s", f->name);
4546 if (f->f == xor_blocks_wrapper &&
4547 chunks > MAX_XOR_BLOCKS + 1) {
4548 DMERR("chunks > MAX_XOR_BLOCKS"
4549 " + 1");
4550 break;
4553 mutex_lock(&rs->io.xor_lock);
4554 rs->xor.f = f;
4555 rs->xor.chunks = chunks;
4556 rs->xor.speed = 0;
4557 mutex_unlock(&rs->io.xor_lock);
4559 if (stripe) {
4560 rs->xor.speed = xor_speed(stripe);
4561 io_size = stripe->io.size;
4562 stripe_free(stripe, rs->sc.mem_cache_client);
4565 rs_log(rs, io_size);
4566 return 0;
4572 return -EINVAL;
4576 * Allow writes after they got prohibited because of a device failure.
4578 * This needs to be called after userspace updated metadata state
4579 * based on an event being thrown during device failure processing.
4581 static int allow_writes(struct raid_set *rs, int argc, char **argv,
4582 enum raid_set_flags flag)
4584 if (TestClearRSProhibitWrites(rs)) {
4585 DMINFO("%s waking", __func__);
4586 wake_do_raid(rs);
4587 return 0;
4590 return -EPERM;
4593 /* Parse the RAID message. */
4595 * 'all[ow_writes]'
4596 * 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50'
4597 * "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of'
4598 * 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of'
4599 * 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024'
4600 * 'xor algorithm #chunks' # e.g. 'xor xor_8 5'
4603 static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
4605 if (argc) {
4606 size_t len = strlen(argv[0]);
4607 struct raid_set *rs = ti->private;
4608 struct {
4609 const char *name;
4610 int (*f) (struct raid_set *rs, int argc, char **argv,
4611 enum raid_set_flags flag);
4612 enum raid_set_flags flag;
4613 } msg_descr[] = {
4614 { "allow_writes", allow_writes, 0 },
4615 { "bandwidth", bandwidth_change, 0 },
4616 { "overwrite", devel_flags, RS_CHECK_OVERWRITE },
4617 { "statistics", devel_flags, RS_DEVEL_STATS },
4618 { "stripe_cache", sc_resize, 0 },
4619 { "xor", xor_set, 0 },
4620 }, *m = ARRAY_END(msg_descr);
4622 if (len < 3)
4623 len = 3;
4625 while (m-- > msg_descr) {
4626 if (!strncmp(argv[0], m->name, len))
4627 return m->f(rs, argc - 1, argv + 1, m->flag);
4632 return -EINVAL;
4635 * END message interface
4638 /* Provide io hints. */
4639 static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
4641 struct raid_set *rs = ti->private;
4643 blk_limits_io_min(limits, rs->set.chunk_size);
4644 blk_limits_io_opt(limits, rs->set.chunk_size * rs->set.data_devs);
4647 static struct target_type raid_target = {
4648 .name = "raid45",
4649 .version = {1, 0, 0},
4650 .module = THIS_MODULE,
4651 .ctr = raid_ctr,
4652 .dtr = raid_dtr,
4653 .map = raid_map,
4654 .presuspend = raid_presuspend,
4655 .postsuspend = raid_postsuspend,
4656 .resume = raid_resume,
4657 .status = raid_status,
4658 .message = raid_message,
4659 .io_hints = raid_io_hints,
4662 static void init_exit(const char *bad_msg, const char *good_msg, int r)
4664 if (r)
4665 DMERR("Failed to %sregister target [%d]", bad_msg, r);
4666 else
4667 DMINFO("%s %s", good_msg, version);
4670 static int __init dm_raid_init(void)
4672 int r = dm_register_target(&raid_target);
4674 init_exit("", "initialized", r);
4675 return r;
4678 static void __exit dm_raid_exit(void)
4680 dm_unregister_target(&raid_target);
4681 init_exit("un", "exit", 0);
4684 /* Module hooks. */
4685 module_init(dm_raid_init);
4686 module_exit(dm_raid_exit);
4688 MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
4689 MODULE_AUTHOR("Heinz Mauelshagen <heinzm@redhat.com>");
4690 MODULE_LICENSE("GPL");
4691 MODULE_ALIAS("dm-raid4");
4692 MODULE_ALIAS("dm-raid5");