2 * Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
4 * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
6 * This file is released under the GPL.
9 * Linux 2.6 Device Mapper RAID4 and RAID5 target.
11 * Tested-by: Intel; Marcin.Labun@intel.com, krzysztof.wojcik@intel.com
14 * Supports the following ATARAID vendor solutions (and SNIA DDF):
16 * Adaptec HostRAID ASR
25 * Silicon Image Medley
28 * via the dmraid application.
33 * o RAID4 with dedicated and selectable parity device
34 * o RAID5 with rotating parity (left+right, symmetric+asymmetric)
35 * o recovery of out of sync device for initial
36 * RAID set creation or after dead drive replacement
37 * o run time optimization of xor algorithm used to calculate parity
41 * o the raid address calculation algorithm
42 * o the base of the biovec <-> page list copier.
45 * Uses region hash to keep track of how many writes are in flight to
46 * regions in order to use dirty log to keep state of regions to recover:
48 * o clean regions (those which are synchronized
49 * and don't have write io in flight)
50 * o dirty regions (those with write io in flight)
53 * On startup, any dirty regions are migrated to the
54 * 'nosync' state and are subject to recovery by the daemon.
56 * See raid_ctr() for table definition.
58 * ANALYZEME: recovery bandwidth
61 static const char *version
= "v0.2597k";
64 #include "dm-memcache.h"
65 #include "dm-raid45.h"
67 #include <linux/kernel.h>
68 #include <linux/vmalloc.h>
69 #include <linux/raid/xor.h>
70 #include <linux/slab.h>
71 #include <linux/module.h>
73 #include <linux/bio.h>
74 #include <linux/dm-io.h>
75 #include <linux/dm-dirty-log.h>
76 #include <linux/dm-region-hash.h>
80 * Configurable parameters
83 /* Minimum/maximum and default # of selectable stripes. */
85 #define STRIPES_MAX 16384
86 #define STRIPES_DEFAULT 80
88 /* Maximum and default chunk size in sectors if not set in constructor. */
89 #define CHUNK_SIZE_MIN 8
90 #define CHUNK_SIZE_MAX 16384
91 #define CHUNK_SIZE_DEFAULT 64
93 /* Default io size in sectors if not set in constructor. */
94 #define IO_SIZE_MIN CHUNK_SIZE_MIN
95 #define IO_SIZE_DEFAULT IO_SIZE_MIN
97 /* Recover io size default in sectors. */
98 #define RECOVER_IO_SIZE_MIN 64
99 #define RECOVER_IO_SIZE_DEFAULT 256
101 /* Default, minimum and maximum percentage of recover io bandwidth. */
102 #define BANDWIDTH_DEFAULT 10
103 #define BANDWIDTH_MIN 1
104 #define BANDWIDTH_MAX 100
106 /* # of parallel recovered regions */
107 #define RECOVERY_STRIPES_MIN 1
108 #define RECOVERY_STRIPES_MAX 64
109 #define RECOVERY_STRIPES_DEFAULT RECOVERY_STRIPES_MIN
111 * END Configurable parameters
114 #define TARGET "dm-raid45"
115 #define DAEMON "kraid45d"
116 #define DM_MSG_PREFIX TARGET
118 #define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT)
120 /* Amount/size for __xor(). */
121 #define XOR_SIZE PAGE_SIZE
123 /* Ticks to run xor_speed() test for. */
124 #define XOR_SPEED_TICKS 5
126 /* Check value in range. */
127 #define range_ok(i, min, max) (i >= min && i <= max)
129 /* Structure access macros. */
130 /* Derive raid_set from stripe_cache pointer. */
131 #define RS(x) container_of(x, struct raid_set, sc)
133 /* Page reference. */
134 #define PAGE(stripe, p) ((stripe)->obj[p].pl->page)
136 /* Stripe chunk reference. */
137 #define CHUNK(stripe, p) ((stripe)->chunk + p)
139 /* Bio list reference. */
140 #define BL(stripe, p, rw) (stripe->chunk[p].bl + rw)
141 #define BL_CHUNK(chunk, rw) (chunk->bl + rw)
143 /* Page list reference. */
144 #define PL(stripe, p) (stripe->obj[p].pl)
145 /* END: structure access macros. */
147 /* Factor out to dm-bio-list.h */
148 static inline void bio_list_push(struct bio_list
*bl
, struct bio
*bio
)
150 bio
->bi_next
= bl
->head
;
157 /* Factor out to dm.h */
158 #define TI_ERR_RET(str, ret) \
159 do { ti->error = str; return ret; } while (0);
160 #define TI_ERR(str) TI_ERR_RET(str, -EINVAL)
162 /* Macro to define access IO flags access inline functions. */
163 #define BITOPS(name, what, var, flag) \
164 static inline int TestClear ## name ## what(struct var *v) \
165 { return test_and_clear_bit(flag, &v->io.flags); } \
166 static inline int TestSet ## name ## what(struct var *v) \
167 { return test_and_set_bit(flag, &v->io.flags); } \
168 static inline void Clear ## name ## what(struct var *v) \
169 { clear_bit(flag, &v->io.flags); } \
170 static inline void Set ## name ## what(struct var *v) \
171 { set_bit(flag, &v->io.flags); } \
172 static inline int name ## what(struct var *v) \
173 { return test_bit(flag, &v->io.flags); }
175 /*-----------------------------------------------------------------
178 * Cache for all reads and writes to raid sets (operational or degraded)
180 * We need to run all data to and from a RAID set through this cache,
181 * because parity chunks need to get calculated from data chunks
182 * or, in the degraded/resynchronization case, missing chunks need
183 * to be reconstructed using the other chunks of the stripe.
184 *---------------------------------------------------------------*/
185 /* Unique kmem cache name suffix # counter. */
186 static atomic_t _stripe_sc_nr
= ATOMIC_INIT(-1); /* kmem cache # counter. */
188 /* A chunk within a stripe (holds bios hanging off). */
189 /* IO status flags for chunks of a stripe. */
191 CHUNK_DIRTY
, /* Pages of chunk dirty; need writing. */
192 CHUNK_ERROR
, /* IO error on any chunk page. */
193 CHUNK_IO
, /* Allow/prohibit IO on chunk pages. */
194 CHUNK_LOCKED
, /* Chunk pages locked during IO. */
195 CHUNK_MUST_IO
, /* Chunk must io. */
196 CHUNK_UNLOCK
, /* Enforce chunk unlock. */
197 CHUNK_UPTODATE
, /* Chunk pages are uptodate. */
201 WRITE_QUEUED
= WRITE
+ 1,
203 NR_BL_TYPES
, /* Must be last one! */
205 struct stripe_chunk
{
206 atomic_t cnt
; /* Reference count. */
207 struct stripe
*stripe
; /* Backpointer to stripe for endio(). */
208 /* Bio lists for reads, writes, and writes merged. */
209 struct bio_list bl
[NR_BL_TYPES
];
211 unsigned long flags
; /* IO status flags. */
215 /* Define chunk bit operations. */
216 BITOPS(Chunk
, Dirty
, stripe_chunk
, CHUNK_DIRTY
)
217 BITOPS(Chunk
, Error
, stripe_chunk
, CHUNK_ERROR
)
218 BITOPS(Chunk
, Io
, stripe_chunk
, CHUNK_IO
)
219 BITOPS(Chunk
, Locked
, stripe_chunk
, CHUNK_LOCKED
)
220 BITOPS(Chunk
, MustIo
, stripe_chunk
, CHUNK_MUST_IO
)
221 BITOPS(Chunk
, Unlock
, stripe_chunk
, CHUNK_UNLOCK
)
222 BITOPS(Chunk
, Uptodate
, stripe_chunk
, CHUNK_UPTODATE
)
225 * Stripe linked list indexes. Keep order, because the stripe
226 * and the stripe cache rely on the first 3!
229 LIST_FLUSH
, /* Stripes to flush for io. */
230 LIST_ENDIO
, /* Stripes to endio. */
231 LIST_LRU
, /* Least recently used stripes. */
232 SC_NR_LISTS
, /* # of lists in stripe cache. */
233 LIST_HASH
= SC_NR_LISTS
, /* Hashed stripes. */
234 LIST_RECOVER
= LIST_HASH
, /* For recovery type stripes only. */
235 STRIPE_NR_LISTS
,/* To size array in struct stripe. */
238 /* Adressing region recovery. */
239 struct recover_addr
{
240 struct dm_region
*reg
; /* Actual region to recover. */
241 sector_t pos
; /* Position within region to recover. */
242 sector_t end
; /* End of region to recover. */
245 /* A stripe: the io object to handle all reads and writes to a RAID set. */
247 atomic_t cnt
; /* Reference count. */
248 struct stripe_cache
*sc
; /* Backpointer to stripe cache. */
252 * o io list to flush io
254 * o LRU list to put stripes w/o reference count on
255 * o stripe cache hash
257 struct list_head lists
[STRIPE_NR_LISTS
];
259 sector_t key
; /* Hash key. */
260 region_t region
; /* Region stripe is mapped to. */
263 unsigned long flags
; /* Stripe state flags (see below). */
266 * Pending ios in flight:
268 * used to control move of stripe to endio list
272 /* Sectors to read and write for multi page stripe sets. */
276 /* Address region recovery. */
277 struct recover_addr
*recover
;
279 /* Lock on stripe (Future: for clustering). */
283 unsigned short parity
; /* Parity chunk index. */
284 short recover
; /* Recovery chunk index. */
288 * This stripe's memory cache object (dm-mem-cache);
289 * i.e. the io chunk pages.
291 struct dm_mem_cache_object
*obj
;
293 /* Array of stripe sets (dynamically allocated). */
294 struct stripe_chunk chunk
[0];
297 /* States stripes can be in (flags field). */
299 STRIPE_ERROR
, /* io error on stripe. */
300 STRIPE_MERGED
, /* Writes got merged to be written. */
301 STRIPE_RBW
, /* Read-before-write stripe. */
302 STRIPE_RECONSTRUCT
, /* Reconstruct of a missing chunk required. */
303 STRIPE_RECONSTRUCTED
, /* Reconstructed of a missing chunk. */
304 STRIPE_RECOVER
, /* Stripe used for RAID set recovery. */
307 /* Define stripe bit operations. */
308 BITOPS(Stripe
, Error
, stripe
, STRIPE_ERROR
)
309 BITOPS(Stripe
, Merged
, stripe
, STRIPE_MERGED
)
310 BITOPS(Stripe
, RBW
, stripe
, STRIPE_RBW
)
311 BITOPS(Stripe
, Reconstruct
, stripe
, STRIPE_RECONSTRUCT
)
312 BITOPS(Stripe
, Reconstructed
, stripe
, STRIPE_RECONSTRUCTED
)
313 BITOPS(Stripe
, Recover
, stripe
, STRIPE_RECOVER
)
317 struct list_head
*hash
;
325 LOCK_ENDIO
, /* Protect endio list. */
326 NR_LOCKS
, /* To size array in struct stripe_cache. */
329 /* A stripe cache. */
330 struct stripe_cache
{
332 struct stripe_hash hash
;
334 spinlock_t locks
[NR_LOCKS
]; /* Locks to protect lists. */
336 /* Stripes with io to flush, stripes to endio and LRU lists. */
337 struct list_head lists
[SC_NR_LISTS
];
339 /* Slab cache to allocate stripes from. */
341 struct kmem_cache
*cache
; /* Cache itself. */
342 char name
[32]; /* Unique name. */
345 struct dm_io_client
*dm_io_client
; /* dm-io client resource context. */
347 /* dm-mem-cache client resource context. */
348 struct dm_mem_cache_client
*mem_cache_client
;
350 int stripes_parm
; /* # stripes parameter from constructor. */
351 atomic_t stripes
; /* actual # of stripes in cache. */
352 atomic_t stripes_to_set
; /* # of stripes to resize cache to. */
353 atomic_t stripes_last
; /* last # of stripes in cache. */
354 atomic_t active_stripes
; /* actual # of active stripes in cache. */
357 atomic_t active_stripes_max
; /* actual # of active stripes in cache. */
360 /* Flag specs for raid_dev */ ;
361 enum raid_dev_flags
{
362 DEV_FAILED
, /* Device failed. */
363 DEV_IO_QUEUED
, /* Io got queued to device. */
366 /* The raid device in a set. */
369 sector_t start
; /* Offset to map to. */
370 struct { /* Using struct to be able to BITOPS(). */
371 unsigned long flags
; /* raid_dev_flags. */
375 BITOPS(Dev
, Failed
, raid_dev
, DEV_FAILED
)
376 BITOPS(Dev
, IoQueued
, raid_dev
, DEV_IO_QUEUED
)
378 /* Flags spec for raid_set. */
379 enum raid_set_flags
{
380 RS_CHECK_OVERWRITE
, /* Check for chunk overwrites. */
381 RS_DEAD
, /* RAID set inoperational. */
382 RS_DEAD_ENDIO_MESSAGE
, /* RAID set dead endio one-off message. */
383 RS_DEGRADED
, /* Io errors on RAID device. */
384 RS_DEVEL_STATS
, /* REMOVEME: display status information. */
385 RS_ENFORCE_PARITY_CREATION
,/* Enforce parity creation. */
386 RS_PROHIBIT_WRITES
, /* Prohibit writes on device failure. */
387 RS_RECOVER
, /* Do recovery. */
388 RS_RECOVERY_BANDWIDTH
, /* Allow recovery bandwidth (delayed bios). */
389 RS_SC_BUSY
, /* Stripe cache busy -> send an event. */
390 RS_SUSPEND
, /* Suspend RAID set. */
393 /* REMOVEME: devel stats counters. */
431 S_NR_STATS
, /* # of stats counters. Must be last! */
434 /* Status type -> string mappings. */
436 const enum stats_types type
;
440 static struct stats_map stats_map
[] = {
441 { S_BIOS_READ
, "r=" },
442 { S_BIOS_ADDED_READ
, "/" },
443 { S_BIOS_ENDIO_READ
, "/" },
444 { S_BIOS_WRITE
, " w=" },
445 { S_BIOS_ADDED_WRITE
, "/" },
446 { S_BIOS_ENDIO_WRITE
, "/" },
447 { S_DM_IO_READ
, " rc=" },
448 { S_DM_IO_WRITE
, " wc=" },
449 { S_BANDWIDTH
, "\nbw=" },
450 { S_NO_BANDWIDTH
, " no_bw=" },
451 { S_BARRIER
, "\nbarrier=" },
452 { S_BIO_COPY_PL_NEXT
, "\nbio_cp_next=" },
453 { S_CAN_MERGE
, "\nmerge=" },
454 { S_CANT_MERGE
, "/no_merge=" },
455 { S_CHUNK_LOCKED
, "\nchunk_locked=" },
456 { S_CONGESTED
, "\ncgst=" },
457 { S_NOT_CONGESTED
, "/not_cgst=" },
458 { S_DEGRADED
, "\ndegraded=" },
459 { S_DELAYED_BIOS
, "\ndel_bios=" },
460 { S_SUM_DELAYED_BIOS
, "/sum_del_bios=" },
461 { S_FLUSHS
, "\nflushs=" },
462 { S_HITS_1ST
, "\nhits_1st=" },
463 { S_IOS_POST
, " ios_post=" },
464 { S_INSCACHE
, " inscache=" },
465 { S_MAX_LOOKUP
, " maxlookup=" },
466 { S_NO_RW
, "\nno_rw=" },
467 { S_NOSYNC
, " nosync=" },
468 { S_OVERWRITE
, " ovr=" },
469 { S_PROHIBITCHUNKIO
, " prhbt_io=" },
470 { S_RECONSTRUCT_EI
, "\nrec_ei=" },
471 { S_RECONSTRUCT_DEV
, " rec_dev=" },
472 { S_RECONSTRUCT_SET
, " rec_set=" },
473 { S_RECONSTRUCTED
, " rec=" },
474 { S_REQUEUE
, " requeue=" },
475 { S_STRIPE_ERROR
, " stripe_err=" },
476 { S_XORS
, " xors=" },
482 #define dm_rh_client dm_region_hash
483 enum count_type
{ IO_WORK
= 0, IO_RECOVER
, IO_NR_COUNT
};
484 typedef void (*xor_function_t
)(unsigned count
, unsigned long **data
);
486 struct dm_target
*ti
; /* Target pointer. */
489 unsigned long flags
; /* State flags. */
490 struct mutex in_lock
; /* Protects central input list below. */
491 struct mutex xor_lock
; /* Protects xor algorithm set. */
492 struct bio_list in
; /* Pending ios (central input list). */
493 struct bio_list work
; /* ios work set. */
494 wait_queue_head_t suspendq
; /* suspend synchronization. */
495 atomic_t in_process
; /* counter of queued bios (suspendq). */
496 atomic_t in_process_max
;/* counter of queued bios max. */
499 struct workqueue_struct
*wq
;
500 struct delayed_work dws_do_raid
; /* For main worker. */
501 struct work_struct ws_do_table_event
; /* For event worker. */
504 /* Stripe locking abstraction. */
505 struct dm_raid45_locking_type
*locking
;
507 struct stripe_cache sc
; /* Stripe cache for this set. */
509 /* Xor optimization. */
516 /* Recovery parameters. */
518 struct dm_dirty_log
*dl
; /* Dirty log. */
519 struct dm_rh_client
*rh
; /* Region hash. */
521 struct dm_io_client
*dm_io_client
; /* recovery dm-io client. */
522 /* dm-mem-cache client resource context for recovery stripes. */
523 struct dm_mem_cache_client
*mem_cache_client
;
525 struct list_head stripes
; /* List of recovery stripes. */
528 region_t nr_regions_to_recover
;
529 region_t nr_regions_recovered
;
530 unsigned long start_jiffies
;
531 unsigned long end_jiffies
;
533 unsigned bandwidth
; /* Recovery bandwidth [%]. */
534 unsigned bandwidth_work
; /* Recovery bandwidth [factor]. */
535 unsigned bandwidth_parm
; /* " constructor parm. */
536 unsigned io_size
; /* recovery io size <= region size. */
537 unsigned io_size_parm
; /* recovery io size ctr parameter. */
538 unsigned recovery
; /* Recovery allowed/prohibited. */
539 unsigned recovery_stripes
; /* # of parallel recovery stripes. */
541 /* recovery io throttling. */
542 atomic_t io_count
[IO_NR_COUNT
]; /* counter recover/regular io.*/
543 unsigned long last_jiffies
;
546 /* RAID set parameters. */
548 struct raid_type
*raid_type
; /* RAID type (eg, RAID4). */
549 unsigned raid_parms
; /* # variable raid parameters. */
551 unsigned chunk_size
; /* Sectors per chunk. */
552 unsigned chunk_size_parm
;
553 unsigned chunk_shift
; /* rsector chunk size shift. */
555 unsigned io_size
; /* Sectors per io. */
556 unsigned io_size_parm
;
557 unsigned io_mask
; /* Mask for bio_copy_page_list(). */
558 unsigned io_inv_mask
; /* Mask for raid_address(). */
560 sector_t sectors_per_dev
; /* Sectors per device. */
562 atomic_t failed_devs
; /* Amount of devices failed. */
564 /* Index of device to initialize. */
566 int dev_to_init_parm
;
568 /* Raid devices dynamically allocated. */
569 unsigned raid_devs
; /* # of RAID devices below. */
570 unsigned data_devs
; /* # of RAID data devices. */
572 int ei
; /* index of failed RAID device. */
574 /* Index of dedicated parity device (i.e. RAID4). */
576 int pi_parm
; /* constructor parm for status output. */
579 /* REMOVEME: devel stats counters. */
580 atomic_t stats
[S_NR_STATS
];
582 /* Dynamically allocated temporary pointers for xor(). */
583 unsigned long **data
;
585 /* Dynamically allocated RAID devices. Alignment? */
586 struct raid_dev dev
[0];
589 /* Define RAID set bit operations. */
590 BITOPS(RS
, Bandwidth
, raid_set
, RS_RECOVERY_BANDWIDTH
)
591 BITOPS(RS
, CheckOverwrite
, raid_set
, RS_CHECK_OVERWRITE
)
592 BITOPS(RS
, Dead
, raid_set
, RS_DEAD
)
593 BITOPS(RS
, DeadEndioMessage
, raid_set
, RS_DEAD_ENDIO_MESSAGE
)
594 BITOPS(RS
, Degraded
, raid_set
, RS_DEGRADED
)
595 BITOPS(RS
, DevelStats
, raid_set
, RS_DEVEL_STATS
)
596 BITOPS(RS
, EnforceParityCreation
, raid_set
, RS_ENFORCE_PARITY_CREATION
)
597 BITOPS(RS
, ProhibitWrites
, raid_set
, RS_PROHIBIT_WRITES
)
598 BITOPS(RS
, Recover
, raid_set
, RS_RECOVER
)
599 BITOPS(RS
, ScBusy
, raid_set
, RS_SC_BUSY
)
600 BITOPS(RS
, Suspend
, raid_set
, RS_SUSPEND
)
603 /*-----------------------------------------------------------------
604 * Raid-4/5 set structures.
605 *---------------------------------------------------------------*/
606 /* RAID level definitions. */
612 /* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
613 enum raid_algorithm
{
622 const char *name
; /* RAID algorithm. */
623 const char *descr
; /* Descriptor text for logging. */
624 const unsigned parity_devs
; /* # of parity devices. */
625 const unsigned minimal_devs
; /* minimal # of devices in set. */
626 const enum raid_level level
; /* RAID level. */
627 const enum raid_algorithm algorithm
; /* RAID algorithm. */
630 /* Supported raid types and properties. */
631 static struct raid_type raid_types
[] = {
632 {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4
, none
},
633 {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5
, left_asym
},
634 {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5
, right_asym
},
635 {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5
, left_sym
},
636 {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5
, right_sym
},
639 /* Address as calculated by raid_address(). */
640 struct raid_address
{
641 sector_t key
; /* Hash key (address of stripe % chunk_size). */
642 unsigned di
, pi
; /* Data and parity disks index. */
645 /* REMOVEME: reset statistics counters. */
646 static void stats_reset(struct raid_set
*rs
)
648 unsigned s
= S_NR_STATS
;
651 atomic_set(rs
->stats
+ s
, 0);
654 /*----------------------------------------------------------------
655 * RAID set management routines.
656 *--------------------------------------------------------------*/
658 * Begin small helper functions.
660 /* No need to be called from region hash indirectly at dm_rh_dec(). */
661 static void wake_dummy(void *context
) {}
663 /* Return # of io reference. */
664 static int io_ref(struct raid_set
*rs
)
666 return atomic_read(&rs
->io
.in_process
);
669 /* Get an io reference. */
670 static void io_get(struct raid_set
*rs
)
672 int p
= atomic_inc_return(&rs
->io
.in_process
);
674 if (p
> atomic_read(&rs
->io
.in_process_max
))
675 atomic_set(&rs
->io
.in_process_max
, p
); /* REMOVEME: max. */
678 /* Put the io reference and conditionally wake io waiters. */
679 static void io_put(struct raid_set
*rs
)
681 /* Intel: rebuild data corrupter? */
682 if (atomic_dec_and_test(&rs
->io
.in_process
))
683 wake_up(&rs
->io
.suspendq
);
685 BUG_ON(io_ref(rs
) < 0);
688 /* Wait until all io has been processed. */
689 static void wait_ios(struct raid_set
*rs
)
691 wait_event(rs
->io
.suspendq
, !io_ref(rs
));
694 /* Queue (optionally delayed) io work. */
695 static void wake_do_raid_delayed(struct raid_set
*rs
, unsigned long delay
)
697 queue_delayed_work(rs
->io
.wq
, &rs
->io
.dws_do_raid
, delay
);
700 /* Queue io work immediately (called from region hash too). */
701 static void wake_do_raid(void *context
)
703 struct raid_set
*rs
= context
;
705 queue_work(rs
->io
.wq
, &rs
->io
.dws_do_raid
.work
);
708 /* Calculate device sector offset. */
709 static sector_t
_sector(struct raid_set
*rs
, struct bio
*bio
)
711 sector_t sector
= bio
->bi_sector
;
713 sector_div(sector
, rs
->set
.data_devs
);
717 /* Return # of active stripes in stripe cache. */
718 static int sc_active(struct stripe_cache
*sc
)
720 return atomic_read(&sc
->active_stripes
);
723 /* Stripe cache busy indicator. */
724 static int sc_busy(struct raid_set
*rs
)
726 return sc_active(&rs
->sc
) >
727 atomic_read(&rs
->sc
.stripes
) - (STRIPES_MIN
/ 2);
730 /* Set chunks states. */
731 enum chunk_dirty_type
{ CLEAN
, DIRTY
, ERROR
};
732 static void chunk_set(struct stripe_chunk
*chunk
, enum chunk_dirty_type type
)
736 ClearChunkDirty(chunk
);
739 SetChunkDirty(chunk
);
742 SetChunkError(chunk
);
743 SetStripeError(chunk
->stripe
);
749 SetChunkUptodate(chunk
);
751 ClearChunkError(chunk
);
754 /* Return region state for a sector. */
755 static int region_state(struct raid_set
*rs
, sector_t sector
,
756 enum dm_rh_region_states state
)
758 struct dm_rh_client
*rh
= rs
->recover
.rh
;
759 region_t region
= dm_rh_sector_to_region(rh
, sector
);
761 return !!(dm_rh_get_state(rh
, region
, 1) & state
);
765 * Return true in case a chunk should be read/written
767 * Conditions to read/write:
768 * o chunk not uptodate
771 * Conditios to avoid io:
772 * o io already ongoing on chunk
773 * o io explitely prohibited
775 static int chunk_io(struct stripe_chunk
*chunk
)
777 /* 2nd run optimization (flag set below on first run). */
778 if (TestClearChunkMustIo(chunk
))
781 /* Avoid io if prohibited or a locked chunk. */
782 if (!ChunkIo(chunk
) || ChunkLocked(chunk
))
785 if (!ChunkUptodate(chunk
) || ChunkDirty(chunk
)) {
786 SetChunkMustIo(chunk
); /* 2nd run optimization. */
793 /* Call a function on each chunk needing io unless device failed. */
794 static unsigned for_each_io_dev(struct stripe
*stripe
,
795 void (*f_io
)(struct stripe
*stripe
, unsigned p
))
797 struct raid_set
*rs
= RS(stripe
->sc
);
800 for (p
= 0; p
< rs
->set
.raid_devs
; p
++) {
801 if (chunk_io(CHUNK(stripe
, p
)) && !DevFailed(rs
->dev
+ p
)) {
811 * Index of device to calculate parity on.
813 * Either the parity device index *or* the selected
814 * device to init after a spare replacement.
816 static int dev_for_parity(struct stripe
*stripe
, int *sync
)
818 struct raid_set
*rs
= RS(stripe
->sc
);
819 int r
= region_state(rs
, stripe
->key
, DM_RH_NOSYNC
| DM_RH_RECOVERING
);
823 /* Reconstruct a particular device ?. */
824 if (r
&& rs
->set
.dev_to_init
> -1)
825 return rs
->set
.dev_to_init
;
826 else if (rs
->set
.raid_type
->level
== raid4
)
828 else if (!StripeRecover(stripe
))
829 return stripe
->idx
.parity
;
834 /* RAID set congested function. */
835 static int rs_congested(void *congested_data
, int bdi_bits
)
839 struct raid_set
*rs
= congested_data
;
841 if (sc_busy(rs
) || RSSuspend(rs
) || RSProhibitWrites(rs
))
843 else for (r
= 0, p
= rs
->set
.raid_devs
; !r
&& p
--; ) {
844 /* If any of our component devices are overloaded. */
845 struct request_queue
*q
= bdev_get_queue(rs
->dev
[p
].dev
->bdev
);
847 r
|= bdi_congested(&q
->backing_dev_info
, bdi_bits
);
850 /* REMOVEME: statistics. */
851 atomic_inc(rs
->stats
+ (r
? S_CONGESTED
: S_NOT_CONGESTED
));
855 /* RAID device degrade check. */
856 static void rs_check_degrade_dev(struct raid_set
*rs
,
857 struct stripe
*stripe
, unsigned p
)
859 if (TestSetDevFailed(rs
->dev
+ p
))
862 /* Through an event in case of member device errors. */
863 if ((atomic_inc_return(&rs
->set
.failed_devs
) >
864 rs
->set
.raid_type
->parity_devs
) &&
865 !TestSetRSDead(rs
)) {
866 /* Display RAID set dead message once. */
868 char buf
[BDEVNAME_SIZE
];
870 DMERR("FATAL: too many devices failed -> RAID set broken");
871 for (p
= 0; p
< rs
->set
.raid_devs
; p
++) {
872 if (DevFailed(rs
->dev
+ p
))
873 DMERR("device /dev/%s failed",
874 bdevname(rs
->dev
[p
].dev
->bdev
, buf
));
878 /* Only log the first member error. */
879 if (!TestSetRSDegraded(rs
)) {
880 char buf
[BDEVNAME_SIZE
];
882 /* Store index for recovery. */
884 DMERR("CRITICAL: %sio error on device /dev/%s "
885 "in region=%llu; DEGRADING RAID set\n",
886 stripe
? "" : "FAKED ",
887 bdevname(rs
->dev
[p
].dev
->bdev
, buf
),
888 (unsigned long long) (stripe
? stripe
->key
: 0));
889 DMERR("further device error messages suppressed");
892 /* Prohibit further writes to allow for userpace to update metadata. */
893 SetRSProhibitWrites(rs
);
894 schedule_work(&rs
->io
.ws_do_table_event
);
897 /* RAID set degrade check. */
898 static void rs_check_degrade(struct stripe
*stripe
)
900 struct raid_set
*rs
= RS(stripe
->sc
);
901 unsigned p
= rs
->set
.raid_devs
;
904 if (ChunkError(CHUNK(stripe
, p
)))
905 rs_check_degrade_dev(rs
, stripe
, p
);
909 /* Lookup a RAID device by name or by major:minor number. */
910 static int raid_dev_lookup(struct raid_set
*rs
, struct raid_dev
*dev_lookup
)
913 struct raid_dev
*dev
;
916 * Must be an incremental loop, because the device array
917 * can have empty slots still on calls from raid_ctr()
919 for (dev
= rs
->dev
, p
= 0;
920 dev
->dev
&& p
< rs
->set
.raid_devs
;
922 if (dev_lookup
->dev
->bdev
->bd_dev
== dev
->dev
->bdev
->bd_dev
)
929 * End small helper functions.
933 * Stripe hash functions
935 /* Initialize/destroy stripe hash. */
936 static int hash_init(struct stripe_hash
*hash
, unsigned stripes
)
938 unsigned buckets
= roundup_pow_of_two(stripes
>> 1);
939 static unsigned hash_primes
[] = {
940 /* Table of primes for hash_fn/table size optimization. */
941 1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769,
942 1543, 3079, 6151, 12289, 24593, 49157, 98317,
945 /* Allocate stripe hash buckets. */
946 hash
->hash
= vmalloc(buckets
* sizeof(*hash
->hash
));
950 hash
->buckets
= buckets
;
951 hash
->mask
= buckets
- 1;
952 hash
->shift
= ffs(buckets
);
953 if (hash
->shift
> ARRAY_SIZE(hash_primes
))
954 hash
->shift
= ARRAY_SIZE(hash_primes
) - 1;
956 BUG_ON(hash
->shift
< 2);
957 hash
->prime
= hash_primes
[hash
->shift
];
959 /* Initialize buckets. */
961 INIT_LIST_HEAD(hash
->hash
+ buckets
);
965 static void hash_exit(struct stripe_hash
*hash
)
973 static unsigned hash_fn(struct stripe_hash
*hash
, sector_t key
)
975 return (unsigned) (((key
* hash
->prime
) >> hash
->shift
) & hash
->mask
);
978 static struct list_head
*hash_bucket(struct stripe_hash
*hash
, sector_t key
)
980 return hash
->hash
+ hash_fn(hash
, key
);
983 /* Insert an entry into a hash. */
984 static void stripe_insert(struct stripe_hash
*hash
, struct stripe
*stripe
)
986 list_add(stripe
->lists
+ LIST_HASH
, hash_bucket(hash
, stripe
->key
));
989 /* Lookup an entry in the stripe hash. */
990 static struct stripe
*stripe_lookup(struct stripe_cache
*sc
, sector_t key
)
993 struct stripe
*stripe
;
994 struct list_head
*bucket
= hash_bucket(&sc
->hash
, key
);
996 list_for_each_entry(stripe
, bucket
, lists
[LIST_HASH
]) {
999 if (stripe
->key
== key
) {
1000 /* REMOVEME: statisics. */
1001 if (look
> atomic_read(RS(sc
)->stats
+ S_MAX_LOOKUP
))
1002 atomic_set(RS(sc
)->stats
+ S_MAX_LOOKUP
, look
);
1010 /* Resize the stripe cache hash on size changes. */
1011 static int sc_hash_resize(struct stripe_cache
*sc
)
1013 /* Resize indicated ? */
1014 if (atomic_read(&sc
->stripes
) != atomic_read(&sc
->stripes_last
)) {
1016 struct stripe_hash hash
;
1018 r
= hash_init(&hash
, atomic_read(&sc
->stripes
));
1022 if (sc
->hash
.hash
) {
1023 unsigned b
= sc
->hash
.buckets
;
1024 struct list_head
*pos
, *tmp
;
1026 /* Walk old buckets and insert into new. */
1028 list_for_each_safe(pos
, tmp
, sc
->hash
.hash
+ b
)
1029 stripe_insert(&hash
,
1030 list_entry(pos
, struct stripe
,
1036 hash_exit(&sc
->hash
);
1037 memcpy(&sc
->hash
, &hash
, sizeof(sc
->hash
));
1038 atomic_set(&sc
->stripes_last
, atomic_read(&sc
->stripes
));
1043 /* End hash stripe hash function. */
1045 /* List add, delete, push and pop functions. */
1046 /* Add stripe to flush list. */
1047 #define DEL_LIST(lh) \
1048 if (!list_empty(lh)) \
1051 /* Delete stripe from hash. */
1052 static void stripe_hash_del(struct stripe
*stripe
)
1054 DEL_LIST(stripe
->lists
+ LIST_HASH
);
1057 /* Return stripe reference count. */
1058 static inline int stripe_ref(struct stripe
*stripe
)
1060 return atomic_read(&stripe
->cnt
);
1063 static void stripe_flush_add(struct stripe
*stripe
)
1065 struct stripe_cache
*sc
= stripe
->sc
;
1066 struct list_head
*lh
= stripe
->lists
+ LIST_FLUSH
;
1068 if (!StripeReconstruct(stripe
) && list_empty(lh
))
1069 list_add_tail(lh
, sc
->lists
+ LIST_FLUSH
);
1073 * Add stripe to LRU (inactive) list.
1075 * Need lock, because of concurrent access from message interface.
1077 static void stripe_lru_add(struct stripe
*stripe
)
1079 if (!StripeRecover(stripe
)) {
1080 struct list_head
*lh
= stripe
->lists
+ LIST_LRU
;
1083 list_add_tail(lh
, stripe
->sc
->lists
+ LIST_LRU
);
1087 #define POP_LIST(list) \
1089 if (list_empty(sc->lists + (list))) \
1092 stripe = list_first_entry(sc->lists + (list), \
1095 list_del_init(stripe->lists + (list)); \
1099 /* Pop an available stripe off the LRU list. */
1100 static struct stripe
*stripe_lru_pop(struct stripe_cache
*sc
)
1102 struct stripe
*stripe
;
1108 /* Pop an available stripe off the io list. */
1109 static struct stripe
*stripe_io_pop(struct stripe_cache
*sc
)
1111 struct stripe
*stripe
;
1113 POP_LIST(LIST_FLUSH
);
1117 /* Push a stripe safely onto the endio list to be handled by do_endios(). */
1118 static void stripe_endio_push(struct stripe
*stripe
)
1120 unsigned long flags
;
1121 struct stripe_cache
*sc
= stripe
->sc
;
1122 struct list_head
*stripe_list
= stripe
->lists
+ LIST_ENDIO
,
1123 *sc_list
= sc
->lists
+ LIST_ENDIO
;
1124 spinlock_t
*lock
= sc
->locks
+ LOCK_ENDIO
;
1126 /* This runs in parallel with do_endios(). */
1127 spin_lock_irqsave(lock
, flags
);
1128 if (list_empty(stripe_list
))
1129 list_add_tail(stripe_list
, sc_list
);
1130 spin_unlock_irqrestore(lock
, flags
);
1132 wake_do_raid(RS(sc
)); /* Wake myself. */
1135 /* Pop a stripe off safely off the endio list. */
1136 static struct stripe
*stripe_endio_pop(struct stripe_cache
*sc
)
1138 struct stripe
*stripe
;
1139 spinlock_t
*lock
= sc
->locks
+ LOCK_ENDIO
;
1141 /* This runs in parallel with endio(). */
1142 spin_lock_irq(lock
);
1143 POP_LIST(LIST_ENDIO
)
1144 spin_unlock_irq(lock
);
1150 * Stripe cache locking functions
1152 /* Dummy lock function for single host RAID4+5. */
1153 static void *no_lock(sector_t key
, enum dm_lock_type type
)
1158 /* Dummy unlock function for single host RAID4+5. */
1159 static void no_unlock(void *lock_handle
)
1163 /* No locking (for single host RAID 4+5). */
1164 static struct dm_raid45_locking_type locking_none
= {
1166 .unlock
= no_unlock
,
1169 /* Lock a stripe (for clustering). */
1171 stripe_lock(struct stripe
*stripe
, int rw
, sector_t key
)
1173 stripe
->lock
= RS(stripe
->sc
)->locking
->lock(key
, rw
== READ
? DM_RAID45_SHARED
: DM_RAID45_EX
);
1174 return stripe
->lock
? 0 : -EPERM
;
1177 /* Unlock a stripe (for clustering). */
1178 static void stripe_unlock(struct stripe
*stripe
)
1180 RS(stripe
->sc
)->locking
->unlock(stripe
->lock
);
1181 stripe
->lock
= NULL
;
1184 /* Test io pending on stripe. */
1185 static int stripe_io_ref(struct stripe
*stripe
)
1187 return atomic_read(&stripe
->io
.pending
);
1190 static void stripe_io_get(struct stripe
*stripe
)
1192 if (atomic_inc_return(&stripe
->io
.pending
) == 1)
1193 /* REMOVEME: statistics */
1194 atomic_inc(&stripe
->sc
->active_stripes
);
1196 BUG_ON(stripe_io_ref(stripe
) < 0);
1199 static void stripe_io_put(struct stripe
*stripe
)
1201 if (atomic_dec_and_test(&stripe
->io
.pending
)) {
1202 if (unlikely(StripeRecover(stripe
)))
1203 /* Don't put recovery stripe on endio list. */
1204 wake_do_raid(RS(stripe
->sc
));
1206 /* Add regular stripe to endio list and wake daemon. */
1207 stripe_endio_push(stripe
);
1209 /* REMOVEME: statistics */
1210 atomic_dec(&stripe
->sc
->active_stripes
);
1212 BUG_ON(stripe_io_ref(stripe
) < 0);
1215 /* Take stripe reference out. */
1216 static int stripe_get(struct stripe
*stripe
)
1219 struct list_head
*lh
= stripe
->lists
+ LIST_LRU
;
1221 /* Delete stripe from LRU (inactive) list if on. */
1223 BUG_ON(stripe_ref(stripe
) < 0);
1225 /* Lock stripe on first reference */
1226 r
= (atomic_inc_return(&stripe
->cnt
) == 1) ?
1227 stripe_lock(stripe
, WRITE
, stripe
->key
) : 0;
1233 /* Return references on a chunk. */
1234 static int chunk_ref(struct stripe_chunk
*chunk
)
1236 return atomic_read(&chunk
->cnt
);
1239 /* Take out reference on a chunk. */
1240 static int chunk_get(struct stripe_chunk
*chunk
)
1242 return atomic_inc_return(&chunk
->cnt
);
1245 /* Drop reference on a chunk. */
1246 static void chunk_put(struct stripe_chunk
*chunk
)
1248 BUG_ON(atomic_dec_return(&chunk
->cnt
) < 0);
1252 * Drop reference on a stripe.
1254 * Move it to list of LRU stripes if zero.
1256 static void stripe_put(struct stripe
*stripe
)
1258 if (atomic_dec_and_test(&stripe
->cnt
)) {
1259 BUG_ON(stripe_io_ref(stripe
));
1260 stripe_unlock(stripe
);
1262 BUG_ON(stripe_ref(stripe
) < 0);
1265 /* Helper needed by for_each_io_dev(). */
1266 static void stripe_get_references(struct stripe
*stripe
, unsigned p
)
1270 * Another one to reference the stripe in
1271 * order to protect vs. LRU list moves.
1273 io_get(RS(stripe
->sc
)); /* Global io references. */
1275 stripe_io_get(stripe
); /* One for each chunk io. */
1278 /* Helper for endio() to put all take references. */
1279 static void stripe_put_references(struct stripe
*stripe
)
1281 stripe_io_put(stripe
); /* One for each chunk io. */
1283 io_put(RS(stripe
->sc
));
1287 * Stripe cache functions.
1290 * Invalidate all chunks (i.e. their pages) of a stripe.
1292 * I only keep state for the whole chunk.
1294 static inline void stripe_chunk_invalidate(struct stripe_chunk
*chunk
)
1296 chunk
->io
.flags
= 0;
1300 stripe_chunks_invalidate(struct stripe
*stripe
)
1302 unsigned p
= RS(stripe
->sc
)->set
.raid_devs
;
1305 stripe_chunk_invalidate(CHUNK(stripe
, p
));
1308 /* Prepare stripe for (re)use. */
1309 static void stripe_invalidate(struct stripe
*stripe
)
1311 stripe
->io
.flags
= 0;
1312 stripe
->idx
.parity
= stripe
->idx
.recover
= -1;
1313 stripe_chunks_invalidate(stripe
);
1317 * Allow io on all chunks of a stripe.
1318 * If not set, IO will not occur; i.e. it's prohibited.
1320 * Actual IO submission for allowed chunks depends
1321 * on their !uptodate or dirty state.
1323 static void stripe_allow_io(struct stripe
*stripe
)
1325 unsigned p
= RS(stripe
->sc
)->set
.raid_devs
;
1328 SetChunkIo(CHUNK(stripe
, p
));
1331 /* Initialize a stripe. */
1332 static void stripe_init(struct stripe_cache
*sc
, struct stripe
*stripe
)
1334 unsigned i
, p
= RS(sc
)->set
.raid_devs
;
1336 /* Work all io chunks. */
1338 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
1340 atomic_set(&chunk
->cnt
, 0);
1341 chunk
->stripe
= stripe
;
1342 i
= ARRAY_SIZE(chunk
->bl
);
1344 bio_list_init(chunk
->bl
+ i
);
1349 i
= ARRAY_SIZE(stripe
->lists
);
1351 INIT_LIST_HEAD(stripe
->lists
+ i
);
1353 stripe
->io
.size
= RS(sc
)->set
.io_size
;
1354 atomic_set(&stripe
->cnt
, 0);
1355 atomic_set(&stripe
->io
.pending
, 0);
1356 stripe_invalidate(stripe
);
1359 /* Number of pages per chunk. */
1360 static inline unsigned chunk_pages(unsigned sectors
)
1362 return dm_div_up(sectors
, SECTORS_PER_PAGE
);
1365 /* Number of pages per stripe. */
1366 static inline unsigned stripe_pages(struct raid_set
*rs
, unsigned io_size
)
1368 return chunk_pages(io_size
) * rs
->set
.raid_devs
;
1371 /* Initialize part of page_list (recovery). */
1372 static void stripe_zero_pl_part(struct stripe
*stripe
, int p
,
1373 unsigned start
, unsigned count
)
1375 unsigned o
= start
/ SECTORS_PER_PAGE
, pages
= chunk_pages(count
);
1376 /* Get offset into the page_list. */
1377 struct page_list
*pl
= pl_elem(PL(stripe
, p
), o
);
1380 while (pl
&& pages
--) {
1382 memset(page_address(pl
->page
), 0, PAGE_SIZE
);
1387 /* Initialize parity chunk of stripe. */
1388 static void stripe_zero_chunk(struct stripe
*stripe
, int p
)
1391 stripe_zero_pl_part(stripe
, p
, 0, stripe
->io
.size
);
1394 /* Return dynamic stripe structure size. */
1395 static size_t stripe_size(struct raid_set
*rs
)
1397 return sizeof(struct stripe
) +
1398 rs
->set
.raid_devs
* sizeof(struct stripe_chunk
);
1401 /* Allocate a stripe and its memory object. */
1402 /* XXX adjust to cope with stripe cache and recovery stripe caches. */
1403 enum grow
{ SC_GROW
, SC_KEEP
};
1404 static struct stripe
*stripe_alloc(struct stripe_cache
*sc
,
1405 struct dm_mem_cache_client
*mc
,
1409 struct stripe
*stripe
;
1411 stripe
= kmem_cache_zalloc(sc
->kc
.cache
, GFP_KERNEL
);
1413 /* Grow the dm-mem-cache by one object. */
1414 if (grow
== SC_GROW
) {
1415 r
= dm_mem_cache_grow(mc
, 1);
1420 stripe
->obj
= dm_mem_cache_alloc(mc
);
1421 if (IS_ERR(stripe
->obj
))
1424 stripe_init(sc
, stripe
);
1430 if (grow
== SC_GROW
)
1431 dm_mem_cache_shrink(mc
, 1);
1433 kmem_cache_free(sc
->kc
.cache
, stripe
);
1438 * Free a stripes memory object, shrink the
1439 * memory cache and free the stripe itself.
1441 static void stripe_free(struct stripe
*stripe
, struct dm_mem_cache_client
*mc
)
1443 dm_mem_cache_free(mc
, stripe
->obj
);
1444 dm_mem_cache_shrink(mc
, 1);
1445 kmem_cache_free(stripe
->sc
->kc
.cache
, stripe
);
1448 /* Free the recovery stripe. */
1449 static void stripe_recover_free(struct raid_set
*rs
)
1451 struct recover
*rec
= &rs
->recover
;
1452 struct dm_mem_cache_client
*mc
;
1454 mc
= rec
->mem_cache_client
;
1455 rec
->mem_cache_client
= NULL
;
1457 struct stripe
*stripe
;
1459 while (!list_empty(&rec
->stripes
)) {
1460 stripe
= list_first_entry(&rec
->stripes
, struct stripe
,
1461 lists
[LIST_RECOVER
]);
1462 list_del(stripe
->lists
+ LIST_RECOVER
);
1463 kfree(stripe
->recover
);
1464 stripe_free(stripe
, mc
);
1467 dm_mem_cache_client_destroy(mc
);
1468 dm_io_client_destroy(rec
->dm_io_client
);
1469 rec
->dm_io_client
= NULL
;
1473 /* Grow stripe cache. */
1474 static int sc_grow(struct stripe_cache
*sc
, unsigned stripes
, enum grow grow
)
1478 /* Try to allocate this many (additional) stripes. */
1480 struct stripe
*stripe
=
1481 stripe_alloc(sc
, sc
->mem_cache_client
, grow
);
1483 if (likely(stripe
)) {
1484 stripe_lru_add(stripe
);
1485 atomic_inc(&sc
->stripes
);
1492 return r
? r
: sc_hash_resize(sc
);
1495 /* Shrink stripe cache. */
1496 static int sc_shrink(struct stripe_cache
*sc
, unsigned stripes
)
1500 /* Try to get unused stripe from LRU list. */
1502 struct stripe
*stripe
;
1504 stripe
= stripe_lru_pop(sc
);
1506 /* An LRU stripe may never have ios pending! */
1507 BUG_ON(stripe_io_ref(stripe
));
1508 BUG_ON(stripe_ref(stripe
));
1509 atomic_dec(&sc
->stripes
);
1510 /* Remove from hash if on before deletion. */
1511 stripe_hash_del(stripe
);
1512 stripe_free(stripe
, sc
->mem_cache_client
);
1519 /* Check if stats are still sane. */
1520 if (atomic_read(&sc
->active_stripes_max
) >
1521 atomic_read(&sc
->stripes
))
1522 atomic_set(&sc
->active_stripes_max
, 0);
1527 return atomic_read(&sc
->stripes
) ? sc_hash_resize(sc
) : 0;
1530 /* Create stripe cache and recovery. */
1531 static int sc_init(struct raid_set
*rs
, unsigned stripes
)
1533 unsigned i
, r
, rstripes
;
1534 struct stripe_cache
*sc
= &rs
->sc
;
1535 struct stripe
*stripe
;
1536 struct recover
*rec
= &rs
->recover
;
1537 struct mapped_device
*md
;
1538 struct gendisk
*disk
;
1541 /* Initialize lists and locks. */
1542 i
= ARRAY_SIZE(sc
->lists
);
1544 INIT_LIST_HEAD(sc
->lists
+ i
);
1546 INIT_LIST_HEAD(&rec
->stripes
);
1548 /* Initialize endio and LRU list locks. */
1551 spin_lock_init(sc
->locks
+ i
);
1553 /* Initialize atomic variables. */
1554 atomic_set(&sc
->stripes
, 0);
1555 atomic_set(&sc
->stripes_to_set
, 0);
1556 atomic_set(&sc
->active_stripes
, 0);
1557 atomic_set(&sc
->active_stripes_max
, 0); /* REMOVEME: statistics. */
1560 * We need a runtime unique # to suffix the kmem cache name
1561 * because we'll have one for each active RAID set.
1563 md
= dm_table_get_md(rs
->ti
->table
);
1565 snprintf(sc
->kc
.name
, sizeof(sc
->kc
.name
), "%s-%d.%d", TARGET
,
1566 disk
->first_minor
, atomic_inc_return(&_stripe_sc_nr
));
1567 sc
->kc
.cache
= kmem_cache_create(sc
->kc
.name
, stripe_size(rs
),
1572 /* Create memory cache client context for RAID stripe cache. */
1573 sc
->mem_cache_client
=
1574 dm_mem_cache_client_create(stripes
, rs
->set
.raid_devs
,
1575 chunk_pages(rs
->set
.io_size
));
1576 if (IS_ERR(sc
->mem_cache_client
))
1577 return PTR_ERR(sc
->mem_cache_client
);
1579 /* Create memory cache client context for RAID recovery stripe(s). */
1580 rstripes
= rec
->recovery_stripes
;
1581 rec
->mem_cache_client
=
1582 dm_mem_cache_client_create(rstripes
, rs
->set
.raid_devs
,
1583 chunk_pages(rec
->io_size
));
1584 if (IS_ERR(rec
->mem_cache_client
))
1585 return PTR_ERR(rec
->mem_cache_client
);
1587 /* Create dm-io client context for IO stripes. */
1588 sc
->dm_io_client
= dm_io_client_create();
1589 if (IS_ERR(sc
->dm_io_client
))
1590 return PTR_ERR(sc
->dm_io_client
);
1592 /* FIXME: intermingeled with stripe cache initialization. */
1593 /* Create dm-io client context for recovery stripes. */
1594 rec
->dm_io_client
= dm_io_client_create();
1595 if (IS_ERR(rec
->dm_io_client
))
1596 return PTR_ERR(rec
->dm_io_client
);
1598 /* Allocate stripes for set recovery. */
1599 while (rstripes
--) {
1600 stripe
= stripe_alloc(sc
, rec
->mem_cache_client
, SC_KEEP
);
1604 stripe
->recover
= kzalloc(sizeof(*stripe
->recover
), GFP_KERNEL
);
1605 if (!stripe
->recover
) {
1606 stripe_free(stripe
, rec
->mem_cache_client
);
1610 SetStripeRecover(stripe
);
1611 stripe
->io
.size
= rec
->io_size
;
1612 list_add_tail(stripe
->lists
+ LIST_RECOVER
, &rec
->stripes
);
1613 /* Don't add recovery stripes to LRU list! */
1617 * Allocate the stripe objetcs from the
1618 * cache and add them to the LRU list.
1620 r
= sc_grow(sc
, stripes
, SC_KEEP
);
1622 atomic_set(&sc
->stripes_last
, stripes
);
1627 /* Destroy the stripe cache. */
1628 static void sc_exit(struct stripe_cache
*sc
)
1630 struct raid_set
*rs
= RS(sc
);
1633 stripe_recover_free(rs
);
1634 BUG_ON(sc_shrink(sc
, atomic_read(&sc
->stripes
)));
1635 kmem_cache_destroy(sc
->kc
.cache
);
1636 sc
->kc
.cache
= NULL
;
1638 if (sc
->mem_cache_client
&& !IS_ERR(sc
->mem_cache_client
))
1639 dm_mem_cache_client_destroy(sc
->mem_cache_client
);
1641 if (sc
->dm_io_client
&& !IS_ERR(sc
->dm_io_client
))
1642 dm_io_client_destroy(sc
->dm_io_client
);
1644 hash_exit(&sc
->hash
);
1649 * Calculate RAID address
1651 * Delivers tuple with the index of the data disk holding the chunk
1652 * in the set, the parity disks index and the start of the stripe
1653 * within the address space of the set (used as the stripe cache hash key).
1656 static struct raid_address
*raid_address(struct raid_set
*rs
, sector_t sector
,
1657 struct raid_address
*addr
)
1659 sector_t stripe
, tmp
;
1662 * chunk_number = sector / chunk_size
1663 * stripe_number = chunk_number / data_devs
1664 * di = stripe % data_devs;
1666 stripe
= sector
>> rs
->set
.chunk_shift
;
1667 addr
->di
= sector_div(stripe
, rs
->set
.data_devs
);
1669 switch (rs
->set
.raid_type
->level
) {
1671 addr
->pi
= rs
->set
.pi
;
1672 goto check_shift_di
;
1675 addr
->pi
= sector_div(tmp
, rs
->set
.raid_devs
);
1677 switch (rs
->set
.raid_type
->algorithm
) {
1678 case left_asym
: /* Left asymmetric. */
1679 addr
->pi
= rs
->set
.data_devs
- addr
->pi
;
1680 case right_asym
: /* Right asymmetric. */
1682 if (addr
->di
>= addr
->pi
)
1685 case left_sym
: /* Left symmetric. */
1686 addr
->pi
= rs
->set
.data_devs
- addr
->pi
;
1687 case right_sym
: /* Right symmetric. */
1688 addr
->di
= (addr
->pi
+ addr
->di
+ 1) %
1691 case none
: /* Ain't happen: RAID4 algorithm placeholder. */
1697 * Start offset of the stripes chunk on any single device of the RAID
1698 * set, adjusted in case io size differs from chunk size.
1700 addr
->key
= (stripe
<< rs
->set
.chunk_shift
) +
1701 (sector
& rs
->set
.io_inv_mask
);
1706 * Copy data across between stripe pages and bio vectors.
1708 * Pay attention to data alignment in stripe and bio pages.
1710 static void bio_copy_page_list(int rw
, struct stripe
*stripe
,
1711 struct page_list
*pl
, struct bio
*bio
)
1713 unsigned i
, page_offset
;
1715 struct raid_set
*rs
= RS(stripe
->sc
);
1718 /* Get start page in page list for this sector. */
1719 i
= (bio
->bi_sector
& rs
->set
.io_mask
) / SECTORS_PER_PAGE
;
1720 pl
= pl_elem(pl
, i
);
1724 page_addr
= page_address(pl
->page
);
1725 page_offset
= to_bytes(bio
->bi_sector
& (SECTORS_PER_PAGE
- 1));
1727 /* Walk all segments and copy data across between bio_vecs and pages. */
1728 bio_for_each_segment(bv
, bio
, i
) {
1729 int len
= bv
->bv_len
, size
;
1730 unsigned bio_offset
= 0;
1731 void *bio_addr
= __bio_kmap_atomic(bio
, i
, KM_USER0
);
1733 size
= (page_offset
+ len
> PAGE_SIZE
) ?
1734 PAGE_SIZE
- page_offset
: len
;
1737 memcpy(bio_addr
+ bio_offset
,
1738 page_addr
+ page_offset
, size
);
1740 memcpy(page_addr
+ page_offset
,
1741 bio_addr
+ bio_offset
, size
);
1743 page_offset
+= size
;
1744 if (page_offset
== PAGE_SIZE
) {
1746 * We reached the end of the chunk page ->
1747 * need to refer to the next one to copy more data.
1751 /* Get next page. */
1755 page_addr
= page_address(pl
->page
);
1758 /* REMOVEME: statistics. */
1759 atomic_inc(rs
->stats
+ S_BIO_COPY_PL_NEXT
);
1764 __bio_kunmap_atomic(bio_addr
, KM_USER0
);
1769 * Xor optimization macros.
1771 /* Xor data pointer declaration and initialization macros. */
1772 #define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1]
1773 #define DECLARE_3 DECLARE_2, *d2 = data[2]
1774 #define DECLARE_4 DECLARE_3, *d3 = data[3]
1775 #define DECLARE_5 DECLARE_4, *d4 = data[4]
1776 #define DECLARE_6 DECLARE_5, *d5 = data[5]
1777 #define DECLARE_7 DECLARE_6, *d6 = data[6]
1778 #define DECLARE_8 DECLARE_7, *d7 = data[7]
1780 /* Xor unrole macros. */
1781 #define D2(n) d0[n] = d0[n] ^ d1[n]
1782 #define D3(n) D2(n) ^ d2[n]
1783 #define D4(n) D3(n) ^ d3[n]
1784 #define D5(n) D4(n) ^ d4[n]
1785 #define D6(n) D5(n) ^ d5[n]
1786 #define D7(n) D6(n) ^ d6[n]
1787 #define D8(n) D7(n) ^ d7[n]
1789 #define X_2(macro, offset) macro(offset); macro(offset + 1);
1790 #define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2);
1791 #define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4);
1792 #define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8);
1793 #define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16);
1794 #define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32);
1796 /* Define a _xor_#chunks_#xors_per_run() function. */
1797 #define _XOR(chunks, xors_per_run) \
1798 static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
1800 unsigned end = XOR_SIZE / sizeof(data[0]), i; \
1801 DECLARE_ ## chunks; \
1803 for (i = 0; i < end; i += xors_per_run) { \
1804 X_ ## xors_per_run(D ## chunks, i); \
1808 /* Define xor functions for 2 - 8 chunks and xors per run. */
1809 #define MAKE_XOR_PER_RUN(xors_per_run) \
1810 _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
1811 _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
1812 _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
1813 _XOR(8, xors_per_run);
1815 MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */
1816 MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */
1817 MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */
1818 MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */
1820 #define MAKE_XOR(xors_per_run) \
1822 void (*f)(unsigned long **); \
1823 } static xor_funcs ## xors_per_run[] = { \
1824 { NULL }, /* NULL pointers to optimize indexing in xor(). */ \
1826 { _xor2_ ## xors_per_run }, \
1827 { _xor3_ ## xors_per_run }, \
1828 { _xor4_ ## xors_per_run }, \
1829 { _xor5_ ## xors_per_run }, \
1830 { _xor6_ ## xors_per_run }, \
1831 { _xor7_ ## xors_per_run }, \
1832 { _xor8_ ## xors_per_run }, \
1835 static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
1837 /* Call respective function for amount of chunks. */ \
1838 xor_funcs ## xors_per_run[n].f(data); \
1841 /* Define xor_8() - xor_64 functions. */
1847 * END xor optimization macros.
1850 /* Maximum number of chunks, which can be xor'ed in one go. */
1851 #define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1)
1853 /* xor_blocks wrapper to allow for using that crypto library function. */
1854 static void xor_blocks_wrapper(unsigned n
, unsigned long **data
)
1856 BUG_ON(n
< 2 || n
> MAX_XOR_BLOCKS
+ 1);
1857 xor_blocks(n
- 1, XOR_SIZE
, (void *) data
[0], (void **) data
+ 1);
1863 } static xor_funcs
[] = {
1864 { xor_64
, "xor_64" },
1865 { xor_32
, "xor_32" },
1866 { xor_16
, "xor_16" },
1868 { xor_blocks_wrapper
, "xor_blocks" },
1872 * Check, if chunk has to be xored in/out:
1874 * o if writes are queued
1875 * o if writes are merged
1876 * o if stripe is to be reconstructed
1877 * o if recovery stripe
1879 static inline int chunk_must_xor(struct stripe_chunk
*chunk
)
1881 if (ChunkUptodate(chunk
)) {
1882 BUG_ON(!bio_list_empty(BL_CHUNK(chunk
, WRITE_QUEUED
)) &&
1883 !bio_list_empty(BL_CHUNK(chunk
, WRITE_MERGED
)));
1885 if (!bio_list_empty(BL_CHUNK(chunk
, WRITE_QUEUED
)) ||
1886 !bio_list_empty(BL_CHUNK(chunk
, WRITE_MERGED
)))
1889 if (StripeReconstruct(chunk
->stripe
) ||
1890 StripeRecover(chunk
->stripe
))
1900 * This indexes into the chunks of a stripe and their pages.
1902 * All chunks will be xored into the indexed (@pi)
1903 * chunk in maximum groups of xor.chunks.
1906 static void xor(struct stripe
*stripe
, unsigned pi
, unsigned sector
)
1908 struct raid_set
*rs
= RS(stripe
->sc
);
1909 unsigned max_chunks
= rs
->xor.chunks
, n
= 1,
1910 o
= sector
/ SECTORS_PER_PAGE
, /* Offset into the page_list. */
1911 p
= rs
->set
.raid_devs
;
1912 unsigned long **d
= rs
->data
;
1913 xor_function_t xor_f
= rs
->xor.f
->f
;
1915 BUG_ON(sector
> stripe
->io
.size
);
1917 /* Address of parity page to xor into. */
1918 d
[0] = page_address(pl_elem(PL(stripe
, pi
), o
)->page
);
1921 /* Preset pointers to data pages. */
1922 if (p
!= pi
&& chunk_must_xor(CHUNK(stripe
, p
)))
1923 d
[n
++] = page_address(pl_elem(PL(stripe
, p
), o
)->page
);
1925 /* If max chunks -> xor. */
1926 if (n
== max_chunks
) {
1927 mutex_lock(&rs
->io
.xor_lock
);
1929 mutex_unlock(&rs
->io
.xor_lock
);
1934 /* If chunks -> xor. */
1936 mutex_lock(&rs
->io
.xor_lock
);
1938 mutex_unlock(&rs
->io
.xor_lock
);
1942 /* Common xor loop through all stripe page lists. */
1943 static void common_xor(struct stripe
*stripe
, sector_t count
,
1944 unsigned off
, unsigned pi
)
1949 for (sector
= off
; sector
< count
; sector
+= SECTORS_PER_PAGE
)
1950 xor(stripe
, pi
, sector
);
1952 /* Set parity page uptodate and clean. */
1953 chunk_set(CHUNK(stripe
, pi
), CLEAN
);
1954 atomic_inc(RS(stripe
->sc
)->stats
+ S_XORS
); /* REMOVEME: statistics. */
1958 * Calculate parity sectors on intact stripes.
1960 * Need to calculate raid address for recover stripe, because its
1961 * chunk sizes differs and is typically larger than io chunk size.
1963 static void parity_xor(struct stripe
*stripe
)
1965 struct raid_set
*rs
= RS(stripe
->sc
);
1966 int size_differs
= stripe
->io
.size
!= rs
->set
.io_size
;
1967 unsigned chunk_size
= rs
->set
.chunk_size
, io_size
= stripe
->io
.size
,
1968 xor_size
= chunk_size
> io_size
? io_size
: chunk_size
;
1971 /* This can be the recover stripe with a larger io size. */
1972 for (off
= 0; off
< io_size
; off
+= xor_size
) {
1974 * Recover stripe is likely bigger than regular io
1975 * ones and has no precalculated parity disk index ->
1976 * need to calculate RAID address.
1978 if (unlikely(size_differs
)) {
1979 struct raid_address addr
;
1981 raid_address(rs
, (stripe
->key
+ off
) *
1982 rs
->set
.data_devs
, &addr
);
1983 stripe
->idx
.parity
= addr
.pi
;
1984 stripe_zero_pl_part(stripe
, addr
.pi
, off
, xor_size
);
1987 common_xor(stripe
, xor_size
, off
, stripe
->idx
.parity
);
1988 chunk_set(CHUNK(stripe
, stripe
->idx
.parity
), DIRTY
);
1992 /* Reconstruct missing chunk. */
1993 static void stripe_reconstruct(struct stripe
*stripe
)
1995 struct raid_set
*rs
= RS(stripe
->sc
);
1996 int p
= rs
->set
.raid_devs
, pr
= stripe
->idx
.recover
;
2000 /* Check if all but the chunk to be reconstructed are uptodate. */
2002 BUG_ON(p
!= pr
&& !ChunkUptodate(CHUNK(stripe
, p
)));
2004 /* REMOVEME: statistics. */
2005 atomic_inc(rs
->stats
+ (RSDegraded(rs
) ? S_RECONSTRUCT_EI
:
2006 S_RECONSTRUCT_DEV
));
2007 /* Zero chunk to be reconstructed. */
2008 stripe_zero_chunk(stripe
, pr
);
2009 common_xor(stripe
, stripe
->io
.size
, 0, pr
);
2013 * Recovery io throttling
2015 /* Conditionally reset io counters. */
2016 static int recover_io_reset(struct raid_set
*rs
)
2018 unsigned long j
= jiffies
;
2020 /* Pay attention to jiffies overflows. */
2021 if (j
> rs
->recover
.last_jiffies
+ HZ
||
2022 j
< rs
->recover
.last_jiffies
) {
2023 atomic_set(rs
->recover
.io_count
+ IO_WORK
, 0);
2024 atomic_set(rs
->recover
.io_count
+ IO_RECOVER
, 0);
2025 rs
->recover
.last_jiffies
= j
;
2033 static void recover_io_count(struct stripe
*stripe
)
2035 struct raid_set
*rs
= RS(stripe
->sc
);
2037 atomic_inc(rs
->recover
.io_count
+
2038 (StripeRecover(stripe
) ? IO_RECOVER
: IO_WORK
));
2041 /* Try getting a stripe either from the hash or from the LRU list. */
2042 static struct stripe
*stripe_find(struct raid_set
*rs
,
2043 struct raid_address
*addr
)
2046 struct stripe_cache
*sc
= &rs
->sc
;
2047 struct stripe
*stripe
;
2049 /* Try stripe from hash. */
2050 stripe
= stripe_lookup(sc
, addr
->key
);
2052 r
= stripe_get(stripe
);
2054 goto get_lock_failed
;
2056 atomic_inc(rs
->stats
+ S_HITS_1ST
); /* REMOVEME: statistics. */
2058 /* Not in hash -> try to get an LRU stripe. */
2059 stripe
= stripe_lru_pop(sc
);
2062 * An LRU stripe may not be referenced
2063 * and may never have ios pending!
2065 BUG_ON(stripe_ref(stripe
));
2066 BUG_ON(stripe_io_ref(stripe
));
2068 /* Remove from hash if on before reuse. */
2069 stripe_hash_del(stripe
);
2071 /* Invalidate before reinserting with changed key. */
2072 stripe_invalidate(stripe
);
2074 stripe
->key
= addr
->key
;
2075 stripe
->region
= dm_rh_sector_to_region(rs
->recover
.rh
,
2077 stripe
->idx
.parity
= addr
->pi
;
2078 r
= stripe_get(stripe
);
2080 goto get_lock_failed
;
2082 /* Insert stripe into the stripe hash. */
2083 stripe_insert(&sc
->hash
, stripe
);
2084 /* REMOVEME: statistics. */
2085 atomic_inc(rs
->stats
+ S_INSCACHE
);
2099 * I need to do it here because I can't in interrupt
2101 /* End io all bios on a bio list. */
2102 static void bio_list_endio(struct stripe
*stripe
, struct bio_list
*bl
,
2105 struct raid_set
*rs
= RS(stripe
->sc
);
2107 struct page_list
*pl
= PL(stripe
, p
);
2108 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2110 /* Update region counters. */
2111 while ((bio
= bio_list_pop(bl
))) {
2112 if (bio_data_dir(bio
) == WRITE
)
2113 /* Drop io pending count for any writes. */
2114 dm_rh_dec(rs
->recover
.rh
, stripe
->region
);
2116 /* Copy data accross. */
2117 bio_copy_page_list(READ
, stripe
, pl
, bio
);
2119 bio_endio(bio
, error
);
2121 /* REMOVEME: statistics. */
2122 atomic_inc(rs
->stats
+ (bio_data_dir(bio
) == READ
?
2123 S_BIOS_ENDIO_READ
: S_BIOS_ENDIO_WRITE
));
2127 io_put(rs
); /* Wake any suspend waiters on last bio. */
2132 * End io all reads/writes on a stripe copying
2133 * read data accross from stripe to bios and
2134 * decrementing region counters for writes.
2136 * Processing of ios depeding on state:
2137 * o no chunk error -> endio ok
2139 * - chunk error and read -> ignore to be requeued
2140 * - chunk error and write -> endio ok
2141 * o dead (more than parity_devs failed) and chunk_error-> endio failed
2143 static void stripe_endio(int rw
, struct stripe
*stripe
)
2145 struct raid_set
*rs
= RS(stripe
->sc
);
2146 unsigned p
= rs
->set
.raid_devs
;
2147 int write
= (rw
!= READ
);
2150 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2151 struct bio_list
*bl
;
2153 BUG_ON(ChunkLocked(chunk
));
2155 bl
= BL_CHUNK(chunk
, rw
);
2156 if (bio_list_empty(bl
))
2159 if (unlikely(ChunkError(chunk
) || !ChunkUptodate(chunk
))) {
2160 /* RAID set dead. */
2161 if (unlikely(RSDead(rs
)))
2162 bio_list_endio(stripe
, bl
, p
, -EIO
);
2163 /* RAID set degraded. */
2165 bio_list_endio(stripe
, bl
, p
, 0);
2167 BUG_ON(!RSDegraded(rs
) && ChunkDirty(chunk
));
2168 bio_list_endio(stripe
, bl
, p
, 0);
2173 /* Fail all ios hanging off all bio lists of a stripe. */
2174 static void stripe_fail_io(struct stripe
*stripe
)
2176 struct raid_set
*rs
= RS(stripe
->sc
);
2177 unsigned p
= rs
->set
.raid_devs
;
2180 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2181 int i
= ARRAY_SIZE(chunk
->bl
);
2183 /* Fail all bios on all bio lists of the stripe. */
2185 struct bio_list
*bl
= chunk
->bl
+ i
;
2187 if (!bio_list_empty(bl
))
2188 bio_list_endio(stripe
, bl
, p
, -EIO
);
2192 /* Put stripe on LRU list. */
2193 BUG_ON(stripe_io_ref(stripe
));
2194 BUG_ON(stripe_ref(stripe
));
2197 /* Unlock all required chunks. */
2198 static void stripe_chunks_unlock(struct stripe
*stripe
)
2200 unsigned p
= RS(stripe
->sc
)->set
.raid_devs
;
2201 struct stripe_chunk
*chunk
;
2204 chunk
= CHUNK(stripe
, p
);
2206 if (TestClearChunkUnlock(chunk
))
2207 ClearChunkLocked(chunk
);
2212 * Queue reads and writes to a stripe by hanging
2213 * their bios off the stripesets read/write lists.
2215 static int stripe_queue_bio(struct raid_set
*rs
, struct bio
*bio
,
2216 struct bio_list
*reject
)
2218 struct raid_address addr
;
2219 struct stripe
*stripe
;
2221 stripe
= stripe_find(rs
, raid_address(rs
, bio
->bi_sector
, &addr
));
2223 int r
= 0, rw
= bio_data_dir(bio
);
2225 /* Distinguish reads and writes. */
2226 bio_list_add(BL(stripe
, addr
.di
, rw
), bio
);
2229 /* REMOVEME: statistics. */
2230 atomic_inc(rs
->stats
+ S_BIOS_ADDED_READ
);
2232 /* Inrement pending write count on region. */
2233 dm_rh_inc(rs
->recover
.rh
, stripe
->region
);
2236 /* REMOVEME: statistics. */
2237 atomic_inc(rs
->stats
+ S_BIOS_ADDED_WRITE
);
2241 * Put on io (flush) list in case of
2242 * initial bio queued to chunk.
2244 if (chunk_get(CHUNK(stripe
, addr
.di
)) == 1)
2245 stripe_flush_add(stripe
);
2250 /* Got no stripe from cache or failed to lock it -> reject bio. */
2251 bio_list_add(reject
, bio
);
2252 atomic_inc(rs
->stats
+ S_IOS_POST
); /* REMOVEME: statistics. */
2257 * Handle all stripes by handing them to the daemon, because we can't
2258 * map their chunk pages to copy the data in interrupt context.
2260 * We don't want to handle them here either, while interrupts are disabled.
2263 /* Read/write endio function for dm-io (interrupt context). */
2264 static void endio(unsigned long error
, void *context
)
2266 struct stripe_chunk
*chunk
= context
;
2268 if (unlikely(error
)) {
2269 chunk_set(chunk
, ERROR
);
2270 /* REMOVEME: statistics. */
2271 atomic_inc(RS(chunk
->stripe
->sc
)->stats
+ S_STRIPE_ERROR
);
2273 chunk_set(chunk
, CLEAN
);
2276 * For recovery stripes, I need to reset locked locked
2277 * here, because those aren't processed in do_endios().
2279 if (unlikely(StripeRecover(chunk
->stripe
)))
2280 ClearChunkLocked(chunk
);
2282 SetChunkUnlock(chunk
);
2284 /* Indirectly puts stripe on cache's endio list via stripe_io_put(). */
2285 stripe_put_references(chunk
->stripe
);
2288 /* Read/Write a chunk asynchronously. */
2289 static void stripe_chunk_rw(struct stripe
*stripe
, unsigned p
)
2291 struct stripe_cache
*sc
= stripe
->sc
;
2292 struct raid_set
*rs
= RS(sc
);
2293 struct dm_mem_cache_object
*obj
= stripe
->obj
+ p
;
2294 struct page_list
*pl
= obj
->pl
;
2295 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2296 struct raid_dev
*dev
= rs
->dev
+ p
;
2297 struct dm_io_region io
= {
2298 .bdev
= dev
->dev
->bdev
,
2299 .sector
= stripe
->key
,
2300 .count
= stripe
->io
.size
,
2302 struct dm_io_request control
= {
2303 .bi_rw
= ChunkDirty(chunk
) ? WRITE
: READ
,
2305 .type
= DM_IO_PAGE_LIST
,
2313 .client
= StripeRecover(stripe
) ? rs
->recover
.dm_io_client
:
2317 BUG_ON(ChunkLocked(chunk
));
2318 BUG_ON(!ChunkUptodate(chunk
) && ChunkDirty(chunk
));
2319 BUG_ON(ChunkUptodate(chunk
) && !ChunkDirty(chunk
));
2322 * Don't rw past end of device, which can happen, because
2323 * typically sectors_per_dev isn't divisible by io_size.
2325 if (unlikely(io
.sector
+ io
.count
> rs
->set
.sectors_per_dev
))
2326 io
.count
= rs
->set
.sectors_per_dev
- io
.sector
;
2329 io
.sector
+= dev
->start
; /* Add <offset>. */
2331 recover_io_count(stripe
); /* Recovery io accounting. */
2333 /* REMOVEME: statistics. */
2334 atomic_inc(rs
->stats
+ (ChunkDirty(chunk
) ? S_DM_IO_WRITE
:
2336 SetChunkLocked(chunk
);
2337 SetDevIoQueued(dev
);
2338 BUG_ON(dm_io(&control
, 1, &io
, NULL
));
2342 * Write dirty or read not uptodate page lists of a stripe.
2344 static int stripe_chunks_rw(struct stripe
*stripe
)
2347 struct raid_set
*rs
= RS(stripe
->sc
);
2350 * Increment the pending count on the stripe
2351 * first, so that we don't race in endio().
2353 * An inc (IO) is needed for any chunk unless !ChunkIo(chunk):
2356 * o dirtied by writes merged
2357 * o dirtied by parity calculations
2359 r
= for_each_io_dev(stripe
, stripe_get_references
);
2361 /* Io needed: chunks are either not uptodate or dirty. */
2362 int max
; /* REMOVEME: */
2363 struct stripe_cache
*sc
= &rs
->sc
;
2365 /* Submit actual io. */
2366 for_each_io_dev(stripe
, stripe_chunk_rw
);
2368 /* REMOVEME: statistics */
2369 max
= sc_active(sc
);
2370 if (atomic_read(&sc
->active_stripes_max
) < max
)
2371 atomic_set(&sc
->active_stripes_max
, max
);
2373 atomic_inc(rs
->stats
+ S_FLUSHS
);
2374 /* END REMOVEME: statistics */
2380 /* Merge in all writes hence dirtying respective chunks. */
2381 static void stripe_merge_writes(struct stripe
*stripe
)
2383 unsigned p
= RS(stripe
->sc
)->set
.raid_devs
;
2386 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2387 struct bio_list
*write
= BL_CHUNK(chunk
, WRITE_QUEUED
);
2389 if (!bio_list_empty(write
)) {
2391 struct page_list
*pl
= stripe
->obj
[p
].pl
;
2394 * We can play with the lists without holding a lock,
2395 * because it is just us accessing them anyway.
2397 bio_list_for_each(bio
, write
)
2398 bio_copy_page_list(WRITE
, stripe
, pl
, bio
);
2400 bio_list_merge(BL_CHUNK(chunk
, WRITE_MERGED
), write
);
2401 bio_list_init(write
);
2402 chunk_set(chunk
, DIRTY
);
2407 /* Queue all writes to get merged. */
2408 static int stripe_queue_writes(struct stripe
*stripe
)
2411 unsigned p
= RS(stripe
->sc
)->set
.raid_devs
;
2414 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2415 struct bio_list
*write
= BL_CHUNK(chunk
, WRITE
);
2417 if (!bio_list_empty(write
)) {
2418 bio_list_merge(BL_CHUNK(chunk
, WRITE_QUEUED
), write
);
2419 bio_list_init(write
);
2429 /* Check, if a chunk gets completely overwritten. */
2430 static int stripe_check_chunk_overwrite(struct stripe
*stripe
, unsigned p
)
2432 unsigned sectors
= 0;
2434 struct bio_list
*bl
= BL(stripe
, p
, WRITE_QUEUED
);
2436 bio_list_for_each(bio
, bl
)
2437 sectors
+= bio_sectors(bio
);
2439 BUG_ON(sectors
> RS(stripe
->sc
)->set
.io_size
);
2440 return sectors
== RS(stripe
->sc
)->set
.io_size
;
2444 * Avoid io on broken/reconstructed drive in order to
2445 * reconstruct date on endio.
2447 * (*1*) We set StripeReconstruct() in here, so that _do_endios()
2448 * will trigger a reconstruct call before resetting it.
2450 static int stripe_chunk_set_io_flags(struct stripe
*stripe
, int pr
)
2452 struct stripe_chunk
*chunk
= CHUNK(stripe
, pr
);
2455 * Allow io on all chunks but the indexed one,
2456 * because we're either degraded or prohibit it
2457 * on the one for later reconstruction.
2459 /* Includes ClearChunkIo(), ClearChunkUptodate(). */
2460 stripe_chunk_invalidate(chunk
);
2461 stripe
->idx
.recover
= pr
;
2462 SetStripeReconstruct(stripe
);
2464 /* REMOVEME: statistics. */
2465 atomic_inc(RS(stripe
->sc
)->stats
+ S_PROHIBITCHUNKIO
);
2469 /* Chunk locked/uptodate and device failed tests. */
2470 static struct stripe_chunk
*
2471 stripe_chunk_check(struct stripe
*stripe
, unsigned p
, unsigned *chunks_uptodate
)
2473 struct raid_set
*rs
= RS(stripe
->sc
);
2474 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2476 /* Can't access active chunks. */
2477 if (ChunkLocked(chunk
)) {
2478 /* REMOVEME: statistics. */
2479 atomic_inc(rs
->stats
+ S_CHUNK_LOCKED
);
2483 /* Can't access broken devive. */
2484 if (ChunkError(chunk
) || DevFailed(rs
->dev
+ p
))
2487 /* Can access uptodate chunks. */
2488 if (ChunkUptodate(chunk
)) {
2489 (*chunks_uptodate
)++;
2497 * Degraded/reconstruction mode.
2499 * Check stripe state to figure which chunks don't need IO.
2501 * Returns 0 for fully operational, -EPERM for degraded/resynchronizing.
2503 static int stripe_check_reconstruct(struct stripe
*stripe
)
2505 struct raid_set
*rs
= RS(stripe
->sc
);
2508 ClearStripeReconstruct(stripe
);
2509 ClearStripeReconstructed(stripe
);
2510 stripe_allow_io(stripe
);
2514 /* Avoid further reconstruction setting, when already set. */
2515 if (StripeReconstruct(stripe
)) {
2516 /* REMOVEME: statistics. */
2517 atomic_inc(rs
->stats
+ S_RECONSTRUCT_SET
);
2521 /* Initially allow io on all chunks. */
2522 stripe_allow_io(stripe
);
2524 /* Return if stripe is already reconstructed. */
2525 if (StripeReconstructed(stripe
)) {
2526 atomic_inc(rs
->stats
+ S_RECONSTRUCTED
);
2531 * Degraded/reconstruction mode (device failed) ->
2532 * avoid io on the failed device.
2534 if (unlikely(RSDegraded(rs
))) {
2535 /* REMOVEME: statistics. */
2536 atomic_inc(rs
->stats
+ S_DEGRADED
);
2537 /* Allow IO on all devices but the dead one. */
2538 BUG_ON(rs
->set
.ei
< 0);
2539 return stripe_chunk_set_io_flags(stripe
, rs
->set
.ei
);
2541 int sync
, pi
= dev_for_parity(stripe
, &sync
);
2544 * Reconstruction mode (ie. a particular (replaced) device or
2545 * some (rotating) parity chunk is being resynchronized) ->
2546 * o make sure all needed chunks are read in
2547 * o cope with 3/4 disk array special case where it
2548 * doesn't make a difference to read in parity
2549 * to xor data in/out
2551 if (RSEnforceParityCreation(rs
) || !sync
) {
2552 /* REMOVEME: statistics. */
2553 atomic_inc(rs
->stats
+ S_NOSYNC
);
2554 /* Allow IO on all devs but the one to reconstruct. */
2555 return stripe_chunk_set_io_flags(stripe
, pi
);
2563 * Check, if stripe is ready to merge writes.
2564 * I.e. if all chunks present to allow to merge bios.
2566 * We prohibit io on:
2568 * o chunks without bios
2569 * o chunks which get completely written over
2571 static int stripe_merge_possible(struct stripe
*stripe
, int nosync
)
2573 struct raid_set
*rs
= RS(stripe
->sc
);
2574 unsigned chunks_overwrite
= 0, chunks_prohibited
= 0,
2575 chunks_uptodate
= 0, p
= rs
->set
.raid_devs
;
2577 /* Walk all chunks. */
2579 struct stripe_chunk
*chunk
;
2581 /* Prohibit io on broken devices. */
2582 if (DevFailed(rs
->dev
+ p
)) {
2583 chunk
= CHUNK(stripe
, p
);
2587 /* We can't optimize any further if no chunk. */
2588 chunk
= stripe_chunk_check(stripe
, p
, &chunks_uptodate
);
2589 if (!chunk
|| nosync
)
2593 * We have a chunk, which is not uptodate.
2595 * If this is not parity and we don't have
2596 * reads queued, we can optimize further.
2598 if (p
!= stripe
->idx
.parity
&&
2599 bio_list_empty(BL_CHUNK(chunk
, READ
)) &&
2600 bio_list_empty(BL_CHUNK(chunk
, WRITE_MERGED
))) {
2601 if (bio_list_empty(BL_CHUNK(chunk
, WRITE_QUEUED
)))
2603 else if (RSCheckOverwrite(rs
) &&
2604 stripe_check_chunk_overwrite(stripe
, p
))
2605 /* Completely overwritten chunk. */
2609 /* Allow io for chunks with bios and overwritten ones. */
2614 /* No io for broken devices or for chunks w/o bios. */
2615 ClearChunkIo(chunk
);
2616 chunks_prohibited
++;
2617 /* REMOVEME: statistics. */
2618 atomic_inc(RS(stripe
->sc
)->stats
+ S_PROHIBITCHUNKIO
);
2621 /* All data chunks will get written over. */
2622 if (chunks_overwrite
== rs
->set
.data_devs
)
2623 atomic_inc(rs
->stats
+ S_OVERWRITE
); /* REMOVEME: statistics.*/
2624 else if (chunks_uptodate
+ chunks_prohibited
< rs
->set
.raid_devs
) {
2625 /* We don't have enough chunks to merge. */
2626 atomic_inc(rs
->stats
+ S_CANT_MERGE
); /* REMOVEME: statistics.*/
2631 * If we have all chunks up to date or overwrite them, we
2632 * just zero the parity chunk and let stripe_rw() recreate it.
2634 if (chunks_uptodate
== rs
->set
.raid_devs
||
2635 chunks_overwrite
== rs
->set
.data_devs
) {
2636 stripe_zero_chunk(stripe
, stripe
->idx
.parity
);
2637 BUG_ON(StripeReconstruct(stripe
));
2638 SetStripeReconstruct(stripe
); /* Enforce xor in caller. */
2641 * With less chunks, we xor parity out.
2643 * (*4*) We rely on !StripeReconstruct() in chunk_must_xor(),
2644 * so that only chunks with queued or merged writes
2651 * We do have enough chunks to merge.
2652 * All chunks are uptodate or get written over.
2654 atomic_inc(rs
->stats
+ S_CAN_MERGE
); /* REMOVEME: statistics. */
2659 * Avoid reading chunks in case we're fully operational.
2661 * We prohibit io on any chunks without bios but the parity chunk.
2663 static void stripe_avoid_reads(struct stripe
*stripe
)
2665 struct raid_set
*rs
= RS(stripe
->sc
);
2666 unsigned dummy
= 0, p
= rs
->set
.raid_devs
;
2668 /* Walk all chunks. */
2670 struct stripe_chunk
*chunk
=
2671 stripe_chunk_check(stripe
, p
, &dummy
);
2676 /* If parity or any bios pending -> allow io. */
2677 if (chunk_ref(chunk
) || p
== stripe
->idx
.parity
)
2680 ClearChunkIo(chunk
);
2681 /* REMOVEME: statistics. */
2682 atomic_inc(RS(stripe
->sc
)->stats
+ S_PROHIBITCHUNKIO
);
2688 * Read/write a stripe.
2690 * All stripe read/write activity goes through this function
2691 * unless recovery, which has to call stripe_chunk_rw() directly.
2693 * Make sure we don't try already merged stripes in order
2694 * to avoid data corruption.
2696 * Check the state of the RAID set and if degraded (or
2697 * resynchronizing for reads), read in all other chunks but
2698 * the one on the dead/resynchronizing device in order to be
2699 * able to reconstruct the missing one in _do_endios().
2701 * Can be called on active stripes in order
2702 * to dispatch new io on inactive chunks.
2705 * o stripe to read and/or write
2706 * o stripe with error to reconstruct
2708 static int stripe_rw(struct stripe
*stripe
)
2711 struct raid_set
*rs
= RS(stripe
->sc
);
2714 * Check, if a chunk needs to be reconstructed
2715 * because of a degraded set or a region out of sync.
2717 nosync
= stripe_check_reconstruct(stripe
);
2720 return 0; /* Wait for stripe reconstruction to finish. */
2726 * If we don't have merged writes pending, we can schedule
2727 * queued writes to be merged next without corrupting data.
2729 if (!StripeMerged(stripe
)) {
2730 r
= stripe_queue_writes(stripe
);
2732 /* Writes got queued -> flag RBW. */
2733 SetStripeRBW(stripe
);
2737 * Merge all writes hanging off uptodate/overwritten
2738 * chunks of the stripe.
2740 if (StripeRBW(stripe
)) {
2741 r
= stripe_merge_possible(stripe
, nosync
);
2742 if (!r
) { /* Merge possible. */
2743 struct stripe_chunk
*chunk
;
2746 * I rely on valid parity in order
2747 * to xor a fraction of chunks out
2748 * of parity and back in.
2750 stripe_merge_writes(stripe
); /* Merge writes in. */
2751 parity_xor(stripe
); /* Update parity. */
2752 ClearStripeReconstruct(stripe
); /* Reset xor enforce. */
2753 SetStripeMerged(stripe
); /* Writes merged. */
2754 ClearStripeRBW(stripe
); /* Disable RBW. */
2757 * REMOVEME: sanity check on parity chunk
2758 * states after writes got merged.
2760 chunk
= CHUNK(stripe
, stripe
->idx
.parity
);
2761 BUG_ON(ChunkLocked(chunk
));
2762 BUG_ON(!ChunkUptodate(chunk
));
2763 BUG_ON(!ChunkDirty(chunk
));
2764 BUG_ON(!ChunkIo(chunk
));
2766 } else if (!nosync
&& !StripeMerged(stripe
))
2767 /* Read avoidance if not degraded/resynchronizing/merged. */
2768 stripe_avoid_reads(stripe
);
2771 /* Now submit any reads/writes for non-uptodate or dirty chunks. */
2772 r
= stripe_chunks_rw(stripe
);
2775 * No io submitted because of chunk io
2776 * prohibited or locked chunks/failed devices
2777 * -> push to end io list for processing.
2779 stripe_endio_push(stripe
);
2780 atomic_inc(rs
->stats
+ S_NO_RW
); /* REMOVEME: statistics. */
2787 * Recovery functions
2789 /* Read a stripe off a raid set for recovery. */
2790 static int stripe_recover_read(struct stripe
*stripe
, int pi
)
2792 BUG_ON(stripe_io_ref(stripe
));
2794 /* Invalidate all chunks so that they get read in. */
2795 stripe_chunks_invalidate(stripe
);
2796 stripe_allow_io(stripe
); /* Allow io on all recovery chunks. */
2799 * If we are reconstructing a perticular device, we can avoid
2800 * reading the respective chunk in, because we're going to
2801 * reconstruct it anyway.
2803 * We can't do that for resynchronization of rotating parity,
2804 * because the recovery stripe chunk size is typically larger
2805 * than the sets chunk size.
2808 ClearChunkIo(CHUNK(stripe
, pi
));
2810 return stripe_chunks_rw(stripe
);
2813 /* Write a stripe to a raid set for recovery. */
2814 static int stripe_recover_write(struct stripe
*stripe
, int pi
)
2816 BUG_ON(stripe_io_ref(stripe
));
2819 * If this is a reconstruct of a particular device, then
2820 * reconstruct the respective chunk, else create parity chunk.
2823 stripe_zero_chunk(stripe
, pi
);
2824 common_xor(stripe
, stripe
->io
.size
, 0, pi
);
2825 chunk_set(CHUNK(stripe
, pi
), DIRTY
);
2829 return stripe_chunks_rw(stripe
);
2832 /* Read/write a recovery stripe. */
2833 static int stripe_recover_rw(struct stripe
*stripe
)
2835 int r
= 0, sync
= 0;
2837 /* Read/write flip-flop. */
2838 if (TestClearStripeRBW(stripe
)) {
2839 SetStripeMerged(stripe
);
2840 stripe
->key
= stripe
->recover
->pos
;
2841 r
= stripe_recover_read(stripe
, dev_for_parity(stripe
, &sync
));
2843 } else if (TestClearStripeMerged(stripe
)) {
2844 r
= stripe_recover_write(stripe
, dev_for_parity(stripe
, &sync
));
2852 /* Recover bandwidth available ?. */
2853 static int recover_bandwidth(struct raid_set
*rs
)
2857 /* On reset or when bios delayed -> allow recovery. */
2858 r
= recover_io_reset(rs
);
2859 if (r
|| RSBandwidth(rs
))
2862 work
= atomic_read(rs
->recover
.io_count
+ IO_WORK
);
2864 /* Pay attention to larger recover stripe size. */
2865 int recover
= atomic_read(rs
->recover
.io_count
+ IO_RECOVER
) *
2866 rs
->recover
.io_size
/ rs
->set
.io_size
;
2869 * Don't use more than given bandwidth
2870 * of the work io for recovery.
2872 if (recover
> work
/ rs
->recover
.bandwidth_work
) {
2873 /* REMOVEME: statistics. */
2874 atomic_inc(rs
->stats
+ S_NO_BANDWIDTH
);
2880 atomic_inc(rs
->stats
+ S_BANDWIDTH
); /* REMOVEME: statistics. */
2884 /* Try to get a region to recover. */
2885 static int stripe_recover_get_region(struct stripe
*stripe
)
2887 struct raid_set
*rs
= RS(stripe
->sc
);
2888 struct recover
*rec
= &rs
->recover
;
2889 struct recover_addr
*addr
= stripe
->recover
;
2890 struct dm_dirty_log
*dl
= rec
->dl
;
2891 struct dm_rh_client
*rh
= rec
->rh
;
2896 /* Return, that we have region first to finish it during suspension. */
2903 if (dl
->type
->get_sync_count(dl
) >= rec
->nr_regions
)
2906 /* If we don't have enough bandwidth, we don't proceed recovering. */
2907 if (!recover_bandwidth(rs
))
2910 /* Start quiescing a region. */
2911 dm_rh_recovery_prepare(rh
);
2912 addr
->reg
= dm_rh_recovery_start(rh
);
2916 addr
->pos
= dm_rh_region_to_sector(rh
, dm_rh_get_region_key(addr
->reg
));
2917 addr
->end
= addr
->pos
+ dm_rh_get_region_size(rh
);
2920 * Take one global io reference out for the
2921 * whole region, which is going to be released
2922 * when the region is completely done with.
2928 /* Update region hash state. */
2929 enum recover_type
{ REC_FAILURE
= 0, REC_SUCCESS
= 1 };
2930 static void recover_rh_update(struct stripe
*stripe
, enum recover_type success
)
2932 struct recover_addr
*addr
= stripe
->recover
;
2933 struct raid_set
*rs
= RS(stripe
->sc
);
2934 struct recover
*rec
= &rs
->recover
;
2937 DMERR("%s- Called w/o region", __func__
);
2941 dm_rh_recovery_end(addr
->reg
, success
);
2943 rec
->nr_regions_recovered
++;
2948 * Completely done with this region ->
2949 * release the 1st io reference.
2954 /* Set start of recovery state. */
2955 static void set_start_recovery(struct raid_set
*rs
)
2957 /* Initialize recovery. */
2958 rs
->recover
.start_jiffies
= jiffies
;
2959 rs
->recover
.end_jiffies
= 0;
2962 /* Set end of recovery state. */
2963 static void set_end_recovery(struct raid_set
*rs
)
2966 /* Achtung: nicht mehr zurück setzten -> 'i' belibt in status output und userpace könnte sich darauf verlassen, das es verschiwndet!!!! */
2967 rs
->set
.dev_to_init
= -1;
2969 /* Check for jiffies overrun. */
2970 rs
->recover
.end_jiffies
= jiffies
;
2971 if (rs
->recover
.end_jiffies
< rs
->recover
.start_jiffies
)
2972 rs
->recover
.end_jiffies
= ~0;
2975 /* Handle recovery on one recovery stripe. */
2976 static int _do_recovery(struct stripe
*stripe
)
2979 struct raid_set
*rs
= RS(stripe
->sc
);
2980 struct recover_addr
*addr
= stripe
->recover
;
2982 /* If recovery is active -> return. */
2983 if (stripe_io_ref(stripe
))
2986 /* IO error is fatal for recovery -> stop it. */
2987 if (unlikely(StripeError(stripe
)))
2990 /* Recovery end required. */
2991 if (unlikely(RSDegraded(rs
)))
2994 /* Get a region to recover. */
2995 r
= stripe_recover_get_region(stripe
);
2997 case 0: /* Got a new region: flag initial read before write. */
2998 SetStripeRBW(stripe
);
2999 case 1: /* Have a region in the works. */
3002 /* No bandwidth/quiesced region yet, try later. */
3004 wake_do_raid_delayed(rs
, HZ
/ 4);
3008 case -ENOENT
: /* No more regions to recover. */
3009 schedule_work(&rs
->io
.ws_do_table_event
);
3015 /* Read/write a recover stripe. */
3016 r
= stripe_recover_rw(stripe
);
3021 /* Read and write finished-> update recovery position within region. */
3022 addr
->pos
+= stripe
->io
.size
;
3024 /* If we're at end of region, update region hash. */
3025 if (addr
->pos
>= addr
->end
||
3026 addr
->pos
>= rs
->set
.sectors_per_dev
)
3027 recover_rh_update(stripe
, REC_SUCCESS
);
3029 /* Prepare to read next region segment. */
3030 SetStripeRBW(stripe
);
3032 /* Schedule myself for another round... */
3037 /* FIXME: rather try recovering other regions on error? */
3038 rs_check_degrade(stripe
);
3039 recover_rh_update(stripe
, REC_FAILURE
);
3041 /* Check state of partially recovered array. */
3042 if (RSDegraded(rs
) && !RSDead(rs
) &&
3043 rs
->set
.dev_to_init
!= -1 &&
3044 rs
->set
.ei
!= rs
->set
.dev_to_init
) {
3045 /* Broken drive != drive to recover -> FATAL. */
3047 DMERR("FATAL: failed device != device to initialize -> "
3051 if (StripeError(stripe
) || RSDegraded(rs
)) {
3052 char buf
[BDEVNAME_SIZE
];
3054 DMERR("stopping recovery due to "
3055 "ERROR on /dev/%s, stripe at offset %llu",
3056 bdevname(rs
->dev
[rs
->set
.ei
].dev
->bdev
, buf
),
3057 (unsigned long long) stripe
->key
);
3061 /* Make sure, that all quiesced regions get released. */
3063 dm_rh_recovery_end(addr
->reg
, -EIO
);
3064 addr
->reg
= dm_rh_recovery_start(rs
->recover
.rh
);
3070 /* Called by main io daemon to recover regions. */
3071 static int do_recovery(struct raid_set
*rs
)
3073 if (RSRecover(rs
)) {
3075 struct stripe
*stripe
;
3077 list_for_each_entry(stripe
, &rs
->recover
.stripes
,
3078 lists
[LIST_RECOVER
])
3079 r
+= _do_recovery(stripe
);
3084 set_end_recovery(rs
);
3085 stripe_recover_free(rs
);
3092 * END recovery functions
3095 /* End io process all stripes handed in by endio() callback. */
3096 static void _do_endios(struct raid_set
*rs
, struct stripe
*stripe
,
3097 struct list_head
*flush_list
)
3099 /* First unlock all required chunks. */
3100 stripe_chunks_unlock(stripe
);
3103 * If an io error on a stripe occured, degrade the RAID set
3104 * and try to endio as many bios as possible. If any bios can't
3105 * be endio processed, requeue the stripe (stripe_ref() != 0).
3107 if (TestClearStripeError(stripe
)) {
3109 * FIXME: if read, rewrite the failed chunk after reconstruction
3110 * in order to trigger disk bad sector relocation.
3112 rs_check_degrade(stripe
); /* Resets ChunkError(). */
3113 ClearStripeReconstruct(stripe
);
3114 ClearStripeReconstructed(stripe
);
3117 * FIXME: if write, don't endio writes in flight and don't
3118 * allow for new writes until userspace has updated
3123 /* Got to reconstruct a missing chunk. */
3124 if (StripeReconstruct(stripe
)) {
3126 * (*2*) We use StripeReconstruct() to allow for
3127 * all chunks to be xored into the reconstructed
3128 * one (see chunk_must_xor()).
3130 stripe_reconstruct(stripe
);
3133 * (*3*) Now we reset StripeReconstruct() and flag
3134 * StripeReconstructed() to show to stripe_rw(),
3135 * that we have reconstructed a missing chunk.
3137 ClearStripeReconstruct(stripe
);
3138 SetStripeReconstructed(stripe
);
3140 /* FIXME: reschedule to be written in case of read. */
3141 /* if (!RSDead && RSDegraded(rs) !StripeRBW(stripe)) {
3142 chunk_set(CHUNK(stripe, stripe->idx.recover), DIRTY);
3143 stripe_chunks_rw(stripe);
3146 stripe
->idx
.recover
= -1;
3150 * Now that we eventually got a complete stripe, we
3151 * can process the rest of the end ios on reads.
3153 stripe_endio(READ
, stripe
);
3155 /* End io all merged writes if not prohibited. */
3156 if (!RSProhibitWrites(rs
) && StripeMerged(stripe
)) {
3157 ClearStripeMerged(stripe
);
3158 stripe_endio(WRITE_MERGED
, stripe
);
3161 /* If RAID set is dead -> fail any ios to dead drives. */
3163 if (!TestSetRSDeadEndioMessage(rs
))
3164 DMERR("RAID set dead: failing ios to dead devices");
3166 stripe_fail_io(stripe
);
3170 * We have stripe references still,
3171 * beacuse of read before writes or IO errors ->
3172 * got to put on flush list for processing.
3174 if (stripe_ref(stripe
)) {
3175 BUG_ON(!list_empty(stripe
->lists
+ LIST_LRU
));
3176 list_add_tail(stripe
->lists
+ LIST_FLUSH
, flush_list
);
3177 atomic_inc(rs
->stats
+ S_REQUEUE
); /* REMOVEME: statistics. */
3179 stripe_lru_add(stripe
);
3182 /* Pop any endio stripes off of the endio list and belabour them. */
3183 static void do_endios(struct raid_set
*rs
)
3185 struct stripe_cache
*sc
= &rs
->sc
;
3186 struct stripe
*stripe
;
3187 /* IO flush list for sorted requeued stripes. */
3188 struct list_head flush_list
;
3190 INIT_LIST_HEAD(&flush_list
);
3192 while ((stripe
= stripe_endio_pop(sc
))) {
3193 /* Avoid endio on stripes with newly io'ed chunks. */
3194 if (!stripe_io_ref(stripe
))
3195 _do_endios(rs
, stripe
, &flush_list
);
3199 * Insert any requeued stripes in the proper
3200 * order at the beginning of the io (flush) list.
3202 list_splice(&flush_list
, sc
->lists
+ LIST_FLUSH
);
3205 /* Flush any stripes on the io list. */
3206 static int do_flush(struct raid_set
*rs
)
3209 struct stripe
*stripe
;
3211 while ((stripe
= stripe_io_pop(&rs
->sc
)))
3212 r
+= stripe_rw(stripe
); /* Read/write stripe. */
3217 /* Stripe cache resizing. */
3218 static void do_sc_resize(struct raid_set
*rs
)
3220 unsigned set
= atomic_read(&rs
->sc
.stripes_to_set
);
3223 unsigned cur
= atomic_read(&rs
->sc
.stripes
);
3224 int r
= (set
> cur
) ? sc_grow(&rs
->sc
, set
- cur
, SC_GROW
) :
3225 sc_shrink(&rs
->sc
, cur
- set
);
3227 /* Flag end of resizeing if ok. */
3229 atomic_set(&rs
->sc
.stripes_to_set
, 0);
3236 * We do different things with the io depending
3237 * on the state of the region that it is in:
3239 * o reads: hang off stripe cache or postpone if full
3243 * CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
3244 * In case stripe cache is full or busy, postpone the io.
3246 * RECOVERING: delay the io until recovery of the region completes.
3249 static void do_ios(struct raid_set
*rs
, struct bio_list
*ios
)
3252 unsigned flush
= 0, delay
= 0;
3254 struct dm_rh_client
*rh
= rs
->recover
.rh
;
3256 struct bio_list reject
;
3258 bio_list_init(&reject
);
3262 * o delay writes to recovering regions (let reads go through)
3263 * o queue io to all other regions
3265 while ((bio
= bio_list_pop(ios
))) {
3267 * In case we get a barrier bio, push it back onto
3268 * the input queue unless all work queues are empty
3269 * and the stripe cache is inactive.
3271 if (bio
->bi_rw
& REQ_FLUSH
) {
3272 /* REMOVEME: statistics. */
3273 atomic_inc(rs
->stats
+ S_BARRIER
);
3275 !list_empty(rs
->sc
.lists
+ LIST_FLUSH
) ||
3276 !bio_list_empty(&reject
) ||
3277 sc_active(&rs
->sc
)) {
3278 bio_list_push(ios
, bio
);
3283 /* If writes prohibited because of failures -> postpone. */
3284 if (RSProhibitWrites(rs
) && bio_data_dir(bio
) == WRITE
) {
3285 bio_list_add(&reject
, bio
);
3289 /* Check for recovering regions. */
3290 sector
= _sector(rs
, bio
);
3291 r
= region_state(rs
, sector
, DM_RH_RECOVERING
);
3294 /* Wait writing to recovering regions. */
3295 dm_rh_delay_by_region(rh
, bio
,
3296 dm_rh_sector_to_region(rh
,
3298 /* REMOVEME: statistics.*/
3299 atomic_inc(rs
->stats
+ S_DELAYED_BIOS
);
3300 atomic_inc(rs
->stats
+ S_SUM_DELAYED_BIOS
);
3302 /* Force bandwidth tests in recovery. */
3306 * Process ios to non-recovering regions by queueing
3307 * them to stripes (does dm_rh_inc()) for writes).
3309 flush
+= stripe_queue_bio(rs
, bio
, &reject
);
3314 /* FIXME: better error handling. */
3315 r
= dm_rh_flush(rh
); /* Writes got queued -> flush dirty log. */
3317 DMERR_LIMIT("dirty log flush");
3320 /* Merge any rejected bios back to the head of the input list. */
3321 bio_list_merge_head(ios
, &reject
);
3324 /* Send an event in case we're getting too busy. */
3325 static void do_busy_event(struct raid_set
*rs
)
3328 if (!TestSetRSScBusy(rs
))
3329 schedule_work(&rs
->io
.ws_do_table_event
);
3334 /* Throw an event. */
3335 static void do_table_event(struct work_struct
*ws
)
3337 struct raid_set
*rs
= container_of(ws
, struct raid_set
,
3338 io
.ws_do_table_event
);
3339 dm_table_event(rs
->ti
->table
);
3343 /*-----------------------------------------------------------------
3345 *---------------------------------------------------------------*/
3347 * o belabour all end ios
3348 * o update the region hash states
3349 * o optionally shrink the stripe cache
3350 * o optionally do recovery
3351 * o unplug any component raid devices with queued bios
3352 * o grab the input queue
3353 * o work an all requeued or new ios and perform stripe cache flushs
3354 * o unplug any component raid devices with queued bios
3355 * o check, if the stripe cache gets too busy and throw an event if so
3357 static void do_raid(struct work_struct
*ws
)
3360 struct raid_set
*rs
= container_of(ws
, struct raid_set
,
3361 io
.dws_do_raid
.work
);
3362 struct bio_list
*ios
= &rs
->io
.work
, *ios_in
= &rs
->io
.in
;
3365 * We always need to end io, so that ios can get errored in
3366 * case the set failed and the region counters get decremented
3367 * before we update region hash states and go any further.
3370 dm_rh_update_states(rs
->recover
.rh
, 1);
3373 * Now that we've end io'd, which may have put stripes on the LRU list
3374 * to allow for shrinking, we resize the stripe cache if requested.
3378 /* Try to recover regions. */
3379 r
= do_recovery(rs
);
3381 /* Quickly grab all new ios queued and add them to the work list. */
3382 mutex_lock(&rs
->io
.in_lock
);
3383 bio_list_merge(ios
, ios_in
);
3384 bio_list_init(ios_in
);
3385 mutex_unlock(&rs
->io
.in_lock
);
3387 if (!bio_list_empty(ios
))
3388 do_ios(rs
, ios
); /* Got ios to work into the cache. */
3390 r
= do_flush(rs
); /* Flush any stripes on io list. */
3392 do_busy_event(rs
); /* Check if we got too busy. */
3396 * Callback for region hash to dispatch
3397 * delayed bios queued to recovered regions
3398 * (gets called via dm_rh_update_states()).
3400 static void dispatch_delayed_bios(void *context
, struct bio_list
*bl
)
3402 struct raid_set
*rs
= context
;
3405 /* REMOVEME: statistics; decrement pending delayed bios counter. */
3406 bio_list_for_each(bio
, bl
)
3407 atomic_dec(rs
->stats
+ S_DELAYED_BIOS
);
3409 /* Merge region hash private list to work list. */
3410 bio_list_merge_head(&rs
->io
.work
, bl
);
3412 ClearRSBandwidth(rs
);
3415 /*************************************************************
3416 * Constructor helpers
3417 *************************************************************/
3418 /* Calculate MB/sec. */
3419 static unsigned mbpers(struct raid_set
*rs
, unsigned io_size
)
3421 return to_bytes((rs
->xor.speed
* rs
->set
.data_devs
*
3422 io_size
* HZ
/ XOR_SPEED_TICKS
) >> 10) >> 10;
3426 * Discover fastest xor algorithm and # of chunks combination.
3428 /* Calculate speed of particular algorithm and # of chunks. */
3429 static unsigned xor_speed(struct stripe
*stripe
)
3431 int ticks
= XOR_SPEED_TICKS
;
3432 unsigned p
= RS(stripe
->sc
)->set
.raid_devs
, r
= 0;
3435 /* Set uptodate so that common_xor()->xor() will belabour chunks. */
3437 SetChunkUptodate(CHUNK(stripe
, p
));
3439 /* Wait for next tick. */
3440 for (j
= jiffies
; j
== jiffies
; );
3442 /* Do xors for a few ticks. */
3446 for (j
= jiffies
; j
== jiffies
; ) {
3448 common_xor(stripe
, stripe
->io
.size
, 0, 0);
3461 /* Define for xor multi recovery stripe optimization runs. */
3462 #define DMRAID45_XOR_TEST
3464 /* Optimize xor algorithm for this RAID set. */
3465 static unsigned xor_optimize(struct raid_set
*rs
)
3467 unsigned chunks_max
= 2, speed_max
= 0;
3468 struct xor_func
*f
= ARRAY_END(xor_funcs
), *f_max
= NULL
;
3469 struct stripe
*stripe
;
3470 unsigned io_size
= 0, speed_hm
= 0, speed_min
= ~0, speed_xor_blocks
= 0;
3472 BUG_ON(list_empty(&rs
->recover
.stripes
));
3473 #ifndef DMRAID45_XOR_TEST
3474 stripe
= list_first_entry(&rs
->recover
.stripes
, struct stripe
,
3475 lists
[LIST_RECOVER
]);
3478 /* Try all xor functions. */
3479 while (f
-- > xor_funcs
) {
3482 #ifdef DMRAID45_XOR_TEST
3483 list_for_each_entry(stripe
, &rs
->recover
.stripes
,
3484 lists
[LIST_RECOVER
]) {
3485 io_size
= stripe
->io
.size
;
3488 /* Set actual xor function for common_xor(). */
3490 rs
->xor.chunks
= (f
->f
== xor_blocks_wrapper
?
3491 (MAX_XOR_BLOCKS
+ 1) :
3493 if (rs
->xor.chunks
> rs
->set
.raid_devs
)
3494 rs
->xor.chunks
= rs
->set
.raid_devs
;
3496 for ( ; rs
->xor.chunks
> 1; rs
->xor.chunks
--) {
3497 speed
= xor_speed(stripe
);
3499 #ifdef DMRAID45_XOR_TEST
3500 if (f
->f
== xor_blocks_wrapper
) {
3501 if (speed
> speed_xor_blocks
)
3502 speed_xor_blocks
= speed
;
3503 } else if (speed
> speed_hm
)
3506 if (speed
< speed_min
)
3510 if (speed
> speed_max
) {
3512 chunks_max
= rs
->xor.chunks
;
3516 #ifdef DMRAID45_XOR_TEST
3521 /* Memorize optimal parameters. */
3523 rs
->xor.chunks
= chunks_max
;
3524 #ifdef DMRAID45_XOR_TEST
3525 DMINFO("%s stripes=%u/size=%u min=%u xor_blocks=%u hm=%u max=%u",
3526 speed_max
== speed_hm
? "HM" : "NB",
3527 rs
->recover
.recovery_stripes
, io_size
, speed_min
,
3528 speed_xor_blocks
, speed_hm
, speed_max
);
3534 * Allocate a RAID context (a RAID set)
3536 /* Structure for variable RAID parameters. */
3537 struct variable_parms
{
3541 int chunk_size_parm
;
3546 int recover_io_size
;
3547 int recover_io_size_parm
;
3550 int recovery_stripes
;
3551 int recovery_stripes_parm
;
3554 static struct raid_set
*
3555 context_alloc(struct raid_type
*raid_type
, struct variable_parms
*p
,
3556 unsigned raid_devs
, sector_t sectors_per_dev
,
3557 struct dm_target
*ti
, unsigned dl_parms
, char **argv
)
3561 sector_t region_size
, ti_len
;
3562 struct raid_set
*rs
= NULL
;
3563 struct dm_dirty_log
*dl
;
3564 struct recover
*rec
;
3567 * Create the dirty log
3569 * We need to change length for the dirty log constructor,
3570 * because we want an amount of regions for all stripes derived
3571 * from the single device size, so that we can keep region
3572 * size = 2^^n independant of the number of devices
3575 ti
->len
= sectors_per_dev
;
3576 dl
= dm_dirty_log_create(argv
[0], ti
, NULL
, dl_parms
, argv
+ 2);
3581 /* Chunk size *must* be smaller than region size. */
3582 region_size
= dl
->type
->get_region_size(dl
);
3583 if (p
->chunk_size
> region_size
)
3584 goto bad_chunk_size
;
3586 /* Recover io size *must* be smaller than region size as well. */
3587 if (p
->recover_io_size
> region_size
)
3588 goto bad_recover_io_size
;
3590 /* Size and allocate the RAID set structure. */
3591 len
= sizeof(*rs
->data
) + sizeof(*rs
->dev
);
3592 if (dm_array_too_big(sizeof(*rs
), len
, raid_devs
))
3595 len
= sizeof(*rs
) + raid_devs
* len
;
3596 rs
= kzalloc(len
, GFP_KERNEL
);
3601 atomic_set(&rs
->io
.in_process
, 0);
3602 atomic_set(&rs
->io
.in_process_max
, 0);
3603 rec
->io_size
= p
->recover_io_size
;
3605 /* Pointer to data array. */
3606 rs
->data
= (unsigned long **)
3607 ((void *) rs
->dev
+ raid_devs
* sizeof(*rs
->dev
));
3609 rs
->set
.raid_devs
= raid_devs
;
3610 rs
->set
.data_devs
= raid_devs
- raid_type
->parity_devs
;
3611 rs
->set
.raid_type
= raid_type
;
3613 rs
->set
.raid_parms
= p
->raid_parms
;
3614 rs
->set
.chunk_size_parm
= p
->chunk_size_parm
;
3615 rs
->set
.io_size_parm
= p
->io_size_parm
;
3616 rs
->sc
.stripes_parm
= p
->stripes_parm
;
3617 rec
->io_size_parm
= p
->recover_io_size_parm
;
3618 rec
->bandwidth_parm
= p
->bandwidth_parm
;
3619 rec
->recovery
= p
->recovery
;
3620 rec
->recovery_stripes
= p
->recovery_stripes
;
3623 * Set chunk and io size and respective shifts
3624 * (used to avoid divisions)
3626 rs
->set
.chunk_size
= p
->chunk_size
;
3627 rs
->set
.chunk_shift
= ffs(p
->chunk_size
) - 1;
3629 rs
->set
.io_size
= p
->io_size
;
3630 rs
->set
.io_mask
= p
->io_size
- 1;
3631 /* Mask to adjust address key in case io_size != chunk_size. */
3632 rs
->set
.io_inv_mask
= (p
->chunk_size
- 1) & ~rs
->set
.io_mask
;
3634 rs
->set
.sectors_per_dev
= sectors_per_dev
;
3636 rs
->set
.ei
= -1; /* Indicate no failed device. */
3637 atomic_set(&rs
->set
.failed_devs
, 0);
3641 atomic_set(rec
->io_count
+ IO_WORK
, 0);
3642 atomic_set(rec
->io_count
+ IO_RECOVER
, 0);
3644 /* Initialize io lock and queues. */
3645 mutex_init(&rs
->io
.in_lock
);
3646 mutex_init(&rs
->io
.xor_lock
);
3647 bio_list_init(&rs
->io
.in
);
3648 bio_list_init(&rs
->io
.work
);
3650 init_waitqueue_head(&rs
->io
.suspendq
); /* Suspend waiters (dm-io). */
3652 rec
->nr_regions
= dm_sector_div_up(sectors_per_dev
, region_size
);
3653 rec
->rh
= dm_region_hash_create(rs
, dispatch_delayed_bios
,
3654 wake_dummy
, wake_do_raid
, 0, p
->recovery_stripes
,
3655 dl
, region_size
, rec
->nr_regions
);
3656 if (IS_ERR(rec
->rh
))
3659 /* Initialize stripe cache. */
3660 r
= sc_init(rs
, p
->stripes
);
3664 /* REMOVEME: statistics. */
3666 ClearRSDevelStats(rs
); /* Disnable development status. */
3670 TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM
));
3673 dm_dirty_log_destroy(dl
);
3674 TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL
));
3676 bad_recover_io_size
:
3677 dm_dirty_log_destroy(dl
);
3678 TI_ERR_RET("Recover stripe io size larger than region size",
3682 dm_dirty_log_destroy(dl
);
3683 TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL
));
3686 dm_dirty_log_destroy(dl
);
3687 TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM
));
3690 dm_dirty_log_destroy(dl
);
3691 ti
->error
= DM_MSG_PREFIX
"Error creating dirty region hash";
3695 dm_region_hash_destroy(rec
->rh
); /* Destroys dirty log too. */
3697 ti
->error
= DM_MSG_PREFIX
"Error creating stripe cache";
3700 return ERR_PTR(-ENOMEM
);
3703 /* Free a RAID context (a RAID set). */
3704 static void context_free(struct raid_set
*rs
, unsigned p
)
3707 dm_put_device(rs
->ti
, rs
->dev
[p
].dev
);
3710 dm_region_hash_destroy(rs
->recover
.rh
); /* Destroys dirty log too. */
3714 /* Create work queue and initialize delayed work. */
3715 static int rs_workqueue_init(struct raid_set
*rs
)
3717 struct dm_target
*ti
= rs
->ti
;
3719 rs
->io
.wq
= create_singlethread_workqueue(DAEMON
);
3721 TI_ERR_RET("failed to create " DAEMON
, -ENOMEM
);
3723 INIT_DELAYED_WORK(&rs
->io
.dws_do_raid
, do_raid
);
3724 INIT_WORK(&rs
->io
.ws_do_table_event
, do_table_event
);
3728 /* Return pointer to raid_type structure for raid name. */
3729 static struct raid_type
*get_raid_type(char *name
)
3731 struct raid_type
*r
= ARRAY_END(raid_types
);
3733 while (r
-- > raid_types
) {
3734 if (!strcmp(r
->name
, name
))
3741 /* FIXME: factor out to dm core. */
3742 static int multiple(sector_t a
, sector_t b
, sector_t
*n
)
3751 /* Log RAID set information to kernel log. */
3752 static void rs_log(struct raid_set
*rs
, unsigned io_size
)
3755 char buf
[BDEVNAME_SIZE
];
3757 for (p
= 0; p
< rs
->set
.raid_devs
; p
++)
3758 DMINFO("/dev/%s is raid disk %u%s",
3759 bdevname(rs
->dev
[p
].dev
->bdev
, buf
), p
,
3760 (p
== rs
->set
.pi
) ? " (parity)" : "");
3762 DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n"
3763 "algorithm \"%s\", %u chunks with %uMB/s\n"
3764 "%s set with net %u/%u devices",
3765 rs
->set
.chunk_size
, rs
->set
.io_size
, rs
->recover
.io_size
,
3766 atomic_read(&rs
->sc
.stripes
),
3767 rs
->xor.f
->name
, rs
->xor.chunks
, mbpers(rs
, io_size
),
3768 rs
->set
.raid_type
->descr
, rs
->set
.data_devs
, rs
->set
.raid_devs
);
3771 /* Get all devices and offsets. */
3772 static int dev_parms(struct raid_set
*rs
, char **argv
, int *p
)
3774 struct dm_target
*ti
= rs
->ti
;
3776 DMINFO("rs->set.sectors_per_dev=%llu", (unsigned long long) rs
->set
.sectors_per_dev
);
3777 for (*p
= 0; *p
< rs
->set
.raid_devs
; (*p
)++, argv
+= 2) {
3779 unsigned long long tmp
;
3780 struct raid_dev
*dev
= rs
->dev
+ *p
;
3782 /* Get offset and device. */
3783 if (sscanf(argv
[1], "%llu", &tmp
) != 1 ||
3784 tmp
> rs
->set
.sectors_per_dev
)
3785 TI_ERR("Invalid RAID device offset parameter");
3788 r
= dm_get_device(ti
, argv
[0], dm_table_get_mode(ti
->table
),
3791 TI_ERR_RET("RAID device lookup failure", r
);
3793 r
= raid_dev_lookup(rs
, dev
);
3794 if (r
!= -ENODEV
&& r
< *p
) {
3795 (*p
)++; /* Ensure dm_put_device() on actual device. */
3796 TI_ERR_RET("Duplicate RAID device", -ENXIO
);
3803 /* Set recovery bandwidth. */
3805 recover_set_bandwidth(struct raid_set
*rs
, unsigned bandwidth
)
3807 rs
->recover
.bandwidth
= bandwidth
;
3808 rs
->recover
.bandwidth_work
= 100 / bandwidth
;
3811 /* Handle variable number of RAID parameters. */
3812 static int get_raid_variable_parms(struct dm_target
*ti
, char **argv
,
3813 struct variable_parms
*vp
)
3817 int action
; /* -1: skip, 0: no power2 check, 1: power2 check */
3820 int *var
, *var2
, *var3
;
3823 "Invalid chunk size; must be -1 or 2^^n and <= 16384",
3824 IO_SIZE_MIN
, CHUNK_SIZE_MAX
,
3825 &vp
->chunk_size_parm
, &vp
->chunk_size
, &vp
->io_size
},
3827 "Invalid number of stripes: must be -1 or >= 8 and <= 16384",
3828 STRIPES_MIN
, STRIPES_MAX
,
3829 &vp
->stripes_parm
, &vp
->stripes
, NULL
},
3831 "Invalid io size; must -1 or >= 8, 2^^n and less equal "
3832 "min(BIO_MAX_SECTORS/2, chunk size)",
3833 IO_SIZE_MIN
, 0, /* Needs to be updated in loop below. */
3834 &vp
->io_size_parm
, &vp
->io_size
, NULL
},
3836 "Invalid recovery io size; must be -1 or "
3837 "2^^n and less equal BIO_MAX_SECTORS/2",
3838 RECOVER_IO_SIZE_MIN
, BIO_MAX_SECTORS
/ 2,
3839 &vp
->recover_io_size_parm
, &vp
->recover_io_size
, NULL
},
3841 "Invalid recovery bandwidth percentage; "
3842 "must be -1 or > 0 and <= 100",
3843 BANDWIDTH_MIN
, BANDWIDTH_MAX
,
3844 &vp
->bandwidth_parm
, &vp
->bandwidth
, NULL
},
3845 /* Handle sync argument seperately in loop. */
3847 "Invalid recovery switch; must be \"sync\" or \"nosync\"" },
3849 "Invalid number of recovery stripes;"
3850 "must be -1, > 0 and <= 64",
3851 RECOVERY_STRIPES_MIN
, RECOVERY_STRIPES_MAX
,
3852 &vp
->recovery_stripes_parm
, &vp
->recovery_stripes
, NULL
},
3855 /* Fetch # of variable raid parameters. */
3856 if (sscanf(*(argv
++), "%d", &vp
->raid_parms
) != 1 ||
3857 !range_ok(vp
->raid_parms
, 0, 7))
3858 TI_ERR("Bad variable raid parameters number");
3860 /* Preset variable RAID parameters. */
3861 vp
->chunk_size
= CHUNK_SIZE_DEFAULT
;
3862 vp
->io_size
= IO_SIZE_DEFAULT
;
3863 vp
->stripes
= STRIPES_DEFAULT
;
3864 vp
->recover_io_size
= RECOVER_IO_SIZE_DEFAULT
;
3865 vp
->bandwidth
= BANDWIDTH_DEFAULT
;
3867 vp
->recovery_stripes
= RECOVERY_STRIPES_DEFAULT
;
3869 /* Walk the array of argument constraints for all given ones. */
3870 for (p
= 0, varp
= argctr
; p
< vp
->raid_parms
; p
++, varp
++) {
3871 BUG_ON(varp
>= ARRAY_END(argctr
));
3873 /* Special case for "[no]sync" string argument. */
3874 if (varp
->action
< 0) {
3875 if (!strcmp(*argv
, "sync"))
3877 else if (!strcmp(*argv
, "nosync"))
3880 TI_ERR(varp
->errmsg
);
3887 * Special case for io_size depending
3888 * on previously set chunk size.
3891 varp
->max
= min(BIO_MAX_SECTORS
/ 2, vp
->chunk_size
);
3893 if (sscanf(*(argv
++), "%d", &value
) != 1 ||
3895 ((varp
->action
&& !is_power_of_2(value
)) ||
3896 !range_ok(value
, varp
->min
, varp
->max
))))
3897 TI_ERR(varp
->errmsg
);
3902 *varp
->var2
= value
;
3904 *varp
->var3
= value
;
3911 /* Parse optional locking parameters. */
3912 static int get_raid_locking_parms(struct dm_target
*ti
, char **argv
,
3914 struct dm_raid45_locking_type
**locking_type
)
3916 if (!strnicmp(argv
[0], "locking", strlen(argv
[0]))) {
3917 char *lckstr
= argv
[1];
3918 size_t lcksz
= strlen(lckstr
);
3920 if (!strnicmp(lckstr
, "none", lcksz
)) {
3921 *locking_type
= &locking_none
;
3923 } else if (!strnicmp(lckstr
, "cluster", lcksz
)) {
3924 DMERR("locking type \"%s\" not yet implemented",
3928 DMERR("unknown locking type \"%s\"", lckstr
);
3934 *locking_type
= &locking_none
;
3938 /* Set backing device read ahead properties of RAID set. */
3939 static void rs_set_read_ahead(struct raid_set
*rs
,
3940 unsigned sectors
, unsigned stripes
)
3942 unsigned ra_pages
= dm_div_up(sectors
, SECTORS_PER_PAGE
);
3943 struct mapped_device
*md
= dm_table_get_md(rs
->ti
->table
);
3944 struct backing_dev_info
*bdi
= &dm_disk(md
)->queue
->backing_dev_info
;
3946 /* Set read-ahead for the RAID set and the component devices. */
3948 unsigned p
= rs
->set
.raid_devs
;
3950 bdi
->ra_pages
= stripes
* ra_pages
* rs
->set
.data_devs
;
3953 struct request_queue
*q
=
3954 bdev_get_queue(rs
->dev
[p
].dev
->bdev
);
3956 q
->backing_dev_info
.ra_pages
= ra_pages
;
3961 /* Set congested function. */
3962 static void rs_set_congested_fn(struct raid_set
*rs
)
3964 struct mapped_device
*md
= dm_table_get_md(rs
->ti
->table
);
3965 struct backing_dev_info
*bdi
= &dm_disk(md
)->queue
->backing_dev_info
;
3967 /* Set congested function and data. */
3968 bdi
->congested_fn
= rs_congested
;
3969 bdi
->congested_data
= rs
;
3973 * Construct a RAID4/5 mapping:
3975 * log_type #log_params <log_params> \
3976 * raid_type [#parity_dev] #raid_variable_params <raid_params> \
3977 * [locking "none"/"cluster"]
3978 * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
3980 * log_type = "core"/"disk",
3981 * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
3982 * log_params = [dirty_log_path] region_size [[no]sync])
3984 * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
3986 * #parity_dev = N if raid_type = "raid4"
3987 * o N = -1: pick default = last device
3988 * o N >= 0 and < #raid_devs: parity device index
3990 * #raid_variable_params = 0-7; raid_params (-1 = default):
3991 * [chunk_size [#stripes [io_size [recover_io_size \
3992 * [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]]
3993 * o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
3994 * and <= CHUNK_SIZE_MAX)
3995 * o #stripes is number of stripes allocated to stripe cache
3996 * (must be > 1 and < STRIPES_MAX)
3997 * o io_size (io unit size per device in sectors; must be 2^^n and > 8)
3998 * o recover_io_size (io unit size per device for recovery in sectors;
3999 must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
4000 * o %recovery_bandwith is the maximum amount spend for recovery during
4001 * application io (1-100%)
4002 * o recovery switch = [sync|nosync]
4003 * o #recovery_stripes is the number of recovery stripes used for
4004 * parallel recovery of the RAID set
4005 * If raid_variable_params = 0, defaults will be used.
4006 * Any raid_variable_param can be set to -1 to apply a default
4008 * #raid_devs = N (N >= 3)
4010 * #dev_to_initialize = N
4011 * -1: initialize parity on all devices
4012 * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
4013 * of a failed devices content after replacement
4015 * <dev_path> = device_path (eg, /dev/sdd1)
4016 * <offset> = begin at offset on <dev_path>
4019 #define MIN_PARMS 13
4020 static int raid_ctr(struct dm_target
*ti
, unsigned argc
, char **argv
)
4022 int dev_to_init
, dl_parms
, i
, locking_parms
,
4023 parity_parm
, pi
= -1, r
, raid_devs
;
4024 sector_t tmp
, sectors_per_dev
;
4025 struct dm_raid45_locking_type
*locking
;
4026 struct raid_set
*rs
;
4027 struct raid_type
*raid_type
;
4028 struct variable_parms parms
;
4030 /* Ensure minimum number of parameters. */
4031 if (argc
< MIN_PARMS
)
4032 TI_ERR("Not enough parameters");
4034 /* Fetch # of dirty log parameters. */
4035 if (sscanf(argv
[1], "%d", &dl_parms
) != 1 ||
4036 !range_ok(dl_parms
, 1, 4711)) /* ;-) */
4037 TI_ERR("Bad dirty log parameters number");
4039 /* Check raid_type. */
4040 raid_type
= get_raid_type(argv
[dl_parms
+ 2]);
4042 TI_ERR("Bad raid type");
4044 /* In case of RAID4, parity drive is selectable. */
4045 parity_parm
= !!(raid_type
->level
== raid4
);
4047 /* Handle variable number of RAID parameters. */
4048 r
= get_raid_variable_parms(ti
, argv
+ dl_parms
+ parity_parm
+ 3,
4053 /* Handle any locking parameters. */
4054 r
= get_raid_locking_parms(ti
,
4055 argv
+ dl_parms
+ parity_parm
+
4056 parms
.raid_parms
+ 4,
4057 &locking_parms
, &locking
);
4061 /* # of raid devices. */
4062 i
= dl_parms
+ parity_parm
+ parms
.raid_parms
+ locking_parms
+ 4;
4063 if (sscanf(argv
[i
], "%d", &raid_devs
) != 1 ||
4064 raid_devs
< raid_type
->minimal_devs
)
4065 TI_ERR("Invalid number of raid devices");
4067 /* In case of RAID4, check parity drive index is in limits. */
4068 if (raid_type
->level
== raid4
) {
4069 /* Fetch index of parity device. */
4070 if (sscanf(argv
[dl_parms
+ 3], "%d", &pi
) != 1 ||
4071 (pi
!= -1 && !range_ok(pi
, 0, raid_devs
- 1)))
4072 TI_ERR("Invalid RAID4 parity device index");
4076 * Index of device to initialize starts at 0
4078 * o -1 -> don't initialize a selected device;
4079 * initialize parity conforming to algorithm
4080 * o 0..raid_devs-1 -> initialize respective device
4081 * (used for reconstruction of a replaced device)
4083 if (sscanf(argv
[dl_parms
+ parity_parm
+ parms
.raid_parms
+
4084 locking_parms
+ 5], "%d", &dev_to_init
) != 1 ||
4085 !range_ok(dev_to_init
, -1, raid_devs
- 1))
4086 TI_ERR("Invalid number for raid device to initialize");
4088 /* Check # of raid device arguments. */
4089 if (argc
- dl_parms
- parity_parm
- parms
.raid_parms
- 6 !=
4091 TI_ERR("Wrong number of raid device/offset arguments");
4094 * Check that the table length is devisable
4095 * w/o rest by (raid_devs - parity_devs)
4097 if (!multiple(ti
->len
, raid_devs
- raid_type
->parity_devs
,
4099 TI_ERR("Target length not divisible by number of data devices");
4102 * Check that the device size is
4103 * devisable w/o rest by chunk size
4105 if (!multiple(sectors_per_dev
, parms
.chunk_size
, &tmp
))
4106 TI_ERR("Device length not divisible by chunk_size");
4108 /****************************************************************
4109 * Now that we checked the constructor arguments ->
4110 * let's allocate the RAID set
4111 ****************************************************************/
4112 rs
= context_alloc(raid_type
, &parms
, raid_devs
, sectors_per_dev
,
4113 ti
, dl_parms
, argv
);
4118 rs
->set
.dev_to_init
= rs
->set
.dev_to_init_parm
= dev_to_init
;
4119 rs
->set
.pi
= rs
->set
.pi_parm
= pi
;
4121 /* Set RAID4 parity drive index. */
4122 if (raid_type
->level
== raid4
)
4123 rs
->set
.pi
= (pi
== -1) ? rs
->set
.data_devs
: pi
;
4125 recover_set_bandwidth(rs
, parms
.bandwidth
);
4127 /* Use locking type to lock stripe access. */
4128 rs
->locking
= locking
;
4130 /* Get the device/offset tupels. */
4131 argv
+= dl_parms
+ 6 + parity_parm
+ parms
.raid_parms
;
4132 r
= dev_parms(rs
, argv
, &i
);
4136 /* Set backing device information (eg. read ahead). */
4137 rs_set_read_ahead(rs
, 2 * rs
->set
.chunk_size
/* sectors per device */,
4138 2 /* # of stripes */);
4139 rs_set_congested_fn(rs
); /* Set congested function. */
4140 SetRSCheckOverwrite(rs
); /* Allow chunk overwrite checks. */
4141 rs
->xor.speed
= xor_optimize(rs
); /* Select best xor algorithm. */
4143 /* Set for recovery of any nosync regions. */
4148 * Need to free recovery stripe(s) here in case
4149 * of nosync, because xor_optimize uses one.
4151 set_start_recovery(rs
);
4152 set_end_recovery(rs
);
4153 stripe_recover_free(rs
);
4157 * Enable parity chunk creation enformcement for
4158 * little numbers of array members where it doesn'ti
4159 * gain us performance to xor parity out and back in as
4160 * with larger array member numbers.
4162 if (rs
->set
.raid_devs
<= rs
->set
.raid_type
->minimal_devs
+ 1)
4163 SetRSEnforceParityCreation(rs
);
4166 * Make sure that dm core only hands maximum io size
4167 * length down and pays attention to io boundaries.
4169 ti
->split_io
= rs
->set
.io_size
;
4172 /* Initialize work queue to handle this RAID set's io. */
4173 r
= rs_workqueue_init(rs
);
4177 rs_log(rs
, rs
->recover
.io_size
); /* Log information about RAID set. */
4181 context_free(rs
, i
);
4186 * Destruct a raid mapping
4188 static void raid_dtr(struct dm_target
*ti
)
4190 struct raid_set
*rs
= ti
->private;
4192 destroy_workqueue(rs
->io
.wq
);
4193 context_free(rs
, rs
->set
.raid_devs
);
4196 /* Raid mapping function. */
4197 static int raid_map(struct dm_target
*ti
, struct bio
*bio
,
4198 union map_info
*map_context
)
4200 /* I don't want to waste stripe cache capacity. */
4201 if (bio_rw(bio
) == READA
)
4204 struct raid_set
*rs
= ti
->private;
4207 * Get io reference to be waiting for to drop
4208 * to zero on device suspension/destruction.
4211 bio
->bi_sector
-= ti
->begin
; /* Remap sector. */
4213 /* Queue io to RAID set. */
4214 mutex_lock(&rs
->io
.in_lock
);
4215 bio_list_add(&rs
->io
.in
, bio
);
4216 mutex_unlock(&rs
->io
.in_lock
);
4218 /* Wake daemon to process input list. */
4221 /* REMOVEME: statistics. */
4222 atomic_inc(rs
->stats
+ (bio_data_dir(bio
) == READ
?
4223 S_BIOS_READ
: S_BIOS_WRITE
));
4224 return DM_MAPIO_SUBMITTED
; /* Handle later. */
4228 /* Device suspend. */
4229 static void raid_presuspend(struct dm_target
*ti
)
4231 struct raid_set
*rs
= ti
->private;
4232 struct dm_dirty_log
*dl
= rs
->recover
.dl
;
4237 dm_rh_stop_recovery(rs
->recover
.rh
);
4239 cancel_delayed_work(&rs
->io
.dws_do_raid
);
4240 flush_workqueue(rs
->io
.wq
);
4241 wait_ios(rs
); /* Wait for completion of all ios being processed. */
4243 if (dl
->type
->presuspend
&& dl
->type
->presuspend(dl
))
4244 /* FIXME: need better error handling. */
4245 DMWARN("log presuspend failed");
4248 static void raid_postsuspend(struct dm_target
*ti
)
4250 struct raid_set
*rs
= ti
->private;
4251 struct dm_dirty_log
*dl
= rs
->recover
.dl
;
4253 if (dl
->type
->postsuspend
&& dl
->type
->postsuspend(dl
))
4254 /* FIXME: need better error handling. */
4255 DMWARN("log postsuspend failed");
4259 /* Device resume. */
4260 static void raid_resume(struct dm_target
*ti
)
4262 struct raid_set
*rs
= ti
->private;
4263 struct recover
*rec
= &rs
->recover
;
4264 struct dm_dirty_log
*dl
= rec
->dl
;
4266 DMINFO("%s...", __func__
);
4267 if (dl
->type
->resume
&& dl
->type
->resume(dl
))
4268 /* Resume dirty log. */
4269 /* FIXME: need better error handling. */
4270 DMWARN("log resume failed");
4272 rec
->nr_regions_to_recover
=
4273 rec
->nr_regions
- dl
->type
->get_sync_count(dl
);
4275 /* Restart any unfinished recovery. */
4276 if (RSRecover(rs
)) {
4277 set_start_recovery(rs
);
4278 dm_rh_start_recovery(rec
->rh
);
4284 /* Return stripe cache size. */
4285 static unsigned sc_size(struct raid_set
*rs
)
4287 return to_sector(atomic_read(&rs
->sc
.stripes
) *
4288 (sizeof(struct stripe
) +
4289 (sizeof(struct stripe_chunk
) +
4290 (sizeof(struct page_list
) +
4291 to_bytes(rs
->set
.io_size
) *
4292 rs
->set
.raid_devs
)) +
4293 (rs
->recover
.end_jiffies
?
4294 0 : rs
->recover
.recovery_stripes
*
4295 to_bytes(rs
->set
.raid_devs
* rs
->recover
.io_size
))));
4298 /* REMOVEME: status output for development. */
4299 static void raid_devel_stats(struct dm_target
*ti
, char *result
,
4300 unsigned *size
, unsigned maxlen
)
4302 unsigned sz
= *size
;
4304 char buf
[BDEVNAME_SIZE
], *p
;
4305 struct stats_map
*sm
;
4306 struct raid_set
*rs
= ti
->private;
4307 struct recover
*rec
= &rs
->recover
;
4310 DMEMIT("%s %s=%u bw=%u\n",
4311 version
, rs
->xor.f
->name
, rs
->xor.chunks
, rs
->recover
.bandwidth
);
4312 DMEMIT("act_ios=%d ", io_ref(rs
));
4313 DMEMIT("act_ios_max=%d\n", atomic_read(&rs
->io
.in_process_max
));
4314 DMEMIT("act_stripes=%d ", sc_active(&rs
->sc
));
4315 DMEMIT("act_stripes_max=%d\n",
4316 atomic_read(&rs
->sc
.active_stripes_max
));
4318 for (sm
= stats_map
; sm
< ARRAY_END(stats_map
); sm
++)
4319 DMEMIT("%s%d", sm
->str
, atomic_read(rs
->stats
+ sm
->type
));
4321 DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs
) ? "on" : "off");
4322 DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs
->set
.chunk_size
,
4323 atomic_read(&rs
->sc
.stripes
), rs
->set
.io_size
,
4324 rec
->recovery_stripes
, rec
->io_size
, rs
->sc
.hash
.buckets
,
4327 j
= (rec
->end_jiffies
? rec
->end_jiffies
: jiffies
) -
4329 jiffies_to_timespec(j
, &ts
);
4330 sprintf(buf
, "%ld.%ld", ts
.tv_sec
, ts
.tv_nsec
);
4331 p
= strchr(buf
, '.');
4334 DMEMIT("rg=%llu/%llu/%llu/%u %s\n",
4335 (unsigned long long) rec
->nr_regions_recovered
,
4336 (unsigned long long) rec
->nr_regions_to_recover
,
4337 (unsigned long long) rec
->nr_regions
, rec
->bandwidth
, buf
);
4342 static int raid_status(struct dm_target
*ti
, status_type_t type
,
4343 char *result
, unsigned maxlen
)
4346 char buf
[BDEVNAME_SIZE
];
4347 struct raid_set
*rs
= ti
->private;
4348 struct dm_dirty_log
*dl
= rs
->recover
.dl
;
4349 int raid_parms
[] = {
4350 rs
->set
.chunk_size_parm
,
4351 rs
->sc
.stripes_parm
,
4352 rs
->set
.io_size_parm
,
4353 rs
->recover
.io_size_parm
,
4354 rs
->recover
.bandwidth_parm
,
4356 rs
->recover
.recovery_stripes
,
4360 case STATUSTYPE_INFO
:
4361 /* REMOVEME: statistics. */
4362 if (RSDevelStats(rs
))
4363 raid_devel_stats(ti
, result
, &sz
, maxlen
);
4365 DMEMIT("%u ", rs
->set
.raid_devs
);
4367 for (p
= 0; p
< rs
->set
.raid_devs
; p
++)
4369 format_dev_t(buf
, rs
->dev
[p
].dev
->bdev
->bd_dev
));
4372 for (p
= 0; p
< rs
->set
.raid_devs
; p
++) {
4373 DMEMIT("%c", !DevFailed(rs
->dev
+ p
) ? 'A' : 'D');
4375 if (p
== rs
->set
.pi
)
4378 if (p
== rs
->set
.dev_to_init
)
4382 DMEMIT(" %llu/%llu ",
4383 (unsigned long long) dl
->type
->get_sync_count(dl
),
4384 (unsigned long long) rs
->recover
.nr_regions
);
4386 sz
+= dl
->type
->status(dl
, type
, result
+sz
, maxlen
-sz
);
4388 case STATUSTYPE_TABLE
:
4389 sz
= rs
->recover
.dl
->type
->status(rs
->recover
.dl
, type
,
4391 DMEMIT("%s %u ", rs
->set
.raid_type
->name
, rs
->set
.raid_parms
);
4393 for (p
= 0; p
< rs
->set
.raid_parms
; p
++) {
4394 if (raid_parms
[p
] > -2)
4395 DMEMIT("%d ", raid_parms
[p
]);
4397 DMEMIT("%s ", rs
->recover
.recovery
?
4401 DMEMIT("%u %d ", rs
->set
.raid_devs
, rs
->set
.dev_to_init
);
4403 for (p
= 0; p
< rs
->set
.raid_devs
; p
++)
4405 format_dev_t(buf
, rs
->dev
[p
].dev
->bdev
->bd_dev
),
4406 (unsigned long long) rs
->dev
[p
].start
);
4415 /* Turn a delta into an absolute value. */
4416 static int _absolute(char *action
, int act
, int r
)
4418 size_t len
= strlen(action
);
4423 /* Make delta absolute. */
4424 if (!strncmp("set", action
, len
))
4426 else if (!strncmp("grow", action
, len
))
4428 else if (!strncmp("shrink", action
, len
))
4436 /* Change recovery io bandwidth. */
4437 static int bandwidth_change(struct raid_set
*rs
, int argc
, char **argv
,
4438 enum raid_set_flags flag
)
4440 int act
= rs
->recover
.bandwidth
, bandwidth
;
4445 if (sscanf(argv
[1], "%d", &bandwidth
) == 1 &&
4446 range_ok(bandwidth
, BANDWIDTH_MIN
, BANDWIDTH_MAX
)) {
4447 /* Make delta bandwidth absolute. */
4448 bandwidth
= _absolute(argv
[0], act
, bandwidth
);
4451 if (range_ok(bandwidth
, BANDWIDTH_MIN
, BANDWIDTH_MAX
)) {
4452 recover_set_bandwidth(rs
, bandwidth
);
4460 /* Set/reset development feature flags. */
4461 static int devel_flags(struct raid_set
*rs
, int argc
, char **argv
,
4462 enum raid_set_flags flag
)
4469 len
= strlen(argv
[0]);
4473 if (!strncmp(argv
[0], "on", len
))
4474 return test_and_set_bit(flag
, &rs
->io
.flags
) ? -EPERM
: 0;
4475 else if (!strncmp(argv
[0], "off", len
))
4476 return test_and_clear_bit(flag
, &rs
->io
.flags
) ? 0 : -EPERM
;
4477 else if (!strncmp(argv
[0], "reset", len
)) {
4478 if (flag
== RS_DEVEL_STATS
) {
4479 if (test_bit(flag
, &rs
->io
.flags
)) {
4485 set_bit(flag
, &rs
->io
.flags
);
4493 /* Resize the stripe cache. */
4494 static int sc_resize(struct raid_set
*rs
, int argc
, char **argv
,
4495 enum raid_set_flags flag
)
4502 /* Deny permission in case the daemon is still resizing!. */
4503 if (atomic_read(&rs
->sc
.stripes_to_set
))
4506 if (sscanf(argv
[1], "%d", &stripes
) == 1 &&
4508 act
= atomic_read(&rs
->sc
.stripes
);
4510 /* Make delta stripes absolute. */
4511 stripes
= _absolute(argv
[0], act
, stripes
);
4514 * Check range and that the # of stripes changes.
4515 * We leave the resizing to the wroker.
4517 if (range_ok(stripes
, STRIPES_MIN
, STRIPES_MAX
) &&
4518 stripes
!= atomic_read(&rs
->sc
.stripes
)) {
4519 atomic_set(&rs
->sc
.stripes_to_set
, stripes
);
4528 /* Change xor algorithm and number of chunks. */
4529 static int xor_set(struct raid_set
*rs
, int argc
, char **argv
,
4530 enum raid_set_flags flag
)
4534 char *algorithm
= argv
[0];
4535 struct xor_func
*f
= ARRAY_END(xor_funcs
);
4537 if (sscanf(argv
[1], "%d", &chunks
) == 1 &&
4538 range_ok(chunks
, 2, XOR_CHUNKS_MAX
) &&
4539 chunks
<= rs
->set
.raid_devs
) {
4540 while (f
-- > xor_funcs
) {
4541 if (!strcmp(algorithm
, f
->name
)) {
4542 unsigned io_size
= 0;
4543 struct stripe
*stripe
= stripe_alloc(&rs
->sc
, rs
->sc
.mem_cache_client
, SC_GROW
);
4545 DMINFO("xor: %s", f
->name
);
4546 if (f
->f
== xor_blocks_wrapper
&&
4547 chunks
> MAX_XOR_BLOCKS
+ 1) {
4548 DMERR("chunks > MAX_XOR_BLOCKS"
4553 mutex_lock(&rs
->io
.xor_lock
);
4555 rs
->xor.chunks
= chunks
;
4557 mutex_unlock(&rs
->io
.xor_lock
);
4560 rs
->xor.speed
= xor_speed(stripe
);
4561 io_size
= stripe
->io
.size
;
4562 stripe_free(stripe
, rs
->sc
.mem_cache_client
);
4565 rs_log(rs
, io_size
);
4576 * Allow writes after they got prohibited because of a device failure.
4578 * This needs to be called after userspace updated metadata state
4579 * based on an event being thrown during device failure processing.
4581 static int allow_writes(struct raid_set
*rs
, int argc
, char **argv
,
4582 enum raid_set_flags flag
)
4584 if (TestClearRSProhibitWrites(rs
)) {
4585 DMINFO("%s waking", __func__
);
4593 /* Parse the RAID message. */
4596 * 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50'
4597 * "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of'
4598 * 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of'
4599 * 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024'
4600 * 'xor algorithm #chunks' # e.g. 'xor xor_8 5'
4603 static int raid_message(struct dm_target
*ti
, unsigned argc
, char **argv
)
4606 size_t len
= strlen(argv
[0]);
4607 struct raid_set
*rs
= ti
->private;
4610 int (*f
) (struct raid_set
*rs
, int argc
, char **argv
,
4611 enum raid_set_flags flag
);
4612 enum raid_set_flags flag
;
4614 { "allow_writes", allow_writes
, 0 },
4615 { "bandwidth", bandwidth_change
, 0 },
4616 { "overwrite", devel_flags
, RS_CHECK_OVERWRITE
},
4617 { "statistics", devel_flags
, RS_DEVEL_STATS
},
4618 { "stripe_cache", sc_resize
, 0 },
4619 { "xor", xor_set
, 0 },
4620 }, *m
= ARRAY_END(msg_descr
);
4625 while (m
-- > msg_descr
) {
4626 if (!strncmp(argv
[0], m
->name
, len
))
4627 return m
->f(rs
, argc
- 1, argv
+ 1, m
->flag
);
4635 * END message interface
4638 /* Provide io hints. */
4639 static void raid_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
4641 struct raid_set
*rs
= ti
->private;
4643 blk_limits_io_min(limits
, rs
->set
.chunk_size
);
4644 blk_limits_io_opt(limits
, rs
->set
.chunk_size
* rs
->set
.data_devs
);
4647 static struct target_type raid_target
= {
4649 .version
= {1, 0, 0},
4650 .module
= THIS_MODULE
,
4654 .presuspend
= raid_presuspend
,
4655 .postsuspend
= raid_postsuspend
,
4656 .resume
= raid_resume
,
4657 .status
= raid_status
,
4658 .message
= raid_message
,
4659 .io_hints
= raid_io_hints
,
4662 static void init_exit(const char *bad_msg
, const char *good_msg
, int r
)
4665 DMERR("Failed to %sregister target [%d]", bad_msg
, r
);
4667 DMINFO("%s %s", good_msg
, version
);
4670 static int __init
dm_raid_init(void)
4672 int r
= dm_register_target(&raid_target
);
4674 init_exit("", "initialized", r
);
4678 static void __exit
dm_raid_exit(void)
4680 dm_unregister_target(&raid_target
);
4681 init_exit("un", "exit", 0);
4685 module_init(dm_raid_init
);
4686 module_exit(dm_raid_exit
);
4688 MODULE_DESCRIPTION(DM_NAME
" raid4/5 target");
4689 MODULE_AUTHOR("Heinz Mauelshagen <heinzm@redhat.com>");
4690 MODULE_LICENSE("GPL");
4691 MODULE_ALIAS("dm-raid4");
4692 MODULE_ALIAS("dm-raid5");