2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
6 * RAID-5 management functions.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2, or (at your option)
13 * You should have received a copy of the GNU General Public License
14 * (for example /usr/src/linux/COPYING); if not, write to the Free
15 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 #include <linux/config.h>
20 #include <linux/module.h>
21 #include <linux/slab.h>
22 #include <linux/raid/raid5.h>
23 #include <linux/highmem.h>
24 #include <linux/bitops.h>
25 #include <asm/atomic.h>
31 #define NR_STRIPES 256
32 #define STRIPE_SIZE PAGE_SIZE
33 #define STRIPE_SHIFT (PAGE_SHIFT - 9)
34 #define STRIPE_SECTORS (STRIPE_SIZE>>9)
35 #define IO_THRESHOLD 1
37 #define HASH_PAGES_ORDER 0
38 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
39 #define HASH_MASK (NR_HASH - 1)
41 #define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])
43 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
44 * order without overlap. There may be several bio's per stripe+device, and
45 * a bio could span several devices.
46 * When walking this list for a particular stripe+device, we must never proceed
47 * beyond a bio that extends past this device, as the next bio might no longer
49 * This macro is used to determine the 'next' bio in the list, given the sector
50 * of the current stripe+device
52 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
54 * The following can be used to debug the driver
57 #define RAID5_PARANOIA 1
58 #if RAID5_PARANOIA && defined(CONFIG_SMP)
59 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
61 # define CHECK_DEVLOCK()
64 #define PRINTK(x...) ((void)(RAID5_DEBUG && printk(x)))
70 static void print_raid5_conf (raid5_conf_t
*conf
);
72 static inline void __release_stripe(raid5_conf_t
*conf
, struct stripe_head
*sh
)
74 if (atomic_dec_and_test(&sh
->count
)) {
75 if (!list_empty(&sh
->lru
))
77 if (atomic_read(&conf
->active_stripes
)==0)
79 if (test_bit(STRIPE_HANDLE
, &sh
->state
)) {
80 if (test_bit(STRIPE_DELAYED
, &sh
->state
))
81 list_add_tail(&sh
->lru
, &conf
->delayed_list
);
83 list_add_tail(&sh
->lru
, &conf
->handle_list
);
84 md_wakeup_thread(conf
->mddev
->thread
);
86 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
)) {
87 atomic_dec(&conf
->preread_active_stripes
);
88 if (atomic_read(&conf
->preread_active_stripes
) < IO_THRESHOLD
)
89 md_wakeup_thread(conf
->mddev
->thread
);
91 list_add_tail(&sh
->lru
, &conf
->inactive_list
);
92 atomic_dec(&conf
->active_stripes
);
93 if (!conf
->inactive_blocked
||
94 atomic_read(&conf
->active_stripes
) < (NR_STRIPES
*3/4))
95 wake_up(&conf
->wait_for_stripe
);
99 static void release_stripe(struct stripe_head
*sh
)
101 raid5_conf_t
*conf
= sh
->raid_conf
;
104 spin_lock_irqsave(&conf
->device_lock
, flags
);
105 __release_stripe(conf
, sh
);
106 spin_unlock_irqrestore(&conf
->device_lock
, flags
);
109 static void remove_hash(struct stripe_head
*sh
)
111 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh
->sector
);
113 if (sh
->hash_pprev
) {
115 sh
->hash_next
->hash_pprev
= sh
->hash_pprev
;
116 *sh
->hash_pprev
= sh
->hash_next
;
117 sh
->hash_pprev
= NULL
;
121 static __inline__
void insert_hash(raid5_conf_t
*conf
, struct stripe_head
*sh
)
123 struct stripe_head
**shp
= &stripe_hash(conf
, sh
->sector
);
125 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh
->sector
);
128 if ((sh
->hash_next
= *shp
) != NULL
)
129 (*shp
)->hash_pprev
= &sh
->hash_next
;
131 sh
->hash_pprev
= shp
;
135 /* find an idle stripe, make sure it is unhashed, and return it. */
136 static struct stripe_head
*get_free_stripe(raid5_conf_t
*conf
)
138 struct stripe_head
*sh
= NULL
;
139 struct list_head
*first
;
142 if (list_empty(&conf
->inactive_list
))
144 first
= conf
->inactive_list
.next
;
145 sh
= list_entry(first
, struct stripe_head
, lru
);
146 list_del_init(first
);
148 atomic_inc(&conf
->active_stripes
);
153 static void shrink_buffers(struct stripe_head
*sh
, int num
)
158 for (i
=0; i
<num
; i
++) {
162 sh
->dev
[i
].page
= NULL
;
163 page_cache_release(p
);
167 static int grow_buffers(struct stripe_head
*sh
, int num
)
171 for (i
=0; i
<num
; i
++) {
174 if (!(page
= alloc_page(GFP_KERNEL
))) {
177 sh
->dev
[i
].page
= page
;
182 static void raid5_build_block (struct stripe_head
*sh
, int i
);
184 static inline void init_stripe(struct stripe_head
*sh
, sector_t sector
, int pd_idx
)
186 raid5_conf_t
*conf
= sh
->raid_conf
;
187 int disks
= conf
->raid_disks
, i
;
189 if (atomic_read(&sh
->count
) != 0)
191 if (test_bit(STRIPE_HANDLE
, &sh
->state
))
195 PRINTK("init_stripe called, stripe %llu\n",
196 (unsigned long long)sh
->sector
);
204 for (i
=disks
; i
--; ) {
205 struct r5dev
*dev
= &sh
->dev
[i
];
207 if (dev
->toread
|| dev
->towrite
|| dev
->written
||
208 test_bit(R5_LOCKED
, &dev
->flags
)) {
209 printk("sector=%llx i=%d %p %p %p %d\n",
210 (unsigned long long)sh
->sector
, i
, dev
->toread
,
211 dev
->towrite
, dev
->written
,
212 test_bit(R5_LOCKED
, &dev
->flags
));
216 raid5_build_block(sh
, i
);
218 insert_hash(conf
, sh
);
221 static struct stripe_head
*__find_stripe(raid5_conf_t
*conf
, sector_t sector
)
223 struct stripe_head
*sh
;
226 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector
);
227 for (sh
= stripe_hash(conf
, sector
); sh
; sh
= sh
->hash_next
)
228 if (sh
->sector
== sector
)
230 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector
);
234 static void unplug_slaves(mddev_t
*mddev
);
235 static void raid5_unplug_device(request_queue_t
*q
);
237 static struct stripe_head
*get_active_stripe(raid5_conf_t
*conf
, sector_t sector
,
238 int pd_idx
, int noblock
)
240 struct stripe_head
*sh
;
242 PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector
);
244 spin_lock_irq(&conf
->device_lock
);
247 sh
= __find_stripe(conf
, sector
);
249 if (!conf
->inactive_blocked
)
250 sh
= get_free_stripe(conf
);
251 if (noblock
&& sh
== NULL
)
254 conf
->inactive_blocked
= 1;
255 wait_event_lock_irq(conf
->wait_for_stripe
,
256 !list_empty(&conf
->inactive_list
) &&
257 (atomic_read(&conf
->active_stripes
) < (NR_STRIPES
*3/4)
258 || !conf
->inactive_blocked
),
260 unplug_slaves(conf
->mddev
);
262 conf
->inactive_blocked
= 0;
264 init_stripe(sh
, sector
, pd_idx
);
266 if (atomic_read(&sh
->count
)) {
267 if (!list_empty(&sh
->lru
))
270 if (!test_bit(STRIPE_HANDLE
, &sh
->state
))
271 atomic_inc(&conf
->active_stripes
);
272 if (list_empty(&sh
->lru
))
274 list_del_init(&sh
->lru
);
277 } while (sh
== NULL
);
280 atomic_inc(&sh
->count
);
282 spin_unlock_irq(&conf
->device_lock
);
286 static int grow_stripes(raid5_conf_t
*conf
, int num
)
288 struct stripe_head
*sh
;
290 int devs
= conf
->raid_disks
;
292 sprintf(conf
->cache_name
, "raid5/%s", mdname(conf
->mddev
));
294 sc
= kmem_cache_create(conf
->cache_name
,
295 sizeof(struct stripe_head
)+(devs
-1)*sizeof(struct r5dev
),
299 conf
->slab_cache
= sc
;
301 sh
= kmem_cache_alloc(sc
, GFP_KERNEL
);
304 memset(sh
, 0, sizeof(*sh
) + (devs
-1)*sizeof(struct r5dev
));
305 sh
->raid_conf
= conf
;
306 spin_lock_init(&sh
->lock
);
308 if (grow_buffers(sh
, conf
->raid_disks
)) {
309 shrink_buffers(sh
, conf
->raid_disks
);
310 kmem_cache_free(sc
, sh
);
313 /* we just created an active stripe so... */
314 atomic_set(&sh
->count
, 1);
315 atomic_inc(&conf
->active_stripes
);
316 INIT_LIST_HEAD(&sh
->lru
);
322 static void shrink_stripes(raid5_conf_t
*conf
)
324 struct stripe_head
*sh
;
327 spin_lock_irq(&conf
->device_lock
);
328 sh
= get_free_stripe(conf
);
329 spin_unlock_irq(&conf
->device_lock
);
332 if (atomic_read(&sh
->count
))
334 shrink_buffers(sh
, conf
->raid_disks
);
335 kmem_cache_free(conf
->slab_cache
, sh
);
336 atomic_dec(&conf
->active_stripes
);
338 kmem_cache_destroy(conf
->slab_cache
);
339 conf
->slab_cache
= NULL
;
342 static int raid5_end_read_request (struct bio
* bi
, unsigned int bytes_done
,
345 struct stripe_head
*sh
= bi
->bi_private
;
346 raid5_conf_t
*conf
= sh
->raid_conf
;
347 int disks
= conf
->raid_disks
, i
;
348 int uptodate
= test_bit(BIO_UPTODATE
, &bi
->bi_flags
);
353 for (i
=0 ; i
<disks
; i
++)
354 if (bi
== &sh
->dev
[i
].req
)
357 PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
358 (unsigned long long)sh
->sector
, i
, atomic_read(&sh
->count
),
369 spin_lock_irqsave(&conf
->device_lock
, flags
);
370 /* we can return a buffer if we bypassed the cache or
371 * if the top buffer is not in highmem. If there are
372 * multiple buffers, leave the extra work to
375 buffer
= sh
->bh_read
[i
];
377 (!PageHighMem(buffer
->b_page
)
378 || buffer
->b_page
== bh
->b_page
)
380 sh
->bh_read
[i
] = buffer
->b_reqnext
;
381 buffer
->b_reqnext
= NULL
;
384 spin_unlock_irqrestore(&conf
->device_lock
, flags
);
385 if (sh
->bh_page
[i
]==bh
->b_page
)
386 set_buffer_uptodate(bh
);
388 if (buffer
->b_page
!= bh
->b_page
)
389 memcpy(buffer
->b_data
, bh
->b_data
, bh
->b_size
);
390 buffer
->b_end_io(buffer
, 1);
393 set_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
);
396 md_error(conf
->mddev
, conf
->disks
[i
].rdev
);
397 clear_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
);
399 rdev_dec_pending(conf
->disks
[i
].rdev
, conf
->mddev
);
401 /* must restore b_page before unlocking buffer... */
402 if (sh
->bh_page
[i
] != bh
->b_page
) {
403 bh
->b_page
= sh
->bh_page
[i
];
404 bh
->b_data
= page_address(bh
->b_page
);
405 clear_buffer_uptodate(bh
);
408 clear_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
409 set_bit(STRIPE_HANDLE
, &sh
->state
);
414 static int raid5_end_write_request (struct bio
*bi
, unsigned int bytes_done
,
417 struct stripe_head
*sh
= bi
->bi_private
;
418 raid5_conf_t
*conf
= sh
->raid_conf
;
419 int disks
= conf
->raid_disks
, i
;
421 int uptodate
= test_bit(BIO_UPTODATE
, &bi
->bi_flags
);
426 for (i
=0 ; i
<disks
; i
++)
427 if (bi
== &sh
->dev
[i
].req
)
430 PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
431 (unsigned long long)sh
->sector
, i
, atomic_read(&sh
->count
),
438 spin_lock_irqsave(&conf
->device_lock
, flags
);
440 md_error(conf
->mddev
, conf
->disks
[i
].rdev
);
442 rdev_dec_pending(conf
->disks
[i
].rdev
, conf
->mddev
);
444 clear_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
445 set_bit(STRIPE_HANDLE
, &sh
->state
);
446 __release_stripe(conf
, sh
);
447 spin_unlock_irqrestore(&conf
->device_lock
, flags
);
452 static sector_t
compute_blocknr(struct stripe_head
*sh
, int i
);
454 static void raid5_build_block (struct stripe_head
*sh
, int i
)
456 struct r5dev
*dev
= &sh
->dev
[i
];
459 dev
->req
.bi_io_vec
= &dev
->vec
;
461 dev
->req
.bi_max_vecs
++;
462 dev
->vec
.bv_page
= dev
->page
;
463 dev
->vec
.bv_len
= STRIPE_SIZE
;
464 dev
->vec
.bv_offset
= 0;
466 dev
->req
.bi_sector
= sh
->sector
;
467 dev
->req
.bi_private
= sh
;
471 dev
->sector
= compute_blocknr(sh
, i
);
474 static void error(mddev_t
*mddev
, mdk_rdev_t
*rdev
)
476 char b
[BDEVNAME_SIZE
];
477 raid5_conf_t
*conf
= (raid5_conf_t
*) mddev
->private;
478 PRINTK("raid5: error called\n");
483 conf
->working_disks
--;
485 conf
->failed_disks
++;
488 * if recovery was running, make sure it aborts.
490 set_bit(MD_RECOVERY_ERR
, &mddev
->recovery
);
494 "raid5: Disk failure on %s, disabling device."
495 " Operation continuing on %d devices\n",
496 bdevname(rdev
->bdev
,b
), conf
->working_disks
);
501 * Input: a 'big' sector number,
502 * Output: index of the data and parity disk, and the sector # in them.
504 static sector_t
raid5_compute_sector(sector_t r_sector
, unsigned int raid_disks
,
505 unsigned int data_disks
, unsigned int * dd_idx
,
506 unsigned int * pd_idx
, raid5_conf_t
*conf
)
509 unsigned long chunk_number
;
510 unsigned int chunk_offset
;
512 int sectors_per_chunk
= conf
->chunk_size
>> 9;
514 /* First compute the information on this sector */
517 * Compute the chunk number and the sector offset inside the chunk
519 chunk_offset
= sector_div(r_sector
, sectors_per_chunk
);
520 chunk_number
= r_sector
;
521 BUG_ON(r_sector
!= chunk_number
);
524 * Compute the stripe number
526 stripe
= chunk_number
/ data_disks
;
529 * Compute the data disk and parity disk indexes inside the stripe
531 *dd_idx
= chunk_number
% data_disks
;
534 * Select the parity disk based on the user selected algorithm.
536 if (conf
->level
== 4)
537 *pd_idx
= data_disks
;
538 else switch (conf
->algorithm
) {
539 case ALGORITHM_LEFT_ASYMMETRIC
:
540 *pd_idx
= data_disks
- stripe
% raid_disks
;
541 if (*dd_idx
>= *pd_idx
)
544 case ALGORITHM_RIGHT_ASYMMETRIC
:
545 *pd_idx
= stripe
% raid_disks
;
546 if (*dd_idx
>= *pd_idx
)
549 case ALGORITHM_LEFT_SYMMETRIC
:
550 *pd_idx
= data_disks
- stripe
% raid_disks
;
551 *dd_idx
= (*pd_idx
+ 1 + *dd_idx
) % raid_disks
;
553 case ALGORITHM_RIGHT_SYMMETRIC
:
554 *pd_idx
= stripe
% raid_disks
;
555 *dd_idx
= (*pd_idx
+ 1 + *dd_idx
) % raid_disks
;
558 printk("raid5: unsupported algorithm %d\n",
563 * Finally, compute the new sector number
565 new_sector
= (sector_t
)stripe
* sectors_per_chunk
+ chunk_offset
;
570 static sector_t
compute_blocknr(struct stripe_head
*sh
, int i
)
572 raid5_conf_t
*conf
= sh
->raid_conf
;
573 int raid_disks
= conf
->raid_disks
, data_disks
= raid_disks
- 1;
574 sector_t new_sector
= sh
->sector
, check
;
575 int sectors_per_chunk
= conf
->chunk_size
>> 9;
578 int chunk_number
, dummy1
, dummy2
, dd_idx
= i
;
581 chunk_offset
= sector_div(new_sector
, sectors_per_chunk
);
583 BUG_ON(new_sector
!= stripe
);
586 switch (conf
->algorithm
) {
587 case ALGORITHM_LEFT_ASYMMETRIC
:
588 case ALGORITHM_RIGHT_ASYMMETRIC
:
592 case ALGORITHM_LEFT_SYMMETRIC
:
593 case ALGORITHM_RIGHT_SYMMETRIC
:
596 i
-= (sh
->pd_idx
+ 1);
599 printk("raid5: unsupported algorithm %d\n",
603 chunk_number
= stripe
* data_disks
+ i
;
604 r_sector
= (sector_t
)chunk_number
* sectors_per_chunk
+ chunk_offset
;
606 check
= raid5_compute_sector (r_sector
, raid_disks
, data_disks
, &dummy1
, &dummy2
, conf
);
607 if (check
!= sh
->sector
|| dummy1
!= dd_idx
|| dummy2
!= sh
->pd_idx
) {
608 printk("compute_blocknr: map not correct\n");
617 * Copy data between a page in the stripe cache, and a bio.
618 * There are no alignment or size guarantees between the page or the
619 * bio except that there is some overlap.
620 * All iovecs in the bio must be considered.
622 static void copy_data(int frombio
, struct bio
*bio
,
626 char *pa
= page_address(page
);
631 if (bio
->bi_sector
>= sector
)
632 page_offset
= (signed)(bio
->bi_sector
- sector
) * 512;
634 page_offset
= (signed)(sector
- bio
->bi_sector
) * -512;
635 bio_for_each_segment(bvl
, bio
, i
) {
636 int len
= bio_iovec_idx(bio
,i
)->bv_len
;
640 if (page_offset
< 0) {
641 b_offset
= -page_offset
;
642 page_offset
+= b_offset
;
646 if (len
> 0 && page_offset
+ len
> STRIPE_SIZE
)
647 clen
= STRIPE_SIZE
- page_offset
;
651 char *ba
= __bio_kmap_atomic(bio
, i
, KM_USER0
);
653 memcpy(pa
+page_offset
, ba
+b_offset
, clen
);
655 memcpy(ba
+b_offset
, pa
+page_offset
, clen
);
656 __bio_kunmap_atomic(ba
, KM_USER0
);
658 if (clen
< len
) /* hit end of page */
664 #define check_xor() do { \
665 if (count == MAX_XOR_BLOCKS) { \
666 xor_block(count, STRIPE_SIZE, ptr); \
672 static void compute_block(struct stripe_head
*sh
, int dd_idx
)
674 raid5_conf_t
*conf
= sh
->raid_conf
;
675 int i
, count
, disks
= conf
->raid_disks
;
676 void *ptr
[MAX_XOR_BLOCKS
], *p
;
678 PRINTK("compute_block, stripe %llu, idx %d\n",
679 (unsigned long long)sh
->sector
, dd_idx
);
681 ptr
[0] = page_address(sh
->dev
[dd_idx
].page
);
682 memset(ptr
[0], 0, STRIPE_SIZE
);
684 for (i
= disks
; i
--; ) {
687 p
= page_address(sh
->dev
[i
].page
);
688 if (test_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
))
691 printk("compute_block() %d, stripe %llu, %d"
692 " not present\n", dd_idx
,
693 (unsigned long long)sh
->sector
, i
);
698 xor_block(count
, STRIPE_SIZE
, ptr
);
699 set_bit(R5_UPTODATE
, &sh
->dev
[dd_idx
].flags
);
702 static void compute_parity(struct stripe_head
*sh
, int method
)
704 raid5_conf_t
*conf
= sh
->raid_conf
;
705 int i
, pd_idx
= sh
->pd_idx
, disks
= conf
->raid_disks
, count
;
706 void *ptr
[MAX_XOR_BLOCKS
];
709 PRINTK("compute_parity, stripe %llu, method %d\n",
710 (unsigned long long)sh
->sector
, method
);
713 ptr
[0] = page_address(sh
->dev
[pd_idx
].page
);
715 case READ_MODIFY_WRITE
:
716 if (!test_bit(R5_UPTODATE
, &sh
->dev
[pd_idx
].flags
))
718 for (i
=disks
; i
-- ;) {
721 if (sh
->dev
[i
].towrite
&&
722 test_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
)) {
723 ptr
[count
++] = page_address(sh
->dev
[i
].page
);
724 chosen
= sh
->dev
[i
].towrite
;
725 sh
->dev
[i
].towrite
= NULL
;
727 if (test_and_clear_bit(R5_Overlap
, &sh
->dev
[i
].flags
))
728 wake_up(&conf
->wait_for_overlap
);
730 if (sh
->dev
[i
].written
) BUG();
731 sh
->dev
[i
].written
= chosen
;
736 case RECONSTRUCT_WRITE
:
737 memset(ptr
[0], 0, STRIPE_SIZE
);
738 for (i
= disks
; i
-- ;)
739 if (i
!=pd_idx
&& sh
->dev
[i
].towrite
) {
740 chosen
= sh
->dev
[i
].towrite
;
741 sh
->dev
[i
].towrite
= NULL
;
743 if (test_and_clear_bit(R5_Overlap
, &sh
->dev
[i
].flags
))
744 wake_up(&conf
->wait_for_overlap
);
746 if (sh
->dev
[i
].written
) BUG();
747 sh
->dev
[i
].written
= chosen
;
754 xor_block(count
, STRIPE_SIZE
, ptr
);
758 for (i
= disks
; i
--;)
759 if (sh
->dev
[i
].written
) {
760 sector_t sector
= sh
->dev
[i
].sector
;
761 struct bio
*wbi
= sh
->dev
[i
].written
;
762 while (wbi
&& wbi
->bi_sector
< sector
+ STRIPE_SECTORS
) {
763 copy_data(1, wbi
, sh
->dev
[i
].page
, sector
);
764 wbi
= r5_next_bio(wbi
, sector
);
767 set_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
768 set_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
);
772 case RECONSTRUCT_WRITE
:
776 ptr
[count
++] = page_address(sh
->dev
[i
].page
);
780 case READ_MODIFY_WRITE
:
781 for (i
= disks
; i
--;)
782 if (sh
->dev
[i
].written
) {
783 ptr
[count
++] = page_address(sh
->dev
[i
].page
);
788 xor_block(count
, STRIPE_SIZE
, ptr
);
790 if (method
!= CHECK_PARITY
) {
791 set_bit(R5_UPTODATE
, &sh
->dev
[pd_idx
].flags
);
792 set_bit(R5_LOCKED
, &sh
->dev
[pd_idx
].flags
);
794 clear_bit(R5_UPTODATE
, &sh
->dev
[pd_idx
].flags
);
798 * Each stripe/dev can have one or more bion attached.
799 * toread/towrite point to the first in a chain.
800 * The bi_next chain must be in order.
802 static int add_stripe_bio(struct stripe_head
*sh
, struct bio
*bi
, int dd_idx
, int forwrite
)
805 raid5_conf_t
*conf
= sh
->raid_conf
;
807 PRINTK("adding bh b#%llu to stripe s#%llu\n",
808 (unsigned long long)bi
->bi_sector
,
809 (unsigned long long)sh
->sector
);
812 spin_lock(&sh
->lock
);
813 spin_lock_irq(&conf
->device_lock
);
815 bip
= &sh
->dev
[dd_idx
].towrite
;
817 bip
= &sh
->dev
[dd_idx
].toread
;
818 while (*bip
&& (*bip
)->bi_sector
< bi
->bi_sector
) {
819 if ((*bip
)->bi_sector
+ ((*bip
)->bi_size
>> 9) > bi
->bi_sector
)
821 bip
= & (*bip
)->bi_next
;
823 if (*bip
&& (*bip
)->bi_sector
< bi
->bi_sector
+ ((bi
->bi_size
)>>9))
826 if (*bip
&& bi
->bi_next
&& (*bip
) != bi
->bi_next
)
831 bi
->bi_phys_segments
++;
832 spin_unlock_irq(&conf
->device_lock
);
833 spin_unlock(&sh
->lock
);
835 PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
836 (unsigned long long)bi
->bi_sector
,
837 (unsigned long long)sh
->sector
, dd_idx
);
840 /* check if page is covered */
841 sector_t sector
= sh
->dev
[dd_idx
].sector
;
842 for (bi
=sh
->dev
[dd_idx
].towrite
;
843 sector
< sh
->dev
[dd_idx
].sector
+ STRIPE_SECTORS
&&
844 bi
&& bi
->bi_sector
<= sector
;
845 bi
= r5_next_bio(bi
, sh
->dev
[dd_idx
].sector
)) {
846 if (bi
->bi_sector
+ (bi
->bi_size
>>9) >= sector
)
847 sector
= bi
->bi_sector
+ (bi
->bi_size
>>9);
849 if (sector
>= sh
->dev
[dd_idx
].sector
+ STRIPE_SECTORS
)
850 set_bit(R5_OVERWRITE
, &sh
->dev
[dd_idx
].flags
);
855 set_bit(R5_Overlap
, &sh
->dev
[dd_idx
].flags
);
856 spin_unlock_irq(&conf
->device_lock
);
857 spin_unlock(&sh
->lock
);
863 * handle_stripe - do things to a stripe.
865 * We lock the stripe and then examine the state of various bits
866 * to see what needs to be done.
868 * return some read request which now have data
869 * return some write requests which are safely on disc
870 * schedule a read on some buffers
871 * schedule a write of some buffers
872 * return confirmation of parity correctness
874 * Parity calculations are done inside the stripe lock
875 * buffers are taken off read_list or write_list, and bh_cache buffers
876 * get BH_Lock set before the stripe lock is released.
880 static void handle_stripe(struct stripe_head
*sh
)
882 raid5_conf_t
*conf
= sh
->raid_conf
;
883 int disks
= conf
->raid_disks
;
884 struct bio
*return_bi
= NULL
;
888 int locked
=0, uptodate
=0, to_read
=0, to_write
=0, failed
=0, written
=0;
889 int non_overwrite
= 0;
893 PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
894 (unsigned long long)sh
->sector
, atomic_read(&sh
->count
),
897 spin_lock(&sh
->lock
);
898 clear_bit(STRIPE_HANDLE
, &sh
->state
);
899 clear_bit(STRIPE_DELAYED
, &sh
->state
);
901 syncing
= test_bit(STRIPE_SYNCING
, &sh
->state
);
902 /* Now to look around and see what can be done */
904 for (i
=disks
; i
--; ) {
907 clear_bit(R5_Insync
, &dev
->flags
);
908 clear_bit(R5_Syncio
, &dev
->flags
);
910 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
911 i
, dev
->flags
, dev
->toread
, dev
->towrite
, dev
->written
);
912 /* maybe we can reply to a read */
913 if (test_bit(R5_UPTODATE
, &dev
->flags
) && dev
->toread
) {
914 struct bio
*rbi
, *rbi2
;
915 PRINTK("Return read for disc %d\n", i
);
916 spin_lock_irq(&conf
->device_lock
);
919 if (test_and_clear_bit(R5_Overlap
, &dev
->flags
))
920 wake_up(&conf
->wait_for_overlap
);
921 spin_unlock_irq(&conf
->device_lock
);
922 while (rbi
&& rbi
->bi_sector
< dev
->sector
+ STRIPE_SECTORS
) {
923 copy_data(0, rbi
, dev
->page
, dev
->sector
);
924 rbi2
= r5_next_bio(rbi
, dev
->sector
);
925 spin_lock_irq(&conf
->device_lock
);
926 if (--rbi
->bi_phys_segments
== 0) {
927 rbi
->bi_next
= return_bi
;
930 spin_unlock_irq(&conf
->device_lock
);
935 /* now count some things */
936 if (test_bit(R5_LOCKED
, &dev
->flags
)) locked
++;
937 if (test_bit(R5_UPTODATE
, &dev
->flags
)) uptodate
++;
940 if (dev
->toread
) to_read
++;
943 if (!test_bit(R5_OVERWRITE
, &dev
->flags
))
946 if (dev
->written
) written
++;
947 rdev
= conf
->disks
[i
].rdev
; /* FIXME, should I be looking rdev */
948 if (!rdev
|| !rdev
->in_sync
) {
952 set_bit(R5_Insync
, &dev
->flags
);
954 PRINTK("locked=%d uptodate=%d to_read=%d"
955 " to_write=%d failed=%d failed_num=%d\n",
956 locked
, uptodate
, to_read
, to_write
, failed
, failed_num
);
957 /* check if the array has lost two devices and, if so, some requests might
960 if (failed
> 1 && to_read
+to_write
+written
) {
961 spin_lock_irq(&conf
->device_lock
);
962 for (i
=disks
; i
--; ) {
963 /* fail all writes first */
964 bi
= sh
->dev
[i
].towrite
;
965 sh
->dev
[i
].towrite
= NULL
;
968 if (test_and_clear_bit(R5_Overlap
, &sh
->dev
[i
].flags
))
969 wake_up(&conf
->wait_for_overlap
);
971 while (bi
&& bi
->bi_sector
< sh
->dev
[i
].sector
+ STRIPE_SECTORS
){
972 struct bio
*nextbi
= r5_next_bio(bi
, sh
->dev
[i
].sector
);
973 clear_bit(BIO_UPTODATE
, &bi
->bi_flags
);
974 if (--bi
->bi_phys_segments
== 0) {
975 md_write_end(conf
->mddev
);
976 bi
->bi_next
= return_bi
;
981 /* and fail all 'written' */
982 bi
= sh
->dev
[i
].written
;
983 sh
->dev
[i
].written
= NULL
;
984 while (bi
&& bi
->bi_sector
< sh
->dev
[i
].sector
+ STRIPE_SECTORS
) {
985 struct bio
*bi2
= r5_next_bio(bi
, sh
->dev
[i
].sector
);
986 clear_bit(BIO_UPTODATE
, &bi
->bi_flags
);
987 if (--bi
->bi_phys_segments
== 0) {
988 md_write_end(conf
->mddev
);
989 bi
->bi_next
= return_bi
;
995 /* fail any reads if this device is non-operational */
996 if (!test_bit(R5_Insync
, &sh
->dev
[i
].flags
)) {
997 bi
= sh
->dev
[i
].toread
;
998 sh
->dev
[i
].toread
= NULL
;
999 if (test_and_clear_bit(R5_Overlap
, &sh
->dev
[i
].flags
))
1000 wake_up(&conf
->wait_for_overlap
);
1002 while (bi
&& bi
->bi_sector
< sh
->dev
[i
].sector
+ STRIPE_SECTORS
){
1003 struct bio
*nextbi
= r5_next_bio(bi
, sh
->dev
[i
].sector
);
1004 clear_bit(BIO_UPTODATE
, &bi
->bi_flags
);
1005 if (--bi
->bi_phys_segments
== 0) {
1006 bi
->bi_next
= return_bi
;
1013 spin_unlock_irq(&conf
->device_lock
);
1015 if (failed
> 1 && syncing
) {
1016 md_done_sync(conf
->mddev
, STRIPE_SECTORS
,0);
1017 clear_bit(STRIPE_SYNCING
, &sh
->state
);
1021 /* might be able to return some write requests if the parity block
1022 * is safe, or on a failed drive
1024 dev
= &sh
->dev
[sh
->pd_idx
];
1026 ( (test_bit(R5_Insync
, &dev
->flags
) && !test_bit(R5_LOCKED
, &dev
->flags
) &&
1027 test_bit(R5_UPTODATE
, &dev
->flags
))
1028 || (failed
== 1 && failed_num
== sh
->pd_idx
))
1030 /* any written block on an uptodate or failed drive can be returned.
1031 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
1032 * never LOCKED, so we don't need to test 'failed' directly.
1034 for (i
=disks
; i
--; )
1035 if (sh
->dev
[i
].written
) {
1037 if (!test_bit(R5_LOCKED
, &dev
->flags
) &&
1038 test_bit(R5_UPTODATE
, &dev
->flags
) ) {
1039 /* We can return any write requests */
1040 struct bio
*wbi
, *wbi2
;
1041 PRINTK("Return write for disc %d\n", i
);
1042 spin_lock_irq(&conf
->device_lock
);
1044 dev
->written
= NULL
;
1045 while (wbi
&& wbi
->bi_sector
< dev
->sector
+ STRIPE_SECTORS
) {
1046 wbi2
= r5_next_bio(wbi
, dev
->sector
);
1047 if (--wbi
->bi_phys_segments
== 0) {
1048 md_write_end(conf
->mddev
);
1049 wbi
->bi_next
= return_bi
;
1054 spin_unlock_irq(&conf
->device_lock
);
1059 /* Now we might consider reading some blocks, either to check/generate
1060 * parity, or to satisfy requests
1061 * or to load a block that is being partially written.
1063 if (to_read
|| non_overwrite
|| (syncing
&& (uptodate
< disks
))) {
1064 for (i
=disks
; i
--;) {
1066 if (!test_bit(R5_LOCKED
, &dev
->flags
) && !test_bit(R5_UPTODATE
, &dev
->flags
) &&
1068 (dev
->towrite
&& !test_bit(R5_OVERWRITE
, &dev
->flags
)) ||
1070 (failed
&& (sh
->dev
[failed_num
].toread
||
1071 (sh
->dev
[failed_num
].towrite
&& !test_bit(R5_OVERWRITE
, &sh
->dev
[failed_num
].flags
))))
1074 /* we would like to get this block, possibly
1075 * by computing it, but we might not be able to
1077 if (uptodate
== disks
-1) {
1078 PRINTK("Computing block %d\n", i
);
1079 compute_block(sh
, i
);
1081 } else if (test_bit(R5_Insync
, &dev
->flags
)) {
1082 set_bit(R5_LOCKED
, &dev
->flags
);
1083 set_bit(R5_Wantread
, &dev
->flags
);
1085 /* if I am just reading this block and we don't have
1086 a failed drive, or any pending writes then sidestep the cache */
1087 if (sh
->bh_read
[i
] && !sh
->bh_read
[i
]->b_reqnext
&&
1088 ! syncing
&& !failed
&& !to_write
) {
1089 sh
->bh_cache
[i
]->b_page
= sh
->bh_read
[i
]->b_page
;
1090 sh
->bh_cache
[i
]->b_data
= sh
->bh_read
[i
]->b_data
;
1094 PRINTK("Reading block %d (sync=%d)\n",
1097 md_sync_acct(conf
->disks
[i
].rdev
->bdev
,
1102 set_bit(STRIPE_HANDLE
, &sh
->state
);
1105 /* now to consider writing and what else, if anything should be read */
1108 for (i
=disks
; i
--;) {
1109 /* would I have to read this buffer for read_modify_write */
1111 if ((dev
->towrite
|| i
== sh
->pd_idx
) &&
1112 (!test_bit(R5_LOCKED
, &dev
->flags
)
1114 || sh
->bh_page
[i
]!=bh
->b_page
1117 !test_bit(R5_UPTODATE
, &dev
->flags
)) {
1118 if (test_bit(R5_Insync
, &dev
->flags
)
1119 /* && !(!mddev->insync && i == sh->pd_idx) */
1122 else rmw
+= 2*disks
; /* cannot read it */
1124 /* Would I have to read this buffer for reconstruct_write */
1125 if (!test_bit(R5_OVERWRITE
, &dev
->flags
) && i
!= sh
->pd_idx
&&
1126 (!test_bit(R5_LOCKED
, &dev
->flags
)
1128 || sh
->bh_page
[i
] != bh
->b_page
1131 !test_bit(R5_UPTODATE
, &dev
->flags
)) {
1132 if (test_bit(R5_Insync
, &dev
->flags
)) rcw
++;
1133 else rcw
+= 2*disks
;
1136 PRINTK("for sector %llu, rmw=%d rcw=%d\n",
1137 (unsigned long long)sh
->sector
, rmw
, rcw
);
1138 set_bit(STRIPE_HANDLE
, &sh
->state
);
1139 if (rmw
< rcw
&& rmw
> 0)
1140 /* prefer read-modify-write, but need to get some data */
1141 for (i
=disks
; i
--;) {
1143 if ((dev
->towrite
|| i
== sh
->pd_idx
) &&
1144 !test_bit(R5_LOCKED
, &dev
->flags
) && !test_bit(R5_UPTODATE
, &dev
->flags
) &&
1145 test_bit(R5_Insync
, &dev
->flags
)) {
1146 if (test_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
))
1148 PRINTK("Read_old block %d for r-m-w\n", i
);
1149 set_bit(R5_LOCKED
, &dev
->flags
);
1150 set_bit(R5_Wantread
, &dev
->flags
);
1153 set_bit(STRIPE_DELAYED
, &sh
->state
);
1154 set_bit(STRIPE_HANDLE
, &sh
->state
);
1158 if (rcw
<= rmw
&& rcw
> 0)
1159 /* want reconstruct write, but need to get some data */
1160 for (i
=disks
; i
--;) {
1162 if (!test_bit(R5_OVERWRITE
, &dev
->flags
) && i
!= sh
->pd_idx
&&
1163 !test_bit(R5_LOCKED
, &dev
->flags
) && !test_bit(R5_UPTODATE
, &dev
->flags
) &&
1164 test_bit(R5_Insync
, &dev
->flags
)) {
1165 if (test_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
))
1167 PRINTK("Read_old block %d for Reconstruct\n", i
);
1168 set_bit(R5_LOCKED
, &dev
->flags
);
1169 set_bit(R5_Wantread
, &dev
->flags
);
1172 set_bit(STRIPE_DELAYED
, &sh
->state
);
1173 set_bit(STRIPE_HANDLE
, &sh
->state
);
1177 /* now if nothing is locked, and if we have enough data, we can start a write request */
1178 if (locked
== 0 && (rcw
== 0 ||rmw
== 0)) {
1179 PRINTK("Computing parity...\n");
1180 compute_parity(sh
, rcw
==0 ? RECONSTRUCT_WRITE
: READ_MODIFY_WRITE
);
1181 /* now every locked buffer is ready to be written */
1183 if (test_bit(R5_LOCKED
, &sh
->dev
[i
].flags
)) {
1184 PRINTK("Writing block %d\n", i
);
1186 set_bit(R5_Wantwrite
, &sh
->dev
[i
].flags
);
1187 if (!test_bit(R5_Insync
, &sh
->dev
[i
].flags
)
1188 || (i
==sh
->pd_idx
&& failed
== 0))
1189 set_bit(STRIPE_INSYNC
, &sh
->state
);
1191 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
)) {
1192 atomic_dec(&conf
->preread_active_stripes
);
1193 if (atomic_read(&conf
->preread_active_stripes
) < IO_THRESHOLD
)
1194 md_wakeup_thread(conf
->mddev
->thread
);
1199 /* maybe we need to check and possibly fix the parity for this stripe
1200 * Any reads will already have been scheduled, so we just see if enough data
1203 if (syncing
&& locked
== 0 &&
1204 !test_bit(STRIPE_INSYNC
, &sh
->state
) && failed
<= 1) {
1205 set_bit(STRIPE_HANDLE
, &sh
->state
);
1208 if (uptodate
!= disks
)
1210 compute_parity(sh
, CHECK_PARITY
);
1212 pagea
= page_address(sh
->dev
[sh
->pd_idx
].page
);
1213 if ((*(u32
*)pagea
) == 0 &&
1214 !memcmp(pagea
, pagea
+4, STRIPE_SIZE
-4)) {
1215 /* parity is correct (on disc, not in buffer any more) */
1216 set_bit(STRIPE_INSYNC
, &sh
->state
);
1219 if (!test_bit(STRIPE_INSYNC
, &sh
->state
)) {
1221 failed_num
= sh
->pd_idx
;
1222 /* should be able to compute the missing block and write it to spare */
1223 if (!test_bit(R5_UPTODATE
, &sh
->dev
[failed_num
].flags
)) {
1224 if (uptodate
+1 != disks
)
1226 compute_block(sh
, failed_num
);
1229 if (uptodate
!= disks
)
1231 dev
= &sh
->dev
[failed_num
];
1232 set_bit(R5_LOCKED
, &dev
->flags
);
1233 set_bit(R5_Wantwrite
, &dev
->flags
);
1235 set_bit(STRIPE_INSYNC
, &sh
->state
);
1236 set_bit(R5_Syncio
, &dev
->flags
);
1239 if (syncing
&& locked
== 0 && test_bit(STRIPE_INSYNC
, &sh
->state
)) {
1240 md_done_sync(conf
->mddev
, STRIPE_SECTORS
,1);
1241 clear_bit(STRIPE_SYNCING
, &sh
->state
);
1244 spin_unlock(&sh
->lock
);
1246 while ((bi
=return_bi
)) {
1247 int bytes
= bi
->bi_size
;
1249 return_bi
= bi
->bi_next
;
1252 bi
->bi_end_io(bi
, bytes
, 0);
1254 for (i
=disks
; i
-- ;) {
1258 if (test_and_clear_bit(R5_Wantwrite
, &sh
->dev
[i
].flags
))
1260 else if (test_and_clear_bit(R5_Wantread
, &sh
->dev
[i
].flags
))
1265 bi
= &sh
->dev
[i
].req
;
1269 bi
->bi_end_io
= raid5_end_write_request
;
1271 bi
->bi_end_io
= raid5_end_read_request
;
1274 rdev
= conf
->disks
[i
].rdev
;
1275 if (rdev
&& rdev
->faulty
)
1278 atomic_inc(&rdev
->nr_pending
);
1282 if (test_bit(R5_Syncio
, &sh
->dev
[i
].flags
))
1283 md_sync_acct(rdev
->bdev
, STRIPE_SECTORS
);
1285 bi
->bi_bdev
= rdev
->bdev
;
1286 PRINTK("for %llu schedule op %ld on disc %d\n",
1287 (unsigned long long)sh
->sector
, bi
->bi_rw
, i
);
1288 atomic_inc(&sh
->count
);
1289 bi
->bi_sector
= sh
->sector
+ rdev
->data_offset
;
1290 bi
->bi_flags
= 1 << BIO_UPTODATE
;
1292 bi
->bi_max_vecs
= 1;
1294 bi
->bi_io_vec
= &sh
->dev
[i
].vec
;
1295 bi
->bi_io_vec
[0].bv_len
= STRIPE_SIZE
;
1296 bi
->bi_io_vec
[0].bv_offset
= 0;
1297 bi
->bi_size
= STRIPE_SIZE
;
1299 generic_make_request(bi
);
1301 PRINTK("skip op %ld on disc %d for sector %llu\n",
1302 bi
->bi_rw
, i
, (unsigned long long)sh
->sector
);
1303 clear_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
1304 set_bit(STRIPE_HANDLE
, &sh
->state
);
1309 static inline void raid5_activate_delayed(raid5_conf_t
*conf
)
1311 if (atomic_read(&conf
->preread_active_stripes
) < IO_THRESHOLD
) {
1312 while (!list_empty(&conf
->delayed_list
)) {
1313 struct list_head
*l
= conf
->delayed_list
.next
;
1314 struct stripe_head
*sh
;
1315 sh
= list_entry(l
, struct stripe_head
, lru
);
1317 clear_bit(STRIPE_DELAYED
, &sh
->state
);
1318 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
))
1319 atomic_inc(&conf
->preread_active_stripes
);
1320 list_add_tail(&sh
->lru
, &conf
->handle_list
);
1325 static void unplug_slaves(mddev_t
*mddev
)
1327 raid5_conf_t
*conf
= mddev_to_conf(mddev
);
1331 for (i
=0; i
<mddev
->raid_disks
; i
++) {
1332 mdk_rdev_t
*rdev
= conf
->disks
[i
].rdev
;
1333 if (rdev
&& !rdev
->faulty
&& atomic_read(&rdev
->nr_pending
)) {
1334 request_queue_t
*r_queue
= bdev_get_queue(rdev
->bdev
);
1336 atomic_inc(&rdev
->nr_pending
);
1339 if (r_queue
->unplug_fn
)
1340 r_queue
->unplug_fn(r_queue
);
1342 rdev_dec_pending(rdev
, mddev
);
1349 static void raid5_unplug_device(request_queue_t
*q
)
1351 mddev_t
*mddev
= q
->queuedata
;
1352 raid5_conf_t
*conf
= mddev_to_conf(mddev
);
1353 unsigned long flags
;
1355 spin_lock_irqsave(&conf
->device_lock
, flags
);
1357 if (blk_remove_plug(q
))
1358 raid5_activate_delayed(conf
);
1359 md_wakeup_thread(mddev
->thread
);
1361 spin_unlock_irqrestore(&conf
->device_lock
, flags
);
1363 unplug_slaves(mddev
);
1366 static int raid5_issue_flush(request_queue_t
*q
, struct gendisk
*disk
,
1367 sector_t
*error_sector
)
1369 mddev_t
*mddev
= q
->queuedata
;
1370 raid5_conf_t
*conf
= mddev_to_conf(mddev
);
1374 for (i
=0; i
<mddev
->raid_disks
&& ret
== 0; i
++) {
1375 mdk_rdev_t
*rdev
= conf
->disks
[i
].rdev
;
1376 if (rdev
&& !rdev
->faulty
) {
1377 struct block_device
*bdev
= rdev
->bdev
;
1378 request_queue_t
*r_queue
= bdev_get_queue(bdev
);
1380 if (!r_queue
->issue_flush_fn
)
1383 atomic_inc(&rdev
->nr_pending
);
1385 ret
= r_queue
->issue_flush_fn(r_queue
, bdev
->bd_disk
,
1387 rdev_dec_pending(rdev
, mddev
);
1396 static inline void raid5_plug_device(raid5_conf_t
*conf
)
1398 spin_lock_irq(&conf
->device_lock
);
1399 blk_plug_device(conf
->mddev
->queue
);
1400 spin_unlock_irq(&conf
->device_lock
);
1403 static int make_request (request_queue_t
*q
, struct bio
* bi
)
1405 mddev_t
*mddev
= q
->queuedata
;
1406 raid5_conf_t
*conf
= mddev_to_conf(mddev
);
1407 const unsigned int raid_disks
= conf
->raid_disks
;
1408 const unsigned int data_disks
= raid_disks
- 1;
1409 unsigned int dd_idx
, pd_idx
;
1410 sector_t new_sector
;
1411 sector_t logical_sector
, last_sector
;
1412 struct stripe_head
*sh
;
1414 md_write_start(mddev
, bi
);
1416 if (bio_data_dir(bi
)==WRITE
) {
1417 disk_stat_inc(mddev
->gendisk
, writes
);
1418 disk_stat_add(mddev
->gendisk
, write_sectors
, bio_sectors(bi
));
1420 disk_stat_inc(mddev
->gendisk
, reads
);
1421 disk_stat_add(mddev
->gendisk
, read_sectors
, bio_sectors(bi
));
1424 logical_sector
= bi
->bi_sector
& ~((sector_t
)STRIPE_SECTORS
-1);
1425 last_sector
= bi
->bi_sector
+ (bi
->bi_size
>>9);
1427 bi
->bi_phys_segments
= 1; /* over-loaded to count active stripes */
1429 for (;logical_sector
< last_sector
; logical_sector
+= STRIPE_SECTORS
) {
1432 new_sector
= raid5_compute_sector(logical_sector
,
1433 raid_disks
, data_disks
, &dd_idx
, &pd_idx
, conf
);
1435 PRINTK("raid5: make_request, sector %llu logical %llu\n",
1436 (unsigned long long)new_sector
,
1437 (unsigned long long)logical_sector
);
1440 prepare_to_wait(&conf
->wait_for_overlap
, &w
, TASK_UNINTERRUPTIBLE
);
1441 sh
= get_active_stripe(conf
, new_sector
, pd_idx
, (bi
->bi_rw
&RWA_MASK
));
1443 if (!add_stripe_bio(sh
, bi
, dd_idx
, (bi
->bi_rw
&RW_MASK
))) {
1444 /* Add failed due to overlap. Flush everything
1447 raid5_unplug_device(mddev
->queue
);
1452 finish_wait(&conf
->wait_for_overlap
, &w
);
1453 raid5_plug_device(conf
);
1458 /* cannot get stripe for read-ahead, just give-up */
1459 clear_bit(BIO_UPTODATE
, &bi
->bi_flags
);
1460 finish_wait(&conf
->wait_for_overlap
, &w
);
1465 spin_lock_irq(&conf
->device_lock
);
1466 if (--bi
->bi_phys_segments
== 0) {
1467 int bytes
= bi
->bi_size
;
1469 if ( bio_data_dir(bi
) == WRITE
)
1470 md_write_end(mddev
);
1472 bi
->bi_end_io(bi
, bytes
, 0);
1474 spin_unlock_irq(&conf
->device_lock
);
1478 /* FIXME go_faster isn't used */
1479 static sector_t
sync_request(mddev_t
*mddev
, sector_t sector_nr
, int *skipped
, int go_faster
)
1481 raid5_conf_t
*conf
= (raid5_conf_t
*) mddev
->private;
1482 struct stripe_head
*sh
;
1483 int sectors_per_chunk
= conf
->chunk_size
>> 9;
1485 unsigned long stripe
;
1488 sector_t first_sector
;
1489 int raid_disks
= conf
->raid_disks
;
1490 int data_disks
= raid_disks
-1;
1492 if (sector_nr
>= mddev
->size
<<1) {
1493 /* just being told to finish up .. nothing much to do */
1494 unplug_slaves(mddev
);
1497 /* if there is 1 or more failed drives and we are trying
1498 * to resync, then assert that we are finished, because there is
1499 * nothing we can do.
1501 if (mddev
->degraded
>= 1 && test_bit(MD_RECOVERY_SYNC
, &mddev
->recovery
)) {
1502 sector_t rv
= (mddev
->size
<< 1) - sector_nr
;
1508 chunk_offset
= sector_div(x
, sectors_per_chunk
);
1510 BUG_ON(x
!= stripe
);
1512 first_sector
= raid5_compute_sector((sector_t
)stripe
*data_disks
*sectors_per_chunk
1513 + chunk_offset
, raid_disks
, data_disks
, &dd_idx
, &pd_idx
, conf
);
1514 sh
= get_active_stripe(conf
, sector_nr
, pd_idx
, 1);
1516 sh
= get_active_stripe(conf
, sector_nr
, pd_idx
, 0);
1517 /* make sure we don't swamp the stripe cache if someone else
1518 * is trying to get access
1520 set_current_state(TASK_UNINTERRUPTIBLE
);
1521 schedule_timeout(1);
1523 spin_lock(&sh
->lock
);
1524 set_bit(STRIPE_SYNCING
, &sh
->state
);
1525 clear_bit(STRIPE_INSYNC
, &sh
->state
);
1526 spin_unlock(&sh
->lock
);
1531 return STRIPE_SECTORS
;
1535 * This is our raid5 kernel thread.
1537 * We scan the hash table for stripes which can be handled now.
1538 * During the scan, completed stripes are saved for us by the interrupt
1539 * handler, so that they will not have to wait for our next wakeup.
1541 static void raid5d (mddev_t
*mddev
)
1543 struct stripe_head
*sh
;
1544 raid5_conf_t
*conf
= mddev_to_conf(mddev
);
1547 PRINTK("+++ raid5d active\n");
1549 md_check_recovery(mddev
);
1552 spin_lock_irq(&conf
->device_lock
);
1554 struct list_head
*first
;
1556 if (list_empty(&conf
->handle_list
) &&
1557 atomic_read(&conf
->preread_active_stripes
) < IO_THRESHOLD
&&
1558 !blk_queue_plugged(mddev
->queue
) &&
1559 !list_empty(&conf
->delayed_list
))
1560 raid5_activate_delayed(conf
);
1562 if (list_empty(&conf
->handle_list
))
1565 first
= conf
->handle_list
.next
;
1566 sh
= list_entry(first
, struct stripe_head
, lru
);
1568 list_del_init(first
);
1569 atomic_inc(&sh
->count
);
1570 if (atomic_read(&sh
->count
)!= 1)
1572 spin_unlock_irq(&conf
->device_lock
);
1578 spin_lock_irq(&conf
->device_lock
);
1580 PRINTK("%d stripes handled\n", handled
);
1582 spin_unlock_irq(&conf
->device_lock
);
1584 unplug_slaves(mddev
);
1586 PRINTK("--- raid5d inactive\n");
1589 static int run (mddev_t
*mddev
)
1592 int raid_disk
, memory
;
1594 struct disk_info
*disk
;
1595 struct list_head
*tmp
;
1597 if (mddev
->level
!= 5 && mddev
->level
!= 4) {
1598 printk("raid5: %s: raid level not set to 4/5 (%d)\n", mdname(mddev
), mddev
->level
);
1602 mddev
->private = kmalloc (sizeof (raid5_conf_t
)
1603 + mddev
->raid_disks
* sizeof(struct disk_info
),
1605 if ((conf
= mddev
->private) == NULL
)
1607 memset (conf
, 0, sizeof (*conf
) + mddev
->raid_disks
* sizeof(struct disk_info
) );
1608 conf
->mddev
= mddev
;
1610 if ((conf
->stripe_hashtbl
= (struct stripe_head
**) __get_free_pages(GFP_ATOMIC
, HASH_PAGES_ORDER
)) == NULL
)
1612 memset(conf
->stripe_hashtbl
, 0, HASH_PAGES
* PAGE_SIZE
);
1614 spin_lock_init(&conf
->device_lock
);
1615 init_waitqueue_head(&conf
->wait_for_stripe
);
1616 init_waitqueue_head(&conf
->wait_for_overlap
);
1617 INIT_LIST_HEAD(&conf
->handle_list
);
1618 INIT_LIST_HEAD(&conf
->delayed_list
);
1619 INIT_LIST_HEAD(&conf
->inactive_list
);
1620 atomic_set(&conf
->active_stripes
, 0);
1621 atomic_set(&conf
->preread_active_stripes
, 0);
1623 PRINTK("raid5: run(%s) called.\n", mdname(mddev
));
1625 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1626 raid_disk
= rdev
->raid_disk
;
1627 if (raid_disk
>= mddev
->raid_disks
1630 disk
= conf
->disks
+ raid_disk
;
1634 if (rdev
->in_sync
) {
1635 char b
[BDEVNAME_SIZE
];
1636 printk(KERN_INFO
"raid5: device %s operational as raid"
1637 " disk %d\n", bdevname(rdev
->bdev
,b
),
1639 conf
->working_disks
++;
1643 conf
->raid_disks
= mddev
->raid_disks
;
1645 * 0 for a fully functional array, 1 for a degraded array.
1647 mddev
->degraded
= conf
->failed_disks
= conf
->raid_disks
- conf
->working_disks
;
1648 conf
->mddev
= mddev
;
1649 conf
->chunk_size
= mddev
->chunk_size
;
1650 conf
->level
= mddev
->level
;
1651 conf
->algorithm
= mddev
->layout
;
1652 conf
->max_nr_stripes
= NR_STRIPES
;
1654 /* device size must be a multiple of chunk size */
1655 mddev
->size
&= ~(mddev
->chunk_size
/1024 -1);
1656 mddev
->resync_max_sectors
= mddev
->size
<< 1;
1658 if (!conf
->chunk_size
|| conf
->chunk_size
% 4) {
1659 printk(KERN_ERR
"raid5: invalid chunk size %d for %s\n",
1660 conf
->chunk_size
, mdname(mddev
));
1663 if (conf
->algorithm
> ALGORITHM_RIGHT_SYMMETRIC
) {
1665 "raid5: unsupported parity algorithm %d for %s\n",
1666 conf
->algorithm
, mdname(mddev
));
1669 if (mddev
->degraded
> 1) {
1670 printk(KERN_ERR
"raid5: not enough operational devices for %s"
1671 " (%d/%d failed)\n",
1672 mdname(mddev
), conf
->failed_disks
, conf
->raid_disks
);
1676 if (mddev
->degraded
== 1 &&
1677 mddev
->recovery_cp
!= MaxSector
) {
1679 "raid5: cannot start dirty degraded array for %s\n",
1685 mddev
->thread
= md_register_thread(raid5d
, mddev
, "%s_raid5");
1686 if (!mddev
->thread
) {
1688 "raid5: couldn't allocate thread for %s\n",
1693 memory
= conf
->max_nr_stripes
* (sizeof(struct stripe_head
) +
1694 conf
->raid_disks
* ((sizeof(struct bio
) + PAGE_SIZE
))) / 1024;
1695 if (grow_stripes(conf
, conf
->max_nr_stripes
)) {
1697 "raid5: couldn't allocate %dkB for buffers\n", memory
);
1698 shrink_stripes(conf
);
1699 md_unregister_thread(mddev
->thread
);
1702 printk(KERN_INFO
"raid5: allocated %dkB for %s\n",
1703 memory
, mdname(mddev
));
1705 if (mddev
->degraded
== 0)
1706 printk("raid5: raid level %d set %s active with %d out of %d"
1707 " devices, algorithm %d\n", conf
->level
, mdname(mddev
),
1708 mddev
->raid_disks
-mddev
->degraded
, mddev
->raid_disks
,
1711 printk(KERN_ALERT
"raid5: raid level %d set %s active with %d"
1712 " out of %d devices, algorithm %d\n", conf
->level
,
1713 mdname(mddev
), mddev
->raid_disks
- mddev
->degraded
,
1714 mddev
->raid_disks
, conf
->algorithm
);
1716 print_raid5_conf(conf
);
1718 /* read-ahead size must cover two whole stripes, which is
1719 * 2 * (n-1) * chunksize where 'n' is the number of raid devices
1722 int stripe
= (mddev
->raid_disks
-1) * mddev
->chunk_size
1724 if (mddev
->queue
->backing_dev_info
.ra_pages
< 2 * stripe
)
1725 mddev
->queue
->backing_dev_info
.ra_pages
= 2 * stripe
;
1728 /* Ok, everything is just fine now */
1730 mddev
->queue
->unplug_fn
= raid5_unplug_device
;
1731 mddev
->queue
->issue_flush_fn
= raid5_issue_flush
;
1733 mddev
->array_size
= mddev
->size
* (mddev
->raid_disks
- 1);
1737 print_raid5_conf(conf
);
1738 if (conf
->stripe_hashtbl
)
1739 free_pages((unsigned long) conf
->stripe_hashtbl
,
1743 mddev
->private = NULL
;
1744 printk(KERN_ALERT
"raid5: failed to run raid set %s\n", mdname(mddev
));
1750 static int stop (mddev_t
*mddev
)
1752 raid5_conf_t
*conf
= (raid5_conf_t
*) mddev
->private;
1754 md_unregister_thread(mddev
->thread
);
1755 mddev
->thread
= NULL
;
1756 shrink_stripes(conf
);
1757 free_pages((unsigned long) conf
->stripe_hashtbl
, HASH_PAGES_ORDER
);
1758 blk_sync_queue(mddev
->queue
); /* the unplug fn references 'conf'*/
1760 mddev
->private = NULL
;
1765 static void print_sh (struct stripe_head
*sh
)
1769 printk("sh %llu, pd_idx %d, state %ld.\n",
1770 (unsigned long long)sh
->sector
, sh
->pd_idx
, sh
->state
);
1771 printk("sh %llu, count %d.\n",
1772 (unsigned long long)sh
->sector
, atomic_read(&sh
->count
));
1773 printk("sh %llu, ", (unsigned long long)sh
->sector
);
1774 for (i
= 0; i
< sh
->raid_conf
->raid_disks
; i
++) {
1775 printk("(cache%d: %p %ld) ",
1776 i
, sh
->dev
[i
].page
, sh
->dev
[i
].flags
);
1781 static void printall (raid5_conf_t
*conf
)
1783 struct stripe_head
*sh
;
1786 spin_lock_irq(&conf
->device_lock
);
1787 for (i
= 0; i
< NR_HASH
; i
++) {
1788 sh
= conf
->stripe_hashtbl
[i
];
1789 for (; sh
; sh
= sh
->hash_next
) {
1790 if (sh
->raid_conf
!= conf
)
1795 spin_unlock_irq(&conf
->device_lock
);
1799 static void status (struct seq_file
*seq
, mddev_t
*mddev
)
1801 raid5_conf_t
*conf
= (raid5_conf_t
*) mddev
->private;
1804 seq_printf (seq
, " level %d, %dk chunk, algorithm %d", mddev
->level
, mddev
->chunk_size
>> 10, mddev
->layout
);
1805 seq_printf (seq
, " [%d/%d] [", conf
->raid_disks
, conf
->working_disks
);
1806 for (i
= 0; i
< conf
->raid_disks
; i
++)
1807 seq_printf (seq
, "%s",
1808 conf
->disks
[i
].rdev
&&
1809 conf
->disks
[i
].rdev
->in_sync
? "U" : "_");
1810 seq_printf (seq
, "]");
1813 seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
1818 static void print_raid5_conf (raid5_conf_t
*conf
)
1821 struct disk_info
*tmp
;
1823 printk("RAID5 conf printout:\n");
1825 printk("(conf==NULL)\n");
1828 printk(" --- rd:%d wd:%d fd:%d\n", conf
->raid_disks
,
1829 conf
->working_disks
, conf
->failed_disks
);
1831 for (i
= 0; i
< conf
->raid_disks
; i
++) {
1832 char b
[BDEVNAME_SIZE
];
1833 tmp
= conf
->disks
+ i
;
1835 printk(" disk %d, o:%d, dev:%s\n",
1836 i
, !tmp
->rdev
->faulty
,
1837 bdevname(tmp
->rdev
->bdev
,b
));
1841 static int raid5_spare_active(mddev_t
*mddev
)
1844 raid5_conf_t
*conf
= mddev
->private;
1845 struct disk_info
*tmp
;
1847 for (i
= 0; i
< conf
->raid_disks
; i
++) {
1848 tmp
= conf
->disks
+ i
;
1850 && !tmp
->rdev
->faulty
1851 && !tmp
->rdev
->in_sync
) {
1853 conf
->failed_disks
--;
1854 conf
->working_disks
++;
1855 tmp
->rdev
->in_sync
= 1;
1858 print_raid5_conf(conf
);
1862 static int raid5_remove_disk(mddev_t
*mddev
, int number
)
1864 raid5_conf_t
*conf
= mddev
->private;
1867 struct disk_info
*p
= conf
->disks
+ number
;
1869 print_raid5_conf(conf
);
1872 if (rdev
->in_sync
||
1873 atomic_read(&rdev
->nr_pending
)) {
1879 if (atomic_read(&rdev
->nr_pending
)) {
1880 /* lost the race, try later */
1887 print_raid5_conf(conf
);
1891 static int raid5_add_disk(mddev_t
*mddev
, mdk_rdev_t
*rdev
)
1893 raid5_conf_t
*conf
= mddev
->private;
1896 struct disk_info
*p
;
1898 if (mddev
->degraded
> 1)
1899 /* no point adding a device */
1905 for (disk
=0; disk
< mddev
->raid_disks
; disk
++)
1906 if ((p
=conf
->disks
+ disk
)->rdev
== NULL
) {
1908 rdev
->raid_disk
= disk
;
1913 print_raid5_conf(conf
);
1917 static int raid5_resize(mddev_t
*mddev
, sector_t sectors
)
1919 /* no resync is happening, and there is enough space
1920 * on all devices, so we can resize.
1921 * We need to make sure resync covers any new space.
1922 * If the array is shrinking we should possibly wait until
1923 * any io in the removed space completes, but it hardly seems
1926 sectors
&= ~((sector_t
)mddev
->chunk_size
/512 - 1);
1927 mddev
->array_size
= (sectors
* (mddev
->raid_disks
-1))>>1;
1928 set_capacity(mddev
->gendisk
, mddev
->array_size
<< 1);
1930 if (sectors
/2 > mddev
->size
&& mddev
->recovery_cp
== MaxSector
) {
1931 mddev
->recovery_cp
= mddev
->size
<< 1;
1932 set_bit(MD_RECOVERY_NEEDED
, &mddev
->recovery
);
1934 mddev
->size
= sectors
/2;
1935 mddev
->resync_max_sectors
= sectors
;
1939 static mdk_personality_t raid5_personality
=
1942 .owner
= THIS_MODULE
,
1943 .make_request
= make_request
,
1947 .error_handler
= error
,
1948 .hot_add_disk
= raid5_add_disk
,
1949 .hot_remove_disk
= raid5_remove_disk
,
1950 .spare_active
= raid5_spare_active
,
1951 .sync_request
= sync_request
,
1952 .resize
= raid5_resize
,
1955 static int __init
raid5_init (void)
1957 return register_md_personality (RAID5
, &raid5_personality
);
1960 static void raid5_exit (void)
1962 unregister_md_personality (RAID5
);
1965 module_init(raid5_init
);
1966 module_exit(raid5_exit
);
1967 MODULE_LICENSE("GPL");
1968 MODULE_ALIAS("md-personality-4"); /* RAID5 */