1 /* $NetBSD: rf_raid5.c,v 1.18 2006/10/12 01:31:52 christos Exp $ */
3 * Copyright (c) 1995 Carnegie-Mellon University.
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
18 * Carnegie Mellon requests users of this software to return to
20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
29 /******************************************************************************
31 * rf_raid5.c -- implements RAID Level 5
33 *****************************************************************************/
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: rf_raid5.c,v 1.18 2006/10/12 01:31:52 christos Exp $");
38 #include <dev/raidframe/raidframevar.h>
43 #include "rf_dagffrd.h"
44 #include "rf_dagffwr.h"
45 #include "rf_dagdegrd.h"
46 #include "rf_dagdegwr.h"
47 #include "rf_dagutils.h"
48 #include "rf_general.h"
52 typedef struct RF_Raid5ConfigInfo_s
{
53 RF_RowCol_t
**stripeIdentifier
; /* filled in at config time and used
54 * by IdentifyStripe */
55 } RF_Raid5ConfigInfo_t
;
58 rf_ConfigureRAID5(RF_ShutdownList_t
**listp
, RF_Raid_t
*raidPtr
,
61 RF_RaidLayout_t
*layoutPtr
= &raidPtr
->Layout
;
62 RF_Raid5ConfigInfo_t
*info
;
63 RF_RowCol_t i
, j
, startdisk
;
65 /* create a RAID level 5 configuration structure */
66 RF_MallocAndAdd(info
, sizeof(RF_Raid5ConfigInfo_t
), (RF_Raid5ConfigInfo_t
*), raidPtr
->cleanupList
);
69 layoutPtr
->layoutSpecificInfo
= (void *) info
;
71 /* the stripe identifier must identify the disks in each stripe, IN
72 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */
73 info
->stripeIdentifier
= rf_make_2d_array(raidPtr
->numCol
, raidPtr
->numCol
, raidPtr
->cleanupList
);
74 if (info
->stripeIdentifier
== NULL
)
77 for (i
= 0; i
< raidPtr
->numCol
; i
++) {
78 for (j
= 0; j
< raidPtr
->numCol
; j
++) {
79 info
->stripeIdentifier
[i
][j
] = (startdisk
+ j
) % raidPtr
->numCol
;
81 if ((--startdisk
) < 0)
82 startdisk
= raidPtr
->numCol
- 1;
85 /* fill in the remaining layout parameters */
86 layoutPtr
->numStripe
= layoutPtr
->stripeUnitsPerDisk
;
87 layoutPtr
->numDataCol
= raidPtr
->numCol
- 1;
88 layoutPtr
->dataSectorsPerStripe
= layoutPtr
->numDataCol
* layoutPtr
->sectorsPerStripeUnit
;
89 layoutPtr
->numParityCol
= 1;
90 layoutPtr
->dataStripeUnitsPerDisk
= layoutPtr
->stripeUnitsPerDisk
;
92 raidPtr
->totalSectors
= layoutPtr
->stripeUnitsPerDisk
* layoutPtr
->numDataCol
* layoutPtr
->sectorsPerStripeUnit
;
98 rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t
*raidPtr
)
104 rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t
*raidPtr
)
108 #if !defined(__NetBSD__) && !defined(_KERNEL)
109 /* not currently used */
111 rf_ShutdownRAID5(RF_Raid_t
*raidPtr
)
118 rf_MapSectorRAID5(RF_Raid_t
*raidPtr
, RF_RaidAddr_t raidSector
,
119 RF_RowCol_t
*col
, RF_SectorNum_t
*diskSector
,
122 RF_StripeNum_t SUID
= raidSector
/ raidPtr
->Layout
.sectorsPerStripeUnit
;
123 *col
= (SUID
% raidPtr
->numCol
);
124 *diskSector
= (SUID
/ (raidPtr
->Layout
.numDataCol
)) * raidPtr
->Layout
.sectorsPerStripeUnit
+
125 (raidSector
% raidPtr
->Layout
.sectorsPerStripeUnit
);
129 rf_MapParityRAID5(RF_Raid_t
*raidPtr
, RF_RaidAddr_t raidSector
,
130 RF_RowCol_t
*col
, RF_SectorNum_t
*diskSector
,
133 RF_StripeNum_t SUID
= raidSector
/ raidPtr
->Layout
.sectorsPerStripeUnit
;
135 *col
= raidPtr
->Layout
.numDataCol
- (SUID
/ raidPtr
->Layout
.numDataCol
) % raidPtr
->numCol
;
136 *diskSector
= (SUID
/ (raidPtr
->Layout
.numDataCol
)) * raidPtr
->Layout
.sectorsPerStripeUnit
+
137 (raidSector
% raidPtr
->Layout
.sectorsPerStripeUnit
);
141 rf_IdentifyStripeRAID5(RF_Raid_t
*raidPtr
, RF_RaidAddr_t addr
,
142 RF_RowCol_t
**diskids
)
144 RF_StripeNum_t stripeID
= rf_RaidAddressToStripeID(&raidPtr
->Layout
, addr
);
145 RF_Raid5ConfigInfo_t
*info
= (RF_Raid5ConfigInfo_t
*) raidPtr
->Layout
.layoutSpecificInfo
;
147 *diskids
= info
->stripeIdentifier
[stripeID
% raidPtr
->numCol
];
151 rf_MapSIDToPSIDRAID5(RF_RaidLayout_t
*layoutPtr
,
152 RF_StripeNum_t stripeID
,
153 RF_StripeNum_t
*psID
, RF_ReconUnitNum_t
*which_ru
)
158 /* select an algorithm for performing an access. Returns two pointers,
159 * one to a function that will return information about the DAG, and
160 * another to a function that will create the dag.
163 rf_RaidFiveDagSelect(RF_Raid_t
*raidPtr
, RF_IoType_t type
,
164 RF_AccessStripeMap_t
*asmap
,
165 RF_VoidFuncPtr
*createFunc
)
167 RF_RaidLayout_t
*layoutPtr
= &(raidPtr
->Layout
);
168 RF_PhysDiskAddr_t
*failedPDA
= NULL
;
170 RF_RowStatus_t rstat
;
173 RF_ASSERT(RF_IO_IS_R_OR_W(type
));
175 if ((asmap
->numDataFailed
+ asmap
->numParityFailed
> 1) ||
176 (raidPtr
->numFailures
> 1)){
179 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
185 if (asmap
->numDataFailed
+ asmap
->numParityFailed
== 1) {
187 /* if under recon & already reconstructed, redirect
188 * the access to the spare drive and eliminate the
189 * failure indication */
190 failedPDA
= asmap
->failedPDAs
[0];
191 fcol
= failedPDA
->col
;
192 rstat
= raidPtr
->status
;
193 prior_recon
= (rstat
== rf_rs_reconfigured
) || (
194 (rstat
== rf_rs_reconstructing
) ?
195 rf_CheckRUReconstructed(raidPtr
->reconControl
->reconMap
, failedPDA
->startSector
) : 0
198 #if RF_DEBUG_DAG > 0 || RF_DEBUG_MAP > 0
199 RF_RowCol_t oc
= failedPDA
->col
;
200 RF_SectorNum_t oo
= failedPDA
->startSector
;
202 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
203 if (layoutPtr
->map
->flags
& RF_DISTRIBUTE_SPARE
) { /* redirect to dist
206 if (failedPDA
== asmap
->parityInfo
) {
208 /* parity has failed */
209 (layoutPtr
->map
->MapParity
) (raidPtr
, failedPDA
->raidAddress
,
210 &failedPDA
->col
, &failedPDA
->startSector
, RF_REMAP
);
212 if (asmap
->parityInfo
->next
) { /* redir 2nd component,
214 RF_PhysDiskAddr_t
*p
= asmap
->parityInfo
->next
;
215 RF_SectorNum_t SUoffs
= p
->startSector
% layoutPtr
->sectorsPerStripeUnit
;
216 p
->col
= failedPDA
->col
;
217 p
->startSector
= rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr
, failedPDA
->startSector
) +
220 * really a RAID address */
223 if (asmap
->parityInfo
->next
&& failedPDA
== asmap
->parityInfo
->next
) {
224 RF_ASSERT(0); /* should not ever
228 /* data has failed */
229 (layoutPtr
->map
->MapSector
) (raidPtr
, failedPDA
->raidAddress
,
230 &failedPDA
->col
, &failedPDA
->startSector
, RF_REMAP
);
236 /* redirect to dedicated spare space */
238 failedPDA
->col
= raidPtr
->Disks
[fcol
].spareCol
;
240 /* the parity may have two distinct
241 * components, both of which may need
242 * to be redirected */
243 if (asmap
->parityInfo
->next
) {
244 if (failedPDA
== asmap
->parityInfo
) {
245 failedPDA
->next
->col
= failedPDA
->col
;
247 if (failedPDA
== asmap
->parityInfo
->next
) { /* paranoid: should
249 asmap
->parityInfo
->col
= failedPDA
->col
;
252 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
255 RF_ASSERT(failedPDA
->col
!= -1);
257 #if RF_DEBUG_DAG > 0 || RF_DEBUG_MAP > 0
258 if (rf_dagDebug
|| rf_mapDebug
) {
259 printf("raid%d: Redirected type '%c' c %d o %ld -> c %d o %ld\n",
260 raidPtr
->raidid
, type
, oc
,
261 (long) oo
, failedPDA
->col
,
262 (long) failedPDA
->startSector
);
265 asmap
->numDataFailed
= asmap
->numParityFailed
= 0;
268 /* all dags begin/end with block/unblock node therefore, hdrSucc &
269 * termAnt counts should always be 1 also, these counts should not be
270 * visible outside dag creation routines - manipulating the counts
271 * here should be removed */
272 if (type
== RF_IO_TYPE_READ
) {
273 if (asmap
->numDataFailed
== 0)
274 *createFunc
= (RF_VoidFuncPtr
) rf_CreateFaultFreeReadDAG
;
276 *createFunc
= (RF_VoidFuncPtr
) rf_CreateRaidFiveDegradedReadDAG
;
280 /* if mirroring, always use large writes. If the access
281 * requires two distinct parity updates, always do a small
282 * write. If the stripe contains a failure but the access
283 * does not, do a small write. The first conditional
284 * (numStripeUnitsAccessed <= numDataCol/2) uses a
285 * less-than-or-equal rather than just a less-than because
286 * when G is 3 or 4, numDataCol/2 is 1, and I want
287 * single-stripe-unit updates to use just one disk. */
288 if ((asmap
->numDataFailed
+ asmap
->numParityFailed
) == 0) {
289 if (rf_suppressLocksAndLargeWrites
||
290 (((asmap
->numStripeUnitsAccessed
<= (layoutPtr
->numDataCol
/ 2)) && (layoutPtr
->numDataCol
!= 1)) ||
291 (asmap
->parityInfo
->next
!= NULL
) || rf_CheckStripeForFailures(raidPtr
, asmap
))) {
292 *createFunc
= (RF_VoidFuncPtr
) rf_CreateSmallWriteDAG
;
294 *createFunc
= (RF_VoidFuncPtr
) rf_CreateLargeWriteDAG
;
296 if (asmap
->numParityFailed
== 1)
297 *createFunc
= (RF_VoidFuncPtr
) rf_CreateNonRedundantWriteDAG
;
299 if (asmap
->numStripeUnitsAccessed
!= 1 && (failedPDA
== NULL
|| failedPDA
->numSector
!= layoutPtr
->sectorsPerStripeUnit
))
302 *createFunc
= (RF_VoidFuncPtr
) rf_CreateDegradedWriteDAG
;