1 /* $NetBSD: rf_evenodd_dagfuncs.c,v 1.19 2008/11/18 14:29:55 ad Exp $ */
3 * Copyright (c) 1995 Carnegie-Mellon University.
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
18 * Carnegie Mellon requests users of this software to return to
20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
30 * Code for RAID-EVENODD architecture.
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: rf_evenodd_dagfuncs.c,v 1.19 2008/11/18 14:29:55 ad Exp $");
39 #include "opt_raid_diagnostic.h"
42 #if RF_INCLUDE_EVENODD > 0
44 #include <dev/raidframe/raidframevar.h>
48 #include "rf_dagffrd.h"
49 #include "rf_dagffwr.h"
50 #include "rf_dagdegrd.h"
51 #include "rf_dagdegwr.h"
52 #include "rf_dagutils.h"
53 #include "rf_dagfuncs.h"
54 #include "rf_etimer.h"
55 #include "rf_general.h"
56 #include "rf_parityscan.h"
57 #include "rf_evenodd.h"
58 #include "rf_evenodd_dagfuncs.h"
60 /* These redundant functions are for small write */
61 RF_RedFuncs_t rf_EOSmallWritePFuncs
= {rf_RegularXorFunc
, "Regular Old-New P", rf_SimpleXorFunc
, "Simple Old-New P"};
62 RF_RedFuncs_t rf_EOSmallWriteEFuncs
= {rf_RegularONEFunc
, "Regular Old-New E", rf_SimpleONEFunc
, "Regular Old-New E"};
63 /* These redundant functions are for degraded read */
64 RF_RedFuncs_t rf_eoPRecoveryFuncs
= {rf_RecoveryXorFunc
, "Recovery Xr", rf_RecoveryXorFunc
, "Recovery Xr"};
65 RF_RedFuncs_t rf_eoERecoveryFuncs
= {rf_RecoveryEFunc
, "Recovery E Func", rf_RecoveryEFunc
, "Recovery E Func"};
66 /**********************************************************************************************
67 * the following encoding node functions is used in EO_000_CreateLargeWriteDAG
68 **********************************************************************************************/
70 rf_RegularPEFunc(RF_DagNode_t
*node
)
72 rf_RegularESubroutine(node
, node
->results
[1]);
73 rf_RegularXorFunc(node
);/* does the wakeup here! */
75 return (0); /* XXX This was missing... GO */
80 /************************************************************************************************
81 * For EO_001_CreateSmallWriteDAG, there are (i)RegularONEFunc() and (ii)SimpleONEFunc() to
82 * be used. The previous case is when write access at least sectors of full stripe unit.
83 * The later function is used when the write access two stripe units but with total sectors
84 * less than sectors per SU. In this case, the access of parity and 'E' are shown as disconnected
85 * areas in their stripe unit and parity write and 'E' write are both devided into two distinct
86 * writes( totally four). This simple old-new write and regular old-new write happen as in RAID-5
87 ************************************************************************************************/
90 1. Store the difference of old data and new data in the Rod buffer.
91 2. then encode this buffer into the buffer which already have old 'E' information inside it,
92 the result can be shown to be the new 'E' information.
93 3. xor the Wnd buffer into the difference buffer to recover the original old data.
94 Here we have another alternative: to allocate a temporary buffer for storing the difference of
95 old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach
96 take the same speed as the previous, and need more memory.
99 rf_RegularONEFunc(RF_DagNode_t
*node
)
101 RF_Raid_t
*raidPtr
= (RF_Raid_t
*) node
->params
[node
->numParams
- 1].p
;
102 RF_RaidLayout_t
*layoutPtr
= (RF_RaidLayout_t
*) & raidPtr
->Layout
;
103 int EpdaIndex
= (node
->numParams
- 1) / 2 - 1; /* the parameter of node
106 int i
, k
, retcode
= 0;
107 int suoffset
, length
;
109 char *srcbuf
, *destbuf
;
110 RF_AccTraceEntry_t
*tracerec
= node
->dagHdr
->tracerec
;
112 RF_PhysDiskAddr_t
*pda
;
113 #ifdef RAID_DIAGNOSTIC
114 RF_PhysDiskAddr_t
*EPDA
=
115 (RF_PhysDiskAddr_t
*) node
->params
[EpdaIndex
].p
;
116 int ESUOffset
= rf_StripeUnitOffset(layoutPtr
, EPDA
->startSector
);
117 #endif /* RAID_DIAGNOSTIC */
119 RF_ASSERT(EPDA
->type
== RF_PDA_TYPE_Q
);
120 RF_ASSERT(ESUOffset
== 0);
122 RF_ETIMER_START(timer
);
124 /* Xor the Wnd buffer into Rod buffer, the difference of old data and
125 * new data is stored in Rod buffer */
126 for (k
= 0; k
< EpdaIndex
; k
+= 2) {
127 length
= rf_RaidAddressToByte(raidPtr
, ((RF_PhysDiskAddr_t
*) node
->params
[k
].p
)->numSector
);
128 retcode
= rf_bxor(node
->params
[k
+ EpdaIndex
+ 3].p
, node
->params
[k
+ 1].p
, length
);
130 /* Start to encoding the buffer storing the difference of old data and
131 * new data into 'E' buffer */
132 for (i
= 0; i
< EpdaIndex
; i
+= 2)
133 if (node
->params
[i
+ 1].p
!= node
->results
[0]) { /* results[0] is buf ptr
135 pda
= (RF_PhysDiskAddr_t
*) node
->params
[i
].p
;
136 srcbuf
= (char *) node
->params
[i
+ 1].p
;
137 scol
= rf_EUCol(layoutPtr
, pda
->raidAddress
);
138 suoffset
= rf_StripeUnitOffset(layoutPtr
, pda
->startSector
);
139 destbuf
= ((char *) node
->results
[0]) + rf_RaidAddressToByte(raidPtr
, suoffset
);
140 rf_e_encToBuf(raidPtr
, scol
, srcbuf
, RF_EO_MATRIX_DIM
- 2, destbuf
, pda
->numSector
);
142 /* Recover the original old data to be used by parity encoding
143 * function in XorNode */
144 for (k
= 0; k
< EpdaIndex
; k
+= 2) {
145 length
= rf_RaidAddressToByte(raidPtr
, ((RF_PhysDiskAddr_t
*) node
->params
[k
].p
)->numSector
);
146 retcode
= rf_bxor(node
->params
[k
+ EpdaIndex
+ 3].p
, node
->params
[k
+ 1].p
, length
);
148 RF_ETIMER_STOP(timer
);
149 RF_ETIMER_EVAL(timer
);
150 tracerec
->q_us
+= RF_ETIMER_VAL_US(timer
);
151 rf_GenericWakeupFunc(node
, 0);
153 return (0); /* XXX this was missing.. GO */
158 rf_SimpleONEFunc(RF_DagNode_t
*node
)
160 RF_Raid_t
*raidPtr
= (RF_Raid_t
*) node
->params
[node
->numParams
- 1].p
;
161 RF_RaidLayout_t
*layoutPtr
= (RF_RaidLayout_t
*) & raidPtr
->Layout
;
162 RF_PhysDiskAddr_t
*pda
= (RF_PhysDiskAddr_t
*) node
->params
[0].p
;
164 char *srcbuf
, *destbuf
;
165 RF_AccTraceEntry_t
*tracerec
= node
->dagHdr
->tracerec
;
170 RF_ASSERT(((RF_PhysDiskAddr_t
*) node
->params
[2].p
)->type
== RF_PDA_TYPE_Q
);
171 if (node
->dagHdr
->status
== rf_enable
) {
172 RF_ETIMER_START(timer
);
173 length
= rf_RaidAddressToByte(raidPtr
, ((RF_PhysDiskAddr_t
*) node
->params
[4].p
)->numSector
); /* this is a pda of
175 /* bxor to buffer of readDataNodes */
176 retcode
= rf_bxor(node
->params
[5].p
, node
->params
[1].p
, length
);
177 /* find out the corresponding colume in encoding matrix for
178 * write colume to be encoded into redundant disk 'E' */
179 scol
= rf_EUCol(layoutPtr
, pda
->raidAddress
);
180 srcbuf
= node
->params
[1].p
;
181 destbuf
= node
->params
[3].p
;
182 /* Start encoding process */
183 rf_e_encToBuf(raidPtr
, scol
, srcbuf
, RF_EO_MATRIX_DIM
- 2, destbuf
, pda
->numSector
);
184 rf_bxor(node
->params
[5].p
, node
->params
[1].p
, length
);
185 RF_ETIMER_STOP(timer
);
186 RF_ETIMER_EVAL(timer
);
187 tracerec
->q_us
+= RF_ETIMER_VAL_US(timer
);
190 return (rf_GenericWakeupFunc(node
, retcode
)); /* call wake func
191 * explicitly since no
192 * I/O in this node */
196 /****** called by rf_RegularPEFunc(node) and rf_RegularEFunc(node) in f.f. large write ********/
198 rf_RegularESubroutine(RF_DagNode_t
*node
, char *ebuf
)
200 RF_Raid_t
*raidPtr
= (RF_Raid_t
*) node
->params
[node
->numParams
- 1].p
;
201 RF_RaidLayout_t
*layoutPtr
= (RF_RaidLayout_t
*) & raidPtr
->Layout
;
202 RF_PhysDiskAddr_t
*pda
;
205 char *srcbuf
, *destbuf
;
206 RF_AccTraceEntry_t
*tracerec
= node
->dagHdr
->tracerec
;
209 RF_ETIMER_START(timer
);
210 for (i
= 0; i
< node
->numParams
- 2; i
+= 2) {
211 RF_ASSERT(node
->params
[i
+ 1].p
!= ebuf
);
212 pda
= (RF_PhysDiskAddr_t
*) node
->params
[i
].p
;
213 suoffset
= rf_StripeUnitOffset(layoutPtr
, pda
->startSector
);
214 scol
= rf_EUCol(layoutPtr
, pda
->raidAddress
);
215 srcbuf
= (char *) node
->params
[i
+ 1].p
;
216 destbuf
= ebuf
+ rf_RaidAddressToByte(raidPtr
, suoffset
);
217 rf_e_encToBuf(raidPtr
, scol
, srcbuf
, RF_EO_MATRIX_DIM
- 2, destbuf
, pda
->numSector
);
219 RF_ETIMER_STOP(timer
);
220 RF_ETIMER_EVAL(timer
);
221 tracerec
->xor_us
+= RF_ETIMER_VAL_US(timer
);
225 /*******************************************************************************************
226 * Used in EO_001_CreateLargeWriteDAG
227 ******************************************************************************************/
229 rf_RegularEFunc(RF_DagNode_t
*node
)
231 rf_RegularESubroutine(node
, node
->results
[0]);
232 rf_GenericWakeupFunc(node
, 0);
234 return (0); /* XXX this was missing?.. GO */
237 /*******************************************************************************************
238 * This degraded function allow only two case:
239 * 1. when write access the full failed stripe unit, then the access can be more than
241 * 2. when write access only part of the failed SU, we assume accesses of more than
242 * one stripe unit is not allowed so that the write can be dealt with like a
244 * The following function is based on these assumptions. So except in the second case,
245 * it looks the same as a large write encodeing function. But this is not exactly the
246 * normal way for doing a degraded write, since raidframe have to break cases of access
247 * other than the above two into smaller accesses. We may have to change
248 * DegrESubroutin in the future.
249 *******************************************************************************************/
251 rf_DegrESubroutine(RF_DagNode_t
*node
, char *ebuf
)
253 RF_Raid_t
*raidPtr
= (RF_Raid_t
*) node
->params
[node
->numParams
- 1].p
;
254 RF_RaidLayout_t
*layoutPtr
= (RF_RaidLayout_t
*) & raidPtr
->Layout
;
255 RF_PhysDiskAddr_t
*failedPDA
= (RF_PhysDiskAddr_t
*) node
->params
[node
->numParams
- 2].p
;
256 RF_PhysDiskAddr_t
*pda
;
257 int i
, suoffset
, failedSUOffset
= rf_StripeUnitOffset(layoutPtr
, failedPDA
->startSector
);
259 char *srcbuf
, *destbuf
;
260 RF_AccTraceEntry_t
*tracerec
= node
->dagHdr
->tracerec
;
263 RF_ETIMER_START(timer
);
264 for (i
= 0; i
< node
->numParams
- 2; i
+= 2) {
265 RF_ASSERT(node
->params
[i
+ 1].p
!= ebuf
);
266 pda
= (RF_PhysDiskAddr_t
*) node
->params
[i
].p
;
267 suoffset
= rf_StripeUnitOffset(layoutPtr
, pda
->startSector
);
268 scol
= rf_EUCol(layoutPtr
, pda
->raidAddress
);
269 srcbuf
= (char *) node
->params
[i
+ 1].p
;
270 destbuf
= ebuf
+ rf_RaidAddressToByte(raidPtr
, suoffset
- failedSUOffset
);
271 rf_e_encToBuf(raidPtr
, scol
, srcbuf
, RF_EO_MATRIX_DIM
- 2, destbuf
, pda
->numSector
);
274 RF_ETIMER_STOP(timer
);
275 RF_ETIMER_EVAL(timer
);
276 tracerec
->q_us
+= RF_ETIMER_VAL_US(timer
);
280 /**************************************************************************************
281 * This function is used in case where one data disk failed and both redundant disks
282 * alive. It is used in the EO_100_CreateWriteDAG. Note: if there is another disk
283 * failed in the stripe but not accessed at this time, then we should, instead, use
284 * the rf_EOWriteDoubleRecoveryFunc().
285 **************************************************************************************/
287 rf_Degraded_100_EOFunc(RF_DagNode_t
*node
)
289 rf_DegrESubroutine(node
, node
->results
[1]);
290 rf_RecoveryXorFunc(node
); /* does the wakeup here! */
292 return (0); /* XXX this was missing... SHould these be
293 * void functions??? GO */
296 /**************************************************************************************
297 * This function is to encode one sector in one of the data disks to the E disk.
298 * However, in evenodd this function can also be used as decoding function to recover
299 * data from dead disk in the case of parity failure and a single data failure.
300 **************************************************************************************/
303 RF_RowCol_t srcLogicCol
,
305 RF_RowCol_t destLogicCol
,
309 int S_index
; /* index of the EU in the src col which need
310 * be Xored into all EUs in a dest sector */
311 int numRowInEncMatix
= (RF_EO_MATRIX_DIM
) - 1;
312 RF_RowCol_t j
, indexInDest
, /* row index of an encoding unit in
313 * the destination colume of encoding
315 indexInSrc
; /* row index of an encoding unit in the source
316 * colume used for recovery */
317 int bytesPerEU
= bytesPerSector
/ numRowInEncMatix
;
319 #if RF_EO_MATRIX_DIM > 17
320 int shortsPerEU
= bytesPerEU
/ sizeof(short);
321 short *destShortBuf
, *srcShortBuf1
, *srcShortBuf2
;
323 #elif RF_EO_MATRIX_DIM == 17
324 int longsPerEU
= bytesPerEU
/ sizeof(long);
325 long *destLongBuf
, *srcLongBuf1
, *srcLongBuf2
;
329 #if RF_EO_MATRIX_DIM > 17
330 RF_ASSERT(sizeof(short) == 2 || sizeof(short) == 1);
331 RF_ASSERT(bytesPerEU
% sizeof(short) == 0);
332 #elif RF_EO_MATRIX_DIM == 17
333 RF_ASSERT(sizeof(long) == 8 || sizeof(long) == 4);
334 RF_ASSERT(bytesPerEU
% sizeof(long) == 0);
337 S_index
= rf_EO_Mod((RF_EO_MATRIX_DIM
- 1 + destLogicCol
- srcLogicCol
), RF_EO_MATRIX_DIM
);
338 #if RF_EO_MATRIX_DIM > 17
339 srcShortBuf1
= (short *) (srcSecbuf
+ S_index
* bytesPerEU
);
340 #elif RF_EO_MATRIX_DIM == 17
341 srcLongBuf1
= (long *) (srcSecbuf
+ S_index
* bytesPerEU
);
344 for (indexInDest
= 0; indexInDest
< numRowInEncMatix
; indexInDest
++) {
345 indexInSrc
= rf_EO_Mod((indexInDest
+ destLogicCol
- srcLogicCol
), RF_EO_MATRIX_DIM
);
347 #if RF_EO_MATRIX_DIM > 17
348 destShortBuf
= (short *) (destSecbuf
+ indexInDest
* bytesPerEU
);
349 srcShortBuf2
= (short *) (srcSecbuf
+ indexInSrc
* bytesPerEU
);
350 for (j
= 0; j
< shortsPerEU
; j
++) {
351 temp1
= destShortBuf
[j
] ^ srcShortBuf1
[j
];
352 /* note: S_index won't be at the end row for any src
354 if (indexInSrc
!= RF_EO_MATRIX_DIM
- 1)
355 destShortBuf
[j
] = (srcShortBuf2
[j
]) ^ temp1
;
356 /* if indexInSrc is at the end row, ie.
357 * RF_EO_MATRIX_DIM -1, then all elements are zero! */
359 destShortBuf
[j
] = temp1
;
362 #elif RF_EO_MATRIX_DIM == 17
363 destLongBuf
= (long *) (destSecbuf
+ indexInDest
* bytesPerEU
);
364 srcLongBuf2
= (long *) (srcSecbuf
+ indexInSrc
* bytesPerEU
);
365 for (j
= 0; j
< longsPerEU
; j
++) {
366 temp1
= destLongBuf
[j
] ^ srcLongBuf1
[j
];
367 if (indexInSrc
!= RF_EO_MATRIX_DIM
- 1)
368 destLongBuf
[j
] = (srcLongBuf2
[j
]) ^ temp1
;
370 destLongBuf
[j
] = temp1
;
379 RF_RowCol_t srcLogicCol
,
381 RF_RowCol_t destLogicCol
,
385 int i
, bytesPerSector
= rf_RaidAddressToByte(raidPtr
, 1);
387 for (i
= 0; i
< numSector
; i
++) {
388 rf_e_EncOneSect(srcLogicCol
, srcbuf
, destLogicCol
, destbuf
, bytesPerSector
);
389 srcbuf
+= bytesPerSector
;
390 destbuf
+= bytesPerSector
;
393 /**************************************************************************************
394 * when parity die and one data die, We use second redundant information, 'E',
395 * to recover the data in dead disk. This function is used in the recovery node of
396 * for EO_110_CreateReadDAG
397 **************************************************************************************/
399 rf_RecoveryEFunc(RF_DagNode_t
*node
)
401 RF_Raid_t
*raidPtr
= (RF_Raid_t
*) node
->params
[node
->numParams
- 1].p
;
402 RF_RaidLayout_t
*layoutPtr
= (RF_RaidLayout_t
*) & raidPtr
->Layout
;
403 RF_PhysDiskAddr_t
*failedPDA
= (RF_PhysDiskAddr_t
*) node
->params
[node
->numParams
- 2].p
;
404 RF_RowCol_t scol
, /* source logical column */
405 fcol
= rf_EUCol(layoutPtr
, failedPDA
->raidAddress
); /* logical column of
408 RF_PhysDiskAddr_t
*pda
;
409 int suoffset
, failedSUOffset
= rf_StripeUnitOffset(layoutPtr
, failedPDA
->startSector
);
410 char *srcbuf
, *destbuf
;
411 RF_AccTraceEntry_t
*tracerec
= node
->dagHdr
->tracerec
;
414 memset((char *) node
->results
[0], 0,
415 rf_RaidAddressToByte(raidPtr
, failedPDA
->numSector
));
416 if (node
->dagHdr
->status
== rf_enable
) {
417 RF_ETIMER_START(timer
);
418 for (i
= 0; i
< node
->numParams
- 2; i
+= 2)
419 if (node
->params
[i
+ 1].p
!= node
->results
[0]) {
420 pda
= (RF_PhysDiskAddr_t
*) node
->params
[i
].p
;
421 if (i
== node
->numParams
- 4)
422 scol
= RF_EO_MATRIX_DIM
- 2; /* the colume of
425 scol
= rf_EUCol(layoutPtr
, pda
->raidAddress
);
426 srcbuf
= (char *) node
->params
[i
+ 1].p
;
427 suoffset
= rf_StripeUnitOffset(layoutPtr
, pda
->startSector
);
428 destbuf
= ((char *) node
->results
[0]) + rf_RaidAddressToByte(raidPtr
, suoffset
- failedSUOffset
);
429 rf_e_encToBuf(raidPtr
, scol
, srcbuf
, fcol
, destbuf
, pda
->numSector
);
431 RF_ETIMER_STOP(timer
);
432 RF_ETIMER_EVAL(timer
);
433 tracerec
->xor_us
+= RF_ETIMER_VAL_US(timer
);
435 return (rf_GenericWakeupFunc(node
, 0)); /* node execute successfully */
437 /**************************************************************************************
438 * This function is used in the case where one data and the parity have filed.
439 * (in EO_110_CreateWriteDAG )
440 **************************************************************************************/
442 rf_EO_DegradedWriteEFunc(RF_DagNode_t
* node
)
444 rf_DegrESubroutine(node
, node
->results
[0]);
445 rf_GenericWakeupFunc(node
, 0);
447 return (0); /* XXX Yet another one!! GO */
453 /**************************************************************************************
454 * THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES
455 **************************************************************************************/
466 RF_RaidLayout_t
*layoutPtr
= (RF_RaidLayout_t
*) & (raidPtr
->Layout
);
467 int i
, j
, k
, f1
, f2
, row
;
468 int rrdrow
, erow
, count
= 0;
469 int bytesPerSector
= rf_RaidAddressToByte(raidPtr
, 1);
470 int numRowInEncMatix
= (RF_EO_MATRIX_DIM
) - 1;
472 int pcol
= (RF_EO_MATRIX_DIM
) - 1;
474 int ecol
= (RF_EO_MATRIX_DIM
) - 2;
475 int bytesPerEU
= bytesPerSector
/ numRowInEncMatix
;
476 int numDataCol
= layoutPtr
->numDataCol
;
477 #if RF_EO_MATRIX_DIM > 17
478 int shortsPerEU
= bytesPerEU
/ sizeof(short);
479 short *rrdbuf_current
, *pbuf_current
, *ebuf_current
;
480 short *dest_smaller
, *dest_smaller_current
, *dest_larger
, *dest_larger_current
;
484 RF_ASSERT(bytesPerEU
% sizeof(short) == 0);
485 RF_Malloc(P
, bytesPerEU
, (short *));
486 RF_Malloc(temp
, bytesPerEU
, (short *));
487 #elif RF_EO_MATRIX_DIM == 17
488 int longsPerEU
= bytesPerEU
/ sizeof(long);
489 long *rrdbuf_current
, *pbuf_current
, *ebuf_current
;
490 long *dest_smaller
, *dest_smaller_current
, *dest_larger
, *dest_larger_current
;
494 RF_ASSERT(bytesPerEU
% sizeof(long) == 0);
495 RF_Malloc(P
, bytesPerEU
, (long *));
496 RF_Malloc(temp
, bytesPerEU
, (long *));
498 RF_ASSERT(*((long *) dest
[0]) == 0);
499 RF_ASSERT(*((long *) dest
[1]) == 0);
500 memset((char *) P
, 0, bytesPerEU
);
501 memset((char *) temp
, 0, bytesPerEU
);
503 /* calculate the 'P' parameter, which, not parity, is the Xor of all
504 * elements in the last two column, ie. 'E' and 'parity' colume, see
505 * the Ref. paper by Blaum, et al 1993 */
506 for (i
= 0; i
< numRowInEncMatix
; i
++)
507 for (k
= 0; k
< longsPerEU
; k
++) {
508 #if RF_EO_MATRIX_DIM > 17
509 ebuf_current
= ((short *) ebuf
) + i
* shortsPerEU
+ k
;
510 pbuf_current
= ((short *) pbuf
) + i
* shortsPerEU
+ k
;
511 #elif RF_EO_MATRIX_DIM == 17
512 ebuf_current
= ((long *) ebuf
) + i
* longsPerEU
+ k
;
513 pbuf_current
= ((long *) pbuf
) + i
* longsPerEU
+ k
;
515 P
[k
] ^= *ebuf_current
;
516 P
[k
] ^= *pbuf_current
;
518 RF_ASSERT(fcol
[0] != fcol
[1]);
519 if (fcol
[0] < fcol
[1]) {
520 #if RF_EO_MATRIX_DIM > 17
521 dest_smaller
= (short *) (dest
[0]);
522 dest_larger
= (short *) (dest
[1]);
523 #elif RF_EO_MATRIX_DIM == 17
524 dest_smaller
= (long *) (dest
[0]);
525 dest_larger
= (long *) (dest
[1]);
530 #if RF_EO_MATRIX_DIM > 17
531 dest_smaller
= (short *) (dest
[1]);
532 dest_larger
= (short *) (dest
[0]);
533 #elif RF_EO_MATRIX_DIM == 17
534 dest_smaller
= (long *) (dest
[1]);
535 dest_larger
= (long *) (dest
[0]);
540 row
= (RF_EO_MATRIX_DIM
) - 1;
541 while ((row
= rf_EO_Mod((row
+ f1
- f2
), RF_EO_MATRIX_DIM
)) != ((RF_EO_MATRIX_DIM
) - 1)) {
542 #if RF_EO_MATRIX_DIM > 17
543 dest_larger_current
= dest_larger
+ row
* shortsPerEU
;
544 dest_smaller_current
= dest_smaller
+ row
* shortsPerEU
;
545 #elif RF_EO_MATRIX_DIM == 17
546 dest_larger_current
= dest_larger
+ row
* longsPerEU
;
547 dest_smaller_current
= dest_smaller
+ row
* longsPerEU
;
549 /** Do the diagonal recovery. Initially, temp[k] = (failed 1),
550 which is the failed data in the colume which has smaller col index. **/
551 /* step 1: ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3)) */
552 for (j
= 0; j
< numDataCol
; j
++) {
553 if (j
== f1
|| j
== f2
)
555 rrdrow
= rf_EO_Mod((row
+ f2
- j
), RF_EO_MATRIX_DIM
);
556 if (rrdrow
!= (RF_EO_MATRIX_DIM
) - 1) {
557 #if RF_EO_MATRIX_DIM > 17
558 rrdbuf_current
= (short *) (rrdbuf
[j
]) + rrdrow
* shortsPerEU
;
559 for (k
= 0; k
< shortsPerEU
; k
++)
560 temp
[k
] ^= *(rrdbuf_current
+ k
);
561 #elif RF_EO_MATRIX_DIM == 17
562 rrdbuf_current
= (long *) (rrdbuf
[j
]) + rrdrow
* longsPerEU
;
563 for (k
= 0; k
< longsPerEU
; k
++)
564 temp
[k
] ^= *(rrdbuf_current
+ k
);
568 /* step 2: ^E(erow,m-2), If erow is at the buttom row, don't
569 * Xor into it E(erow,m-2) = (principle diagonal) ^ (failed
570 * 1) ^ (failed 2) ^ ( SUM of nonfailed in-diagonal
571 * A(rrdrow,0..m-3) ) After this step, temp[k] = (principle
572 * diagonal) ^ (failed 2) */
574 erow
= rf_EO_Mod((row
+ f2
- ecol
), (RF_EO_MATRIX_DIM
));
575 if (erow
!= (RF_EO_MATRIX_DIM
) - 1) {
576 #if RF_EO_MATRIX_DIM > 17
577 ebuf_current
= (short *) ebuf
+ shortsPerEU
* erow
;
578 for (k
= 0; k
< shortsPerEU
; k
++)
579 temp
[k
] ^= *(ebuf_current
+ k
);
580 #elif RF_EO_MATRIX_DIM == 17
581 ebuf_current
= (long *) ebuf
+ longsPerEU
* erow
;
582 for (k
= 0; k
< longsPerEU
; k
++)
583 temp
[k
] ^= *(ebuf_current
+ k
);
586 /* step 3: ^P to obtain the failed data (failed 2). P can be
587 * proved to be actually (principle diagonal) After this
588 * step, temp[k] = (failed 2), the failed data to be recovered */
589 #if RF_EO_MATRIX_DIM > 17
590 for (k
= 0; k
< shortsPerEU
; k
++)
592 /* Put the data to the destination buffer */
593 for (k
= 0; k
< shortsPerEU
; k
++)
594 dest_larger_current
[k
] = temp
[k
];
595 #elif RF_EO_MATRIX_DIM == 17
596 for (k
= 0; k
< longsPerEU
; k
++)
598 /* Put the data to the destination buffer */
599 for (k
= 0; k
< longsPerEU
; k
++)
600 dest_larger_current
[k
] = temp
[k
];
603 /** THE FOLLOWING DO THE HORIZONTAL XOR **/
604 /* step 1: ^(SUM of A(row,0..m-3)), ie. all nonfailed data
606 for (j
= 0; j
< numDataCol
; j
++) {
607 if (j
== f1
|| j
== f2
)
609 #if RF_EO_MATRIX_DIM > 17
610 rrdbuf_current
= (short *) (rrdbuf
[j
]) + row
* shortsPerEU
;
611 for (k
= 0; k
< shortsPerEU
; k
++)
612 temp
[k
] ^= *(rrdbuf_current
+ k
);
613 #elif RF_EO_MATRIX_DIM == 17
614 rrdbuf_current
= (long *) (rrdbuf
[j
]) + row
* longsPerEU
;
615 for (k
= 0; k
< longsPerEU
; k
++)
616 temp
[k
] ^= *(rrdbuf_current
+ k
);
619 /* step 2: ^A(row,m-1) */
620 /* step 3: Put the data to the destination buffer */
621 #if RF_EO_MATRIX_DIM > 17
622 pbuf_current
= (short *) pbuf
+ shortsPerEU
* row
;
623 for (k
= 0; k
< shortsPerEU
; k
++)
624 temp
[k
] ^= *(pbuf_current
+ k
);
625 for (k
= 0; k
< shortsPerEU
; k
++)
626 dest_smaller_current
[k
] = temp
[k
];
627 #elif RF_EO_MATRIX_DIM == 17
628 pbuf_current
= (long *) pbuf
+ longsPerEU
* row
;
629 for (k
= 0; k
< longsPerEU
; k
++)
630 temp
[k
] ^= *(pbuf_current
+ k
);
631 for (k
= 0; k
< longsPerEU
; k
++)
632 dest_smaller_current
[k
] = temp
[k
];
636 /* Check if all Encoding Unit in the data buffer have been decoded,
637 * according EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime number,
638 * this algorithm will covered all buffer */
639 RF_ASSERT(count
== numRowInEncMatix
);
640 RF_Free((char *) P
, bytesPerEU
);
641 RF_Free((char *) temp
, bytesPerEU
);
645 /***************************************************************************************
646 * This function is called by double degragded read
647 * EO_200_CreateReadDAG
649 ***************************************************************************************/
651 rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t
*node
)
654 int np
= node
->numParams
;
655 RF_AccessStripeMap_t
*asmap
= (RF_AccessStripeMap_t
*) node
->params
[np
- 1].p
;
656 RF_Raid_t
*raidPtr
= (RF_Raid_t
*) node
->params
[np
- 2].p
;
657 RF_RaidLayout_t
*layoutPtr
= (RF_RaidLayout_t
*) & (raidPtr
->Layout
);
658 int i
, prm
, sector
, nresults
= node
->numResults
;
659 RF_SectorCount_t secPerSU
= layoutPtr
->sectorsPerStripeUnit
;
661 int two
= 0, mallc_one
= 0, mallc_two
= 0; /* flags to indicate if
662 * memory is allocated */
663 int bytesPerSector
= rf_RaidAddressToByte(raidPtr
, 1);
664 RF_PhysDiskAddr_t
*ppda
, *ppda2
, *epda
, *epda2
, *pda
, *pda0
, *pda1
,
666 RF_RowCol_t fcol
[2], fsuoff
[2], fsuend
[2], numDataCol
= layoutPtr
->numDataCol
;
667 char **buf
, *ebuf
, *pbuf
, *dest
[2];
668 long *suoff
= NULL
, *suend
= NULL
, *prmToCol
= NULL
,
669 psuoff
= 0, esuoff
= 0;
670 RF_SectorNum_t startSector
, endSector
;
672 RF_AccTraceEntry_t
*tracerec
= node
->dagHdr
->tracerec
;
674 RF_ETIMER_START(timer
);
676 /* Find out the number of parameters which are pdas for data
678 for (i
= 0; i
<= np
; i
++)
679 if (((RF_PhysDiskAddr_t
*) node
->params
[i
].p
)->type
!= RF_PDA_TYPE_DATA
) {
683 RF_Malloc(buf
, numDataCol
* sizeof(char *), (char **));
684 if (ndataParam
!= 0) {
685 RF_Malloc(suoff
, ndataParam
* sizeof(long), (long *));
686 RF_Malloc(suend
, ndataParam
* sizeof(long), (long *));
687 RF_Malloc(prmToCol
, ndataParam
* sizeof(long), (long *));
689 if (asmap
->failedPDAs
[1] &&
690 (asmap
->failedPDAs
[1]->numSector
+ asmap
->failedPDAs
[0]->numSector
< secPerSU
)) {
691 RF_ASSERT(0); /* currently, no support for this situation */
692 ppda
= node
->params
[np
- 6].p
;
693 ppda2
= node
->params
[np
- 5].p
;
694 RF_ASSERT(ppda2
->type
== RF_PDA_TYPE_PARITY
);
695 epda
= node
->params
[np
- 4].p
;
696 epda2
= node
->params
[np
- 3].p
;
697 RF_ASSERT(epda2
->type
== RF_PDA_TYPE_Q
);
700 ppda
= node
->params
[np
- 4].p
;
701 epda
= node
->params
[np
- 3].p
;
702 psuoff
= rf_StripeUnitOffset(layoutPtr
, ppda
->startSector
);
703 esuoff
= rf_StripeUnitOffset(layoutPtr
, epda
->startSector
);
704 RF_ASSERT(psuoff
== esuoff
);
707 the followings have three goals:
708 1. determine the startSector to begin decoding and endSector to end decoding.
709 2. determine the colume numbers of the two failed disks.
710 3. determine the offset and end offset of the access within each failed stripe unit.
713 /* find the startSector to begin decoding */
714 pda
= node
->results
[0];
715 memset(pda
->bufPtr
, 0, bytesPerSector
* pda
->numSector
);
716 fsuoff
[0] = rf_StripeUnitOffset(layoutPtr
, pda
->startSector
);
717 fsuend
[0] = fsuoff
[0] + pda
->numSector
;
720 startSector
= fsuoff
[0];
721 endSector
= fsuend
[0];
723 /* find out the column of failed disk being accessed */
724 fcol
[0] = rf_EUCol(layoutPtr
, pda
->raidAddress
);
726 /* find out the other failed colume not accessed */
727 sosAddr
= rf_RaidAddressOfPrevStripeBoundary(layoutPtr
, asmap
->raidAddress
);
728 for (i
= 0; i
< numDataCol
; i
++) {
729 npda
.raidAddress
= sosAddr
+ (i
* secPerSU
);
730 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, npda
.raidAddress
, &(npda
.col
), &(npda
.startSector
), 0);
731 /* skip over dead disks */
732 if (RF_DEAD_DISK(raidPtr
->Disks
[npda
.col
].status
))
736 RF_ASSERT(i
< numDataCol
);
739 RF_ASSERT(nresults
== 2);
740 pda0
= node
->results
[0];
741 memset(pda0
->bufPtr
, 0, bytesPerSector
* pda0
->numSector
);
742 pda1
= node
->results
[1];
743 memset(pda1
->bufPtr
, 0, bytesPerSector
* pda1
->numSector
);
744 /* determine the failed colume numbers of the two failed
746 fcol
[0] = rf_EUCol(layoutPtr
, pda0
->raidAddress
);
747 fcol
[1] = rf_EUCol(layoutPtr
, pda1
->raidAddress
);
748 /* determine the offset and end offset of the access within
749 * each failed stripe unit. */
750 fsuoff
[0] = rf_StripeUnitOffset(layoutPtr
, pda0
->startSector
);
751 fsuend
[0] = fsuoff
[0] + pda0
->numSector
;
752 fsuoff
[1] = rf_StripeUnitOffset(layoutPtr
, pda1
->startSector
);
753 fsuend
[1] = fsuoff
[1] + pda1
->numSector
;
754 /* determine the startSector to begin decoding */
755 startSector
= RF_MIN(pda0
->startSector
, pda1
->startSector
);
756 /* determine the endSector to end decoding */
757 endSector
= RF_MAX(fsuend
[0], fsuend
[1]);
760 assign the beginning sector and the end sector for each parameter
761 find out the corresponding colume # for each parameter
763 for (prm
= 0; prm
< ndataParam
; prm
++) {
764 pda
= node
->params
[prm
].p
;
765 suoff
[prm
] = rf_StripeUnitOffset(layoutPtr
, pda
->startSector
);
766 suend
[prm
] = suoff
[prm
] + pda
->numSector
;
767 prmToCol
[prm
] = rf_EUCol(layoutPtr
, pda
->raidAddress
);
769 /* 'sector' is the sector for the current decoding algorithm. For each
770 * sector in the failed SU, find out the corresponding parameters that
771 * cover the current sector and that are needed for decoding of this
772 * sector in failed SU. 2. Find out if sector is in the shadow of any
773 * accessed failed SU. If not, malloc a temporary space of a sector in
775 for (sector
= startSector
; sector
< endSector
; sector
++) {
777 if (!(fsuoff
[0] <= sector
&& sector
< fsuend
[0]) && !(fsuoff
[1] <= sector
&& sector
< fsuend
[1]))
779 for (prm
= 0; prm
< ndataParam
; prm
++)
780 if (suoff
[prm
] <= sector
&& sector
< suend
[prm
])
781 buf
[(prmToCol
[prm
])] = (char *)((RF_PhysDiskAddr_t
*) node
->params
[prm
].p
)->bufPtr
+
782 rf_RaidAddressToByte(raidPtr
, sector
- suoff
[prm
]);
783 /* find out if sector is in the shadow of any accessed failed
784 * SU. If yes, assign dest[0], dest[1] to point at suitable
785 * position of the buffer corresponding to failed SUs. if no,
786 * malloc a temporary space of a sector in size for
787 * destination of decoding. */
788 RF_ASSERT(nresults
== 1 || nresults
== 2);
790 dest
[0] = (char *)((RF_PhysDiskAddr_t
*) node
->results
[0])->bufPtr
+ rf_RaidAddressToByte(raidPtr
, sector
- fsuoff
[0]);
791 /* Always malloc temp buffer to dest[1] */
792 RF_Malloc(dest
[1], bytesPerSector
, (char *));
793 memset(dest
[1], 0, bytesPerSector
);
796 if (fsuoff
[0] <= sector
&& sector
< fsuend
[0])
797 dest
[0] = (char *)((RF_PhysDiskAddr_t
*) node
->results
[0])->bufPtr
+ rf_RaidAddressToByte(raidPtr
, sector
- fsuoff
[0]);
799 RF_Malloc(dest
[0], bytesPerSector
, (char *));
800 memset(dest
[0], 0, bytesPerSector
);
803 if (fsuoff
[1] <= sector
&& sector
< fsuend
[1])
804 dest
[1] = (char *)((RF_PhysDiskAddr_t
*) node
->results
[1])->bufPtr
+ rf_RaidAddressToByte(raidPtr
, sector
- fsuoff
[1]);
806 RF_Malloc(dest
[1], bytesPerSector
, (char *));
807 memset(dest
[1], 0, bytesPerSector
);
810 RF_ASSERT(mallc_one
== 0 || mallc_two
== 0);
812 pbuf
= (char *)ppda
->bufPtr
+ rf_RaidAddressToByte(raidPtr
, sector
- psuoff
);
813 ebuf
= (char *)epda
->bufPtr
+ rf_RaidAddressToByte(raidPtr
, sector
- esuoff
);
815 * After finish finding all needed sectors, call doubleEOdecode function for decoding
816 * one sector to destination.
818 rf_doubleEOdecode(raidPtr
, buf
, dest
, fcol
, pbuf
, ebuf
);
819 /* free all allocated memory, and mark flag to indicate no
820 * memory is being allocated */
822 RF_Free(dest
[0], bytesPerSector
);
824 RF_Free(dest
[1], bytesPerSector
);
825 mallc_one
= mallc_two
= 0;
827 RF_Free(buf
, numDataCol
* sizeof(char *));
828 if (ndataParam
!= 0) {
829 RF_Free(suoff
, ndataParam
* sizeof(long));
830 RF_Free(suend
, ndataParam
* sizeof(long));
831 RF_Free(prmToCol
, ndataParam
* sizeof(long));
833 RF_ETIMER_STOP(timer
);
834 RF_ETIMER_EVAL(timer
);
836 tracerec
->q_us
+= RF_ETIMER_VAL_US(timer
);
838 rf_GenericWakeupFunc(node
, 0);
840 return (0); /* XXX is this even close!!?!?!!? GO */
845 /* currently, only access of one of the two failed SU is allowed in this function.
846 * also, asmap->numStripeUnitsAccessed is limited to be one, the RaidFrame will break large access into
847 * many accesses of single stripe unit.
851 rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t
*node
)
853 int np
= node
->numParams
;
854 RF_AccessStripeMap_t
*asmap
= (RF_AccessStripeMap_t
*) node
->params
[np
- 1].p
;
855 RF_Raid_t
*raidPtr
= (RF_Raid_t
*) node
->params
[np
- 2].p
;
856 RF_RaidLayout_t
*layoutPtr
= (RF_RaidLayout_t
*) & (raidPtr
->Layout
);
857 RF_SectorNum_t sector
;
858 RF_RowCol_t col
, scol
;
860 RF_SectorCount_t secPerSU
= layoutPtr
->sectorsPerStripeUnit
;
862 unsigned bytesPerSector
= rf_RaidAddressToByte(raidPtr
, 1);
864 RF_SectorNum_t startSector
, endSector
;
865 RF_PhysDiskAddr_t
*ppda
, *epda
, *pda
, *fpda
, npda
;
866 RF_RowCol_t fcol
[2], numDataCol
= layoutPtr
->numDataCol
;
867 char **buf
; /* buf[0], buf[1], buf[2], ...etc. point to
868 * buffer storing data read from col0, col1,
870 char *ebuf
, *pbuf
, *dest
[2], *olddata
[2];
872 RF_AccTraceEntry_t
*tracerec
= node
->dagHdr
->tracerec
;
874 RF_ASSERT(asmap
->numDataFailed
== 1); /* currently only support this
875 * case, the other failed SU
876 * is not being accessed */
877 RF_ETIMER_START(timer
);
878 RF_Malloc(buf
, numDataCol
* sizeof(char *), (char **));
880 ppda
= node
->results
[0];/* Instead of being buffers, node->results[0]
881 * and [1] are Ppda and Epda */
882 epda
= node
->results
[1];
883 fpda
= asmap
->failedPDAs
[0];
885 /* First, recovery the failed old SU using EvenOdd double decoding */
886 /* determine the startSector and endSector for decoding */
887 startSector
= rf_StripeUnitOffset(layoutPtr
, fpda
->startSector
);
888 endSector
= startSector
+ fpda
->numSector
;
889 /* Assign buf[col] pointers to point to each non-failed colume and
890 * initialize the pbuf and ebuf to point at the beginning of each
891 * source buffers and destination buffers */
892 for (prm
= 0; prm
< numDataCol
- 2; prm
++) {
893 pda
= (RF_PhysDiskAddr_t
*) node
->params
[prm
].p
;
894 col
= rf_EUCol(layoutPtr
, pda
->raidAddress
);
895 buf
[col
] = pda
->bufPtr
;
897 /* pbuf and ebuf: they will change values as double recovery decoding
901 /* find out the logical colume numbers in the encoding matrix of the
902 * two failed columes */
903 fcol
[0] = rf_EUCol(layoutPtr
, fpda
->raidAddress
);
905 /* find out the other failed colume not accessed this time */
906 sosAddr
= rf_RaidAddressOfPrevStripeBoundary(layoutPtr
, asmap
->raidAddress
);
907 for (i
= 0; i
< numDataCol
; i
++) {
908 npda
.raidAddress
= sosAddr
+ (i
* secPerSU
);
909 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, npda
.raidAddress
, &(npda
.col
), &(npda
.startSector
), 0);
910 /* skip over dead disks */
911 if (RF_DEAD_DISK(raidPtr
->Disks
[npda
.col
].status
))
915 RF_ASSERT(i
< numDataCol
);
917 /* assign temporary space to put recovered failed SU */
918 numbytes
= fpda
->numSector
* bytesPerSector
;
919 RF_Malloc(olddata
[0], numbytes
, (char *));
920 RF_Malloc(olddata
[1], numbytes
, (char *));
921 dest
[0] = olddata
[0];
922 dest
[1] = olddata
[1];
923 memset(olddata
[0], 0, numbytes
);
924 memset(olddata
[1], 0, numbytes
);
925 /* Begin the recovery decoding, initially buf[j], ebuf, pbuf, dest[j]
926 * have already pointed at the beginning of each source buffers and
927 * destination buffers */
928 for (sector
= startSector
, i
= 0; sector
< endSector
; sector
++, i
++) {
929 rf_doubleEOdecode(raidPtr
, buf
, dest
, fcol
, pbuf
, ebuf
);
930 for (j
= 0; j
< numDataCol
; j
++)
931 if ((j
!= fcol
[0]) && (j
!= fcol
[1]))
932 buf
[j
] += bytesPerSector
;
933 dest
[0] += bytesPerSector
;
934 dest
[1] += bytesPerSector
;
935 ebuf
+= bytesPerSector
;
936 pbuf
+= bytesPerSector
;
938 /* after recovery, the buffer pointed by olddata[0] is the old failed
939 * data. With new writing data and this old data, use small write to
940 * calculate the new redundant informations */
941 /* node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of
942 * Rrd; params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol
943 * -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ; params[
944 * PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol
945 * +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1] are Pdas of
946 * wudNodes; For current implementation, we assume the simplest case:
947 * asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1
948 * ie. PDAPerDisk = 1 then node->params[numDataCol] must be the new
949 * data to be writen to the failed disk. We first bxor the new data
950 * into the old recovered data, then do the same things as small
953 rf_bxor(((RF_PhysDiskAddr_t
*) node
->params
[numDataCol
].p
)->bufPtr
, olddata
[0], numbytes
);
954 /* do new 'E' calculation */
955 /* find out the corresponding colume in encoding matrix for write
956 * colume to be encoded into redundant disk 'E' */
957 scol
= rf_EUCol(layoutPtr
, fpda
->raidAddress
);
958 /* olddata[0] now is source buffer pointer; epda->bufPtr is the dest
960 rf_e_encToBuf(raidPtr
, scol
, olddata
[0], RF_EO_MATRIX_DIM
- 2, epda
->bufPtr
, fpda
->numSector
);
962 /* do new 'P' calculation */
963 rf_bxor(olddata
[0], ppda
->bufPtr
, numbytes
);
964 /* Free the allocated buffer */
965 RF_Free(olddata
[0], numbytes
);
966 RF_Free(olddata
[1], numbytes
);
967 RF_Free(buf
, numDataCol
* sizeof(char *));
969 RF_ETIMER_STOP(timer
);
970 RF_ETIMER_EVAL(timer
);
972 tracerec
->q_us
+= RF_ETIMER_VAL_US(timer
);
974 rf_GenericWakeupFunc(node
, 0);
977 #endif /* RF_INCLUDE_EVENODD > 0 */