1 /* $NetBSD: rf_dagdegrd.c,v 1.26 2006/10/12 01:31:50 christos Exp $ */
3 * Copyright (c) 1995 Carnegie-Mellon University.
6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
18 * Carnegie Mellon requests users of this software to return to
20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
32 * code for creating degraded read DAGs
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: rf_dagdegrd.c,v 1.26 2006/10/12 01:31:50 christos Exp $");
38 #include <dev/raidframe/raidframevar.h>
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_debugMem.h"
46 #include "rf_general.h"
47 #include "rf_dagdegrd.h"
51 /******************************************************************************
53 * General comments on DAG creation:
55 * All DAGs in this file use roll-away error recovery. Each DAG has a single
56 * commit node, usually called "Cmt." If an error occurs before the Cmt node
57 * is reached, the execution engine will halt forward execution and work
58 * backward through the graph, executing the undo functions. Assuming that
59 * each node in the graph prior to the Cmt node are undoable and atomic - or -
60 * does not make changes to permanent state, the graph will fail atomically.
61 * If an error occurs after the Cmt node executes, the engine will roll-forward
62 * through the graph, blindly executing nodes until it reaches the end.
63 * If a graph reaches the end, it is assumed to have completed successfully.
65 * A graph has only 1 Cmt node.
70 /******************************************************************************
72 * The following wrappers map the standard DAG creation interface to the
73 * DAG creation routines. Additionally, these wrappers enable experimentation
74 * with new DAG structures by providing an extra level of indirection, allowing
75 * the DAG creation routines to be replaced at this single point.
79 rf_CreateRaidFiveDegradedReadDAG(RF_Raid_t
*raidPtr
,
80 RF_AccessStripeMap_t
*asmap
,
81 RF_DagHeader_t
*dag_h
,
83 RF_RaidAccessFlags_t flags
,
84 RF_AllocListElem_t
*allocList
)
86 rf_CreateDegradedReadDAG(raidPtr
, asmap
, dag_h
, bp
, flags
, allocList
,
87 &rf_xorRecoveryFuncs
);
91 /******************************************************************************
93 * DAG creation code begins here
97 /******************************************************************************
98 * Create a degraded read DAG for RAID level 1
100 * Hdr -> Nil -> R(p/s)d -> Commit -> Trm
102 * The "Rd" node reads data from the surviving disk in the mirror pair
103 * Rpd - read of primary copy
104 * Rsd - read of secondary copy
106 * Parameters: raidPtr - description of the physical array
107 * asmap - logical & physical addresses for this access
108 * bp - buffer ptr (for holding write data)
109 * flags - general flags (e.g. disk locking)
110 * allocList - list of memory allocated in DAG creation
111 *****************************************************************************/
114 rf_CreateRaidOneDegradedReadDAG(RF_Raid_t
*raidPtr
,
115 RF_AccessStripeMap_t
*asmap
,
116 RF_DagHeader_t
*dag_h
,
118 RF_RaidAccessFlags_t flags
,
119 RF_AllocListElem_t
*allocList
)
121 RF_DagNode_t
*rdNode
, *blockNode
, *commitNode
, *termNode
;
122 RF_StripeNum_t parityStripeID
;
123 RF_ReconUnitNum_t which_ru
;
124 RF_PhysDiskAddr_t
*pda
;
128 parityStripeID
= rf_RaidAddressToParityStripeID(&(raidPtr
->Layout
),
129 asmap
->raidAddress
, &which_ru
);
132 printf("[Creating RAID level 1 degraded read DAG]\n");
135 dag_h
->creator
= "RaidOneDegradedReadDAG";
136 /* alloc the Wnd nodes and the Wmir node */
137 if (asmap
->numDataFailed
== 0)
138 useMirror
= RF_FALSE
;
142 /* total number of nodes = 1 + (block + commit + terminator) */
144 rdNode
= rf_AllocDAGNode();
145 rdNode
->list_next
= dag_h
->nodes
;
146 dag_h
->nodes
= rdNode
;
148 blockNode
= rf_AllocDAGNode();
149 blockNode
->list_next
= dag_h
->nodes
;
150 dag_h
->nodes
= blockNode
;
152 commitNode
= rf_AllocDAGNode();
153 commitNode
->list_next
= dag_h
->nodes
;
154 dag_h
->nodes
= commitNode
;
156 termNode
= rf_AllocDAGNode();
157 termNode
->list_next
= dag_h
->nodes
;
158 dag_h
->nodes
= termNode
;
160 /* this dag can not commit until the commit node is reached. errors
161 * prior to the commit point imply the dag has failed and must be
163 dag_h
->numCommitNodes
= 1;
164 dag_h
->numCommits
= 0;
165 dag_h
->numSuccedents
= 1;
167 /* initialize the block, commit, and terminator nodes */
168 rf_InitNode(blockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
169 NULL
, 1, 0, 0, 0, dag_h
, "Nil", allocList
);
170 rf_InitNode(commitNode
, rf_wait
, RF_TRUE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
171 NULL
, 1, 1, 0, 0, dag_h
, "Cmt", allocList
);
172 rf_InitNode(termNode
, rf_wait
, RF_FALSE
, rf_TerminateFunc
, rf_TerminateUndoFunc
,
173 NULL
, 0, 1, 0, 0, dag_h
, "Trm", allocList
);
175 pda
= asmap
->physInfo
;
176 RF_ASSERT(pda
!= NULL
);
177 /* parityInfo must describe entire parity unit */
178 RF_ASSERT(asmap
->parityInfo
->next
== NULL
);
180 /* initialize the data node */
182 /* read primary copy of data */
183 rf_InitNode(rdNode
, rf_wait
, RF_FALSE
, rf_DiskReadFunc
, rf_DiskReadUndoFunc
,
184 rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
, "Rpd", allocList
);
185 rdNode
->params
[0].p
= pda
;
186 rdNode
->params
[1].p
= pda
->bufPtr
;
187 rdNode
->params
[2].v
= parityStripeID
;
188 rdNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
,
191 /* read secondary copy of data */
192 rf_InitNode(rdNode
, rf_wait
, RF_FALSE
, rf_DiskReadFunc
, rf_DiskReadUndoFunc
,
193 rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
, "Rsd", allocList
);
194 rdNode
->params
[0].p
= asmap
->parityInfo
;
195 rdNode
->params
[1].p
= pda
->bufPtr
;
196 rdNode
->params
[2].v
= parityStripeID
;
197 rdNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
,
201 /* connect header to block node */
202 RF_ASSERT(dag_h
->numSuccedents
== 1);
203 RF_ASSERT(blockNode
->numAntecedents
== 0);
204 dag_h
->succedents
[0] = blockNode
;
206 /* connect block node to rdnode */
207 RF_ASSERT(blockNode
->numSuccedents
== 1);
208 RF_ASSERT(rdNode
->numAntecedents
== 1);
209 blockNode
->succedents
[0] = rdNode
;
210 rdNode
->antecedents
[0] = blockNode
;
211 rdNode
->antType
[0] = rf_control
;
213 /* connect rdnode to commit node */
214 RF_ASSERT(rdNode
->numSuccedents
== 1);
215 RF_ASSERT(commitNode
->numAntecedents
== 1);
216 rdNode
->succedents
[0] = commitNode
;
217 commitNode
->antecedents
[0] = rdNode
;
218 commitNode
->antType
[0] = rf_control
;
220 /* connect commit node to terminator */
221 RF_ASSERT(commitNode
->numSuccedents
== 1);
222 RF_ASSERT(termNode
->numAntecedents
== 1);
223 RF_ASSERT(termNode
->numSuccedents
== 0);
224 commitNode
->succedents
[0] = termNode
;
225 termNode
->antecedents
[0] = commitNode
;
226 termNode
->antType
[0] = rf_control
;
231 /******************************************************************************
233 * creates a DAG to perform a degraded-mode read of data within one stripe.
234 * This DAG is as follows:
236 * Hdr -> Block -> Rud -> Xor -> Cmt -> T
240 * Each R node is a successor of the L node
241 * One successor arc from each R node goes to C, and the other to X
242 * There is one Rud for each chunk of surviving user data requested by the
243 * user, and one Rrd for each chunk of surviving user data _not_ being read by
245 * R = read, ud = user data, rd = recovery (surviving) data, p = parity
246 * X = XOR, C = Commit, T = terminate
248 * The block node guarantees a single source node.
250 * Note: The target buffer for the XOR node is set to the actual user buffer
251 * where the failed data is supposed to end up. This buffer is zero'd by the
252 * code here. Thus, if you create a degraded read dag, use it, and then
253 * re-use, you have to be sure to zero the target buffer prior to the re-use.
255 * The recfunc argument at the end specifies the name and function used for
259 *****************************************************************************/
262 rf_CreateDegradedReadDAG(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
263 RF_DagHeader_t
*dag_h
, void *bp
,
264 RF_RaidAccessFlags_t flags
,
265 RF_AllocListElem_t
*allocList
,
266 const RF_RedFuncs_t
*recFunc
)
268 RF_DagNode_t
*rudNodes
, *rrdNodes
, *xorNode
, *blockNode
;
269 RF_DagNode_t
*commitNode
, *rpNode
, *termNode
;
270 RF_DagNode_t
*tmpNode
, *tmprudNode
, *tmprrdNode
;
271 int nNodes
, nRrdNodes
, nRudNodes
, nXorBufs
, i
;
273 RF_SectorCount_t sectorsPerSU
;
274 RF_ReconUnitNum_t which_ru
;
275 char overlappingPDAs
[RF_MAXCOL
];/* a temporary array of flags */
276 RF_AccessStripeMapHeader_t
*new_asm_h
[2];
277 RF_PhysDiskAddr_t
*pda
, *parityPDA
;
278 RF_StripeNum_t parityStripeID
;
279 RF_PhysDiskAddr_t
*failedPDA
;
280 RF_RaidLayout_t
*layoutPtr
;
283 layoutPtr
= &(raidPtr
->Layout
);
284 /* failedPDA points to the pda within the asm that targets the failed
286 failedPDA
= asmap
->failedPDAs
[0];
287 parityStripeID
= rf_RaidAddressToParityStripeID(layoutPtr
,
288 asmap
->raidAddress
, &which_ru
);
289 sectorsPerSU
= layoutPtr
->sectorsPerStripeUnit
;
293 printf("[Creating degraded read DAG]\n");
296 RF_ASSERT(asmap
->numDataFailed
== 1);
297 dag_h
->creator
= "DegradedReadDAG";
300 * generate two ASMs identifying the surviving data we need
301 * in order to recover the lost data
304 /* overlappingPDAs array must be zero'd */
305 memset(overlappingPDAs
, 0, RF_MAXCOL
);
306 rf_GenerateFailedAccessASMs(raidPtr
, asmap
, failedPDA
, dag_h
, new_asm_h
, &nXorBufs
,
307 &rpBuf
, overlappingPDAs
, allocList
);
310 * create all the nodes at once
312 * -1 because no access is generated for the failed pda
314 nRudNodes
= asmap
->numStripeUnitsAccessed
- 1;
315 nRrdNodes
= ((new_asm_h
[0]) ? new_asm_h
[0]->stripeMap
->numStripeUnitsAccessed
: 0) +
316 ((new_asm_h
[1]) ? new_asm_h
[1]->stripeMap
->numStripeUnitsAccessed
: 0);
317 nNodes
= 5 + nRudNodes
+ nRrdNodes
; /* lock, unlock, xor, Rp, Rud,
320 blockNode
= rf_AllocDAGNode();
321 blockNode
->list_next
= dag_h
->nodes
;
322 dag_h
->nodes
= blockNode
;
324 commitNode
= rf_AllocDAGNode();
325 commitNode
->list_next
= dag_h
->nodes
;
326 dag_h
->nodes
= commitNode
;
328 xorNode
= rf_AllocDAGNode();
329 xorNode
->list_next
= dag_h
->nodes
;
330 dag_h
->nodes
= xorNode
;
332 rpNode
= rf_AllocDAGNode();
333 rpNode
->list_next
= dag_h
->nodes
;
334 dag_h
->nodes
= rpNode
;
336 termNode
= rf_AllocDAGNode();
337 termNode
->list_next
= dag_h
->nodes
;
338 dag_h
->nodes
= termNode
;
340 for (i
= 0; i
< nRudNodes
; i
++) {
341 tmpNode
= rf_AllocDAGNode();
342 tmpNode
->list_next
= dag_h
->nodes
;
343 dag_h
->nodes
= tmpNode
;
345 rudNodes
= dag_h
->nodes
;
347 for (i
= 0; i
< nRrdNodes
; i
++) {
348 tmpNode
= rf_AllocDAGNode();
349 tmpNode
->list_next
= dag_h
->nodes
;
350 dag_h
->nodes
= tmpNode
;
352 rrdNodes
= dag_h
->nodes
;
354 /* initialize nodes */
355 dag_h
->numCommitNodes
= 1;
356 dag_h
->numCommits
= 0;
357 /* this dag can not commit until the commit node is reached errors
358 * prior to the commit point imply the dag has failed */
359 dag_h
->numSuccedents
= 1;
361 rf_InitNode(blockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
362 NULL
, nRudNodes
+ nRrdNodes
+ 1, 0, 0, 0, dag_h
, "Nil", allocList
);
363 rf_InitNode(commitNode
, rf_wait
, RF_TRUE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
364 NULL
, 1, 1, 0, 0, dag_h
, "Cmt", allocList
);
365 rf_InitNode(termNode
, rf_wait
, RF_FALSE
, rf_TerminateFunc
, rf_TerminateUndoFunc
,
366 NULL
, 0, 1, 0, 0, dag_h
, "Trm", allocList
);
367 rf_InitNode(xorNode
, rf_wait
, RF_FALSE
, recFunc
->simple
, rf_NullNodeUndoFunc
,
368 NULL
, 1, nRudNodes
+ nRrdNodes
+ 1, 2 * nXorBufs
+ 2, 1, dag_h
,
369 recFunc
->SimpleName
, allocList
);
371 /* fill in the Rud nodes */
372 tmprudNode
= rudNodes
;
373 for (pda
= asmap
->physInfo
, i
= 0; i
< nRudNodes
; i
++, pda
= pda
->next
) {
374 if (pda
== failedPDA
) {
378 rf_InitNode(tmprudNode
, rf_wait
, RF_FALSE
, rf_DiskReadFunc
,
379 rf_DiskReadUndoFunc
, rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
,
382 tmprudNode
->params
[0].p
= pda
;
383 tmprudNode
->params
[1].p
= pda
->bufPtr
;
384 tmprudNode
->params
[2].v
= parityStripeID
;
385 tmprudNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
386 tmprudNode
= tmprudNode
->list_next
;
389 /* fill in the Rrd nodes */
391 tmprrdNode
= rrdNodes
;
393 for (pda
= new_asm_h
[0]->stripeMap
->physInfo
;
394 i
< new_asm_h
[0]->stripeMap
->numStripeUnitsAccessed
;
395 i
++, pda
= pda
->next
) {
396 rf_InitNode(tmprrdNode
, rf_wait
, RF_FALSE
, rf_DiskReadFunc
,
397 rf_DiskReadUndoFunc
, rf_GenericWakeupFunc
, 1, 1, 4, 0,
398 dag_h
, "Rrd", allocList
);
400 tmprrdNode
->params
[0].p
= pda
;
401 tmprrdNode
->params
[1].p
= pda
->bufPtr
;
402 tmprrdNode
->params
[2].v
= parityStripeID
;
403 tmprrdNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
404 tmprrdNode
= tmprrdNode
->list_next
;
408 /* tmprrdNode = rrdNodes; */ /* don't set this here -- old code was using i+j, which means
409 we need to just continue using tmprrdNode for the next 'j' elements. */
410 for (j
= 0, pda
= new_asm_h
[1]->stripeMap
->physInfo
;
411 j
< new_asm_h
[1]->stripeMap
->numStripeUnitsAccessed
;
412 j
++, pda
= pda
->next
) {
413 rf_InitNode(tmprrdNode
, rf_wait
, RF_FALSE
, rf_DiskReadFunc
,
414 rf_DiskReadUndoFunc
, rf_GenericWakeupFunc
, 1, 1, 4, 0,
415 dag_h
, "Rrd", allocList
);
417 tmprrdNode
->params
[0].p
= pda
;
418 tmprrdNode
->params
[1].p
= pda
->bufPtr
;
419 tmprrdNode
->params
[2].v
= parityStripeID
;
420 tmprrdNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
421 tmprrdNode
= tmprrdNode
->list_next
;
424 /* make a PDA for the parity unit */
425 parityPDA
= rf_AllocPhysDiskAddr();
426 parityPDA
->next
= dag_h
->pda_cleanup_list
;
427 dag_h
->pda_cleanup_list
= parityPDA
;
428 parityPDA
->col
= asmap
->parityInfo
->col
;
429 parityPDA
->startSector
= ((asmap
->parityInfo
->startSector
/ sectorsPerSU
)
430 * sectorsPerSU
) + (failedPDA
->startSector
% sectorsPerSU
);
431 parityPDA
->numSector
= failedPDA
->numSector
;
433 /* initialize the Rp node */
434 rf_InitNode(rpNode
, rf_wait
, RF_FALSE
, rf_DiskReadFunc
, rf_DiskReadUndoFunc
,
435 rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
, "Rp ", allocList
);
436 rpNode
->params
[0].p
= parityPDA
;
437 rpNode
->params
[1].p
= rpBuf
;
438 rpNode
->params
[2].v
= parityStripeID
;
439 rpNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
442 * the last and nastiest step is to assign all
443 * the parameters of the Xor node
446 tmprrdNode
= rrdNodes
;
447 for (i
= 0; i
< nRrdNodes
; i
++) {
448 /* all the Rrd nodes need to be xored together */
449 xorNode
->params
[paramNum
++] = tmprrdNode
->params
[0];
450 xorNode
->params
[paramNum
++] = tmprrdNode
->params
[1];
451 tmprrdNode
= tmprrdNode
->list_next
;
453 tmprudNode
= rudNodes
;
454 for (i
= 0; i
< nRudNodes
; i
++) {
455 /* any Rud nodes that overlap the failed access need to be
457 if (overlappingPDAs
[i
]) {
458 pda
= rf_AllocPhysDiskAddr();
459 memcpy((char *) pda
, (char *) tmprudNode
->params
[0].p
, sizeof(RF_PhysDiskAddr_t
));
460 /* add it into the pda_cleanup_list *after* the copy, TYVM */
461 pda
->next
= dag_h
->pda_cleanup_list
;
462 dag_h
->pda_cleanup_list
= pda
;
463 rf_RangeRestrictPDA(raidPtr
, failedPDA
, pda
, RF_RESTRICT_DOBUFFER
, 0);
464 xorNode
->params
[paramNum
++].p
= pda
;
465 xorNode
->params
[paramNum
++].p
= pda
->bufPtr
;
467 tmprudNode
= tmprudNode
->list_next
;
470 /* install parity pda as last set of params to be xor'd */
471 xorNode
->params
[paramNum
++].p
= parityPDA
;
472 xorNode
->params
[paramNum
++].p
= rpBuf
;
475 * the last 2 params to the recovery xor node are
476 * the failed PDA and the raidPtr
478 xorNode
->params
[paramNum
++].p
= failedPDA
;
479 xorNode
->params
[paramNum
++].p
= raidPtr
;
480 RF_ASSERT(paramNum
== 2 * nXorBufs
+ 2);
483 * The xor node uses results[0] as the target buffer.
484 * Set pointer and zero the buffer. In the kernel, this
485 * may be a user buffer in which case we have to remap it.
487 xorNode
->results
[0] = failedPDA
->bufPtr
;
488 memset(failedPDA
->bufPtr
, 0, rf_RaidAddressToByte(raidPtr
,
489 failedPDA
->numSector
));
491 /* connect nodes to form graph */
492 /* connect the header to the block node */
493 RF_ASSERT(dag_h
->numSuccedents
== 1);
494 RF_ASSERT(blockNode
->numAntecedents
== 0);
495 dag_h
->succedents
[0] = blockNode
;
497 /* connect the block node to the read nodes */
498 RF_ASSERT(blockNode
->numSuccedents
== (1 + nRrdNodes
+ nRudNodes
));
499 RF_ASSERT(rpNode
->numAntecedents
== 1);
500 blockNode
->succedents
[0] = rpNode
;
501 rpNode
->antecedents
[0] = blockNode
;
502 rpNode
->antType
[0] = rf_control
;
503 tmprrdNode
= rrdNodes
;
504 for (i
= 0; i
< nRrdNodes
; i
++) {
505 RF_ASSERT(tmprrdNode
->numSuccedents
== 1);
506 blockNode
->succedents
[1 + i
] = tmprrdNode
;
507 tmprrdNode
->antecedents
[0] = blockNode
;
508 tmprrdNode
->antType
[0] = rf_control
;
509 tmprrdNode
= tmprrdNode
->list_next
;
511 tmprudNode
= rudNodes
;
512 for (i
= 0; i
< nRudNodes
; i
++) {
513 RF_ASSERT(tmprudNode
->numSuccedents
== 1);
514 blockNode
->succedents
[1 + nRrdNodes
+ i
] = tmprudNode
;
515 tmprudNode
->antecedents
[0] = blockNode
;
516 tmprudNode
->antType
[0] = rf_control
;
517 tmprudNode
= tmprudNode
->list_next
;
520 /* connect the read nodes to the xor node */
521 RF_ASSERT(xorNode
->numAntecedents
== (1 + nRrdNodes
+ nRudNodes
));
522 RF_ASSERT(rpNode
->numSuccedents
== 1);
523 rpNode
->succedents
[0] = xorNode
;
524 xorNode
->antecedents
[0] = rpNode
;
525 xorNode
->antType
[0] = rf_trueData
;
526 tmprrdNode
= rrdNodes
;
527 for (i
= 0; i
< nRrdNodes
; i
++) {
528 RF_ASSERT(tmprrdNode
->numSuccedents
== 1);
529 tmprrdNode
->succedents
[0] = xorNode
;
530 xorNode
->antecedents
[1 + i
] = tmprrdNode
;
531 xorNode
->antType
[1 + i
] = rf_trueData
;
532 tmprrdNode
= tmprrdNode
->list_next
;
534 tmprudNode
= rudNodes
;
535 for (i
= 0; i
< nRudNodes
; i
++) {
536 RF_ASSERT(tmprudNode
->numSuccedents
== 1);
537 tmprudNode
->succedents
[0] = xorNode
;
538 xorNode
->antecedents
[1 + nRrdNodes
+ i
] = tmprudNode
;
539 xorNode
->antType
[1 + nRrdNodes
+ i
] = rf_trueData
;
540 tmprudNode
= tmprudNode
->list_next
;
543 /* connect the xor node to the commit node */
544 RF_ASSERT(xorNode
->numSuccedents
== 1);
545 RF_ASSERT(commitNode
->numAntecedents
== 1);
546 xorNode
->succedents
[0] = commitNode
;
547 commitNode
->antecedents
[0] = xorNode
;
548 commitNode
->antType
[0] = rf_control
;
550 /* connect the termNode to the commit node */
551 RF_ASSERT(commitNode
->numSuccedents
== 1);
552 RF_ASSERT(termNode
->numAntecedents
== 1);
553 RF_ASSERT(termNode
->numSuccedents
== 0);
554 commitNode
->succedents
[0] = termNode
;
555 termNode
->antType
[0] = rf_control
;
556 termNode
->antecedents
[0] = commitNode
;
559 #if (RF_INCLUDE_CHAINDECLUSTER > 0)
560 /******************************************************************************
561 * Create a degraded read DAG for Chained Declustering
563 * Hdr -> Nil -> R(p/s)d -> Cmt -> Trm
565 * The "Rd" node reads data from the surviving disk in the mirror pair
566 * Rpd - read of primary copy
567 * Rsd - read of secondary copy
569 * Parameters: raidPtr - description of the physical array
570 * asmap - logical & physical addresses for this access
571 * bp - buffer ptr (for holding write data)
572 * flags - general flags (e.g. disk locking)
573 * allocList - list of memory allocated in DAG creation
574 *****************************************************************************/
577 rf_CreateRaidCDegradedReadDAG(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
578 RF_DagHeader_t
*dag_h
, void *bp
,
579 RF_RaidAccessFlags_t flags
,
580 RF_AllocListElem_t
*allocList
)
582 RF_DagNode_t
*nodes
, *rdNode
, *blockNode
, *commitNode
, *termNode
;
583 RF_StripeNum_t parityStripeID
;
584 int useMirror
, i
, shiftable
;
585 RF_ReconUnitNum_t which_ru
;
586 RF_PhysDiskAddr_t
*pda
;
588 if ((asmap
->numDataFailed
+ asmap
->numParityFailed
) == 0) {
591 shiftable
= RF_FALSE
;
594 parityStripeID
= rf_RaidAddressToParityStripeID(&(raidPtr
->Layout
),
595 asmap
->raidAddress
, &which_ru
);
599 printf("[Creating RAID C degraded read DAG]\n");
602 dag_h
->creator
= "RaidCDegradedReadDAG";
603 /* alloc the Wnd nodes and the Wmir node */
604 if (asmap
->numDataFailed
== 0)
605 useMirror
= RF_FALSE
;
609 /* total number of nodes = 1 + (block + commit + terminator) */
610 RF_MallocAndAdd(nodes
, 4 * sizeof(RF_DagNode_t
), (RF_DagNode_t
*), allocList
);
614 blockNode
= &nodes
[i
];
616 commitNode
= &nodes
[i
];
618 termNode
= &nodes
[i
];
622 * This dag can not commit until the commit node is reached.
623 * Errors prior to the commit point imply the dag has failed
624 * and must be retried.
626 dag_h
->numCommitNodes
= 1;
627 dag_h
->numCommits
= 0;
628 dag_h
->numSuccedents
= 1;
630 /* initialize the block, commit, and terminator nodes */
631 rf_InitNode(blockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
632 NULL
, 1, 0, 0, 0, dag_h
, "Nil", allocList
);
633 rf_InitNode(commitNode
, rf_wait
, RF_TRUE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
634 NULL
, 1, 1, 0, 0, dag_h
, "Cmt", allocList
);
635 rf_InitNode(termNode
, rf_wait
, RF_FALSE
, rf_TerminateFunc
, rf_TerminateUndoFunc
,
636 NULL
, 0, 1, 0, 0, dag_h
, "Trm", allocList
);
638 pda
= asmap
->physInfo
;
639 RF_ASSERT(pda
!= NULL
);
640 /* parityInfo must describe entire parity unit */
641 RF_ASSERT(asmap
->parityInfo
->next
== NULL
);
643 /* initialize the data node */
645 rf_InitNode(rdNode
, rf_wait
, RF_FALSE
, rf_DiskReadFunc
, rf_DiskReadUndoFunc
,
646 rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
, "Rpd", allocList
);
647 if (shiftable
&& rf_compute_workload_shift(raidPtr
, pda
)) {
648 /* shift this read to the next disk in line */
649 rdNode
->params
[0].p
= asmap
->parityInfo
;
650 rdNode
->params
[1].p
= pda
->bufPtr
;
651 rdNode
->params
[2].v
= parityStripeID
;
652 rdNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
654 /* read primary copy */
655 rdNode
->params
[0].p
= pda
;
656 rdNode
->params
[1].p
= pda
->bufPtr
;
657 rdNode
->params
[2].v
= parityStripeID
;
658 rdNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
661 /* read secondary copy of data */
662 rf_InitNode(rdNode
, rf_wait
, RF_FALSE
, rf_DiskReadFunc
, rf_DiskReadUndoFunc
,
663 rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
, "Rsd", allocList
);
664 rdNode
->params
[0].p
= asmap
->parityInfo
;
665 rdNode
->params
[1].p
= pda
->bufPtr
;
666 rdNode
->params
[2].v
= parityStripeID
;
667 rdNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
670 /* connect header to block node */
671 RF_ASSERT(dag_h
->numSuccedents
== 1);
672 RF_ASSERT(blockNode
->numAntecedents
== 0);
673 dag_h
->succedents
[0] = blockNode
;
675 /* connect block node to rdnode */
676 RF_ASSERT(blockNode
->numSuccedents
== 1);
677 RF_ASSERT(rdNode
->numAntecedents
== 1);
678 blockNode
->succedents
[0] = rdNode
;
679 rdNode
->antecedents
[0] = blockNode
;
680 rdNode
->antType
[0] = rf_control
;
682 /* connect rdnode to commit node */
683 RF_ASSERT(rdNode
->numSuccedents
== 1);
684 RF_ASSERT(commitNode
->numAntecedents
== 1);
685 rdNode
->succedents
[0] = commitNode
;
686 commitNode
->antecedents
[0] = rdNode
;
687 commitNode
->antType
[0] = rf_control
;
689 /* connect commit node to terminator */
690 RF_ASSERT(commitNode
->numSuccedents
== 1);
691 RF_ASSERT(termNode
->numAntecedents
== 1);
692 RF_ASSERT(termNode
->numSuccedents
== 0);
693 commitNode
->succedents
[0] = termNode
;
694 termNode
->antecedents
[0] = commitNode
;
695 termNode
->antType
[0] = rf_control
;
697 #endif /* (RF_INCLUDE_CHAINDECLUSTER > 0) */
699 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0)
701 * XXX move this elsewhere?
704 rf_DD_GenerateFailedAccessASMs(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
705 RF_PhysDiskAddr_t
**pdap
, int *nNodep
,
706 RF_PhysDiskAddr_t
**pqpdap
, int *nPQNodep
,
707 RF_AllocListElem_t
*allocList
)
709 RF_RaidLayout_t
*layoutPtr
= &(raidPtr
->Layout
);
711 RF_SectorCount_t secPerSU
= layoutPtr
->sectorsPerStripeUnit
;
712 int numDataCol
= layoutPtr
->numDataCol
;
714 RF_SectorNum_t suoff
, suend
;
715 unsigned firstDataCol
, napdas
, count
;
716 RF_SectorNum_t fone_start
, fone_end
, ftwo_start
= 0, ftwo_end
= 0;
717 RF_PhysDiskAddr_t
*fone
= asmap
->failedPDAs
[0], *ftwo
= asmap
->failedPDAs
[1];
718 RF_PhysDiskAddr_t
*pda_p
;
719 RF_PhysDiskAddr_t
*phys_p
;
720 RF_RaidAddr_t sosAddr
;
722 /* determine how many pda's we will have to generate per unaccess
723 * stripe. If there is only one failed data unit, it is one; if two,
724 * possibly two, depending wether they overlap. */
726 fone_start
= rf_StripeUnitOffset(layoutPtr
, fone
->startSector
);
727 fone_end
= fone_start
+ fone
->numSector
;
729 #define CONS_PDA(if,start,num) \
730 pda_p->col = asmap->if->col; \
731 pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
732 pda_p->numSector = num; \
733 pda_p->next = NULL; \
734 RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
736 if (asmap
->numDataFailed
== 1) {
739 RF_MallocAndAdd(*pqpdap
, 2 * sizeof(RF_PhysDiskAddr_t
), (RF_PhysDiskAddr_t
*), allocList
);
742 CONS_PDA(parityInfo
, fone_start
, fone
->numSector
);
743 pda_p
->type
= RF_PDA_TYPE_PARITY
;
746 CONS_PDA(qInfo
, fone_start
, fone
->numSector
);
747 pda_p
->type
= RF_PDA_TYPE_Q
;
749 ftwo_start
= rf_StripeUnitOffset(layoutPtr
, ftwo
->startSector
);
750 ftwo_end
= ftwo_start
+ ftwo
->numSector
;
751 if (fone
->numSector
+ ftwo
->numSector
> secPerSU
) {
754 RF_MallocAndAdd(*pqpdap
, 2 * sizeof(RF_PhysDiskAddr_t
), (RF_PhysDiskAddr_t
*), allocList
);
756 CONS_PDA(parityInfo
, 0, secPerSU
);
757 pda_p
->type
= RF_PDA_TYPE_PARITY
;
759 CONS_PDA(qInfo
, 0, secPerSU
);
760 pda_p
->type
= RF_PDA_TYPE_Q
;
764 /* four of them, fone, then ftwo */
765 RF_MallocAndAdd(*pqpdap
, 4 * sizeof(RF_PhysDiskAddr_t
), (RF_PhysDiskAddr_t
*), allocList
);
767 CONS_PDA(parityInfo
, fone_start
, fone
->numSector
);
768 pda_p
->type
= RF_PDA_TYPE_PARITY
;
770 CONS_PDA(qInfo
, fone_start
, fone
->numSector
);
771 pda_p
->type
= RF_PDA_TYPE_Q
;
773 CONS_PDA(parityInfo
, ftwo_start
, ftwo
->numSector
);
774 pda_p
->type
= RF_PDA_TYPE_PARITY
;
776 CONS_PDA(qInfo
, ftwo_start
, ftwo
->numSector
);
777 pda_p
->type
= RF_PDA_TYPE_Q
;
780 /* figure out number of nonaccessed pda */
781 napdas
= PDAPerDisk
* (numDataCol
- asmap
->numStripeUnitsAccessed
- (ftwo
== NULL
? 1 : 0));
782 *nPQNodep
= PDAPerDisk
;
784 /* sweep over the over accessed pda's, figuring out the number of
785 * additional pda's to generate. Of course, skip the failed ones */
788 for (pda_p
= asmap
->physInfo
; pda_p
; pda_p
= pda_p
->next
) {
789 if ((pda_p
== fone
) || (pda_p
== ftwo
))
791 suoff
= rf_StripeUnitOffset(layoutPtr
, pda_p
->startSector
);
792 suend
= suoff
+ pda_p
->numSector
;
794 case 1: /* one failed PDA to overlap */
795 /* if a PDA doesn't contain the failed unit, it can
796 * only miss the start or end, not both */
797 if ((suoff
> fone_start
) || (suend
< fone_end
))
800 case 2: /* whole stripe */
801 if (suoff
) /* leak at begining */
803 if (suend
< numDataCol
) /* leak at end */
806 case 3: /* two disjoint units */
807 if ((suoff
> fone_start
) || (suend
< fone_end
))
809 if ((suoff
> ftwo_start
) || (suend
< ftwo_end
))
820 return; /* short circuit */
822 /* allocate up our list of pda's */
824 RF_MallocAndAdd(pda_p
, napdas
* sizeof(RF_PhysDiskAddr_t
),
825 (RF_PhysDiskAddr_t
*), allocList
);
828 /* linkem together */
829 for (i
= 0; i
< (napdas
- 1); i
++)
830 pda_p
[i
].next
= pda_p
+ (i
+ 1);
832 /* march through the one's up to the first accessed disk */
833 firstDataCol
= rf_RaidAddressToStripeUnitID(&(raidPtr
->Layout
), asmap
->physInfo
->raidAddress
) % numDataCol
;
834 sosAddr
= rf_RaidAddressOfPrevStripeBoundary(layoutPtr
, asmap
->raidAddress
);
835 for (i
= 0; i
< firstDataCol
; i
++) {
836 if ((pda_p
- (*pdap
)) == napdas
)
838 pda_p
->type
= RF_PDA_TYPE_DATA
;
839 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
);
840 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
841 /* skip over dead disks */
842 if (RF_DEAD_DISK(raidPtr
->Disks
[pda_p
->col
].status
))
846 pda_p
->numSector
= fone
->numSector
;
847 pda_p
->raidAddress
+= fone_start
;
848 pda_p
->startSector
+= fone_start
;
849 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
851 case 2: /* full stripe */
852 pda_p
->numSector
= secPerSU
;
853 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, secPerSU
), (char *), allocList
);
855 case 3: /* two slabs */
856 pda_p
->numSector
= fone
->numSector
;
857 pda_p
->raidAddress
+= fone_start
;
858 pda_p
->startSector
+= fone_start
;
859 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
861 pda_p
->type
= RF_PDA_TYPE_DATA
;
862 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
);
863 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
864 pda_p
->numSector
= ftwo
->numSector
;
865 pda_p
->raidAddress
+= ftwo_start
;
866 pda_p
->startSector
+= ftwo_start
;
867 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
875 /* march through the touched stripe units */
876 for (phys_p
= asmap
->physInfo
; phys_p
; phys_p
= phys_p
->next
, i
++) {
877 if ((phys_p
== asmap
->failedPDAs
[0]) || (phys_p
== asmap
->failedPDAs
[1]))
879 suoff
= rf_StripeUnitOffset(layoutPtr
, phys_p
->startSector
);
880 suend
= suoff
+ phys_p
->numSector
;
882 case 1: /* single buffer */
883 if (suoff
> fone_start
) {
884 RF_ASSERT(suend
>= fone_end
);
885 /* The data read starts after the mapped
886 * access, snip off the begining */
887 pda_p
->numSector
= suoff
- fone_start
;
888 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
) + fone_start
;
889 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
890 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
893 if (suend
< fone_end
) {
894 RF_ASSERT(suoff
<= fone_start
);
895 /* The data read stops before the end of the
896 * failed access, extend */
897 pda_p
->numSector
= fone_end
- suend
;
898 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
) + suend
; /* off by one? */
899 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
900 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
904 case 2: /* whole stripe unit */
905 RF_ASSERT((suoff
== 0) || (suend
== secPerSU
));
906 if (suend
< secPerSU
) { /* short read, snip from end
908 pda_p
->numSector
= secPerSU
- suend
;
909 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
) + suend
; /* off by one? */
910 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
911 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
914 if (suoff
> 0) { /* short at front */
915 pda_p
->numSector
= suoff
;
916 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
);
917 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
918 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
922 case 3: /* two nonoverlapping failures */
923 if ((suoff
> fone_start
) || (suend
< fone_end
)) {
924 if (suoff
> fone_start
) {
925 RF_ASSERT(suend
>= fone_end
);
926 /* The data read starts after the
927 * mapped access, snip off the
929 pda_p
->numSector
= suoff
- fone_start
;
930 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
) + fone_start
;
931 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
932 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
935 if (suend
< fone_end
) {
936 RF_ASSERT(suoff
<= fone_start
);
937 /* The data read stops before the end
938 * of the failed access, extend */
939 pda_p
->numSector
= fone_end
- suend
;
940 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
) + suend
; /* off by one? */
941 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
942 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
946 if ((suoff
> ftwo_start
) || (suend
< ftwo_end
)) {
947 if (suoff
> ftwo_start
) {
948 RF_ASSERT(suend
>= ftwo_end
);
949 /* The data read starts after the
950 * mapped access, snip off the
952 pda_p
->numSector
= suoff
- ftwo_start
;
953 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
) + ftwo_start
;
954 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
955 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
958 if (suend
< ftwo_end
) {
959 RF_ASSERT(suoff
<= ftwo_start
);
960 /* The data read stops before the end
961 * of the failed access, extend */
962 pda_p
->numSector
= ftwo_end
- suend
;
963 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
) + suend
; /* off by one? */
964 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
965 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
975 /* after the last accessed disk */
976 for (; i
< numDataCol
; i
++) {
977 if ((pda_p
- (*pdap
)) == napdas
)
979 pda_p
->type
= RF_PDA_TYPE_DATA
;
980 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
);
981 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
982 /* skip over dead disks */
983 if (RF_DEAD_DISK(raidPtr
->Disks
[pda_p
->col
].status
))
987 pda_p
->numSector
= fone
->numSector
;
988 pda_p
->raidAddress
+= fone_start
;
989 pda_p
->startSector
+= fone_start
;
990 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
992 case 2: /* full stripe */
993 pda_p
->numSector
= secPerSU
;
994 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, secPerSU
), (char *), allocList
);
996 case 3: /* two slabs */
997 pda_p
->numSector
= fone
->numSector
;
998 pda_p
->raidAddress
+= fone_start
;
999 pda_p
->startSector
+= fone_start
;
1000 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
1002 pda_p
->type
= RF_PDA_TYPE_DATA
;
1003 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
);
1004 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
1005 pda_p
->numSector
= ftwo
->numSector
;
1006 pda_p
->raidAddress
+= ftwo_start
;
1007 pda_p
->startSector
+= ftwo_start
;
1008 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
1016 RF_ASSERT(pda_p
- *pdap
== napdas
);
1019 #define INIT_DISK_NODE(node,name) \
1020 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
1021 (node)->succedents[0] = unblockNode; \
1022 (node)->succedents[1] = recoveryNode; \
1023 (node)->antecedents[0] = blockNode; \
1024 (node)->antType[0] = rf_control
1026 #define DISK_NODE_PARAMS(_node_,_p_) \
1027 (_node_).params[0].p = _p_ ; \
1028 (_node_).params[1].p = (_p_)->bufPtr; \
1029 (_node_).params[2].v = parityStripeID; \
1030 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru)
1033 rf_DoubleDegRead(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
1034 RF_DagHeader_t
*dag_h
, void *bp
,
1035 RF_RaidAccessFlags_t flags
,
1036 RF_AllocListElem_t
*allocList
,
1037 const char *redundantReadNodeName
,
1038 const char *recoveryNodeName
,
1039 int (*recovFunc
) (RF_DagNode_t
*))
1041 RF_RaidLayout_t
*layoutPtr
= &(raidPtr
->Layout
);
1042 RF_DagNode_t
*nodes
, *rudNodes
, *rrdNodes
, *recoveryNode
, *blockNode
,
1043 *unblockNode
, *rpNodes
, *rqNodes
, *termNode
;
1044 RF_PhysDiskAddr_t
*pda
, *pqPDAs
;
1045 RF_PhysDiskAddr_t
*npdas
;
1046 int nNodes
, nRrdNodes
, nRudNodes
, i
;
1047 RF_ReconUnitNum_t which_ru
;
1048 int nReadNodes
, nPQNodes
;
1049 RF_PhysDiskAddr_t
*failedPDA
= asmap
->failedPDAs
[0];
1050 RF_PhysDiskAddr_t
*failedPDAtwo
= asmap
->failedPDAs
[1];
1051 RF_StripeNum_t parityStripeID
= rf_RaidAddressToParityStripeID(layoutPtr
, asmap
->raidAddress
, &which_ru
);
1055 printf("[Creating Double Degraded Read DAG]\n");
1057 rf_DD_GenerateFailedAccessASMs(raidPtr
, asmap
, &npdas
, &nRrdNodes
, &pqPDAs
, &nPQNodes
, allocList
);
1059 nRudNodes
= asmap
->numStripeUnitsAccessed
- (asmap
->numDataFailed
);
1060 nReadNodes
= nRrdNodes
+ nRudNodes
+ 2 * nPQNodes
;
1061 nNodes
= 4 /* block, unblock, recovery, term */ + nReadNodes
;
1063 RF_MallocAndAdd(nodes
, nNodes
* sizeof(RF_DagNode_t
), (RF_DagNode_t
*), allocList
);
1065 blockNode
= &nodes
[i
];
1067 unblockNode
= &nodes
[i
];
1069 recoveryNode
= &nodes
[i
];
1071 termNode
= &nodes
[i
];
1073 rudNodes
= &nodes
[i
];
1075 rrdNodes
= &nodes
[i
];
1077 rpNodes
= &nodes
[i
];
1079 rqNodes
= &nodes
[i
];
1081 RF_ASSERT(i
== nNodes
);
1083 dag_h
->numSuccedents
= 1;
1084 dag_h
->succedents
[0] = blockNode
;
1085 dag_h
->creator
= "DoubleDegRead";
1086 dag_h
->numCommits
= 0;
1087 dag_h
->numCommitNodes
= 1; /* unblock */
1089 rf_InitNode(termNode
, rf_wait
, RF_FALSE
, rf_TerminateFunc
, rf_TerminateUndoFunc
, NULL
, 0, 2, 0, 0, dag_h
, "Trm", allocList
);
1090 termNode
->antecedents
[0] = unblockNode
;
1091 termNode
->antType
[0] = rf_control
;
1092 termNode
->antecedents
[1] = recoveryNode
;
1093 termNode
->antType
[1] = rf_control
;
1095 /* init the block and unblock nodes */
1096 /* The block node has all nodes except itself, unblock and recovery as
1097 * successors. Similarly for predecessors of the unblock. */
1098 rf_InitNode(blockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
, NULL
, nReadNodes
, 0, 0, 0, dag_h
, "Nil", allocList
);
1099 rf_InitNode(unblockNode
, rf_wait
, RF_TRUE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
, NULL
, 1, nReadNodes
, 0, 0, dag_h
, "Nil", allocList
);
1101 for (i
= 0; i
< nReadNodes
; i
++) {
1102 blockNode
->succedents
[i
] = rudNodes
+ i
;
1103 unblockNode
->antecedents
[i
] = rudNodes
+ i
;
1104 unblockNode
->antType
[i
] = rf_control
;
1106 unblockNode
->succedents
[0] = termNode
;
1108 /* The recovery node has all the reads as predecessors, and the term
1109 * node as successors. It gets a pda as a param from each of the read
1110 * nodes plus the raidPtr. For each failed unit is has a result pda. */
1111 rf_InitNode(recoveryNode
, rf_wait
, RF_FALSE
, recovFunc
, rf_NullNodeUndoFunc
, NULL
,
1113 nReadNodes
, /* preds */
1114 nReadNodes
+ 2, /* params */
1115 asmap
->numDataFailed
, /* results */
1116 dag_h
, recoveryNodeName
, allocList
);
1118 recoveryNode
->succedents
[0] = termNode
;
1119 for (i
= 0; i
< nReadNodes
; i
++) {
1120 recoveryNode
->antecedents
[i
] = rudNodes
+ i
;
1121 recoveryNode
->antType
[i
] = rf_trueData
;
1124 /* build the read nodes, then come back and fill in recovery params
1126 pda
= asmap
->physInfo
;
1127 for (i
= 0; i
< nRudNodes
; pda
= pda
->next
) {
1128 if ((pda
== failedPDA
) || (pda
== failedPDAtwo
))
1130 INIT_DISK_NODE(rudNodes
+ i
, "Rud");
1132 DISK_NODE_PARAMS(rudNodes
[i
], pda
);
1137 for (i
= 0; i
< nRrdNodes
; i
++, pda
= pda
->next
) {
1138 INIT_DISK_NODE(rrdNodes
+ i
, "Rrd");
1140 DISK_NODE_PARAMS(rrdNodes
[i
], pda
);
1143 /* redundancy pdas */
1145 INIT_DISK_NODE(rpNodes
, "Rp");
1147 DISK_NODE_PARAMS(rpNodes
[0], pda
);
1149 INIT_DISK_NODE(rqNodes
, redundantReadNodeName
);
1151 DISK_NODE_PARAMS(rqNodes
[0], pda
);
1152 if (nPQNodes
== 2) {
1154 INIT_DISK_NODE(rpNodes
+ 1, "Rp");
1156 DISK_NODE_PARAMS(rpNodes
[1], pda
);
1158 INIT_DISK_NODE(rqNodes
+ 1, redundantReadNodeName
);
1160 DISK_NODE_PARAMS(rqNodes
[1], pda
);
1162 /* fill in recovery node params */
1163 for (i
= 0; i
< nReadNodes
; i
++)
1164 recoveryNode
->params
[i
] = rudNodes
[i
].params
[0]; /* pda */
1165 recoveryNode
->params
[i
++].p
= (void *) raidPtr
;
1166 recoveryNode
->params
[i
++].p
= (void *) asmap
;
1167 recoveryNode
->results
[0] = failedPDA
;
1168 if (asmap
->numDataFailed
== 2)
1169 recoveryNode
->results
[1] = failedPDAtwo
;
1171 /* zero fill the target data buffers? */
1174 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */