1 /* $NetBSD: rf_dagffwr.c,v 1.32 2006/10/12 01:31:50 christos Exp $ */
3 * Copyright (c) 1995 Carnegie-Mellon University.
6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
18 * Carnegie Mellon requests users of this software to return to
20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
32 * code for creating fault-free DAGs
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rf_dagffwr.c,v 1.32 2006/10/12 01:31:50 christos Exp $");
39 #include <dev/raidframe/raidframevar.h>
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_debugMem.h"
46 #include "rf_dagffrd.h"
47 #include "rf_general.h"
48 #include "rf_dagffwr.h"
51 /******************************************************************************
53 * General comments on DAG creation:
55 * All DAGs in this file use roll-away error recovery. Each DAG has a single
56 * commit node, usually called "Cmt." If an error occurs before the Cmt node
57 * is reached, the execution engine will halt forward execution and work
58 * backward through the graph, executing the undo functions. Assuming that
59 * each node in the graph prior to the Cmt node are undoable and atomic - or -
60 * does not make changes to permanent state, the graph will fail atomically.
61 * If an error occurs after the Cmt node executes, the engine will roll-forward
62 * through the graph, blindly executing nodes until it reaches the end.
63 * If a graph reaches the end, it is assumed to have completed successfully.
65 * A graph has only 1 Cmt node.
70 /******************************************************************************
72 * The following wrappers map the standard DAG creation interface to the
73 * DAG creation routines. Additionally, these wrappers enable experimentation
74 * with new DAG structures by providing an extra level of indirection, allowing
75 * the DAG creation routines to be replaced at this single point.
80 rf_CreateNonRedundantWriteDAG(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
81 RF_DagHeader_t
*dag_h
, void *bp
,
82 RF_RaidAccessFlags_t flags
,
83 RF_AllocListElem_t
*allocList
,
86 rf_CreateNonredundantDAG(raidPtr
, asmap
, dag_h
, bp
, flags
, allocList
,
91 rf_CreateRAID0WriteDAG(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
92 RF_DagHeader_t
*dag_h
, void *bp
,
93 RF_RaidAccessFlags_t flags
,
94 RF_AllocListElem_t
*allocList
,
97 rf_CreateNonredundantDAG(raidPtr
, asmap
, dag_h
, bp
, flags
, allocList
,
102 rf_CreateSmallWriteDAG(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
103 RF_DagHeader_t
*dag_h
, void *bp
,
104 RF_RaidAccessFlags_t flags
,
105 RF_AllocListElem_t
*allocList
)
107 /* "normal" rollaway */
108 rf_CommonCreateSmallWriteDAG(raidPtr
, asmap
, dag_h
, bp
, flags
,
109 allocList
, &rf_xorFuncs
, NULL
);
113 rf_CreateLargeWriteDAG(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
114 RF_DagHeader_t
*dag_h
, void *bp
,
115 RF_RaidAccessFlags_t flags
,
116 RF_AllocListElem_t
*allocList
)
118 /* "normal" rollaway */
119 rf_CommonCreateLargeWriteDAG(raidPtr
, asmap
, dag_h
, bp
, flags
,
120 allocList
, 1, rf_RegularXorFunc
, RF_TRUE
);
124 /******************************************************************************
126 * DAG creation code begins here
130 /******************************************************************************
132 * creates a DAG to perform a large-write operation:
135 * H -- block- Rod - Xor - Cmt - Wnd --- T
139 * The XOR node also does the Q calculation in the P+Q architecture.
140 * All nodes are before the commit node (Cmt) are assumed to be atomic and
141 * undoable - or - they make no changes to permanent state.
143 * Rod = read old data
145 * Wnp = write new parity
146 * Wnd = write new data
147 * Wnq = write new "q"
148 * [] denotes optional segments in the graph
150 * Parameters: raidPtr - description of the physical array
151 * asmap - logical & physical addresses for this access
152 * bp - buffer ptr (holds write data)
153 * flags - general flags (e.g. disk locking)
154 * allocList - list of memory allocated in DAG creation
155 * nfaults - number of faults array can tolerate
156 * (equal to # redundancy units in stripe)
157 * redfuncs - list of redundancy generating functions
159 *****************************************************************************/
162 rf_CommonCreateLargeWriteDAG(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
163 RF_DagHeader_t
*dag_h
, void *bp
,
164 RF_RaidAccessFlags_t flags
,
165 RF_AllocListElem_t
*allocList
,
166 int nfaults
, int (*redFunc
) (RF_DagNode_t
*),
167 int allowBufferRecycle
)
169 RF_DagNode_t
*wndNodes
, *rodNodes
, *xorNode
, *wnpNode
, *tmpNode
;
170 RF_DagNode_t
*wnqNode
, *blockNode
, *commitNode
, *termNode
;
171 int nWndNodes
, nRodNodes
, i
, nodeNum
, asmNum
;
172 RF_AccessStripeMapHeader_t
*new_asm_h
[2];
173 RF_StripeNum_t parityStripeID
;
174 char *sosBuffer
, *eosBuffer
;
175 RF_ReconUnitNum_t which_ru
;
176 RF_RaidLayout_t
*layoutPtr
;
177 RF_PhysDiskAddr_t
*pda
;
179 layoutPtr
= &(raidPtr
->Layout
);
180 parityStripeID
= rf_RaidAddressToParityStripeID(layoutPtr
,
186 printf("[Creating large-write DAG]\n");
189 dag_h
->creator
= "LargeWriteDAG";
191 dag_h
->numCommitNodes
= 1;
192 dag_h
->numCommits
= 0;
193 dag_h
->numSuccedents
= 1;
195 /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */
196 nWndNodes
= asmap
->numStripeUnitsAccessed
;
198 for (i
= 0; i
< nWndNodes
; i
++) {
199 tmpNode
= rf_AllocDAGNode();
200 tmpNode
->list_next
= dag_h
->nodes
;
201 dag_h
->nodes
= tmpNode
;
203 wndNodes
= dag_h
->nodes
;
205 xorNode
= rf_AllocDAGNode();
206 xorNode
->list_next
= dag_h
->nodes
;
207 dag_h
->nodes
= xorNode
;
209 wnpNode
= rf_AllocDAGNode();
210 wnpNode
->list_next
= dag_h
->nodes
;
211 dag_h
->nodes
= wnpNode
;
213 blockNode
= rf_AllocDAGNode();
214 blockNode
->list_next
= dag_h
->nodes
;
215 dag_h
->nodes
= blockNode
;
217 commitNode
= rf_AllocDAGNode();
218 commitNode
->list_next
= dag_h
->nodes
;
219 dag_h
->nodes
= commitNode
;
221 termNode
= rf_AllocDAGNode();
222 termNode
->list_next
= dag_h
->nodes
;
223 dag_h
->nodes
= termNode
;
225 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
227 wnqNode
= rf_AllocDAGNode();
231 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
234 rf_MapUnaccessedPortionOfStripe(raidPtr
, layoutPtr
, asmap
, dag_h
,
235 new_asm_h
, &nRodNodes
, &sosBuffer
,
236 &eosBuffer
, allocList
);
238 for (i
= 0; i
< nRodNodes
; i
++) {
239 tmpNode
= rf_AllocDAGNode();
240 tmpNode
->list_next
= dag_h
->nodes
;
241 dag_h
->nodes
= tmpNode
;
243 rodNodes
= dag_h
->nodes
;
248 /* begin node initialization */
250 rf_InitNode(blockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
,
251 rf_NullNodeUndoFunc
, NULL
, nRodNodes
, 0, 0, 0,
252 dag_h
, "Nil", allocList
);
254 rf_InitNode(blockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
,
255 rf_NullNodeUndoFunc
, NULL
, 1, 0, 0, 0,
256 dag_h
, "Nil", allocList
);
259 rf_InitNode(commitNode
, rf_wait
, RF_TRUE
, rf_NullNodeFunc
,
260 rf_NullNodeUndoFunc
, NULL
, nWndNodes
+ nfaults
, 1, 0, 0,
261 dag_h
, "Cmt", allocList
);
262 rf_InitNode(termNode
, rf_wait
, RF_FALSE
, rf_TerminateFunc
,
263 rf_TerminateUndoFunc
, NULL
, 0, nWndNodes
+ nfaults
, 0, 0,
264 dag_h
, "Trm", allocList
);
266 /* initialize the Rod nodes */
268 for (nodeNum
= asmNum
= 0; asmNum
< 2; asmNum
++) {
269 if (new_asm_h
[asmNum
]) {
270 pda
= new_asm_h
[asmNum
]->stripeMap
->physInfo
;
272 rf_InitNode(tmpNode
, rf_wait
,
273 RF_FALSE
, rf_DiskReadFunc
,
275 rf_GenericWakeupFunc
,
278 tmpNode
->params
[0].p
= pda
;
279 tmpNode
->params
[1].p
= pda
->bufPtr
;
280 tmpNode
->params
[2].v
= parityStripeID
;
281 tmpNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
,
285 tmpNode
= tmpNode
->list_next
;
289 RF_ASSERT(nodeNum
== nRodNodes
);
291 /* initialize the wnd nodes */
292 pda
= asmap
->physInfo
;
294 for (i
= 0; i
< nWndNodes
; i
++) {
295 rf_InitNode(tmpNode
, rf_wait
, RF_FALSE
,
296 rf_DiskWriteFunc
, rf_DiskWriteUndoFunc
,
297 rf_GenericWakeupFunc
, 1, 1, 4, 0,
298 dag_h
, "Wnd", allocList
);
299 RF_ASSERT(pda
!= NULL
);
300 tmpNode
->params
[0].p
= pda
;
301 tmpNode
->params
[1].p
= pda
->bufPtr
;
302 tmpNode
->params
[2].v
= parityStripeID
;
303 tmpNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
305 tmpNode
= tmpNode
->list_next
;
308 /* initialize the redundancy node */
310 rf_InitNode(xorNode
, rf_wait
, RF_FALSE
, redFunc
,
311 rf_NullNodeUndoFunc
, NULL
, 1,
312 nRodNodes
, 2 * (nWndNodes
+ nRodNodes
) + 1,
313 nfaults
, dag_h
, "Xr ", allocList
);
315 rf_InitNode(xorNode
, rf_wait
, RF_FALSE
, redFunc
,
316 rf_NullNodeUndoFunc
, NULL
, 1,
317 1, 2 * (nWndNodes
+ nRodNodes
) + 1,
318 nfaults
, dag_h
, "Xr ", allocList
);
320 xorNode
->flags
|= RF_DAGNODE_FLAG_YIELD
;
322 for (i
= 0; i
< nWndNodes
; i
++) {
324 xorNode
->params
[2 * i
+ 0] = tmpNode
->params
[0];
326 xorNode
->params
[2 * i
+ 1] = tmpNode
->params
[1];
327 tmpNode
= tmpNode
->list_next
;
330 for (i
= 0; i
< nRodNodes
; i
++) {
332 xorNode
->params
[2 * (nWndNodes
+ i
) + 0] = tmpNode
->params
[0];
334 xorNode
->params
[2 * (nWndNodes
+ i
) + 1] = tmpNode
->params
[1];
335 tmpNode
= tmpNode
->list_next
;
337 /* xor node needs to get at RAID information */
338 xorNode
->params
[2 * (nWndNodes
+ nRodNodes
)].p
= raidPtr
;
341 * Look for an Rod node that reads a complete SU. If none,
342 * alloc a buffer to receive the parity info. Note that we
343 * can't use a new data buffer because it will not have gotten
344 * written when the xor occurs. */
345 if (allowBufferRecycle
) {
347 for (i
= 0; i
< nRodNodes
; i
++) {
348 if (((RF_PhysDiskAddr_t
*) tmpNode
->params
[0].p
)->numSector
== raidPtr
->Layout
.sectorsPerStripeUnit
)
350 tmpNode
= tmpNode
->list_next
;
353 if ((!allowBufferRecycle
) || (i
== nRodNodes
)) {
354 xorNode
->results
[0] = rf_AllocBuffer(raidPtr
, dag_h
, rf_RaidAddressToByte(raidPtr
, raidPtr
->Layout
.sectorsPerStripeUnit
));
356 /* this works because the only way we get here is if
357 allowBufferRecycle is true and we went through the
358 above for loop, and exited via the break before
359 i==nRodNodes was true. That means tmpNode will
360 still point to a valid node -- the one we want for
362 xorNode
->results
[0] = tmpNode
->params
[1].p
;
365 /* initialize the Wnp node */
366 rf_InitNode(wnpNode
, rf_wait
, RF_FALSE
, rf_DiskWriteFunc
,
367 rf_DiskWriteUndoFunc
, rf_GenericWakeupFunc
, 1, 1, 4, 0,
368 dag_h
, "Wnp", allocList
);
369 wnpNode
->params
[0].p
= asmap
->parityInfo
;
370 wnpNode
->params
[1].p
= xorNode
->results
[0];
371 wnpNode
->params
[2].v
= parityStripeID
;
372 wnpNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
373 /* parityInfo must describe entire parity unit */
374 RF_ASSERT(asmap
->parityInfo
->next
== NULL
);
376 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
379 * We never try to recycle a buffer for the Q calcuation
380 * in addition to the parity. This would cause two buffers
381 * to get smashed during the P and Q calculation, guaranteeing
382 * one would be wrong.
384 RF_MallocAndAdd(xorNode
->results
[1],
385 rf_RaidAddressToByte(raidPtr
, raidPtr
->Layout
.sectorsPerStripeUnit
),
386 (void *), allocList
);
387 rf_InitNode(wnqNode
, rf_wait
, RF_FALSE
, rf_DiskWriteFunc
,
388 rf_DiskWriteUndoFunc
, rf_GenericWakeupFunc
,
389 1, 1, 4, 0, dag_h
, "Wnq", allocList
);
390 wnqNode
->params
[0].p
= asmap
->qInfo
;
391 wnqNode
->params
[1].p
= xorNode
->results
[1];
392 wnqNode
->params
[2].v
= parityStripeID
;
393 wnqNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
394 /* parityInfo must describe entire parity unit */
395 RF_ASSERT(asmap
->parityInfo
->next
== NULL
);
399 * Connect nodes to form graph.
402 /* connect dag header to block node */
403 RF_ASSERT(blockNode
->numAntecedents
== 0);
404 dag_h
->succedents
[0] = blockNode
;
407 /* connect the block node to the Rod nodes */
408 RF_ASSERT(blockNode
->numSuccedents
== nRodNodes
);
409 RF_ASSERT(xorNode
->numAntecedents
== nRodNodes
);
411 for (i
= 0; i
< nRodNodes
; i
++) {
412 RF_ASSERT(tmpNode
->numAntecedents
== 1);
413 blockNode
->succedents
[i
] = tmpNode
;
414 tmpNode
->antecedents
[0] = blockNode
;
415 tmpNode
->antType
[0] = rf_control
;
417 /* connect the Rod nodes to the Xor node */
418 RF_ASSERT(tmpNode
->numSuccedents
== 1);
419 tmpNode
->succedents
[0] = xorNode
;
420 xorNode
->antecedents
[i
] = tmpNode
;
421 xorNode
->antType
[i
] = rf_trueData
;
422 tmpNode
= tmpNode
->list_next
;
425 /* connect the block node to the Xor node */
426 RF_ASSERT(blockNode
->numSuccedents
== 1);
427 RF_ASSERT(xorNode
->numAntecedents
== 1);
428 blockNode
->succedents
[0] = xorNode
;
429 xorNode
->antecedents
[0] = blockNode
;
430 xorNode
->antType
[0] = rf_control
;
433 /* connect the xor node to the commit node */
434 RF_ASSERT(xorNode
->numSuccedents
== 1);
435 RF_ASSERT(commitNode
->numAntecedents
== 1);
436 xorNode
->succedents
[0] = commitNode
;
437 commitNode
->antecedents
[0] = xorNode
;
438 commitNode
->antType
[0] = rf_control
;
440 /* connect the commit node to the write nodes */
441 RF_ASSERT(commitNode
->numSuccedents
== nWndNodes
+ nfaults
);
443 for (i
= 0; i
< nWndNodes
; i
++) {
444 RF_ASSERT(wndNodes
->numAntecedents
== 1);
445 commitNode
->succedents
[i
] = tmpNode
;
446 tmpNode
->antecedents
[0] = commitNode
;
447 tmpNode
->antType
[0] = rf_control
;
448 tmpNode
= tmpNode
->list_next
;
450 RF_ASSERT(wnpNode
->numAntecedents
== 1);
451 commitNode
->succedents
[nWndNodes
] = wnpNode
;
452 wnpNode
->antecedents
[0] = commitNode
;
453 wnpNode
->antType
[0] = rf_trueData
;
454 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
456 RF_ASSERT(wnqNode
->numAntecedents
== 1);
457 commitNode
->succedents
[nWndNodes
+ 1] = wnqNode
;
458 wnqNode
->antecedents
[0] = commitNode
;
459 wnqNode
->antType
[0] = rf_trueData
;
462 /* connect the write nodes to the term node */
463 RF_ASSERT(termNode
->numAntecedents
== nWndNodes
+ nfaults
);
464 RF_ASSERT(termNode
->numSuccedents
== 0);
466 for (i
= 0; i
< nWndNodes
; i
++) {
467 RF_ASSERT(wndNodes
->numSuccedents
== 1);
468 tmpNode
->succedents
[0] = termNode
;
469 termNode
->antecedents
[i
] = tmpNode
;
470 termNode
->antType
[i
] = rf_control
;
471 tmpNode
= tmpNode
->list_next
;
473 RF_ASSERT(wnpNode
->numSuccedents
== 1);
474 wnpNode
->succedents
[0] = termNode
;
475 termNode
->antecedents
[nWndNodes
] = wnpNode
;
476 termNode
->antType
[nWndNodes
] = rf_control
;
477 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
479 RF_ASSERT(wnqNode
->numSuccedents
== 1);
480 wnqNode
->succedents
[0] = termNode
;
481 termNode
->antecedents
[nWndNodes
+ 1] = wnqNode
;
482 termNode
->antType
[nWndNodes
+ 1] = rf_control
;
486 /******************************************************************************
488 * creates a DAG to perform a small-write operation (either raid 5 or pq),
489 * which is as follows:
491 * Hdr -> Nil -> Rop -> Xor -> Cmt ----> Wnp [Unp] --> Trm
492 * \- Rod X / \----> Wnd [Und]-/
493 * [\- Rod X / \---> Wnd [Und]-/]
494 * [\- Roq -> Q / \--> Wnq [Unq]-/]
496 * Rop = read old parity
497 * Rod = read old data
500 * Und = unlock data disk
501 * Unp = unlock parity disk
502 * Unq = unlock q disk
503 * Wnp = write new parity
504 * Wnd = write new data
505 * Wnq = write new "q"
506 * [ ] denotes optional segments in the graph
508 * Parameters: raidPtr - description of the physical array
509 * asmap - logical & physical addresses for this access
510 * bp - buffer ptr (holds write data)
511 * flags - general flags (e.g. disk locking)
512 * allocList - list of memory allocated in DAG creation
513 * pfuncs - list of parity generating functions
514 * qfuncs - list of q generating functions
516 * A null qfuncs indicates single fault tolerant
517 *****************************************************************************/
520 rf_CommonCreateSmallWriteDAG(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
521 RF_DagHeader_t
*dag_h
, void *bp
,
522 RF_RaidAccessFlags_t flags
,
523 RF_AllocListElem_t
*allocList
,
524 const RF_RedFuncs_t
*pfuncs
,
525 const RF_RedFuncs_t
*qfuncs
)
527 RF_DagNode_t
*readDataNodes
, *readParityNodes
, *readQNodes
, *termNode
;
528 RF_DagNode_t
*tmpNode
, *tmpreadDataNode
, *tmpreadParityNode
;
529 RF_DagNode_t
*xorNodes
, *qNodes
, *blockNode
, *commitNode
;
530 RF_DagNode_t
*writeDataNodes
, *writeParityNodes
, *writeQNodes
;
531 RF_DagNode_t
*tmpxorNode
, *tmpqNode
, *tmpwriteDataNode
, *tmpreadQNode
;
532 RF_DagNode_t
*tmpwriteParityNode
;
533 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
534 RF_DagNode_t
*tmpwriteQNode
;
536 int i
, j
, nNodes
, totalNumNodes
;
537 RF_ReconUnitNum_t which_ru
;
538 int (*func
) (RF_DagNode_t
*), (*undoFunc
) (RF_DagNode_t
*);
539 int (*qfunc
) (RF_DagNode_t
*);
540 int numDataNodes
, numParityNodes
;
541 RF_StripeNum_t parityStripeID
;
542 RF_PhysDiskAddr_t
*pda
;
543 const char *name
, *qname
;
546 nfaults
= qfuncs
? 2 : 1;
548 parityStripeID
= rf_RaidAddressToParityStripeID(&(raidPtr
->Layout
),
549 asmap
->raidAddress
, &which_ru
);
550 pda
= asmap
->physInfo
;
551 numDataNodes
= asmap
->numStripeUnitsAccessed
;
552 numParityNodes
= (asmap
->parityInfo
->next
) ? 2 : 1;
556 printf("[Creating small-write DAG]\n");
559 RF_ASSERT(numDataNodes
> 0);
560 dag_h
->creator
= "SmallWriteDAG";
562 dag_h
->numCommitNodes
= 1;
563 dag_h
->numCommits
= 0;
564 dag_h
->numSuccedents
= 1;
567 * DAG creation occurs in four steps:
568 * 1. count the number of nodes in the DAG
569 * 2. create the nodes
570 * 3. initialize the nodes
571 * 4. connect the nodes
575 * Step 1. compute number of nodes in the graph
578 /* number of nodes: a read and write for each data unit a
579 * redundancy computation node for each parity node (nfaults *
580 * nparity) a read and write for each parity unit a block and
581 * commit node (2) a terminate node if atomic RMW an unlock
582 * node for each data unit, redundancy unit */
583 totalNumNodes
= (2 * numDataNodes
) + (nfaults
* numParityNodes
)
584 + (nfaults
* 2 * numParityNodes
) + 3;
586 * Step 2. create the nodes
589 blockNode
= rf_AllocDAGNode();
590 blockNode
->list_next
= dag_h
->nodes
;
591 dag_h
->nodes
= blockNode
;
593 commitNode
= rf_AllocDAGNode();
594 commitNode
->list_next
= dag_h
->nodes
;
595 dag_h
->nodes
= commitNode
;
597 for (i
= 0; i
< numDataNodes
; i
++) {
598 tmpNode
= rf_AllocDAGNode();
599 tmpNode
->list_next
= dag_h
->nodes
;
600 dag_h
->nodes
= tmpNode
;
602 readDataNodes
= dag_h
->nodes
;
604 for (i
= 0; i
< numParityNodes
; i
++) {
605 tmpNode
= rf_AllocDAGNode();
606 tmpNode
->list_next
= dag_h
->nodes
;
607 dag_h
->nodes
= tmpNode
;
609 readParityNodes
= dag_h
->nodes
;
611 for (i
= 0; i
< numDataNodes
; i
++) {
612 tmpNode
= rf_AllocDAGNode();
613 tmpNode
->list_next
= dag_h
->nodes
;
614 dag_h
->nodes
= tmpNode
;
616 writeDataNodes
= dag_h
->nodes
;
618 for (i
= 0; i
< numParityNodes
; i
++) {
619 tmpNode
= rf_AllocDAGNode();
620 tmpNode
->list_next
= dag_h
->nodes
;
621 dag_h
->nodes
= tmpNode
;
623 writeParityNodes
= dag_h
->nodes
;
625 for (i
= 0; i
< numParityNodes
; i
++) {
626 tmpNode
= rf_AllocDAGNode();
627 tmpNode
->list_next
= dag_h
->nodes
;
628 dag_h
->nodes
= tmpNode
;
630 xorNodes
= dag_h
->nodes
;
632 termNode
= rf_AllocDAGNode();
633 termNode
->list_next
= dag_h
->nodes
;
634 dag_h
->nodes
= termNode
;
636 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
638 for (i
= 0; i
< numParityNodes
; i
++) {
639 tmpNode
= rf_AllocDAGNode();
640 tmpNode
->list_next
= dag_h
->nodes
;
641 dag_h
->nodes
= tmpNode
;
643 readQNodes
= dag_h
->nodes
;
645 for (i
= 0; i
< numParityNodes
; i
++) {
646 tmpNode
= rf_AllocDAGNode();
647 tmpNode
->list_next
= dag_h
->nodes
;
648 dag_h
->nodes
= tmpNode
;
650 writeQNodes
= dag_h
->nodes
;
652 for (i
= 0; i
< numParityNodes
; i
++) {
653 tmpNode
= rf_AllocDAGNode();
654 tmpNode
->list_next
= dag_h
->nodes
;
655 dag_h
->nodes
= tmpNode
;
657 qNodes
= dag_h
->nodes
;
660 readQNodes
= writeQNodes
= qNodes
= NULL
;
661 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
666 * Step 3. initialize the nodes
668 /* initialize block node (Nil) */
669 nNodes
= numDataNodes
+ (nfaults
* numParityNodes
);
670 rf_InitNode(blockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
,
671 rf_NullNodeUndoFunc
, NULL
, nNodes
, 0, 0, 0,
672 dag_h
, "Nil", allocList
);
674 /* initialize commit node (Cmt) */
675 rf_InitNode(commitNode
, rf_wait
, RF_TRUE
, rf_NullNodeFunc
,
676 rf_NullNodeUndoFunc
, NULL
, nNodes
,
677 (nfaults
* numParityNodes
), 0, 0, dag_h
, "Cmt", allocList
);
679 /* initialize terminate node (Trm) */
680 rf_InitNode(termNode
, rf_wait
, RF_FALSE
, rf_TerminateFunc
,
681 rf_TerminateUndoFunc
, NULL
, 0, nNodes
, 0, 0,
682 dag_h
, "Trm", allocList
);
684 /* initialize nodes which read old data (Rod) */
685 tmpreadDataNode
= readDataNodes
;
686 for (i
= 0; i
< numDataNodes
; i
++) {
687 rf_InitNode(tmpreadDataNode
, rf_wait
, RF_FALSE
,
688 rf_DiskReadFunc
, rf_DiskReadUndoFunc
,
689 rf_GenericWakeupFunc
, (nfaults
* numParityNodes
),
690 1, 4, 0, dag_h
, "Rod", allocList
);
691 RF_ASSERT(pda
!= NULL
);
692 /* physical disk addr desc */
693 tmpreadDataNode
->params
[0].p
= pda
;
694 /* buffer to hold old data */
695 tmpreadDataNode
->params
[1].p
= rf_AllocBuffer(raidPtr
, dag_h
, pda
->numSector
<< raidPtr
->logBytesPerSector
);
696 tmpreadDataNode
->params
[2].v
= parityStripeID
;
697 tmpreadDataNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
,
700 for (j
= 0; j
< tmpreadDataNode
->numSuccedents
; j
++) {
701 tmpreadDataNode
->propList
[j
] = NULL
;
703 tmpreadDataNode
= tmpreadDataNode
->list_next
;
706 /* initialize nodes which read old parity (Rop) */
707 pda
= asmap
->parityInfo
;
709 tmpreadParityNode
= readParityNodes
;
710 for (i
= 0; i
< numParityNodes
; i
++) {
711 RF_ASSERT(pda
!= NULL
);
712 rf_InitNode(tmpreadParityNode
, rf_wait
, RF_FALSE
,
713 rf_DiskReadFunc
, rf_DiskReadUndoFunc
,
714 rf_GenericWakeupFunc
, numParityNodes
, 1, 4, 0,
715 dag_h
, "Rop", allocList
);
716 tmpreadParityNode
->params
[0].p
= pda
;
717 /* buffer to hold old parity */
718 tmpreadParityNode
->params
[1].p
= rf_AllocBuffer(raidPtr
, dag_h
, pda
->numSector
<< raidPtr
->logBytesPerSector
);
719 tmpreadParityNode
->params
[2].v
= parityStripeID
;
720 tmpreadParityNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
,
723 for (j
= 0; j
< tmpreadParityNode
->numSuccedents
; j
++) {
724 tmpreadParityNode
->propList
[0] = NULL
;
726 tmpreadParityNode
= tmpreadParityNode
->list_next
;
729 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
730 /* initialize nodes which read old Q (Roq) */
733 tmpreadQNode
= readQNodes
;
734 for (i
= 0; i
< numParityNodes
; i
++) {
735 RF_ASSERT(pda
!= NULL
);
736 rf_InitNode(tmpreadQNode
, rf_wait
, RF_FALSE
,
737 rf_DiskReadFunc
, rf_DiskReadUndoFunc
,
738 rf_GenericWakeupFunc
, numParityNodes
,
739 1, 4, 0, dag_h
, "Roq", allocList
);
740 tmpreadQNode
->params
[0].p
= pda
;
741 /* buffer to hold old Q */
742 tmpreadQNode
->params
[1].p
= rf_AllocBuffer(raidPtr
, dag_h
,
743 pda
->numSector
<< raidPtr
->logBytesPerSector
);
744 tmpreadQNode
->params
[2].v
= parityStripeID
;
745 tmpreadQNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
,
748 for (j
= 0; j
< tmpreadQNode
->numSuccedents
; j
++) {
749 tmpreadQNode
->propList
[0] = NULL
;
751 tmpreadQNode
= tmpreadQNode
->list_next
;
755 /* initialize nodes which write new data (Wnd) */
756 pda
= asmap
->physInfo
;
757 tmpwriteDataNode
= writeDataNodes
;
758 for (i
= 0; i
< numDataNodes
; i
++) {
759 RF_ASSERT(pda
!= NULL
);
760 rf_InitNode(tmpwriteDataNode
, rf_wait
, RF_FALSE
,
761 rf_DiskWriteFunc
, rf_DiskWriteUndoFunc
,
762 rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
,
764 /* physical disk addr desc */
765 tmpwriteDataNode
->params
[0].p
= pda
;
766 /* buffer holding new data to be written */
767 tmpwriteDataNode
->params
[1].p
= pda
->bufPtr
;
768 tmpwriteDataNode
->params
[2].v
= parityStripeID
;
769 tmpwriteDataNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
,
772 tmpwriteDataNode
= tmpwriteDataNode
->list_next
;
776 * Initialize nodes which compute new parity and Q.
779 * We use the simple XOR func in the double-XOR case, and when
780 * we're accessing only a portion of one stripe unit. The
781 * distinction between the two is that the regular XOR func
782 * assumes that the targbuf is a full SU in size, and examines
783 * the pda associated with the buffer to decide where within
784 * the buffer to XOR the data, whereas the simple XOR func
785 * just XORs the data into the start of the buffer. */
786 if ((numParityNodes
== 2) || ((numDataNodes
== 1)
787 && (asmap
->totalSectorsAccessed
<
788 raidPtr
->Layout
.sectorsPerStripeUnit
))) {
789 func
= pfuncs
->simple
;
790 undoFunc
= rf_NullNodeUndoFunc
;
791 name
= pfuncs
->SimpleName
;
793 qfunc
= qfuncs
->simple
;
794 qname
= qfuncs
->SimpleName
;
800 func
= pfuncs
->regular
;
801 undoFunc
= rf_NullNodeUndoFunc
;
802 name
= pfuncs
->RegularName
;
804 qfunc
= qfuncs
->regular
;
805 qname
= qfuncs
->RegularName
;
812 * Initialize the xor nodes: params are {pda,buf}
813 * from {Rod,Wnd,Rop} nodes, and raidPtr
815 if (numParityNodes
== 2) {
816 /* double-xor case */
817 tmpxorNode
= xorNodes
;
818 tmpreadDataNode
= readDataNodes
;
819 tmpreadParityNode
= readParityNodes
;
820 tmpwriteDataNode
= writeDataNodes
;
822 tmpreadQNode
= readQNodes
;
823 for (i
= 0; i
< numParityNodes
; i
++) {
824 /* note: no wakeup func for xor */
825 rf_InitNode(tmpxorNode
, rf_wait
, RF_FALSE
, func
,
827 (numDataNodes
+ numParityNodes
),
828 7, 1, dag_h
, name
, allocList
);
829 tmpxorNode
->flags
|= RF_DAGNODE_FLAG_YIELD
;
830 tmpxorNode
->params
[0] = tmpreadDataNode
->params
[0];
831 tmpxorNode
->params
[1] = tmpreadDataNode
->params
[1];
832 tmpxorNode
->params
[2] = tmpreadParityNode
->params
[0];
833 tmpxorNode
->params
[3] = tmpreadParityNode
->params
[1];
834 tmpxorNode
->params
[4] = tmpwriteDataNode
->params
[0];
835 tmpxorNode
->params
[5] = tmpwriteDataNode
->params
[1];
836 tmpxorNode
->params
[6].p
= raidPtr
;
837 /* use old parity buf as target buf */
838 tmpxorNode
->results
[0] = tmpreadParityNode
->params
[1].p
;
839 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
841 /* note: no wakeup func for qor */
842 rf_InitNode(tmpqNode
, rf_wait
, RF_FALSE
,
843 qfunc
, undoFunc
, NULL
, 1,
844 (numDataNodes
+ numParityNodes
),
845 7, 1, dag_h
, qname
, allocList
);
846 tmpqNode
->params
[0] = tmpreadDataNode
->params
[0];
847 tmpqNode
->params
[1] = tmpreadDataNode
->params
[1];
848 tmpqNode
->params
[2] = tmpreadQNode
->.params
[0];
849 tmpqNode
->params
[3] = tmpreadQNode
->params
[1];
850 tmpqNode
->params
[4] = tmpwriteDataNode
->params
[0];
851 tmpqNode
->params
[5] = tmpwriteDataNode
->params
[1];
852 tmpqNode
->params
[6].p
= raidPtr
;
853 /* use old Q buf as target buf */
854 tmpqNode
->results
[0] = tmpreadQNode
->params
[1].p
;
855 tmpqNode
= tmpqNode
->list_next
;
856 tmpreadQNodes
= tmpreadQNodes
->list_next
;
859 tmpxorNode
= tmpxorNode
->list_next
;
860 tmpreadDataNode
= tmpreadDataNode
->list_next
;
861 tmpreadParityNode
= tmpreadParityNode
->list_next
;
862 tmpwriteDataNode
= tmpwriteDataNode
->list_next
;
865 /* there is only one xor node in this case */
866 rf_InitNode(xorNodes
, rf_wait
, RF_FALSE
, func
,
867 undoFunc
, NULL
, 1, (numDataNodes
+ numParityNodes
),
868 (2 * (numDataNodes
+ numDataNodes
+ 1) + 1), 1,
869 dag_h
, name
, allocList
);
870 xorNodes
->flags
|= RF_DAGNODE_FLAG_YIELD
;
871 tmpreadDataNode
= readDataNodes
;
872 for (i
= 0; i
< numDataNodes
; i
++) { /* used to be"numDataNodes + 1" until we factored
873 out the "+1" into the "deal with Rop separately below */
874 /* set up params related to Rod nodes */
875 xorNodes
->params
[2 * i
+ 0] = tmpreadDataNode
->params
[0]; /* pda */
876 xorNodes
->params
[2 * i
+ 1] = tmpreadDataNode
->params
[1]; /* buffer ptr */
877 tmpreadDataNode
= tmpreadDataNode
->list_next
;
879 /* deal with Rop separately */
880 xorNodes
->params
[2 * numDataNodes
+ 0] = readParityNodes
->params
[0]; /* pda */
881 xorNodes
->params
[2 * numDataNodes
+ 1] = readParityNodes
->params
[1]; /* buffer ptr */
883 tmpwriteDataNode
= writeDataNodes
;
884 for (i
= 0; i
< numDataNodes
; i
++) {
885 /* set up params related to Wnd and Wnp nodes */
886 xorNodes
->params
[2 * (numDataNodes
+ 1 + i
) + 0] = /* pda */
887 tmpwriteDataNode
->params
[0];
888 xorNodes
->params
[2 * (numDataNodes
+ 1 + i
) + 1] = /* buffer ptr */
889 tmpwriteDataNode
->params
[1];
890 tmpwriteDataNode
= tmpwriteDataNode
->list_next
;
892 /* xor node needs to get at RAID information */
893 xorNodes
->params
[2 * (numDataNodes
+ numDataNodes
+ 1)].p
= raidPtr
;
894 xorNodes
->results
[0] = readParityNodes
->params
[1].p
;
895 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
897 rf_InitNode(qNodes
, rf_wait
, RF_FALSE
, qfunc
,
899 (numDataNodes
+ numParityNodes
),
900 (2 * (numDataNodes
+ numDataNodes
+ 1) + 1), 1,
901 dag_h
, qname
, allocList
);
902 tmpreadDataNode
= readDataNodes
;
903 for (i
= 0; i
< numDataNodes
; i
++) {
904 /* set up params related to Rod */
905 qNodes
->params
[2 * i
+ 0] = tmpreadDataNode
->params
[0]; /* pda */
906 qNodes
->params
[2 * i
+ 1] = tmpreadDataNode
->params
[1]; /* buffer ptr */
907 tmpreadDataNode
= tmpreadDataNode
->list_next
;
910 qNodes
->params
[2 * numDataNodes
+ 0] = /* pda */
911 readQNodes
->params
[0];
912 qNodes
->params
[2 * numDataNodes
+ 1] = /* buffer ptr */
913 readQNodes
->params
[1];
914 tmpwriteDataNode
= writeDataNodes
;
915 for (i
= 0; i
< numDataNodes
; i
++) {
916 /* set up params related to Wnd nodes */
917 qNodes
->params
[2 * (numDataNodes
+ 1 + i
) + 0] = /* pda */
918 tmpwriteDataNode
->params
[0];
919 qNodes
->params
[2 * (numDataNodes
+ 1 + i
) + 1] = /* buffer ptr */
920 tmpwriteDataNode
->params
[1];
921 tmpwriteDataNode
= tmpwriteDataNode
->list_next
;
923 /* xor node needs to get at RAID information */
924 qNodes
->params
[2 * (numDataNodes
+ numDataNodes
+ 1)].p
= raidPtr
;
925 qNodes
->results
[0] = readQNodes
->params
[1].p
;
930 /* initialize nodes which write new parity (Wnp) */
931 pda
= asmap
->parityInfo
;
932 tmpwriteParityNode
= writeParityNodes
;
933 tmpxorNode
= xorNodes
;
934 for (i
= 0; i
< numParityNodes
; i
++) {
935 rf_InitNode(tmpwriteParityNode
, rf_wait
, RF_FALSE
,
936 rf_DiskWriteFunc
, rf_DiskWriteUndoFunc
,
937 rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
,
939 RF_ASSERT(pda
!= NULL
);
940 tmpwriteParityNode
->params
[0].p
= pda
; /* param 1 (bufPtr)
941 * filled in by xor node */
942 tmpwriteParityNode
->params
[1].p
= tmpxorNode
->results
[0]; /* buffer pointer for
945 tmpwriteParityNode
->params
[2].v
= parityStripeID
;
946 tmpwriteParityNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
,
949 tmpwriteParityNode
= tmpwriteParityNode
->list_next
;
950 tmpxorNode
= tmpxorNode
->list_next
;
953 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
954 /* initialize nodes which write new Q (Wnq) */
957 tmpwriteQNode
= writeQNodes
;
959 for (i
= 0; i
< numParityNodes
; i
++) {
960 rf_InitNode(tmpwriteQNode
, rf_wait
, RF_FALSE
,
961 rf_DiskWriteFunc
, rf_DiskWriteUndoFunc
,
962 rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
,
964 RF_ASSERT(pda
!= NULL
);
965 tmpwriteQNode
->params
[0].p
= pda
; /* param 1 (bufPtr)
966 * filled in by xor node */
967 tmpwriteQNode
->params
[1].p
= tmpqNode
->results
[0]; /* buffer pointer for
970 tmpwriteQNode
->params
[2].v
= parityStripeID
;
971 tmpwriteQNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
,
974 tmpwriteQNode
= tmpwriteQNode
->list_next
;
975 tmpqNode
= tmpqNode
->list_next
;
980 * Step 4. connect the nodes.
983 /* connect header to block node */
984 dag_h
->succedents
[0] = blockNode
;
986 /* connect block node to read old data nodes */
987 RF_ASSERT(blockNode
->numSuccedents
== (numDataNodes
+ (numParityNodes
* nfaults
)));
988 tmpreadDataNode
= readDataNodes
;
989 for (i
= 0; i
< numDataNodes
; i
++) {
990 blockNode
->succedents
[i
] = tmpreadDataNode
;
991 RF_ASSERT(tmpreadDataNode
->numAntecedents
== 1);
992 tmpreadDataNode
->antecedents
[0] = blockNode
;
993 tmpreadDataNode
->antType
[0] = rf_control
;
994 tmpreadDataNode
= tmpreadDataNode
->list_next
;
997 /* connect block node to read old parity nodes */
998 tmpreadParityNode
= readParityNodes
;
999 for (i
= 0; i
< numParityNodes
; i
++) {
1000 blockNode
->succedents
[numDataNodes
+ i
] = tmpreadParityNode
;
1001 RF_ASSERT(tmpreadParityNode
->numAntecedents
== 1);
1002 tmpreadParityNode
->antecedents
[0] = blockNode
;
1003 tmpreadParityNode
->antType
[0] = rf_control
;
1004 tmpreadParityNode
= tmpreadParityNode
->list_next
;
1007 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1008 /* connect block node to read old Q nodes */
1010 tmpreadQNode
= readQNodes
;
1011 for (i
= 0; i
< numParityNodes
; i
++) {
1012 blockNode
->succedents
[numDataNodes
+ numParityNodes
+ i
] = tmpreadQNode
;
1013 RF_ASSERT(tmpreadQNode
->numAntecedents
== 1);
1014 tmpreadQNode
->antecedents
[0] = blockNode
;
1015 tmpreadQNode
->antType
[0] = rf_control
;
1016 tmpreadQNode
= tmpreadQNode
->list_next
;
1020 /* connect read old data nodes to xor nodes */
1021 tmpreadDataNode
= readDataNodes
;
1022 for (i
= 0; i
< numDataNodes
; i
++) {
1023 RF_ASSERT(tmpreadDataNode
->numSuccedents
== (nfaults
* numParityNodes
));
1024 tmpxorNode
= xorNodes
;
1025 for (j
= 0; j
< numParityNodes
; j
++) {
1026 RF_ASSERT(tmpxorNode
->numAntecedents
== numDataNodes
+ numParityNodes
);
1027 tmpreadDataNode
->succedents
[j
] = tmpxorNode
;
1028 tmpxorNode
->antecedents
[i
] = tmpreadDataNode
;
1029 tmpxorNode
->antType
[i
] = rf_trueData
;
1030 tmpxorNode
= tmpxorNode
->list_next
;
1032 tmpreadDataNode
= tmpreadDataNode
->list_next
;
1035 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1036 /* connect read old data nodes to q nodes */
1038 tmpreadDataNode
= readDataNodes
;
1039 for (i
= 0; i
< numDataNodes
; i
++) {
1041 for (j
= 0; j
< numParityNodes
; j
++) {
1042 RF_ASSERT(tmpqNode
->numAntecedents
== numDataNodes
+ numParityNodes
);
1043 tmpreadDataNode
->succedents
[numParityNodes
+ j
] = tmpqNode
;
1044 tmpqNode
->antecedents
[i
] = tmpreadDataNode
;
1045 tmpqNode
->antType
[i
] = rf_trueData
;
1046 tmpqNode
= tmpqNode
->list_next
;
1048 tmpreadDataNode
= tmpreadDataNode
->list_next
;
1052 /* connect read old parity nodes to xor nodes */
1053 tmpreadParityNode
= readParityNodes
;
1054 for (i
= 0; i
< numParityNodes
; i
++) {
1055 RF_ASSERT(tmpreadParityNode
->numSuccedents
== numParityNodes
);
1056 tmpxorNode
= xorNodes
;
1057 for (j
= 0; j
< numParityNodes
; j
++) {
1058 tmpreadParityNode
->succedents
[j
] = tmpxorNode
;
1059 tmpxorNode
->antecedents
[numDataNodes
+ i
] = tmpreadParityNode
;
1060 tmpxorNode
->antType
[numDataNodes
+ i
] = rf_trueData
;
1061 tmpxorNode
= tmpxorNode
->list_next
;
1063 tmpreadParityNode
= tmpreadParityNode
->list_next
;
1066 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1067 /* connect read old q nodes to q nodes */
1069 tmpreadParityNode
= readParityNodes
;
1070 tmpreadQNode
= readQNodes
;
1071 for (i
= 0; i
< numParityNodes
; i
++) {
1072 RF_ASSERT(tmpreadParityNode
->numSuccedents
== numParityNodes
);
1074 for (j
= 0; j
< numParityNodes
; j
++) {
1075 tmpreadQNode
->succedents
[j
] = tmpqNode
;
1076 tmpqNode
->antecedents
[numDataNodes
+ i
] = tmpreadQNodes
;
1077 tmpqNode
->antType
[numDataNodes
+ i
] = rf_trueData
;
1078 tmpqNode
= tmpqNode
->list_next
;
1080 tmpreadParityNode
= tmpreadParityNode
->list_next
;
1081 tmpreadQNode
= tmpreadQNode
->list_next
;
1085 /* connect xor nodes to commit node */
1086 RF_ASSERT(commitNode
->numAntecedents
== (nfaults
* numParityNodes
));
1087 tmpxorNode
= xorNodes
;
1088 for (i
= 0; i
< numParityNodes
; i
++) {
1089 RF_ASSERT(tmpxorNode
->numSuccedents
== 1);
1090 tmpxorNode
->succedents
[0] = commitNode
;
1091 commitNode
->antecedents
[i
] = tmpxorNode
;
1092 commitNode
->antType
[i
] = rf_control
;
1093 tmpxorNode
= tmpxorNode
->list_next
;
1096 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1097 /* connect q nodes to commit node */
1100 for (i
= 0; i
< numParityNodes
; i
++) {
1101 RF_ASSERT(tmpqNode
->numSuccedents
== 1);
1102 tmpqNode
->succedents
[0] = commitNode
;
1103 commitNode
->antecedents
[i
+ numParityNodes
] = tmpqNode
;
1104 commitNode
->antType
[i
+ numParityNodes
] = rf_control
;
1105 tmpqNode
= tmpqNode
->list_next
;
1109 /* connect commit node to write nodes */
1110 RF_ASSERT(commitNode
->numSuccedents
== (numDataNodes
+ (nfaults
* numParityNodes
)));
1111 tmpwriteDataNode
= writeDataNodes
;
1112 for (i
= 0; i
< numDataNodes
; i
++) {
1113 RF_ASSERT(tmpwriteDataNode
->numAntecedents
== 1);
1114 commitNode
->succedents
[i
] = tmpwriteDataNode
;
1115 tmpwriteDataNode
->antecedents
[0] = commitNode
;
1116 tmpwriteDataNode
->antType
[0] = rf_trueData
;
1117 tmpwriteDataNode
= tmpwriteDataNode
->list_next
;
1119 tmpwriteParityNode
= writeParityNodes
;
1120 for (i
= 0; i
< numParityNodes
; i
++) {
1121 RF_ASSERT(tmpwriteParityNode
->numAntecedents
== 1);
1122 commitNode
->succedents
[i
+ numDataNodes
] = tmpwriteParityNode
;
1123 tmpwriteParityNode
->antecedents
[0] = commitNode
;
1124 tmpwriteParityNode
->antType
[0] = rf_trueData
;
1125 tmpwriteParityNode
= tmpwriteParityNode
->list_next
;
1127 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1129 tmpwriteQNode
= writeQNodes
;
1130 for (i
= 0; i
< numParityNodes
; i
++) {
1131 RF_ASSERT(tmpwriteQNode
->numAntecedents
== 1);
1132 commitNode
->succedents
[i
+ numDataNodes
+ numParityNodes
] = tmpwriteQNode
;
1133 tmpwriteQNode
->antecedents
[0] = commitNode
;
1134 tmpwriteQNode
->antType
[0] = rf_trueData
;
1135 tmpwriteQNode
= tmpwriteQNode
->list_next
;
1139 RF_ASSERT(termNode
->numAntecedents
== (numDataNodes
+ (nfaults
* numParityNodes
)));
1140 RF_ASSERT(termNode
->numSuccedents
== 0);
1141 tmpwriteDataNode
= writeDataNodes
;
1142 for (i
= 0; i
< numDataNodes
; i
++) {
1143 /* connect write new data nodes to term node */
1144 RF_ASSERT(tmpwriteDataNode
->numSuccedents
== 1);
1145 RF_ASSERT(termNode
->numAntecedents
== (numDataNodes
+ (nfaults
* numParityNodes
)));
1146 tmpwriteDataNode
->succedents
[0] = termNode
;
1147 termNode
->antecedents
[i
] = tmpwriteDataNode
;
1148 termNode
->antType
[i
] = rf_control
;
1149 tmpwriteDataNode
= tmpwriteDataNode
->list_next
;
1152 tmpwriteParityNode
= writeParityNodes
;
1153 for (i
= 0; i
< numParityNodes
; i
++) {
1154 RF_ASSERT(tmpwriteParityNode
->numSuccedents
== 1);
1155 tmpwriteParityNode
->succedents
[0] = termNode
;
1156 termNode
->antecedents
[numDataNodes
+ i
] = tmpwriteParityNode
;
1157 termNode
->antType
[numDataNodes
+ i
] = rf_control
;
1158 tmpwriteParityNode
= tmpwriteParityNode
->list_next
;
1161 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1163 tmpwriteQNode
= writeQNodes
;
1164 for (i
= 0; i
< numParityNodes
; i
++) {
1165 RF_ASSERT(tmpwriteQNode
->numSuccedents
== 1);
1166 tmpwriteQNode
->succedents
[0] = termNode
;
1167 termNode
->antecedents
[numDataNodes
+ numParityNodes
+ i
] = tmpwriteQNode
;
1168 termNode
->antType
[numDataNodes
+ numParityNodes
+ i
] = rf_control
;
1169 tmpwriteQNode
= tmpwriteQNode
->list_next
;
1176 /******************************************************************************
1177 * create a write graph (fault-free or degraded) for RAID level 1
1179 * Hdr -> Commit -> Wpd -> Nil -> Trm
1182 * The "Wpd" node writes data to the primary copy in the mirror pair
1183 * The "Wsd" node writes data to the secondary copy in the mirror pair
1185 * Parameters: raidPtr - description of the physical array
1186 * asmap - logical & physical addresses for this access
1187 * bp - buffer ptr (holds write data)
1188 * flags - general flags (e.g. disk locking)
1189 * allocList - list of memory allocated in DAG creation
1190 *****************************************************************************/
1193 rf_CreateRaidOneWriteDAG(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
1194 RF_DagHeader_t
*dag_h
, void *bp
,
1195 RF_RaidAccessFlags_t flags
,
1196 RF_AllocListElem_t
*allocList
)
1198 RF_DagNode_t
*unblockNode
, *termNode
, *commitNode
;
1199 RF_DagNode_t
*wndNode
, *wmirNode
;
1200 RF_DagNode_t
*tmpNode
, *tmpwndNode
, *tmpwmirNode
;
1201 int nWndNodes
, nWmirNodes
, i
;
1202 RF_ReconUnitNum_t which_ru
;
1203 RF_PhysDiskAddr_t
*pda
, *pdaP
;
1204 RF_StripeNum_t parityStripeID
;
1206 parityStripeID
= rf_RaidAddressToParityStripeID(&(raidPtr
->Layout
),
1207 asmap
->raidAddress
, &which_ru
);
1210 printf("[Creating RAID level 1 write DAG]\n");
1213 dag_h
->creator
= "RaidOneWriteDAG";
1215 /* 2 implies access not SU aligned */
1216 nWmirNodes
= (asmap
->parityInfo
->next
) ? 2 : 1;
1217 nWndNodes
= (asmap
->physInfo
->next
) ? 2 : 1;
1219 /* alloc the Wnd nodes and the Wmir node */
1220 if (asmap
->numDataFailed
== 1)
1222 if (asmap
->numParityFailed
== 1)
1225 /* total number of nodes = nWndNodes + nWmirNodes + (commit + unblock
1227 for (i
= 0; i
< nWndNodes
; i
++) {
1228 tmpNode
= rf_AllocDAGNode();
1229 tmpNode
->list_next
= dag_h
->nodes
;
1230 dag_h
->nodes
= tmpNode
;
1232 wndNode
= dag_h
->nodes
;
1234 for (i
= 0; i
< nWmirNodes
; i
++) {
1235 tmpNode
= rf_AllocDAGNode();
1236 tmpNode
->list_next
= dag_h
->nodes
;
1237 dag_h
->nodes
= tmpNode
;
1239 wmirNode
= dag_h
->nodes
;
1241 commitNode
= rf_AllocDAGNode();
1242 commitNode
->list_next
= dag_h
->nodes
;
1243 dag_h
->nodes
= commitNode
;
1245 unblockNode
= rf_AllocDAGNode();
1246 unblockNode
->list_next
= dag_h
->nodes
;
1247 dag_h
->nodes
= unblockNode
;
1249 termNode
= rf_AllocDAGNode();
1250 termNode
->list_next
= dag_h
->nodes
;
1251 dag_h
->nodes
= termNode
;
1253 /* this dag can commit immediately */
1254 dag_h
->numCommitNodes
= 1;
1255 dag_h
->numCommits
= 0;
1256 dag_h
->numSuccedents
= 1;
1258 /* initialize the commit, unblock, and term nodes */
1259 rf_InitNode(commitNode
, rf_wait
, RF_TRUE
, rf_NullNodeFunc
,
1260 rf_NullNodeUndoFunc
, NULL
, (nWndNodes
+ nWmirNodes
),
1261 0, 0, 0, dag_h
, "Cmt", allocList
);
1262 rf_InitNode(unblockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
,
1263 rf_NullNodeUndoFunc
, NULL
, 1, (nWndNodes
+ nWmirNodes
),
1264 0, 0, dag_h
, "Nil", allocList
);
1265 rf_InitNode(termNode
, rf_wait
, RF_FALSE
, rf_TerminateFunc
,
1266 rf_TerminateUndoFunc
, NULL
, 0, 1, 0, 0,
1267 dag_h
, "Trm", allocList
);
1269 /* initialize the wnd nodes */
1270 if (nWndNodes
> 0) {
1271 pda
= asmap
->physInfo
;
1272 tmpwndNode
= wndNode
;
1273 for (i
= 0; i
< nWndNodes
; i
++) {
1274 rf_InitNode(tmpwndNode
, rf_wait
, RF_FALSE
,
1275 rf_DiskWriteFunc
, rf_DiskWriteUndoFunc
,
1276 rf_GenericWakeupFunc
, 1, 1, 4, 0,
1277 dag_h
, "Wpd", allocList
);
1278 RF_ASSERT(pda
!= NULL
);
1279 tmpwndNode
->params
[0].p
= pda
;
1280 tmpwndNode
->params
[1].p
= pda
->bufPtr
;
1281 tmpwndNode
->params
[2].v
= parityStripeID
;
1282 tmpwndNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
1284 tmpwndNode
= tmpwndNode
->list_next
;
1286 RF_ASSERT(pda
== NULL
);
1288 /* initialize the mirror nodes */
1289 if (nWmirNodes
> 0) {
1290 pda
= asmap
->physInfo
;
1291 pdaP
= asmap
->parityInfo
;
1292 tmpwmirNode
= wmirNode
;
1293 for (i
= 0; i
< nWmirNodes
; i
++) {
1294 rf_InitNode(tmpwmirNode
, rf_wait
, RF_FALSE
,
1295 rf_DiskWriteFunc
, rf_DiskWriteUndoFunc
,
1296 rf_GenericWakeupFunc
, 1, 1, 4, 0,
1297 dag_h
, "Wsd", allocList
);
1298 RF_ASSERT(pda
!= NULL
);
1299 tmpwmirNode
->params
[0].p
= pdaP
;
1300 tmpwmirNode
->params
[1].p
= pda
->bufPtr
;
1301 tmpwmirNode
->params
[2].v
= parityStripeID
;
1302 tmpwmirNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
1305 tmpwmirNode
= tmpwmirNode
->list_next
;
1307 RF_ASSERT(pda
== NULL
);
1308 RF_ASSERT(pdaP
== NULL
);
1310 /* link the header node to the commit node */
1311 RF_ASSERT(dag_h
->numSuccedents
== 1);
1312 RF_ASSERT(commitNode
->numAntecedents
== 0);
1313 dag_h
->succedents
[0] = commitNode
;
1315 /* link the commit node to the write nodes */
1316 RF_ASSERT(commitNode
->numSuccedents
== (nWndNodes
+ nWmirNodes
));
1317 tmpwndNode
= wndNode
;
1318 for (i
= 0; i
< nWndNodes
; i
++) {
1319 RF_ASSERT(tmpwndNode
->numAntecedents
== 1);
1320 commitNode
->succedents
[i
] = tmpwndNode
;
1321 tmpwndNode
->antecedents
[0] = commitNode
;
1322 tmpwndNode
->antType
[0] = rf_control
;
1323 tmpwndNode
= tmpwndNode
->list_next
;
1325 tmpwmirNode
= wmirNode
;
1326 for (i
= 0; i
< nWmirNodes
; i
++) {
1327 RF_ASSERT(tmpwmirNode
->numAntecedents
== 1);
1328 commitNode
->succedents
[i
+ nWndNodes
] = tmpwmirNode
;
1329 tmpwmirNode
->antecedents
[0] = commitNode
;
1330 tmpwmirNode
->antType
[0] = rf_control
;
1331 tmpwmirNode
= tmpwmirNode
->list_next
;
1334 /* link the write nodes to the unblock node */
1335 RF_ASSERT(unblockNode
->numAntecedents
== (nWndNodes
+ nWmirNodes
));
1336 tmpwndNode
= wndNode
;
1337 for (i
= 0; i
< nWndNodes
; i
++) {
1338 RF_ASSERT(tmpwndNode
->numSuccedents
== 1);
1339 tmpwndNode
->succedents
[0] = unblockNode
;
1340 unblockNode
->antecedents
[i
] = tmpwndNode
;
1341 unblockNode
->antType
[i
] = rf_control
;
1342 tmpwndNode
= tmpwndNode
->list_next
;
1344 tmpwmirNode
= wmirNode
;
1345 for (i
= 0; i
< nWmirNodes
; i
++) {
1346 RF_ASSERT(tmpwmirNode
->numSuccedents
== 1);
1347 tmpwmirNode
->succedents
[0] = unblockNode
;
1348 unblockNode
->antecedents
[i
+ nWndNodes
] = tmpwmirNode
;
1349 unblockNode
->antType
[i
+ nWndNodes
] = rf_control
;
1350 tmpwmirNode
= tmpwmirNode
->list_next
;
1353 /* link the unblock node to the term node */
1354 RF_ASSERT(unblockNode
->numSuccedents
== 1);
1355 RF_ASSERT(termNode
->numAntecedents
== 1);
1356 RF_ASSERT(termNode
->numSuccedents
== 0);
1357 unblockNode
->succedents
[0] = termNode
;
1358 termNode
->antecedents
[0] = unblockNode
;
1359 termNode
->antType
[0] = rf_control
;