1 /* $NetBSD: rf_dagdegwr.c,v 1.29 2006/10/12 01:31:50 christos Exp $ */
3 * Copyright (c) 1995 Carnegie-Mellon University.
6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
18 * Carnegie Mellon requests users of this software to return to
20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
32 * code for creating degraded write DAGs
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rf_dagdegwr.c,v 1.29 2006/10/12 01:31:50 christos Exp $");
39 #include <dev/raidframe/raidframevar.h>
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_debugMem.h"
46 #include "rf_general.h"
47 #include "rf_dagdegwr.h"
51 /******************************************************************************
53 * General comments on DAG creation:
55 * All DAGs in this file use roll-away error recovery. Each DAG has a single
56 * commit node, usually called "Cmt." If an error occurs before the Cmt node
57 * is reached, the execution engine will halt forward execution and work
58 * backward through the graph, executing the undo functions. Assuming that
59 * each node in the graph prior to the Cmt node are undoable and atomic - or -
60 * does not make changes to permanent state, the graph will fail atomically.
61 * If an error occurs after the Cmt node executes, the engine will roll-forward
62 * through the graph, blindly executing nodes until it reaches the end.
63 * If a graph reaches the end, it is assumed to have completed successfully.
65 * A graph has only 1 Cmt node.
70 /******************************************************************************
72 * The following wrappers map the standard DAG creation interface to the
73 * DAG creation routines. Additionally, these wrappers enable experimentation
74 * with new DAG structures by providing an extra level of indirection, allowing
75 * the DAG creation routines to be replaced at this single point.
79 RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG
)
81 rf_CommonCreateSimpleDegradedWriteDAG(raidPtr
, asmap
, dag_h
, bp
,
82 flags
, allocList
, 1, rf_RecoveryXorFunc
, RF_TRUE
);
86 rf_CreateDegradedWriteDAG(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
87 RF_DagHeader_t
*dag_h
, void *bp
,
88 RF_RaidAccessFlags_t flags
,
89 RF_AllocListElem_t
*allocList
)
92 RF_ASSERT(asmap
->numDataFailed
== 1);
93 dag_h
->creator
= "DegradedWriteDAG";
96 * if the access writes only a portion of the failed unit, and also
97 * writes some portion of at least one surviving unit, we create two
98 * DAGs, one for the failed component and one for the non-failed
99 * component, and do them sequentially. Note that the fact that we're
100 * accessing only a portion of the failed unit indicates that the
101 * access either starts or ends in the failed unit, and hence we need
102 * create only two dags. This is inefficient in that the same data or
103 * parity can get read and written twice using this structure. I need
104 * to fix this to do the access all at once.
106 RF_ASSERT(!(asmap
->numStripeUnitsAccessed
!= 1 &&
107 asmap
->failedPDAs
[0]->numSector
!=
108 raidPtr
->Layout
.sectorsPerStripeUnit
));
109 rf_CreateSimpleDegradedWriteDAG(raidPtr
, asmap
, dag_h
, bp
, flags
,
115 /******************************************************************************
117 * DAG creation code begins here
122 /******************************************************************************
124 * CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode
125 * write, which is as follows
128 * hdr -> blockNode -> Rod -> Xor -> Cmt -> Wnp ----> unblock -> term
129 * \ {Rod} / \ Wnd ---/
132 * commit nodes: Xor, Wnd
135 * This DAG generator does not work for double-degraded archs since it does not
138 * This dag is essentially identical to the large-write dag, except that the
139 * write to the failed data unit is suppressed.
141 * IMPORTANT: this dag does not work in the case where the access writes only
142 * a portion of the failed unit, and also writes some portion of at least one
143 * surviving SU. this case is handled in CreateDegradedWriteDAG above.
145 * The block & unblock nodes are leftovers from a previous version. They
146 * do nothing, but I haven't deleted them because it would be a tremendous
147 * effort to put them back in.
149 * This dag is used whenever a one of the data units in a write has failed.
150 * If it is the parity unit that failed, the nonredundant write dag (below)
152 *****************************************************************************/
155 rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t
*raidPtr
,
156 RF_AccessStripeMap_t
*asmap
,
157 RF_DagHeader_t
*dag_h
, void *bp
,
158 RF_RaidAccessFlags_t flags
,
159 RF_AllocListElem_t
*allocList
,
161 int (*redFunc
) (RF_DagNode_t
*),
162 int allowBufferRecycle
)
164 int nNodes
, nRrdNodes
, nWndNodes
, nXorBufs
, i
, j
, paramNum
,
166 RF_DagNode_t
*blockNode
, *unblockNode
, *wnpNode
, *wnqNode
, *termNode
;
167 RF_DagNode_t
*wndNodes
, *rrdNodes
, *xorNode
, *commitNode
;
168 RF_DagNode_t
*tmpNode
, *tmpwndNode
, *tmprrdNode
;
169 RF_SectorCount_t sectorsPerSU
;
170 RF_ReconUnitNum_t which_ru
;
171 char *xorTargetBuf
= NULL
; /* the target buffer for the XOR
173 char overlappingPDAs
[RF_MAXCOL
];/* a temporary array of flags */
174 RF_AccessStripeMapHeader_t
*new_asm_h
[2];
175 RF_PhysDiskAddr_t
*pda
, *parityPDA
;
176 RF_StripeNum_t parityStripeID
;
177 RF_PhysDiskAddr_t
*failedPDA
;
178 RF_RaidLayout_t
*layoutPtr
;
180 layoutPtr
= &(raidPtr
->Layout
);
181 parityStripeID
= rf_RaidAddressToParityStripeID(layoutPtr
, asmap
->raidAddress
,
183 sectorsPerSU
= layoutPtr
->sectorsPerStripeUnit
;
184 /* failedPDA points to the pda within the asm that targets the failed
186 failedPDA
= asmap
->failedPDAs
[0];
190 printf("[Creating degraded-write DAG]\n");
193 RF_ASSERT(asmap
->numDataFailed
== 1);
194 dag_h
->creator
= "SimpleDegradedWriteDAG";
197 * Generate two ASMs identifying the surviving data
198 * we need in order to recover the lost data.
200 /* overlappingPDAs array must be zero'd */
201 memset(overlappingPDAs
, 0, RF_MAXCOL
);
202 rf_GenerateFailedAccessASMs(raidPtr
, asmap
, failedPDA
, dag_h
, new_asm_h
,
203 &nXorBufs
, NULL
, overlappingPDAs
, allocList
);
205 /* create all the nodes at once */
206 nWndNodes
= asmap
->numStripeUnitsAccessed
- 1; /* no access is
210 nRrdNodes
= ((new_asm_h
[0]) ? new_asm_h
[0]->stripeMap
->numStripeUnitsAccessed
: 0) +
211 ((new_asm_h
[1]) ? new_asm_h
[1]->stripeMap
->numStripeUnitsAccessed
: 0);
215 * There's a bug with a complete stripe overwrite- that means 0 reads
216 * of old data, and the rest of the DAG generation code doesn't like
217 * that. A release is coming, and I don't wanna risk breaking a critical
218 * DAG generator, so here's what I'm gonna do- if there's no read nodes,
219 * I'm gonna fake there being a read node, and I'm gonna swap in a
220 * no-op node in its place (to make all the link-up code happy).
221 * This should be fixed at some point. --jimz
223 if (nRrdNodes
== 0) {
229 /* lock, unlock, xor, Wnd, Rrd, W(nfaults) */
230 nNodes
= 5 + nfaults
+ nWndNodes
+ nRrdNodes
;
232 blockNode
= rf_AllocDAGNode();
233 blockNode
->list_next
= dag_h
->nodes
;
234 dag_h
->nodes
= blockNode
;
236 commitNode
= rf_AllocDAGNode();
237 commitNode
->list_next
= dag_h
->nodes
;
238 dag_h
->nodes
= commitNode
;
240 unblockNode
= rf_AllocDAGNode();
241 unblockNode
->list_next
= dag_h
->nodes
;
242 dag_h
->nodes
= unblockNode
;
244 termNode
= rf_AllocDAGNode();
245 termNode
->list_next
= dag_h
->nodes
;
246 dag_h
->nodes
= termNode
;
248 xorNode
= rf_AllocDAGNode();
249 xorNode
->list_next
= dag_h
->nodes
;
250 dag_h
->nodes
= xorNode
;
252 wnpNode
= rf_AllocDAGNode();
253 wnpNode
->list_next
= dag_h
->nodes
;
254 dag_h
->nodes
= wnpNode
;
256 for (i
= 0; i
< nWndNodes
; i
++) {
257 tmpNode
= rf_AllocDAGNode();
258 tmpNode
->list_next
= dag_h
->nodes
;
259 dag_h
->nodes
= tmpNode
;
261 wndNodes
= dag_h
->nodes
;
263 for (i
= 0; i
< nRrdNodes
; i
++) {
264 tmpNode
= rf_AllocDAGNode();
265 tmpNode
->list_next
= dag_h
->nodes
;
266 dag_h
->nodes
= tmpNode
;
268 rrdNodes
= dag_h
->nodes
;
270 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
272 wnqNode
= rf_AllocDAGNode();
273 wnqNode
->list_next
= dag_h
->nodes
;
274 dag_h
->nodes
= wnqNode
;
278 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
282 /* this dag can not commit until all rrd and xor Nodes have completed */
283 dag_h
->numCommitNodes
= 1;
284 dag_h
->numCommits
= 0;
285 dag_h
->numSuccedents
= 1;
287 RF_ASSERT(nRrdNodes
> 0);
288 rf_InitNode(blockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
289 NULL
, nRrdNodes
, 0, 0, 0, dag_h
, "Nil", allocList
);
290 rf_InitNode(commitNode
, rf_wait
, RF_TRUE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
291 NULL
, nWndNodes
+ nfaults
, 1, 0, 0, dag_h
, "Cmt", allocList
);
292 rf_InitNode(unblockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
293 NULL
, 1, nWndNodes
+ nfaults
, 0, 0, dag_h
, "Nil", allocList
);
294 rf_InitNode(termNode
, rf_wait
, RF_FALSE
, rf_TerminateFunc
, rf_TerminateUndoFunc
,
295 NULL
, 0, 1, 0, 0, dag_h
, "Trm", allocList
);
296 rf_InitNode(xorNode
, rf_wait
, RF_FALSE
, redFunc
, rf_NullNodeUndoFunc
, NULL
, 1,
297 nRrdNodes
, 2 * nXorBufs
+ 2, nfaults
, dag_h
, "Xrc", allocList
);
300 * Fill in the Rrd nodes. If any of the rrd buffers are the same size as
301 * the failed buffer, save a pointer to it so we can use it as the target
302 * of the XOR. The pdas in the rrd nodes have been range-restricted, so if
303 * a buffer is the same size as the failed buffer, it must also be at the
304 * same alignment within the SU.
307 tmprrdNode
= rrdNodes
;
309 for (i
= 0, pda
= new_asm_h
[0]->stripeMap
->physInfo
;
310 i
< new_asm_h
[0]->stripeMap
->numStripeUnitsAccessed
;
311 i
++, pda
= pda
->next
) {
312 rf_InitNode(tmprrdNode
, rf_wait
, RF_FALSE
, rf_DiskReadFunc
, rf_DiskReadUndoFunc
,
313 rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
, "Rrd", allocList
);
315 tmprrdNode
->params
[0].p
= pda
;
316 tmprrdNode
->params
[1].p
= pda
->bufPtr
;
317 tmprrdNode
->params
[2].v
= parityStripeID
;
318 tmprrdNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
319 tmprrdNode
= tmprrdNode
->list_next
;
322 /* i now equals the number of stripe units accessed in new_asm_h[0] */
323 /* Note that for tmprrdNode, this means a continuation from above, so no need to
324 assign it anything.. */
326 for (j
= 0, pda
= new_asm_h
[1]->stripeMap
->physInfo
;
327 j
< new_asm_h
[1]->stripeMap
->numStripeUnitsAccessed
;
328 j
++, pda
= pda
->next
) {
329 rf_InitNode(tmprrdNode
, rf_wait
, RF_FALSE
, rf_DiskReadFunc
, rf_DiskReadUndoFunc
,
330 rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
, "Rrd", allocList
);
332 tmprrdNode
->params
[0].p
= pda
;
333 tmprrdNode
->params
[1].p
= pda
->bufPtr
;
334 tmprrdNode
->params
[2].v
= parityStripeID
;
335 tmprrdNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
336 if (allowBufferRecycle
&& (pda
->numSector
== failedPDA
->numSector
))
337 xorTargetBuf
= pda
->bufPtr
;
338 tmprrdNode
= tmprrdNode
->list_next
;
343 * This is where we'll init that fake noop read node
344 * (XXX should the wakeup func be different?)
346 /* node that rrdNodes will just be a single node... */
347 rf_InitNode(rrdNodes
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
348 NULL
, 1, 1, 0, 0, dag_h
, "RrN", allocList
);
351 * Make a PDA for the parity unit. The parity PDA should start at
352 * the same offset into the SU as the failed PDA.
354 /* Danner comment: I don't think this copy is really necessary. We are
355 * in one of two cases here. (1) The entire failed unit is written.
356 * Then asmap->parityInfo will describe the entire parity. (2) We are
357 * only writing a subset of the failed unit and nothing else. Then the
358 * asmap->parityInfo describes the failed unit and the copy can also
361 parityPDA
= rf_AllocPhysDiskAddr();
362 parityPDA
->next
= dag_h
->pda_cleanup_list
;
363 dag_h
->pda_cleanup_list
= parityPDA
;
364 parityPDA
->col
= asmap
->parityInfo
->col
;
365 parityPDA
->startSector
= ((asmap
->parityInfo
->startSector
/ sectorsPerSU
)
366 * sectorsPerSU
) + (failedPDA
->startSector
% sectorsPerSU
);
367 parityPDA
->numSector
= failedPDA
->numSector
;
370 xorTargetBuf
= rf_AllocBuffer(raidPtr
, dag_h
, rf_RaidAddressToByte(raidPtr
, failedPDA
->numSector
));
372 /* init the Wnp node */
373 rf_InitNode(wnpNode
, rf_wait
, RF_FALSE
, rf_DiskWriteFunc
, rf_DiskWriteUndoFunc
,
374 rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
, "Wnp", allocList
);
375 wnpNode
->params
[0].p
= parityPDA
;
376 wnpNode
->params
[1].p
= xorTargetBuf
;
377 wnpNode
->params
[2].v
= parityStripeID
;
378 wnpNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
380 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
381 /* fill in the Wnq Node */
384 RF_MallocAndAdd(parityPDA
, sizeof(RF_PhysDiskAddr_t
),
385 (RF_PhysDiskAddr_t
*), allocList
);
386 parityPDA
->col
= asmap
->qInfo
->col
;
387 parityPDA
->startSector
= ((asmap
->qInfo
->startSector
/ sectorsPerSU
)
388 * sectorsPerSU
) + (failedPDA
->startSector
% sectorsPerSU
);
389 parityPDA
->numSector
= failedPDA
->numSector
;
391 rf_InitNode(wnqNode
, rf_wait
, RF_FALSE
, rf_DiskWriteFunc
, rf_DiskWriteUndoFunc
,
392 rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
, "Wnq", allocList
);
393 wnqNode
->params
[0].p
= parityPDA
;
394 RF_MallocAndAdd(xorNode
->results
[1],
395 rf_RaidAddressToByte(raidPtr
, failedPDA
->numSector
), (char *), allocList
);
396 wnqNode
->params
[1].p
= xorNode
->results
[1];
397 wnqNode
->params
[2].v
= parityStripeID
;
398 wnqNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
402 /* fill in the Wnd nodes */
403 tmpwndNode
= wndNodes
;
404 for (pda
= asmap
->physInfo
, i
= 0; i
< nWndNodes
; i
++, pda
= pda
->next
) {
405 if (pda
== failedPDA
) {
409 rf_InitNode(tmpwndNode
, rf_wait
, RF_FALSE
, rf_DiskWriteFunc
, rf_DiskWriteUndoFunc
,
410 rf_GenericWakeupFunc
, 1, 1, 4, 0, dag_h
, "Wnd", allocList
);
412 tmpwndNode
->params
[0].p
= pda
;
413 tmpwndNode
->params
[1].p
= pda
->bufPtr
;
414 tmpwndNode
->params
[2].v
= parityStripeID
;
415 tmpwndNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, which_ru
);
416 tmpwndNode
= tmpwndNode
->list_next
;
419 /* fill in the results of the xor node */
420 xorNode
->results
[0] = xorTargetBuf
;
422 /* fill in the params of the xor node */
425 if (rdnodesFaked
== 0) {
426 tmprrdNode
= rrdNodes
;
427 for (i
= 0; i
< nRrdNodes
; i
++) {
428 /* all the Rrd nodes need to be xored together */
429 xorNode
->params
[paramNum
++] = tmprrdNode
->params
[0];
430 xorNode
->params
[paramNum
++] = tmprrdNode
->params
[1];
431 tmprrdNode
= tmprrdNode
->list_next
;
434 tmpwndNode
= wndNodes
;
435 for (i
= 0; i
< nWndNodes
; i
++) {
436 /* any Wnd nodes that overlap the failed access need to be
438 if (overlappingPDAs
[i
]) {
439 pda
= rf_AllocPhysDiskAddr();
440 memcpy((char *) pda
, (char *) tmpwndNode
->params
[0].p
, sizeof(RF_PhysDiskAddr_t
));
441 /* add it into the pda_cleanup_list *after* the copy, TYVM */
442 pda
->next
= dag_h
->pda_cleanup_list
;
443 dag_h
->pda_cleanup_list
= pda
;
444 rf_RangeRestrictPDA(raidPtr
, failedPDA
, pda
, RF_RESTRICT_DOBUFFER
, 0);
445 xorNode
->params
[paramNum
++].p
= pda
;
446 xorNode
->params
[paramNum
++].p
= pda
->bufPtr
;
448 tmpwndNode
= tmpwndNode
->list_next
;
452 * Install the failed PDA into the xor param list so that the
453 * new data gets xor'd in.
455 xorNode
->params
[paramNum
++].p
= failedPDA
;
456 xorNode
->params
[paramNum
++].p
= failedPDA
->bufPtr
;
459 * The last 2 params to the recovery xor node are always the failed
460 * PDA and the raidPtr. install the failedPDA even though we have just
461 * done so above. This allows us to use the same XOR function for both
462 * degraded reads and degraded writes.
464 xorNode
->params
[paramNum
++].p
= failedPDA
;
465 xorNode
->params
[paramNum
++].p
= raidPtr
;
466 RF_ASSERT(paramNum
== 2 * nXorBufs
+ 2);
469 * Code to link nodes begins here
472 /* link header to block node */
473 RF_ASSERT(blockNode
->numAntecedents
== 0);
474 dag_h
->succedents
[0] = blockNode
;
476 /* link block node to rd nodes */
477 RF_ASSERT(blockNode
->numSuccedents
== nRrdNodes
);
478 tmprrdNode
= rrdNodes
;
479 for (i
= 0; i
< nRrdNodes
; i
++) {
480 RF_ASSERT(tmprrdNode
->numAntecedents
== 1);
481 blockNode
->succedents
[i
] = tmprrdNode
;
482 tmprrdNode
->antecedents
[0] = blockNode
;
483 tmprrdNode
->antType
[0] = rf_control
;
484 tmprrdNode
= tmprrdNode
->list_next
;
487 /* link read nodes to xor node */
488 RF_ASSERT(xorNode
->numAntecedents
== nRrdNodes
);
489 tmprrdNode
= rrdNodes
;
490 for (i
= 0; i
< nRrdNodes
; i
++) {
491 RF_ASSERT(tmprrdNode
->numSuccedents
== 1);
492 tmprrdNode
->succedents
[0] = xorNode
;
493 xorNode
->antecedents
[i
] = tmprrdNode
;
494 xorNode
->antType
[i
] = rf_trueData
;
495 tmprrdNode
= tmprrdNode
->list_next
;
498 /* link xor node to commit node */
499 RF_ASSERT(xorNode
->numSuccedents
== 1);
500 RF_ASSERT(commitNode
->numAntecedents
== 1);
501 xorNode
->succedents
[0] = commitNode
;
502 commitNode
->antecedents
[0] = xorNode
;
503 commitNode
->antType
[0] = rf_control
;
505 /* link commit node to wnd nodes */
506 RF_ASSERT(commitNode
->numSuccedents
== nfaults
+ nWndNodes
);
507 tmpwndNode
= wndNodes
;
508 for (i
= 0; i
< nWndNodes
; i
++) {
509 RF_ASSERT(tmpwndNode
->numAntecedents
== 1);
510 commitNode
->succedents
[i
] = tmpwndNode
;
511 tmpwndNode
->antecedents
[0] = commitNode
;
512 tmpwndNode
->antType
[0] = rf_control
;
513 tmpwndNode
= tmpwndNode
->list_next
;
516 /* link the commit node to wnp, wnq nodes */
517 RF_ASSERT(wnpNode
->numAntecedents
== 1);
518 commitNode
->succedents
[nWndNodes
] = wnpNode
;
519 wnpNode
->antecedents
[0] = commitNode
;
520 wnpNode
->antType
[0] = rf_control
;
521 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
523 RF_ASSERT(wnqNode
->numAntecedents
== 1);
524 commitNode
->succedents
[nWndNodes
+ 1] = wnqNode
;
525 wnqNode
->antecedents
[0] = commitNode
;
526 wnqNode
->antType
[0] = rf_control
;
529 /* link write new data nodes to unblock node */
530 RF_ASSERT(unblockNode
->numAntecedents
== (nWndNodes
+ nfaults
));
531 tmpwndNode
= wndNodes
;
532 for (i
= 0; i
< nWndNodes
; i
++) {
533 RF_ASSERT(tmpwndNode
->numSuccedents
== 1);
534 tmpwndNode
->succedents
[0] = unblockNode
;
535 unblockNode
->antecedents
[i
] = tmpwndNode
;
536 unblockNode
->antType
[i
] = rf_control
;
537 tmpwndNode
= tmpwndNode
->list_next
;
540 /* link write new parity node to unblock node */
541 RF_ASSERT(wnpNode
->numSuccedents
== 1);
542 wnpNode
->succedents
[0] = unblockNode
;
543 unblockNode
->antecedents
[nWndNodes
] = wnpNode
;
544 unblockNode
->antType
[nWndNodes
] = rf_control
;
546 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
547 /* link write new q node to unblock node */
549 RF_ASSERT(wnqNode
->numSuccedents
== 1);
550 wnqNode
->succedents
[0] = unblockNode
;
551 unblockNode
->antecedents
[nWndNodes
+ 1] = wnqNode
;
552 unblockNode
->antType
[nWndNodes
+ 1] = rf_control
;
555 /* link unblock node to term node */
556 RF_ASSERT(unblockNode
->numSuccedents
== 1);
557 RF_ASSERT(termNode
->numAntecedents
== 1);
558 RF_ASSERT(termNode
->numSuccedents
== 0);
559 unblockNode
->succedents
[0] = termNode
;
560 termNode
->antecedents
[0] = unblockNode
;
561 termNode
->antType
[0] = rf_control
;
563 #define CONS_PDA(if,start,num) \
564 pda_p->col = asmap->if->col; \
565 pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
566 pda_p->numSector = num; \
567 pda_p->next = NULL; \
568 RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
569 #if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0)
571 rf_WriteGenerateFailedAccessASMs(
573 RF_AccessStripeMap_t
* asmap
,
574 RF_PhysDiskAddr_t
** pdap
,
576 RF_PhysDiskAddr_t
** pqpdap
,
578 RF_AllocListElem_t
* allocList
)
580 RF_RaidLayout_t
*layoutPtr
= &(raidPtr
->Layout
);
582 RF_SectorCount_t secPerSU
= layoutPtr
->sectorsPerStripeUnit
;
583 int numDataCol
= layoutPtr
->numDataCol
;
586 RF_SectorNum_t fone_start
, fone_end
, ftwo_start
= 0, ftwo_end
;
587 RF_PhysDiskAddr_t
*fone
= asmap
->failedPDAs
[0], *ftwo
= asmap
->failedPDAs
[1];
588 RF_PhysDiskAddr_t
*pda_p
;
589 RF_RaidAddr_t sosAddr
;
591 /* determine how many pda's we will have to generate per unaccess
592 * stripe. If there is only one failed data unit, it is one; if two,
593 * possibly two, depending wether they overlap. */
595 fone_start
= rf_StripeUnitOffset(layoutPtr
, fone
->startSector
);
596 fone_end
= fone_start
+ fone
->numSector
;
598 if (asmap
->numDataFailed
== 1) {
601 RF_MallocAndAdd(*pqpdap
, 2 * sizeof(RF_PhysDiskAddr_t
), (RF_PhysDiskAddr_t
*), allocList
);
604 CONS_PDA(parityInfo
, fone_start
, fone
->numSector
);
605 pda_p
->type
= RF_PDA_TYPE_PARITY
;
608 CONS_PDA(qInfo
, fone_start
, fone
->numSector
);
609 pda_p
->type
= RF_PDA_TYPE_Q
;
611 ftwo_start
= rf_StripeUnitOffset(layoutPtr
, ftwo
->startSector
);
612 ftwo_end
= ftwo_start
+ ftwo
->numSector
;
613 if (fone
->numSector
+ ftwo
->numSector
> secPerSU
) {
616 RF_MallocAndAdd(*pqpdap
, 2 * sizeof(RF_PhysDiskAddr_t
), (RF_PhysDiskAddr_t
*), allocList
);
618 CONS_PDA(parityInfo
, 0, secPerSU
);
619 pda_p
->type
= RF_PDA_TYPE_PARITY
;
621 CONS_PDA(qInfo
, 0, secPerSU
);
622 pda_p
->type
= RF_PDA_TYPE_Q
;
626 /* four of them, fone, then ftwo */
627 RF_MallocAndAdd(*pqpdap
, 4 * sizeof(RF_PhysDiskAddr_t
), (RF_PhysDiskAddr_t
*), allocList
);
629 CONS_PDA(parityInfo
, fone_start
, fone
->numSector
);
630 pda_p
->type
= RF_PDA_TYPE_PARITY
;
632 CONS_PDA(qInfo
, fone_start
, fone
->numSector
);
633 pda_p
->type
= RF_PDA_TYPE_Q
;
635 CONS_PDA(parityInfo
, ftwo_start
, ftwo
->numSector
);
636 pda_p
->type
= RF_PDA_TYPE_PARITY
;
638 CONS_PDA(qInfo
, ftwo_start
, ftwo
->numSector
);
639 pda_p
->type
= RF_PDA_TYPE_Q
;
642 /* figure out number of nonaccessed pda */
643 napdas
= PDAPerDisk
* (numDataCol
- 2);
644 *nPQNodep
= PDAPerDisk
;
648 return; /* short circuit */
650 /* allocate up our list of pda's */
652 RF_MallocAndAdd(pda_p
, napdas
* sizeof(RF_PhysDiskAddr_t
),
653 (RF_PhysDiskAddr_t
*), allocList
);
656 /* linkem together */
657 for (i
= 0; i
< (napdas
- 1); i
++)
658 pda_p
[i
].next
= pda_p
+ (i
+ 1);
660 sosAddr
= rf_RaidAddressOfPrevStripeBoundary(layoutPtr
, asmap
->raidAddress
);
661 for (i
= 0; i
< numDataCol
; i
++) {
662 if ((pda_p
- (*pdap
)) == napdas
)
664 pda_p
->type
= RF_PDA_TYPE_DATA
;
665 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
);
666 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
667 /* skip over dead disks */
668 if (RF_DEAD_DISK(raidPtr
->Disks
[pda_p
->col
].status
))
672 pda_p
->numSector
= fone
->numSector
;
673 pda_p
->raidAddress
+= fone_start
;
674 pda_p
->startSector
+= fone_start
;
675 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
677 case 2: /* full stripe */
678 pda_p
->numSector
= secPerSU
;
679 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, secPerSU
), (char *), allocList
);
681 case 3: /* two slabs */
682 pda_p
->numSector
= fone
->numSector
;
683 pda_p
->raidAddress
+= fone_start
;
684 pda_p
->startSector
+= fone_start
;
685 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
687 pda_p
->type
= RF_PDA_TYPE_DATA
;
688 pda_p
->raidAddress
= sosAddr
+ (i
* secPerSU
);
689 (raidPtr
->Layout
.map
->MapSector
) (raidPtr
, pda_p
->raidAddress
, &(pda_p
->col
), &(pda_p
->startSector
), 0);
690 pda_p
->numSector
= ftwo
->numSector
;
691 pda_p
->raidAddress
+= ftwo_start
;
692 pda_p
->startSector
+= ftwo_start
;
693 RF_MallocAndAdd(pda_p
->bufPtr
, rf_RaidAddressToByte(raidPtr
, pda_p
->numSector
), (char *), allocList
);
701 RF_ASSERT(pda_p
- *pdap
== napdas
);
704 #define DISK_NODE_PDA(node) ((node)->params[0].p)
706 #define DISK_NODE_PARAMS(_node_,_p_) \
707 (_node_).params[0].p = _p_ ; \
708 (_node_).params[1].p = (_p_)->bufPtr; \
709 (_node_).params[2].v = parityStripeID; \
710 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru)
713 rf_DoubleDegSmallWrite(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
714 RF_DagHeader_t
*dag_h
, void *bp
,
715 RF_RaidAccessFlags_t flags
,
716 RF_AllocListElem_t
*allocList
,
717 const char *redundantReadNodeName
,
718 const char *redundantWriteNodeName
,
719 const char *recoveryNodeName
,
720 int (*recovFunc
) (RF_DagNode_t
*))
722 RF_RaidLayout_t
*layoutPtr
= &(raidPtr
->Layout
);
723 RF_DagNode_t
*nodes
, *wudNodes
, *rrdNodes
, *recoveryNode
, *blockNode
,
724 *unblockNode
, *rpNodes
, *rqNodes
, *wpNodes
, *wqNodes
, *termNode
;
725 RF_PhysDiskAddr_t
*pda
, *pqPDAs
;
726 RF_PhysDiskAddr_t
*npdas
;
727 int nWriteNodes
, nNodes
, nReadNodes
, nRrdNodes
, nWudNodes
, i
;
728 RF_ReconUnitNum_t which_ru
;
730 RF_StripeNum_t parityStripeID
= rf_RaidAddressToParityStripeID(layoutPtr
, asmap
->raidAddress
, &which_ru
);
732 /* simple small write case - First part looks like a reconstruct-read
733 * of the failed data units. Then a write of all data units not
737 /* Hdr | ------Block- / / \ Rrd Rrd ... Rrd Rp Rq \ \
738 * / -------PQ----- / \ \ Wud Wp WQ \ | /
741 * Rrd = read recovery data (potentially none) Wud = write user data
742 * (not incl. failed disks) Wp = Write P (could be two) Wq = Write Q
747 rf_WriteGenerateFailedAccessASMs(raidPtr
, asmap
, &npdas
, &nRrdNodes
, &pqPDAs
, &nPQNodes
, allocList
);
749 RF_ASSERT(asmap
->numDataFailed
== 1);
751 nWudNodes
= asmap
->numStripeUnitsAccessed
- (asmap
->numDataFailed
);
752 nReadNodes
= nRrdNodes
+ 2 * nPQNodes
;
753 nWriteNodes
= nWudNodes
+ 2 * nPQNodes
;
754 nNodes
= 4 + nReadNodes
+ nWriteNodes
;
756 RF_MallocAndAdd(nodes
, nNodes
* sizeof(RF_DagNode_t
), (RF_DagNode_t
*), allocList
);
758 unblockNode
= blockNode
+ 1;
759 termNode
= unblockNode
+ 1;
760 recoveryNode
= termNode
+ 1;
761 rrdNodes
= recoveryNode
+ 1;
762 rpNodes
= rrdNodes
+ nRrdNodes
;
763 rqNodes
= rpNodes
+ nPQNodes
;
764 wudNodes
= rqNodes
+ nPQNodes
;
765 wpNodes
= wudNodes
+ nWudNodes
;
766 wqNodes
= wpNodes
+ nPQNodes
;
768 dag_h
->creator
= "PQ_DDSimpleSmallWrite";
769 dag_h
->numSuccedents
= 1;
770 dag_h
->succedents
[0] = blockNode
;
771 rf_InitNode(termNode
, rf_wait
, RF_FALSE
, rf_TerminateFunc
, rf_TerminateUndoFunc
, NULL
, 0, 1, 0, 0, dag_h
, "Trm", allocList
);
772 termNode
->antecedents
[0] = unblockNode
;
773 termNode
->antType
[0] = rf_control
;
775 /* init the block and unblock nodes */
776 /* The block node has all the read nodes as successors */
777 rf_InitNode(blockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
, NULL
, nReadNodes
, 0, 0, 0, dag_h
, "Nil", allocList
);
778 for (i
= 0; i
< nReadNodes
; i
++)
779 blockNode
->succedents
[i
] = rrdNodes
+ i
;
781 /* The unblock node has all the writes as successors */
782 rf_InitNode(unblockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
, NULL
, 1, nWriteNodes
, 0, 0, dag_h
, "Nil", allocList
);
783 for (i
= 0; i
< nWriteNodes
; i
++) {
784 unblockNode
->antecedents
[i
] = wudNodes
+ i
;
785 unblockNode
->antType
[i
] = rf_control
;
787 unblockNode
->succedents
[0] = termNode
;
789 #define INIT_READ_NODE(node,name) \
790 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
791 (node)->succedents[0] = recoveryNode; \
792 (node)->antecedents[0] = blockNode; \
793 (node)->antType[0] = rf_control;
795 /* build the read nodes */
797 for (i
= 0; i
< nRrdNodes
; i
++, pda
= pda
->next
) {
798 INIT_READ_NODE(rrdNodes
+ i
, "rrd");
799 DISK_NODE_PARAMS(rrdNodes
[i
], pda
);
802 /* read redundancy pdas */
804 INIT_READ_NODE(rpNodes
, "Rp");
806 DISK_NODE_PARAMS(rpNodes
[0], pda
);
808 INIT_READ_NODE(rqNodes
, redundantReadNodeName
);
810 DISK_NODE_PARAMS(rqNodes
[0], pda
);
813 INIT_READ_NODE(rpNodes
+ 1, "Rp");
815 DISK_NODE_PARAMS(rpNodes
[1], pda
);
817 INIT_READ_NODE(rqNodes
+ 1, redundantReadNodeName
);
819 DISK_NODE_PARAMS(rqNodes
[1], pda
);
821 /* the recovery node has all reads as precedessors and all writes as
822 * successors. It generates a result for every write P or write Q
823 * node. As parameters, it takes a pda per read and a pda per stripe
824 * of user data written. It also takes as the last params the raidPtr
825 * and asm. For results, it takes PDA for P & Q. */
828 rf_InitNode(recoveryNode
, rf_wait
, RF_FALSE
, recovFunc
, rf_NullNodeUndoFunc
, NULL
,
829 nWriteNodes
, /* succesors */
830 nReadNodes
, /* preds */
831 nReadNodes
+ nWudNodes
+ 3, /* params */
832 2 * nPQNodes
, /* results */
833 dag_h
, recoveryNodeName
, allocList
);
837 for (i
= 0; i
< nReadNodes
; i
++) {
838 recoveryNode
->antecedents
[i
] = rrdNodes
+ i
;
839 recoveryNode
->antType
[i
] = rf_control
;
840 recoveryNode
->params
[i
].p
= DISK_NODE_PDA(rrdNodes
+ i
);
842 for (i
= 0; i
< nWudNodes
; i
++) {
843 recoveryNode
->succedents
[i
] = wudNodes
+ i
;
845 recoveryNode
->params
[nReadNodes
+ nWudNodes
].p
= asmap
->failedPDAs
[0];
846 recoveryNode
->params
[nReadNodes
+ nWudNodes
+ 1].p
= raidPtr
;
847 recoveryNode
->params
[nReadNodes
+ nWudNodes
+ 2].p
= asmap
;
849 for (; i
< nWriteNodes
; i
++)
850 recoveryNode
->succedents
[i
] = wudNodes
+ i
;
853 recoveryNode
->results
[0] = pda
;
855 recoveryNode
->results
[1] = pda
;
858 recoveryNode
->results
[2] = pda
;
860 recoveryNode
->results
[3] = pda
;
863 #define INIT_WRITE_NODE(node,name) \
864 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
865 (node)->succedents[0] = unblockNode; \
866 (node)->antecedents[0] = recoveryNode; \
867 (node)->antType[0] = rf_control;
869 pda
= asmap
->physInfo
;
870 for (i
= 0; i
< nWudNodes
; i
++) {
871 INIT_WRITE_NODE(wudNodes
+ i
, "Wd");
872 DISK_NODE_PARAMS(wudNodes
[i
], pda
);
873 recoveryNode
->params
[nReadNodes
+ i
].p
= DISK_NODE_PDA(wudNodes
+ i
);
876 /* write redundancy pdas */
878 INIT_WRITE_NODE(wpNodes
, "Wp");
880 DISK_NODE_PARAMS(wpNodes
[0], pda
);
882 INIT_WRITE_NODE(wqNodes
, "Wq");
884 DISK_NODE_PARAMS(wqNodes
[0], pda
);
887 INIT_WRITE_NODE(wpNodes
+ 1, "Wp");
889 DISK_NODE_PARAMS(wpNodes
[1], pda
);
891 INIT_WRITE_NODE(wqNodes
+ 1, "Wq");
893 DISK_NODE_PARAMS(wqNodes
[1], pda
);
896 #endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0) */