1 /* $NetBSD: rf_dagffrd.c,v 1.17 2006/10/12 01:31:50 christos Exp $ */
3 * Copyright (c) 1995 Carnegie-Mellon University.
6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
18 * Carnegie Mellon requests users of this software to return to
20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
32 * code for creating fault-free read DAGs
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rf_dagffrd.c,v 1.17 2006/10/12 01:31:50 christos Exp $");
39 #include <dev/raidframe/raidframevar.h>
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_debugMem.h"
46 #include "rf_general.h"
47 #include "rf_dagffrd.h"
49 /******************************************************************************
51 * General comments on DAG creation:
53 * All DAGs in this file use roll-away error recovery. Each DAG has a single
54 * commit node, usually called "Cmt." If an error occurs before the Cmt node
55 * is reached, the execution engine will halt forward execution and work
56 * backward through the graph, executing the undo functions. Assuming that
57 * each node in the graph prior to the Cmt node are undoable and atomic - or -
58 * does not make changes to permanent state, the graph will fail atomically.
59 * If an error occurs after the Cmt node executes, the engine will roll-forward
60 * through the graph, blindly executing nodes until it reaches the end.
61 * If a graph reaches the end, it is assumed to have completed successfully.
63 * A graph has only 1 Cmt node.
68 /******************************************************************************
70 * The following wrappers map the standard DAG creation interface to the
71 * DAG creation routines. Additionally, these wrappers enable experimentation
72 * with new DAG structures by providing an extra level of indirection, allowing
73 * the DAG creation routines to be replaced at this single point.
77 rf_CreateFaultFreeReadDAG(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
78 RF_DagHeader_t
*dag_h
, void *bp
,
79 RF_RaidAccessFlags_t flags
,
80 RF_AllocListElem_t
*allocList
)
82 rf_CreateNonredundantDAG(raidPtr
, asmap
, dag_h
, bp
, flags
, allocList
,
87 /******************************************************************************
89 * DAG creation code begins here
92 /******************************************************************************
94 * creates a DAG to perform a nonredundant read or write of data within one
96 * For reads, this DAG is as follows:
99 * Header -- Block ---- read ---- Commit -- Terminate
102 * For writes, this DAG is as follows:
105 * Header -- Commit ---- write ---- Block -- Terminate
108 * There is one disk node per stripe unit accessed, and all disk nodes are in
111 * Tricky point here: The first disk node (read or write) is created
112 * normally. Subsequent disk nodes are created by copying the first one,
113 * and modifying a few params. The "succedents" and "antecedents" fields are
114 * _not_ re-created in each node, but rather left pointing to the same array
115 * that was malloc'd when the first node was created. Thus, it's essential
116 * that when this DAG is freed, the succedents and antecedents fields be freed
117 * in ONLY ONE of the read nodes. This does not apply to the "params" field
118 * because it is recreated for each READ node.
120 * Note that normal-priority accesses do not need to be tagged with their
121 * parity stripe ID, because they will never be promoted. Hence, I've
122 * commented-out the code to do this, and marked it with UNNEEDED.
124 *****************************************************************************/
127 rf_CreateNonredundantDAG(RF_Raid_t
*raidPtr
,
128 RF_AccessStripeMap_t
*asmap
, RF_DagHeader_t
*dag_h
, void *bp
,
129 RF_RaidAccessFlags_t flags
, RF_AllocListElem_t
*allocList
,
132 RF_DagNode_t
*diskNodes
, *blockNode
, *commitNode
, *termNode
;
133 RF_DagNode_t
*tmpNode
, *tmpdiskNode
;
134 RF_PhysDiskAddr_t
*pda
= asmap
->physInfo
;
135 int (*doFunc
) (RF_DagNode_t
*), (*undoFunc
) (RF_DagNode_t
*);
136 int i
, n
, totalNumNodes
;
139 n
= asmap
->numStripeUnitsAccessed
;
140 dag_h
->creator
= "NonredundantDAG";
142 RF_ASSERT(RF_IO_IS_R_OR_W(type
));
144 case RF_IO_TYPE_READ
:
145 doFunc
= rf_DiskReadFunc
;
146 undoFunc
= rf_DiskReadUndoFunc
;
150 printf("[Creating non-redundant read DAG]\n");
153 case RF_IO_TYPE_WRITE
:
154 doFunc
= rf_DiskWriteFunc
;
155 undoFunc
= rf_DiskWriteUndoFunc
;
159 printf("[Creating non-redundant write DAG]\n");
167 * For reads, the dag can not commit until the block node is reached.
168 * for writes, the dag commits immediately.
170 dag_h
->numCommitNodes
= 1;
171 dag_h
->numCommits
= 0;
172 dag_h
->numSuccedents
= 1;
177 * n data reads (or writes)
182 totalNumNodes
= n
+ 3;
184 for (i
= 0; i
< n
; i
++) {
185 tmpNode
= rf_AllocDAGNode();
186 tmpNode
->list_next
= dag_h
->nodes
;
187 dag_h
->nodes
= tmpNode
;
189 diskNodes
= dag_h
->nodes
;
191 blockNode
= rf_AllocDAGNode();
192 blockNode
->list_next
= dag_h
->nodes
;
193 dag_h
->nodes
= blockNode
;
195 commitNode
= rf_AllocDAGNode();
196 commitNode
->list_next
= dag_h
->nodes
;
197 dag_h
->nodes
= commitNode
;
199 termNode
= rf_AllocDAGNode();
200 termNode
->list_next
= dag_h
->nodes
;
201 dag_h
->nodes
= termNode
;
203 /* initialize nodes */
205 case RF_IO_TYPE_READ
:
206 rf_InitNode(blockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
207 NULL
, n
, 0, 0, 0, dag_h
, "Nil", allocList
);
208 rf_InitNode(commitNode
, rf_wait
, RF_TRUE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
209 NULL
, 1, n
, 0, 0, dag_h
, "Cmt", allocList
);
210 rf_InitNode(termNode
, rf_wait
, RF_FALSE
, rf_TerminateFunc
, rf_TerminateUndoFunc
,
211 NULL
, 0, 1, 0, 0, dag_h
, "Trm", allocList
);
213 case RF_IO_TYPE_WRITE
:
214 rf_InitNode(blockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
215 NULL
, 1, 0, 0, 0, dag_h
, "Nil", allocList
);
216 rf_InitNode(commitNode
, rf_wait
, RF_TRUE
, rf_NullNodeFunc
, rf_NullNodeUndoFunc
,
217 NULL
, n
, 1, 0, 0, dag_h
, "Cmt", allocList
);
218 rf_InitNode(termNode
, rf_wait
, RF_FALSE
, rf_TerminateFunc
, rf_TerminateUndoFunc
,
219 NULL
, 0, n
, 0, 0, dag_h
, "Trm", allocList
);
225 tmpdiskNode
= diskNodes
;
226 for (i
= 0; i
< n
; i
++) {
227 RF_ASSERT(pda
!= NULL
);
228 rf_InitNode(tmpdiskNode
, rf_wait
, RF_FALSE
, doFunc
, undoFunc
, rf_GenericWakeupFunc
,
229 1, 1, 4, 0, dag_h
, name
, allocList
);
230 tmpdiskNode
->params
[0].p
= pda
;
231 tmpdiskNode
->params
[1].p
= pda
->bufPtr
;
232 /* parity stripe id is not necessary */
233 tmpdiskNode
->params
[2].v
= 0;
234 tmpdiskNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, 0);
236 tmpdiskNode
= tmpdiskNode
->list_next
;
243 /* connect hdr to block node */
244 RF_ASSERT(blockNode
->numAntecedents
== 0);
245 dag_h
->succedents
[0] = blockNode
;
247 if (type
== RF_IO_TYPE_READ
) {
248 /* connecting a nonredundant read DAG */
249 RF_ASSERT(blockNode
->numSuccedents
== n
);
250 RF_ASSERT(commitNode
->numAntecedents
== n
);
251 tmpdiskNode
= diskNodes
;
252 for (i
= 0; i
< n
; i
++) {
253 /* connect block node to each read node */
254 RF_ASSERT(tmpdiskNode
->numAntecedents
== 1);
255 blockNode
->succedents
[i
] = tmpdiskNode
;
256 tmpdiskNode
->antecedents
[0] = blockNode
;
257 tmpdiskNode
->antType
[0] = rf_control
;
259 /* connect each read node to the commit node */
260 RF_ASSERT(tmpdiskNode
->numSuccedents
== 1);
261 tmpdiskNode
->succedents
[0] = commitNode
;
262 commitNode
->antecedents
[i
] = tmpdiskNode
;
263 commitNode
->antType
[i
] = rf_control
;
264 tmpdiskNode
= tmpdiskNode
->list_next
;
266 /* connect the commit node to the term node */
267 RF_ASSERT(commitNode
->numSuccedents
== 1);
268 RF_ASSERT(termNode
->numAntecedents
== 1);
269 RF_ASSERT(termNode
->numSuccedents
== 0);
270 commitNode
->succedents
[0] = termNode
;
271 termNode
->antecedents
[0] = commitNode
;
272 termNode
->antType
[0] = rf_control
;
274 /* connecting a nonredundant write DAG */
275 /* connect the block node to the commit node */
276 RF_ASSERT(blockNode
->numSuccedents
== 1);
277 RF_ASSERT(commitNode
->numAntecedents
== 1);
278 blockNode
->succedents
[0] = commitNode
;
279 commitNode
->antecedents
[0] = blockNode
;
280 commitNode
->antType
[0] = rf_control
;
282 RF_ASSERT(commitNode
->numSuccedents
== n
);
283 RF_ASSERT(termNode
->numAntecedents
== n
);
284 RF_ASSERT(termNode
->numSuccedents
== 0);
285 tmpdiskNode
= diskNodes
;
286 for (i
= 0; i
< n
; i
++) {
287 /* connect the commit node to each write node */
288 RF_ASSERT(tmpdiskNode
->numAntecedents
== 1);
289 commitNode
->succedents
[i
] = tmpdiskNode
;
290 tmpdiskNode
->antecedents
[0] = commitNode
;
291 tmpdiskNode
->antType
[0] = rf_control
;
293 /* connect each write node to the term node */
294 RF_ASSERT(tmpdiskNode
->numSuccedents
== 1);
295 tmpdiskNode
->succedents
[0] = termNode
;
296 termNode
->antecedents
[i
] = tmpdiskNode
;
297 termNode
->antType
[i
] = rf_control
;
298 tmpdiskNode
= tmpdiskNode
->list_next
;
302 /******************************************************************************
303 * Create a fault-free read DAG for RAID level 1
305 * Hdr -> Nil -> Rmir -> Cmt -> Trm
307 * The "Rmir" node schedules a read from the disk in the mirror pair with the
308 * shortest disk queue. the proper queue is selected at Rmir execution. this
309 * deferred mapping is unlike other archs in RAIDframe which generally fix
310 * mapping at DAG creation time.
312 * Parameters: raidPtr - description of the physical array
313 * asmap - logical & physical addresses for this access
314 * bp - buffer ptr (for holding read data)
315 * flags - general flags (e.g. disk locking)
316 * allocList - list of memory allocated in DAG creation
317 *****************************************************************************/
320 CreateMirrorReadDAG(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
321 RF_DagHeader_t
*dag_h
, void *bp
,
322 RF_RaidAccessFlags_t flags
, RF_AllocListElem_t
*allocList
,
323 int (*readfunc
) (RF_DagNode_t
* node
))
325 RF_DagNode_t
*readNodes
, *blockNode
, *commitNode
, *termNode
;
326 RF_DagNode_t
*tmpNode
, *tmpreadNode
;
327 RF_PhysDiskAddr_t
*data_pda
= asmap
->physInfo
;
328 RF_PhysDiskAddr_t
*parity_pda
= asmap
->parityInfo
;
329 int i
, n
, totalNumNodes
;
331 n
= asmap
->numStripeUnitsAccessed
;
332 dag_h
->creator
= "RaidOneReadDAG";
335 printf("[Creating RAID level 1 read DAG]\n");
339 * This dag can not commit until the commit node is reached
340 * errors prior to the commit point imply the dag has failed.
342 dag_h
->numCommitNodes
= 1;
343 dag_h
->numCommits
= 0;
344 dag_h
->numSuccedents
= 1;
354 totalNumNodes
= n
+ 3;
356 for (i
= 0; i
< n
; i
++) {
357 tmpNode
= rf_AllocDAGNode();
358 tmpNode
->list_next
= dag_h
->nodes
;
359 dag_h
->nodes
= tmpNode
;
361 readNodes
= dag_h
->nodes
;
363 blockNode
= rf_AllocDAGNode();
364 blockNode
->list_next
= dag_h
->nodes
;
365 dag_h
->nodes
= blockNode
;
367 commitNode
= rf_AllocDAGNode();
368 commitNode
->list_next
= dag_h
->nodes
;
369 dag_h
->nodes
= commitNode
;
371 termNode
= rf_AllocDAGNode();
372 termNode
->list_next
= dag_h
->nodes
;
373 dag_h
->nodes
= termNode
;
375 /* initialize nodes */
376 rf_InitNode(blockNode
, rf_wait
, RF_FALSE
, rf_NullNodeFunc
,
377 rf_NullNodeUndoFunc
, NULL
, n
, 0, 0, 0, dag_h
, "Nil", allocList
);
378 rf_InitNode(commitNode
, rf_wait
, RF_TRUE
, rf_NullNodeFunc
,
379 rf_NullNodeUndoFunc
, NULL
, 1, n
, 0, 0, dag_h
, "Cmt", allocList
);
380 rf_InitNode(termNode
, rf_wait
, RF_FALSE
, rf_TerminateFunc
,
381 rf_TerminateUndoFunc
, NULL
, 0, 1, 0, 0, dag_h
, "Trm", allocList
);
383 tmpreadNode
= readNodes
;
384 for (i
= 0; i
< n
; i
++) {
385 RF_ASSERT(data_pda
!= NULL
);
386 RF_ASSERT(parity_pda
!= NULL
);
387 rf_InitNode(tmpreadNode
, rf_wait
, RF_FALSE
, readfunc
,
388 rf_DiskReadMirrorUndoFunc
, rf_GenericWakeupFunc
, 1, 1, 5, 0, dag_h
,
390 tmpreadNode
->params
[0].p
= data_pda
;
391 tmpreadNode
->params
[1].p
= data_pda
->bufPtr
;
392 /* parity stripe id is not necessary */
393 tmpreadNode
->params
[2].p
= 0;
394 tmpreadNode
->params
[3].v
= RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY
, 0);
395 tmpreadNode
->params
[4].p
= parity_pda
;
396 data_pda
= data_pda
->next
;
397 parity_pda
= parity_pda
->next
;
398 tmpreadNode
= tmpreadNode
->list_next
;
405 /* connect hdr to block node */
406 RF_ASSERT(blockNode
->numAntecedents
== 0);
407 dag_h
->succedents
[0] = blockNode
;
409 /* connect block node to read nodes */
410 RF_ASSERT(blockNode
->numSuccedents
== n
);
411 tmpreadNode
= readNodes
;
412 for (i
= 0; i
< n
; i
++) {
413 RF_ASSERT(tmpreadNode
->numAntecedents
== 1);
414 blockNode
->succedents
[i
] = tmpreadNode
;
415 tmpreadNode
->antecedents
[0] = blockNode
;
416 tmpreadNode
->antType
[0] = rf_control
;
417 tmpreadNode
= tmpreadNode
->list_next
;
420 /* connect read nodes to commit node */
421 RF_ASSERT(commitNode
->numAntecedents
== n
);
422 tmpreadNode
= readNodes
;
423 for (i
= 0; i
< n
; i
++) {
424 RF_ASSERT(tmpreadNode
->numSuccedents
== 1);
425 tmpreadNode
->succedents
[0] = commitNode
;
426 commitNode
->antecedents
[i
] = tmpreadNode
;
427 commitNode
->antType
[i
] = rf_control
;
428 tmpreadNode
= tmpreadNode
->list_next
;
431 /* connect commit node to term node */
432 RF_ASSERT(commitNode
->numSuccedents
== 1);
433 RF_ASSERT(termNode
->numAntecedents
== 1);
434 RF_ASSERT(termNode
->numSuccedents
== 0);
435 commitNode
->succedents
[0] = termNode
;
436 termNode
->antecedents
[0] = commitNode
;
437 termNode
->antType
[0] = rf_control
;
441 rf_CreateMirrorIdleReadDAG(
443 RF_AccessStripeMap_t
* asmap
,
444 RF_DagHeader_t
* dag_h
,
446 RF_RaidAccessFlags_t flags
,
447 RF_AllocListElem_t
* allocList
)
449 CreateMirrorReadDAG(raidPtr
, asmap
, dag_h
, bp
, flags
, allocList
,
450 rf_DiskReadMirrorIdleFunc
);
453 #if (RF_INCLUDE_CHAINDECLUSTER > 0) || (RF_INCLUDE_INTERDECLUSTER > 0)
456 rf_CreateMirrorPartitionReadDAG(RF_Raid_t
*raidPtr
,
457 RF_AccessStripeMap_t
*asmap
,
458 RF_DagHeader_t
*dag_h
, void *bp
,
459 RF_RaidAccessFlags_t flags
,
460 RF_AllocListElem_t
*allocList
)
462 CreateMirrorReadDAG(raidPtr
, asmap
, dag_h
, bp
, flags
, allocList
,
463 rf_DiskReadMirrorPartitionFunc
);