1 /* $NetBSD: rf_dagutils.c,v 1.51 2007/03/04 06:02:36 christos Exp $ */
3 * Copyright (c) 1995 Carnegie-Mellon University.
6 * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
18 * Carnegie Mellon requests users of this software to return to
20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
29 /******************************************************************************
31 * rf_dagutils.c -- utility routines for manipulating dags
33 *****************************************************************************/
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: rf_dagutils.c,v 1.51 2007/03/04 06:02:36 christos Exp $");
38 #include <dev/raidframe/raidframevar.h>
41 #include "rf_threadstuff.h"
44 #include "rf_dagutils.h"
45 #include "rf_dagfuncs.h"
46 #include "rf_general.h"
48 #include "rf_shutdown.h"
50 #define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
52 const RF_RedFuncs_t rf_xorFuncs
= {
53 rf_RegularXorFunc
, "Reg Xr",
54 rf_SimpleXorFunc
, "Simple Xr"};
56 const RF_RedFuncs_t rf_xorRecoveryFuncs
= {
57 rf_RecoveryXorFunc
, "Recovery Xr",
58 rf_RecoveryXorFunc
, "Recovery Xr"};
60 #if RF_DEBUG_VALIDATE_DAG
61 static void rf_RecurPrintDAG(RF_DagNode_t
*, int, int);
62 static void rf_PrintDAG(RF_DagHeader_t
*);
63 static int rf_ValidateBranch(RF_DagNode_t
*, int *, int *,
64 RF_DagNode_t
**, int);
65 static void rf_ValidateBranchVisitedBits(RF_DagNode_t
*, int, int);
66 static void rf_ValidateVisitedBits(RF_DagHeader_t
*);
67 #endif /* RF_DEBUG_VALIDATE_DAG */
69 /* The maximum number of nodes in a DAG is bounded by
71 (2 * raidPtr->Layout->numDataCol) + (1 * layoutPtr->numParityCol) +
72 (1 * 2 * layoutPtr->numParityCol) + 3
74 which is: 2*RF_MAXCOL+1*2+1*2*2+3
76 For RF_MAXCOL of 40, this works out to 89. We use this value to provide an estimate
77 on the maximum size needed for RF_DAGPCACHE_SIZE. For RF_MAXCOL of 40, this structure
78 would be 534 bytes. Too much to have on-hand in a RF_DagNode_t, but should be ok to
79 have a few kicking around.
81 #define RF_DAGPCACHE_SIZE ((2*RF_MAXCOL+1*2+1*2*2+3) *(RF_MAX(sizeof(RF_DagParam_t), sizeof(RF_DagNode_t *))))
84 /******************************************************************************
86 * InitNode - initialize a dag node
88 * the size of the propList array is always the same as that of the
91 *****************************************************************************/
93 rf_InitNode(RF_DagNode_t
*node
, RF_NodeStatus_t initstatus
, int commit
,
94 int (*doFunc
) (RF_DagNode_t
*node
),
95 int (*undoFunc
) (RF_DagNode_t
*node
),
96 int (*wakeFunc
) (RF_DagNode_t
*node
, int status
),
97 int nSucc
, int nAnte
, int nParam
, int nResult
,
98 RF_DagHeader_t
*hdr
, const char *name
, RF_AllocListElem_t
*alist
)
103 if (nAnte
> RF_MAX_ANTECEDENTS
)
105 node
->status
= initstatus
;
106 node
->commitNode
= commit
;
107 node
->doFunc
= doFunc
;
108 node
->undoFunc
= undoFunc
;
109 node
->wakeFunc
= wakeFunc
;
110 node
->numParams
= nParam
;
111 node
->numResults
= nResult
;
112 node
->numAntecedents
= nAnte
;
113 node
->numAntDone
= 0;
115 /* node->list_next = NULL */ /* Don't touch this here!
117 in use by the caller! */
118 node
->numSuccedents
= nSucc
;
121 node
->big_dag_ptrs
= NULL
;
122 node
->big_dag_params
= NULL
;
125 /* allocate all the pointers with one call to malloc */
126 nptrs
= nSucc
+ nAnte
+ nResult
+ nSucc
;
128 if (nptrs
<= RF_DAG_PTRCACHESIZE
) {
130 * The dag_ptrs field of the node is basically some scribble
131 * space to be used here. We could get rid of it, and always
132 * allocate the range of pointers, but that's expensive. So,
133 * we pick a "common case" size for the pointer cache. Hopefully,
135 * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
136 * only a little bit (least efficient case)
137 * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE
140 ptrs
= (void **) node
->dag_ptrs
;
141 } else if (nptrs
<= (RF_DAGPCACHE_SIZE
/ sizeof(RF_DagNode_t
*))) {
142 node
->big_dag_ptrs
= rf_AllocDAGPCache();
143 ptrs
= (void **) node
->big_dag_ptrs
;
145 RF_MallocAndAdd(ptrs
, nptrs
* sizeof(void *),
148 node
->succedents
= (nSucc
) ? (RF_DagNode_t
**) ptrs
: NULL
;
149 node
->antecedents
= (nAnte
) ? (RF_DagNode_t
**) (ptrs
+ nSucc
) : NULL
;
150 node
->results
= (nResult
) ? (void **) (ptrs
+ nSucc
+ nAnte
) : NULL
;
151 node
->propList
= (nSucc
) ? (RF_PropHeader_t
**) (ptrs
+ nSucc
+ nAnte
+ nResult
) : NULL
;
154 if (nParam
<= RF_DAG_PARAMCACHESIZE
) {
155 node
->params
= (RF_DagParam_t
*) node
->dag_params
;
156 } else if (nParam
<= (RF_DAGPCACHE_SIZE
/ sizeof(RF_DagParam_t
))) {
157 node
->big_dag_params
= rf_AllocDAGPCache();
158 node
->params
= node
->big_dag_params
;
160 RF_MallocAndAdd(node
->params
,
161 nParam
* sizeof(RF_DagParam_t
),
162 (RF_DagParam_t
*), alist
);
171 /******************************************************************************
173 * allocation and deallocation routines
175 *****************************************************************************/
178 rf_FreeDAG(RF_DagHeader_t
*dag_h
)
180 RF_AccessStripeMapHeader_t
*asmap
, *t_asmap
;
181 RF_PhysDiskAddr_t
*pda
;
182 RF_DagNode_t
*tmpnode
;
183 RF_DagHeader_t
*nextDag
;
186 nextDag
= dag_h
->next
;
187 rf_FreeAllocList(dag_h
->allocList
);
188 for (asmap
= dag_h
->asmList
; asmap
;) {
191 rf_FreeAccessStripeMap(t_asmap
);
193 while (dag_h
->pda_cleanup_list
) {
194 pda
= dag_h
->pda_cleanup_list
;
195 dag_h
->pda_cleanup_list
= dag_h
->pda_cleanup_list
->next
;
196 rf_FreePhysDiskAddr(pda
);
198 while (dag_h
->nodes
) {
199 tmpnode
= dag_h
->nodes
;
200 dag_h
->nodes
= dag_h
->nodes
->list_next
;
201 rf_FreeDAGNode(tmpnode
);
203 rf_FreeDAGHeader(dag_h
);
208 #define RF_MAX_FREE_DAGH 128
209 #define RF_MIN_FREE_DAGH 32
211 #define RF_MAX_FREE_DAGNODE 512 /* XXX Tune this... */
212 #define RF_MIN_FREE_DAGNODE 128 /* XXX Tune this... */
214 #define RF_MAX_FREE_DAGLIST 128
215 #define RF_MIN_FREE_DAGLIST 32
217 #define RF_MAX_FREE_DAGPCACHE 128
218 #define RF_MIN_FREE_DAGPCACHE 8
220 #define RF_MAX_FREE_FUNCLIST 128
221 #define RF_MIN_FREE_FUNCLIST 32
223 #define RF_MAX_FREE_BUFFERS 128
224 #define RF_MIN_FREE_BUFFERS 32
226 static void rf_ShutdownDAGs(void *);
228 rf_ShutdownDAGs(void *ignored
)
230 pool_destroy(&rf_pools
.dagh
);
231 pool_destroy(&rf_pools
.dagnode
);
232 pool_destroy(&rf_pools
.daglist
);
233 pool_destroy(&rf_pools
.dagpcache
);
234 pool_destroy(&rf_pools
.funclist
);
238 rf_ConfigureDAGs(RF_ShutdownList_t
**listp
)
241 rf_pool_init(&rf_pools
.dagnode
, sizeof(RF_DagNode_t
),
242 "rf_dagnode_pl", RF_MIN_FREE_DAGNODE
, RF_MAX_FREE_DAGNODE
);
243 rf_pool_init(&rf_pools
.dagh
, sizeof(RF_DagHeader_t
),
244 "rf_dagh_pl", RF_MIN_FREE_DAGH
, RF_MAX_FREE_DAGH
);
245 rf_pool_init(&rf_pools
.daglist
, sizeof(RF_DagList_t
),
246 "rf_daglist_pl", RF_MIN_FREE_DAGLIST
, RF_MAX_FREE_DAGLIST
);
247 rf_pool_init(&rf_pools
.dagpcache
, RF_DAGPCACHE_SIZE
,
248 "rf_dagpcache_pl", RF_MIN_FREE_DAGPCACHE
, RF_MAX_FREE_DAGPCACHE
);
249 rf_pool_init(&rf_pools
.funclist
, sizeof(RF_FuncList_t
),
250 "rf_funclist_pl", RF_MIN_FREE_FUNCLIST
, RF_MAX_FREE_FUNCLIST
);
251 rf_ShutdownCreate(listp
, rf_ShutdownDAGs
, NULL
);
257 rf_AllocDAGHeader(void)
261 dh
= pool_get(&rf_pools
.dagh
, PR_WAITOK
);
262 memset((char *) dh
, 0, sizeof(RF_DagHeader_t
));
267 rf_FreeDAGHeader(RF_DagHeader_t
* dh
)
269 pool_put(&rf_pools
.dagh
, dh
);
273 rf_AllocDAGNode(void)
277 node
= pool_get(&rf_pools
.dagnode
, PR_WAITOK
);
278 memset(node
, 0, sizeof(RF_DagNode_t
));
283 rf_FreeDAGNode(RF_DagNode_t
*node
)
285 if (node
->big_dag_ptrs
) {
286 rf_FreeDAGPCache(node
->big_dag_ptrs
);
288 if (node
->big_dag_params
) {
289 rf_FreeDAGPCache(node
->big_dag_params
);
291 pool_put(&rf_pools
.dagnode
, node
);
295 rf_AllocDAGList(void)
297 RF_DagList_t
*dagList
;
299 dagList
= pool_get(&rf_pools
.daglist
, PR_WAITOK
);
300 memset(dagList
, 0, sizeof(RF_DagList_t
));
306 rf_FreeDAGList(RF_DagList_t
*dagList
)
308 pool_put(&rf_pools
.daglist
, dagList
);
312 rf_AllocDAGPCache(void)
315 p
= pool_get(&rf_pools
.dagpcache
, PR_WAITOK
);
316 memset(p
, 0, RF_DAGPCACHE_SIZE
);
322 rf_FreeDAGPCache(void *p
)
324 pool_put(&rf_pools
.dagpcache
, p
);
328 rf_AllocFuncList(void)
330 RF_FuncList_t
*funcList
;
332 funcList
= pool_get(&rf_pools
.funclist
, PR_WAITOK
);
333 memset(funcList
, 0, sizeof(RF_FuncList_t
));
339 rf_FreeFuncList(RF_FuncList_t
*funcList
)
341 pool_put(&rf_pools
.funclist
, funcList
);
344 /* allocates a stripe buffer -- a buffer large enough to hold all the data
349 rf_AllocStripeBuffer(RF_Raid_t
*raidPtr
, RF_DagHeader_t
*dag_h
,
352 RF_VoidPointerListElem_t
*vple
;
355 RF_ASSERT((size
<= (raidPtr
->numCol
* (raidPtr
->Layout
.sectorsPerStripeUnit
<<
356 raidPtr
->logBytesPerSector
))));
358 p
= malloc( raidPtr
->numCol
* (raidPtr
->Layout
.sectorsPerStripeUnit
<<
359 raidPtr
->logBytesPerSector
),
360 M_RAIDFRAME
, M_NOWAIT
);
362 RF_LOCK_MUTEX(raidPtr
->mutex
);
363 if (raidPtr
->stripebuf_count
> 0) {
364 vple
= raidPtr
->stripebuf
;
365 raidPtr
->stripebuf
= vple
->next
;
367 rf_FreeVPListElem(vple
);
368 raidPtr
->stripebuf_count
--;
371 printf("raid%d: Help! Out of emergency full-stripe buffers!\n", raidPtr
->raidid
);
374 RF_UNLOCK_MUTEX(raidPtr
->mutex
);
376 /* We didn't get a buffer... not much we can do other than wait,
377 and hope that someone frees up memory for us.. */
378 p
= malloc( raidPtr
->numCol
* (raidPtr
->Layout
.sectorsPerStripeUnit
<<
379 raidPtr
->logBytesPerSector
), M_RAIDFRAME
, M_WAITOK
);
382 memset(p
, 0, raidPtr
->numCol
* (raidPtr
->Layout
.sectorsPerStripeUnit
<< raidPtr
->logBytesPerSector
));
384 vple
= rf_AllocVPListElem();
386 vple
->next
= dag_h
->desc
->stripebufs
;
387 dag_h
->desc
->stripebufs
= vple
;
394 rf_FreeStripeBuffer(RF_Raid_t
*raidPtr
, RF_VoidPointerListElem_t
*vple
)
396 RF_LOCK_MUTEX(raidPtr
->mutex
);
397 if (raidPtr
->stripebuf_count
< raidPtr
->numEmergencyStripeBuffers
) {
398 /* just tack it in */
399 vple
->next
= raidPtr
->stripebuf
;
400 raidPtr
->stripebuf
= vple
;
401 raidPtr
->stripebuf_count
++;
403 free(vple
->p
, M_RAIDFRAME
);
404 rf_FreeVPListElem(vple
);
406 RF_UNLOCK_MUTEX(raidPtr
->mutex
);
409 /* allocates a buffer big enough to hold the data described by the
410 caller (ie. the data of the associated PDA). Glue this buffer
411 into our dag_h cleanup structure. */
414 rf_AllocBuffer(RF_Raid_t
*raidPtr
, RF_DagHeader_t
*dag_h
, int size
)
416 RF_VoidPointerListElem_t
*vple
;
419 p
= rf_AllocIOBuffer(raidPtr
, size
);
420 vple
= rf_AllocVPListElem();
422 vple
->next
= dag_h
->desc
->iobufs
;
423 dag_h
->desc
->iobufs
= vple
;
429 rf_AllocIOBuffer(RF_Raid_t
*raidPtr
, int size
)
431 RF_VoidPointerListElem_t
*vple
;
434 RF_ASSERT((size
<= (raidPtr
->Layout
.sectorsPerStripeUnit
<<
435 raidPtr
->logBytesPerSector
)));
437 p
= malloc( raidPtr
->Layout
.sectorsPerStripeUnit
<<
438 raidPtr
->logBytesPerSector
,
439 M_RAIDFRAME
, M_NOWAIT
);
441 RF_LOCK_MUTEX(raidPtr
->mutex
);
442 if (raidPtr
->iobuf_count
> 0) {
443 vple
= raidPtr
->iobuf
;
444 raidPtr
->iobuf
= vple
->next
;
446 rf_FreeVPListElem(vple
);
447 raidPtr
->iobuf_count
--;
450 printf("raid%d: Help! Out of emergency buffers!\n", raidPtr
->raidid
);
453 RF_UNLOCK_MUTEX(raidPtr
->mutex
);
455 /* We didn't get a buffer... not much we can do other than wait,
456 and hope that someone frees up memory for us.. */
457 p
= malloc( raidPtr
->Layout
.sectorsPerStripeUnit
<<
458 raidPtr
->logBytesPerSector
,
459 M_RAIDFRAME
, M_WAITOK
);
462 memset(p
, 0, raidPtr
->Layout
.sectorsPerStripeUnit
<< raidPtr
->logBytesPerSector
);
467 rf_FreeIOBuffer(RF_Raid_t
*raidPtr
, RF_VoidPointerListElem_t
*vple
)
469 RF_LOCK_MUTEX(raidPtr
->mutex
);
470 if (raidPtr
->iobuf_count
< raidPtr
->numEmergencyBuffers
) {
471 /* just tack it in */
472 vple
->next
= raidPtr
->iobuf
;
473 raidPtr
->iobuf
= vple
;
474 raidPtr
->iobuf_count
++;
476 free(vple
->p
, M_RAIDFRAME
);
477 rf_FreeVPListElem(vple
);
479 RF_UNLOCK_MUTEX(raidPtr
->mutex
);
484 #if RF_DEBUG_VALIDATE_DAG
485 /******************************************************************************
489 *****************************************************************************/
492 rf_NodeStatusString(RF_DagNode_t
*node
)
494 switch (node
->status
) {
509 rf_PrintNodeInfoString(RF_DagNode_t
*node
)
511 RF_PhysDiskAddr_t
*pda
;
512 int (*df
) (RF_DagNode_t
*) = node
->doFunc
;
516 if ((df
== rf_DiskReadFunc
) || (df
== rf_DiskWriteFunc
)
517 || (df
== rf_DiskReadMirrorIdleFunc
)
518 || (df
== rf_DiskReadMirrorPartitionFunc
)) {
519 pda
= (RF_PhysDiskAddr_t
*) node
->params
[0].p
;
520 bufPtr
= (void *) node
->params
[1].p
;
523 RF_ASSERT(!(lk
&& unlk
));
524 printf("c %d offs %ld nsect %d buf 0x%lx %s\n", pda
->col
,
525 (long) pda
->startSector
, (int) pda
->numSector
, (long) bufPtr
,
526 (lk
) ? "LOCK" : ((unlk
) ? "UNLK" : " "));
529 if ((df
== rf_SimpleXorFunc
) || (df
== rf_RegularXorFunc
)
530 || (df
== rf_RecoveryXorFunc
)) {
531 printf("result buf 0x%lx\n", (long) node
->results
[0]);
532 for (i
= 0; i
< node
->numParams
- 1; i
+= 2) {
533 pda
= (RF_PhysDiskAddr_t
*) node
->params
[i
].p
;
534 bufPtr
= (RF_PhysDiskAddr_t
*) node
->params
[i
+ 1].p
;
535 printf(" buf 0x%lx c%d offs %ld nsect %d\n",
536 (long) bufPtr
, pda
->col
,
537 (long) pda
->startSector
, (int) pda
->numSector
);
541 #if RF_INCLUDE_PARITYLOGGING > 0
542 if (df
== rf_ParityLogOverwriteFunc
|| df
== rf_ParityLogUpdateFunc
) {
543 for (i
= 0; i
< node
->numParams
- 1; i
+= 2) {
544 pda
= (RF_PhysDiskAddr_t
*) node
->params
[i
].p
;
545 bufPtr
= (RF_PhysDiskAddr_t
*) node
->params
[i
+ 1].p
;
546 printf(" c%d offs %ld nsect %d buf 0x%lx\n",
547 pda
->col
, (long) pda
->startSector
,
548 (int) pda
->numSector
, (long) bufPtr
);
552 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
554 if ((df
== rf_TerminateFunc
) || (df
== rf_NullNodeFunc
)) {
562 rf_RecurPrintDAG(RF_DagNode_t
*node
, int depth
, int unvisited
)
567 node
->visited
= (unvisited
) ? 0 : 1;
568 printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth
,
569 node
->nodeNum
, node
->commitNode
, node
->name
, rf_NodeStatusString(node
),
570 node
->numSuccedents
, node
->numSuccFired
, node
->numSuccDone
,
571 node
->numAntecedents
, node
->numAntDone
, node
->numParams
, node
->numResults
);
572 for (i
= 0; i
< node
->numSuccedents
; i
++) {
573 printf("%d%s", node
->succedents
[i
]->nodeNum
,
574 ((i
== node
->numSuccedents
- 1) ? "\0" : " "));
577 for (i
= 0; i
< node
->numAntecedents
; i
++) {
578 switch (node
->antType
[i
]) {
595 printf("%d(%s)%s", node
->antecedents
[i
]->nodeNum
, anttype
, (i
== node
->numAntecedents
- 1) ? "\0" : " ");
598 rf_PrintNodeInfoString(node
);
599 for (i
= 0; i
< node
->numSuccedents
; i
++) {
600 if (node
->succedents
[i
]->visited
== unvisited
)
601 rf_RecurPrintDAG(node
->succedents
[i
], depth
+ 1, unvisited
);
606 rf_PrintDAG(RF_DagHeader_t
*dag_h
)
612 switch (dag_h
->status
) {
617 status
= "rollForward";
619 case rf_rollBackward
:
620 status
= "rollBackward";
626 /* find out if visited bits are currently set or clear */
627 unvisited
= dag_h
->succedents
[0]->visited
;
629 printf("DAG type: %s\n", dag_h
->creator
);
630 printf("format is (depth) num commit type: status,nSucc nSuccFired/nSuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)}; info\n");
631 printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h
->nodeNum
,
632 status
, dag_h
->numSuccedents
, dag_h
->numCommitNodes
, dag_h
->numCommits
);
633 for (i
= 0; i
< dag_h
->numSuccedents
; i
++) {
634 printf("%d%s", dag_h
->succedents
[i
]->nodeNum
,
635 ((i
== dag_h
->numSuccedents
- 1) ? "\0" : " "));
638 for (i
= 0; i
< dag_h
->numSuccedents
; i
++) {
639 if (dag_h
->succedents
[i
]->visited
== unvisited
)
640 rf_RecurPrintDAG(dag_h
->succedents
[i
], 1, unvisited
);
644 /* assigns node numbers */
646 rf_AssignNodeNums(RF_DagHeader_t
* dag_h
)
648 int unvisited
, i
, nnum
;
652 unvisited
= dag_h
->succedents
[0]->visited
;
654 dag_h
->nodeNum
= nnum
++;
655 for (i
= 0; i
< dag_h
->numSuccedents
; i
++) {
656 node
= dag_h
->succedents
[i
];
657 if (node
->visited
== unvisited
) {
658 nnum
= rf_RecurAssignNodeNums(dag_h
->succedents
[i
], nnum
, unvisited
);
665 rf_RecurAssignNodeNums(RF_DagNode_t
*node
, int num
, int unvisited
)
669 node
->visited
= (unvisited
) ? 0 : 1;
671 node
->nodeNum
= num
++;
672 for (i
= 0; i
< node
->numSuccedents
; i
++) {
673 if (node
->succedents
[i
]->visited
== unvisited
) {
674 num
= rf_RecurAssignNodeNums(node
->succedents
[i
], num
, unvisited
);
679 /* set the header pointers in each node to "newptr" */
681 rf_ResetDAGHeaderPointers(RF_DagHeader_t
*dag_h
, RF_DagHeader_t
*newptr
)
684 for (i
= 0; i
< dag_h
->numSuccedents
; i
++)
685 if (dag_h
->succedents
[i
]->dagHdr
!= newptr
)
686 rf_RecurResetDAGHeaderPointers(dag_h
->succedents
[i
], newptr
);
690 rf_RecurResetDAGHeaderPointers(RF_DagNode_t
*node
, RF_DagHeader_t
*newptr
)
693 node
->dagHdr
= newptr
;
694 for (i
= 0; i
< node
->numSuccedents
; i
++)
695 if (node
->succedents
[i
]->dagHdr
!= newptr
)
696 rf_RecurResetDAGHeaderPointers(node
->succedents
[i
], newptr
);
701 rf_PrintDAGList(RF_DagHeader_t
* dag_h
)
705 for (; dag_h
; dag_h
= dag_h
->next
) {
706 rf_AssignNodeNums(dag_h
);
707 printf("\n\nDAG %d IN LIST:\n", i
++);
713 rf_ValidateBranch(RF_DagNode_t
*node
, int *scount
, int *acount
,
714 RF_DagNode_t
**nodes
, int unvisited
)
718 /* construct an array of node pointers indexed by node num */
719 node
->visited
= (unvisited
) ? 0 : 1;
720 nodes
[node
->nodeNum
] = node
;
722 if (node
->next
!= NULL
) {
723 printf("INVALID DAG: next pointer in node is not NULL\n");
726 if (node
->status
!= rf_wait
) {
727 printf("INVALID DAG: Node status is not wait\n");
730 if (node
->numAntDone
!= 0) {
731 printf("INVALID DAG: numAntDone is not zero\n");
734 if (node
->doFunc
== rf_TerminateFunc
) {
735 if (node
->numSuccedents
!= 0) {
736 printf("INVALID DAG: Terminator node has succedents\n");
740 if (node
->numSuccedents
== 0) {
741 printf("INVALID DAG: Non-terminator node has no succedents\n");
745 for (i
= 0; i
< node
->numSuccedents
; i
++) {
746 if (!node
->succedents
[i
]) {
747 printf("INVALID DAG: succedent %d of node %s is NULL\n", i
, node
->name
);
750 scount
[node
->succedents
[i
]->nodeNum
]++;
752 for (i
= 0; i
< node
->numAntecedents
; i
++) {
753 if (!node
->antecedents
[i
]) {
754 printf("INVALID DAG: antecedent %d of node %s is NULL\n", i
, node
->name
);
757 acount
[node
->antecedents
[i
]->nodeNum
]++;
759 for (i
= 0; i
< node
->numSuccedents
; i
++) {
760 if (node
->succedents
[i
]->visited
== unvisited
) {
761 if (rf_ValidateBranch(node
->succedents
[i
], scount
,
762 acount
, nodes
, unvisited
)) {
771 rf_ValidateBranchVisitedBits(RF_DagNode_t
*node
, int unvisited
, int rl
)
775 RF_ASSERT(node
->visited
== unvisited
);
776 for (i
= 0; i
< node
->numSuccedents
; i
++) {
777 if (node
->succedents
[i
] == NULL
) {
778 printf("node=%lx node->succedents[%d] is NULL\n", (long) node
, i
);
781 rf_ValidateBranchVisitedBits(node
->succedents
[i
], unvisited
, rl
+ 1);
784 /* NOTE: never call this on a big dag, because it is exponential
788 rf_ValidateVisitedBits(RF_DagHeader_t
*dag
)
792 unvisited
= dag
->succedents
[0]->visited
;
794 for (i
= 0; i
< dag
->numSuccedents
; i
++) {
795 if (dag
->succedents
[i
] == NULL
) {
796 printf("dag=%lx dag->succedents[%d] is NULL\n", (long) dag
, i
);
799 rf_ValidateBranchVisitedBits(dag
->succedents
[i
], unvisited
, 0);
802 /* validate a DAG. _at entry_ verify that:
803 * -- numNodesCompleted is zero
804 * -- node queue is null
805 * -- dag status is rf_enable
806 * -- next pointer is null on every node
807 * -- all nodes have status wait
808 * -- numAntDone is zero in all nodes
809 * -- terminator node has zero successors
810 * -- no other node besides terminator has zero successors
811 * -- no successor or antecedent pointer in a node is NULL
812 * -- number of times that each node appears as a successor of another node
813 * is equal to the antecedent count on that node
814 * -- number of times that each node appears as an antecedent of another node
815 * is equal to the succedent count on that node
819 rf_ValidateDAG(RF_DagHeader_t
*dag_h
)
822 int *scount
, *acount
;/* per-node successor and antecedent counts */
823 RF_DagNode_t
**nodes
; /* array of ptrs to nodes in dag */
826 int commitNodeCount
= 0;
828 if (rf_validateVisitedDebug
)
829 rf_ValidateVisitedBits(dag_h
);
831 if (dag_h
->numNodesCompleted
!= 0) {
832 printf("INVALID DAG: num nodes completed is %d, should be 0\n", dag_h
->numNodesCompleted
);
834 goto validate_dag_bad
;
836 if (dag_h
->status
!= rf_enable
) {
837 printf("INVALID DAG: not enabled\n");
839 goto validate_dag_bad
;
841 if (dag_h
->numCommits
!= 0) {
842 printf("INVALID DAG: numCommits != 0 (%d)\n", dag_h
->numCommits
);
844 goto validate_dag_bad
;
846 if (dag_h
->numSuccedents
!= 1) {
847 /* currently, all dags must have only one succedent */
848 printf("INVALID DAG: numSuccedents !1 (%d)\n", dag_h
->numSuccedents
);
850 goto validate_dag_bad
;
852 nodecount
= rf_AssignNodeNums(dag_h
);
854 unvisited
= dag_h
->succedents
[0]->visited
;
856 RF_Malloc(scount
, nodecount
* sizeof(int), (int *));
857 RF_Malloc(acount
, nodecount
* sizeof(int), (int *));
858 RF_Malloc(nodes
, nodecount
* sizeof(RF_DagNode_t
*),
860 for (i
= 0; i
< dag_h
->numSuccedents
; i
++) {
861 if ((dag_h
->succedents
[i
]->visited
== unvisited
)
862 && rf_ValidateBranch(dag_h
->succedents
[i
], scount
,
863 acount
, nodes
, unvisited
)) {
867 /* start at 1 to skip the header node */
868 for (i
= 1; i
< nodecount
; i
++) {
869 if (nodes
[i
]->commitNode
)
871 if (nodes
[i
]->doFunc
== NULL
) {
872 printf("INVALID DAG: node %s has an undefined doFunc\n", nodes
[i
]->name
);
874 goto validate_dag_out
;
876 if (nodes
[i
]->undoFunc
== NULL
) {
877 printf("INVALID DAG: node %s has an undefined doFunc\n", nodes
[i
]->name
);
879 goto validate_dag_out
;
881 if (nodes
[i
]->numAntecedents
!= scount
[nodes
[i
]->nodeNum
]) {
882 printf("INVALID DAG: node %s has %d antecedents but appears as a succedent %d times\n",
883 nodes
[i
]->name
, nodes
[i
]->numAntecedents
, scount
[nodes
[i
]->nodeNum
]);
885 goto validate_dag_out
;
887 if (nodes
[i
]->numSuccedents
!= acount
[nodes
[i
]->nodeNum
]) {
888 printf("INVALID DAG: node %s has %d succedents but appears as an antecedent %d times\n",
889 nodes
[i
]->name
, nodes
[i
]->numSuccedents
, acount
[nodes
[i
]->nodeNum
]);
891 goto validate_dag_out
;
895 if (dag_h
->numCommitNodes
!= commitNodeCount
) {
896 printf("INVALID DAG: incorrect commit node count. hdr->numCommitNodes (%d) found (%d) commit nodes in graph\n",
897 dag_h
->numCommitNodes
, commitNodeCount
);
899 goto validate_dag_out
;
902 RF_Free(scount
, nodecount
* sizeof(int));
903 RF_Free(acount
, nodecount
* sizeof(int));
904 RF_Free(nodes
, nodecount
* sizeof(RF_DagNode_t
*));
906 rf_PrintDAGList(dag_h
);
908 if (rf_validateVisitedDebug
)
909 rf_ValidateVisitedBits(dag_h
);
914 rf_PrintDAGList(dag_h
);
918 #endif /* RF_DEBUG_VALIDATE_DAG */
920 /******************************************************************************
922 * misc construction routines
924 *****************************************************************************/
927 rf_redirect_asm(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
)
929 int ds
= (raidPtr
->Layout
.map
->flags
& RF_DISTRIBUTE_SPARE
) ? 1 : 0;
930 int fcol
= raidPtr
->reconControl
->fcol
;
931 int scol
= raidPtr
->reconControl
->spareCol
;
932 RF_PhysDiskAddr_t
*pda
;
934 RF_ASSERT(raidPtr
->status
== rf_rs_reconstructing
);
935 for (pda
= asmap
->physInfo
; pda
; pda
= pda
->next
) {
936 if (pda
->col
== fcol
) {
939 if (!rf_CheckRUReconstructed(raidPtr
->reconControl
->reconMap
,
945 /* printf("Remapped data for large write\n"); */
947 raidPtr
->Layout
.map
->MapSector(raidPtr
, pda
->raidAddress
,
948 &pda
->col
, &pda
->startSector
, RF_REMAP
);
954 for (pda
= asmap
->parityInfo
; pda
; pda
= pda
->next
) {
955 if (pda
->col
== fcol
) {
958 if (!rf_CheckRUReconstructed(raidPtr
->reconControl
->reconMap
, pda
->startSector
)) {
965 (raidPtr
->Layout
.map
->MapParity
) (raidPtr
, pda
->raidAddress
, &pda
->col
, &pda
->startSector
, RF_REMAP
);
973 /* this routine allocates read buffers and generates stripe maps for the
974 * regions of the array from the start of the stripe to the start of the
975 * access, and from the end of the access to the end of the stripe. It also
976 * computes and returns the number of DAG nodes needed to read all this data.
977 * Note that this routine does the wrong thing if the access is fully
978 * contained within one stripe unit, so we RF_ASSERT against this case at the
981 * layoutPtr - in: layout information
982 * asmap - in: access stripe map
983 * dag_h - in: header of the dag to create
984 * new_asm_h - in: ptr to array of 2 headers. to be filled in
985 * nRodNodes - out: num nodes to be generated to read unaccessed data
986 * sosBuffer, eosBuffer - out: pointers to newly allocated buffer
989 rf_MapUnaccessedPortionOfStripe(RF_Raid_t
*raidPtr
,
990 RF_RaidLayout_t
*layoutPtr
,
991 RF_AccessStripeMap_t
*asmap
,
992 RF_DagHeader_t
*dag_h
,
993 RF_AccessStripeMapHeader_t
**new_asm_h
,
995 char **sosBuffer
, char **eosBuffer
,
996 RF_AllocListElem_t
*allocList
)
998 RF_RaidAddr_t sosRaidAddress
, eosRaidAddress
;
999 RF_SectorNum_t sosNumSector
, eosNumSector
;
1001 RF_ASSERT(asmap
->numStripeUnitsAccessed
> (layoutPtr
->numDataCol
/ 2));
1002 /* generate an access map for the region of the array from start of
1003 * stripe to start of access */
1004 new_asm_h
[0] = new_asm_h
[1] = NULL
;
1006 if (!rf_RaidAddressStripeAligned(layoutPtr
, asmap
->raidAddress
)) {
1007 sosRaidAddress
= rf_RaidAddressOfPrevStripeBoundary(layoutPtr
, asmap
->raidAddress
);
1008 sosNumSector
= asmap
->raidAddress
- sosRaidAddress
;
1009 *sosBuffer
= rf_AllocStripeBuffer(raidPtr
, dag_h
, rf_RaidAddressToByte(raidPtr
, sosNumSector
));
1010 new_asm_h
[0] = rf_MapAccess(raidPtr
, sosRaidAddress
, sosNumSector
, *sosBuffer
, RF_DONT_REMAP
);
1011 new_asm_h
[0]->next
= dag_h
->asmList
;
1012 dag_h
->asmList
= new_asm_h
[0];
1013 *nRodNodes
+= new_asm_h
[0]->stripeMap
->numStripeUnitsAccessed
;
1015 RF_ASSERT(new_asm_h
[0]->stripeMap
->next
== NULL
);
1016 /* we're totally within one stripe here */
1017 if (asmap
->flags
& RF_ASM_REDIR_LARGE_WRITE
)
1018 rf_redirect_asm(raidPtr
, new_asm_h
[0]->stripeMap
);
1020 /* generate an access map for the region of the array from end of
1021 * access to end of stripe */
1022 if (!rf_RaidAddressStripeAligned(layoutPtr
, asmap
->endRaidAddress
)) {
1023 eosRaidAddress
= asmap
->endRaidAddress
;
1024 eosNumSector
= rf_RaidAddressOfNextStripeBoundary(layoutPtr
, eosRaidAddress
) - eosRaidAddress
;
1025 *eosBuffer
= rf_AllocStripeBuffer(raidPtr
, dag_h
, rf_RaidAddressToByte(raidPtr
, eosNumSector
));
1026 new_asm_h
[1] = rf_MapAccess(raidPtr
, eosRaidAddress
, eosNumSector
, *eosBuffer
, RF_DONT_REMAP
);
1027 new_asm_h
[1]->next
= dag_h
->asmList
;
1028 dag_h
->asmList
= new_asm_h
[1];
1029 *nRodNodes
+= new_asm_h
[1]->stripeMap
->numStripeUnitsAccessed
;
1031 RF_ASSERT(new_asm_h
[1]->stripeMap
->next
== NULL
);
1032 /* we're totally within one stripe here */
1033 if (asmap
->flags
& RF_ASM_REDIR_LARGE_WRITE
)
1034 rf_redirect_asm(raidPtr
, new_asm_h
[1]->stripeMap
);
1040 /* returns non-zero if the indicated ranges of stripe unit offsets overlap */
1042 rf_PDAOverlap(RF_RaidLayout_t
*layoutPtr
,
1043 RF_PhysDiskAddr_t
*src
, RF_PhysDiskAddr_t
*dest
)
1045 RF_SectorNum_t soffs
= rf_StripeUnitOffset(layoutPtr
, src
->startSector
);
1046 RF_SectorNum_t doffs
= rf_StripeUnitOffset(layoutPtr
, dest
->startSector
);
1047 /* use -1 to be sure we stay within SU */
1048 RF_SectorNum_t send
= rf_StripeUnitOffset(layoutPtr
, src
->startSector
+ src
->numSector
- 1);
1049 RF_SectorNum_t dend
= rf_StripeUnitOffset(layoutPtr
, dest
->startSector
+ dest
->numSector
- 1);
1050 return ((RF_MAX(soffs
, doffs
) <= RF_MIN(send
, dend
)) ? 1 : 0);
1054 /* GenerateFailedAccessASMs
1056 * this routine figures out what portion of the stripe needs to be read
1057 * to effect the degraded read or write operation. It's primary function
1058 * is to identify everything required to recover the data, and then
1059 * eliminate anything that is already being accessed by the user.
1061 * The main result is two new ASMs, one for the region from the start of the
1062 * stripe to the start of the access, and one for the region from the end of
1063 * the access to the end of the stripe. These ASMs describe everything that
1064 * needs to be read to effect the degraded access. Other results are:
1065 * nXorBufs -- the total number of buffers that need to be XORed together to
1066 * recover the lost data,
1067 * rpBufPtr -- ptr to a newly-allocated buffer to hold the parity. If NULL
1068 * at entry, not allocated.
1069 * overlappingPDAs --
1070 * describes which of the non-failed PDAs in the user access
1071 * overlap data that needs to be read to effect recovery.
1072 * overlappingPDAs[i]==1 if and only if, neglecting the failed
1073 * PDA, the ith pda in the input asm overlaps data that needs
1074 * to be read for recovery.
1076 /* in: asm - ASM for the actual access, one stripe only */
1077 /* in: failedPDA - which component of the access has failed */
1078 /* in: dag_h - header of the DAG we're going to create */
1079 /* out: new_asm_h - the two new ASMs */
1080 /* out: nXorBufs - the total number of xor bufs required */
1081 /* out: rpBufPtr - a buffer for the parity read */
1083 rf_GenerateFailedAccessASMs(RF_Raid_t
*raidPtr
, RF_AccessStripeMap_t
*asmap
,
1084 RF_PhysDiskAddr_t
*failedPDA
,
1085 RF_DagHeader_t
*dag_h
,
1086 RF_AccessStripeMapHeader_t
**new_asm_h
,
1087 int *nXorBufs
, char **rpBufPtr
,
1088 char *overlappingPDAs
,
1089 RF_AllocListElem_t
*allocList
)
1091 RF_RaidLayout_t
*layoutPtr
= &(raidPtr
->Layout
);
1093 /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
1094 RF_RaidAddr_t sosAddr
, sosEndAddr
, eosStartAddr
, eosAddr
;
1095 RF_PhysDiskAddr_t
*pda
;
1099 /* first compute the following raid addresses: start of stripe,
1100 * (sosAddr) MIN(start of access, start of failed SU), (sosEndAddr)
1101 * MAX(end of access, end of failed SU), (eosStartAddr) end of
1102 * stripe (i.e. start of next stripe) (eosAddr) */
1103 sosAddr
= rf_RaidAddressOfPrevStripeBoundary(layoutPtr
, asmap
->raidAddress
);
1104 sosEndAddr
= RF_MIN(asmap
->raidAddress
, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr
, failedPDA
->raidAddress
));
1105 eosStartAddr
= RF_MAX(asmap
->endRaidAddress
, rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr
, failedPDA
->raidAddress
));
1106 eosAddr
= rf_RaidAddressOfNextStripeBoundary(layoutPtr
, asmap
->raidAddress
);
1108 /* now generate access stripe maps for each of the above regions of
1109 * the stripe. Use a dummy (NULL) buf ptr for now */
1111 new_asm_h
[0] = (sosAddr
!= sosEndAddr
) ? rf_MapAccess(raidPtr
, sosAddr
, sosEndAddr
- sosAddr
, NULL
, RF_DONT_REMAP
) : NULL
;
1112 new_asm_h
[1] = (eosStartAddr
!= eosAddr
) ? rf_MapAccess(raidPtr
, eosStartAddr
, eosAddr
- eosStartAddr
, NULL
, RF_DONT_REMAP
) : NULL
;
1114 /* walk through the PDAs and range-restrict each SU to the region of
1115 * the SU touched on the failed PDA. also compute total data buffer
1116 * space requirements in this step. Ignore the parity for now. */
1117 /* Also count nodes to find out how many bufs need to be xored together */
1118 (*nXorBufs
) = 1; /* in read case, 1 is for parity. In write
1119 * case, 1 is for failed data */
1122 new_asm_h
[0]->next
= dag_h
->asmList
;
1123 dag_h
->asmList
= new_asm_h
[0];
1124 for (pda
= new_asm_h
[0]->stripeMap
->physInfo
; pda
; pda
= pda
->next
) {
1125 rf_RangeRestrictPDA(raidPtr
, failedPDA
, pda
, RF_RESTRICT_NOBUFFER
, 0);
1126 pda
->bufPtr
= rf_AllocBuffer(raidPtr
, dag_h
, pda
->numSector
<< raidPtr
->logBytesPerSector
);
1128 (*nXorBufs
) += new_asm_h
[0]->stripeMap
->numStripeUnitsAccessed
;
1131 new_asm_h
[1]->next
= dag_h
->asmList
;
1132 dag_h
->asmList
= new_asm_h
[1];
1133 for (pda
= new_asm_h
[1]->stripeMap
->physInfo
; pda
; pda
= pda
->next
) {
1134 rf_RangeRestrictPDA(raidPtr
, failedPDA
, pda
, RF_RESTRICT_NOBUFFER
, 0);
1135 pda
->bufPtr
= rf_AllocBuffer(raidPtr
, dag_h
, pda
->numSector
<< raidPtr
->logBytesPerSector
);
1137 (*nXorBufs
) += new_asm_h
[1]->stripeMap
->numStripeUnitsAccessed
;
1140 /* allocate a buffer for parity */
1142 *rpBufPtr
= rf_AllocBuffer(raidPtr
, dag_h
, failedPDA
->numSector
<< raidPtr
->logBytesPerSector
);
1144 /* the last step is to figure out how many more distinct buffers need
1145 * to get xor'd to produce the missing unit. there's one for each
1146 * user-data read node that overlaps the portion of the failed unit
1149 for (foundit
= i
= 0, pda
= asmap
->physInfo
; pda
; i
++, pda
= pda
->next
) {
1150 if (pda
== failedPDA
) {
1155 if (rf_PDAOverlap(layoutPtr
, pda
, failedPDA
)) {
1156 overlappingPDAs
[i
] = 1;
1161 RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA in asm list\n");
1165 if (rf_degDagDebug
) {
1167 printf("First asm:\n");
1168 rf_PrintFullAccessStripeMap(new_asm_h
[0], 1);
1171 printf("Second asm:\n");
1172 rf_PrintFullAccessStripeMap(new_asm_h
[1], 1);
1179 /* adjusts the offset and number of sectors in the destination pda so that
1180 * it covers at most the region of the SU covered by the source PDA. This
1181 * is exclusively a restriction: the number of sectors indicated by the
1182 * target PDA can only shrink.
1184 * For example: s = sectors within SU indicated by source PDA
1185 * d = sectors within SU indicated by dest PDA
1186 * r = results, stored in dest PDA
1188 * |--------------- one stripe unit ---------------------|
1189 * | sssssssssssssssssssssssssssssssss |
1190 * | ddddddddddddddddddddddddddddddddddddddddddddd |
1191 * | rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr |
1195 * |--------------- one stripe unit ---------------------|
1196 * | sssssssssssssssssssssssssssssssss |
1197 * | ddddddddddddddddddddddd |
1198 * | rrrrrrrrrrrrrrrr |
1202 rf_RangeRestrictPDA(RF_Raid_t
*raidPtr
, RF_PhysDiskAddr_t
*src
,
1203 RF_PhysDiskAddr_t
*dest
, int dobuffer
, int doraidaddr
)
1205 RF_RaidLayout_t
*layoutPtr
= &raidPtr
->Layout
;
1206 RF_SectorNum_t soffs
= rf_StripeUnitOffset(layoutPtr
, src
->startSector
);
1207 RF_SectorNum_t doffs
= rf_StripeUnitOffset(layoutPtr
, dest
->startSector
);
1208 RF_SectorNum_t send
= rf_StripeUnitOffset(layoutPtr
, src
->startSector
+ src
->numSector
- 1); /* use -1 to be sure we
1210 RF_SectorNum_t dend
= rf_StripeUnitOffset(layoutPtr
, dest
->startSector
+ dest
->numSector
- 1);
1211 RF_SectorNum_t subAddr
= rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr
, dest
->startSector
); /* stripe unit boundary */
1213 dest
->startSector
= subAddr
+ RF_MAX(soffs
, doffs
);
1214 dest
->numSector
= subAddr
+ RF_MIN(send
, dend
) + 1 - dest
->startSector
;
1217 dest
->bufPtr
= (char *)(dest
->bufPtr
) + ((soffs
> doffs
) ? rf_RaidAddressToByte(raidPtr
, soffs
- doffs
) : 0);
1219 dest
->raidAddress
= rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr
, dest
->raidAddress
) +
1220 rf_StripeUnitOffset(layoutPtr
, dest
->startSector
);
1224 #if (RF_INCLUDE_CHAINDECLUSTER > 0)
1227 * Want the highest of these primes to be the largest one
1228 * less than the max expected number of columns (won't hurt
1229 * to be too small or too large, but won't be optimal, either)
1232 #define NLOWPRIMES 8
1233 static int lowprimes
[NLOWPRIMES
] = {2, 3, 5, 7, 11, 13, 17, 19};
1234 /*****************************************************************************
1235 * compute the workload shift factor. (chained declustering)
1237 * return nonzero if access should shift to secondary, otherwise,
1238 * access is to primary
1239 *****************************************************************************/
1241 rf_compute_workload_shift(RF_Raid_t
*raidPtr
, RF_PhysDiskAddr_t
*pda
)
1245 * d = column of disk containing primary
1246 * f = column of failed disk
1247 * n = number of disks in array
1248 * sd = "shift distance" (number of columns that d is to the right of f)
1249 * v = numerator of redirection ratio
1250 * k = denominator of redirection ratio
1252 RF_RowCol_t d
, f
, sd
, n
;
1255 n
= raidPtr
->numCol
;
1257 /* assign column of primary copy to d */
1260 /* assign column of dead disk to f */
1261 for (f
= 0; ((!RF_DEAD_DISK(raidPtr
->Disks
[f
].status
)) && (f
< n
)); f
++);
1266 sd
= (f
> d
) ? (n
+ d
- f
) : (d
- f
);
1270 * v of every k accesses should be redirected
1272 * v/k := (n-1-sd)/(n-1)
1282 * Now reduce the fraction, by repeatedly factoring
1283 * out primes (just like they teach in elementary school!)
1285 for (i
= 0; i
< NLOWPRIMES
; i
++) {
1286 if (lowprimes
[i
] > v
)
1288 while (((v
% lowprimes
[i
]) == 0) && ((k
% lowprimes
[i
]) == 0)) {
1295 raidPtr
->hist_diskreq
[d
]++;
1296 if (raidPtr
->hist_diskreq
[d
] > v
) {
1297 ret
= 0; /* do not redirect */
1299 ret
= 1; /* redirect */
1303 printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d
, f
, sd
, v
, k
, ret
,
1304 raidPtr
->hist_diskreq
[d
]);
1307 if (raidPtr
->hist_diskreq
[d
] >= k
) {
1309 raidPtr
->hist_diskreq
[d
] = 0;
1313 #endif /* (RF_INCLUDE_CHAINDECLUSTER > 0) */
1316 * Disk selection routines
1320 * Selects the disk with the shortest queue from a mirror pair.
1321 * Both the disk I/Os queued in RAIDframe as well as those at the physical
1322 * disk are counted as members of the "queue"
1325 rf_SelectMirrorDiskIdle(RF_DagNode_t
* node
)
1327 RF_Raid_t
*raidPtr
= (RF_Raid_t
*) node
->dagHdr
->raidPtr
;
1328 RF_RowCol_t colData
, colMirror
;
1329 int dataQueueLength
, mirrorQueueLength
, usemirror
;
1330 RF_PhysDiskAddr_t
*data_pda
= (RF_PhysDiskAddr_t
*) node
->params
[0].p
;
1331 RF_PhysDiskAddr_t
*mirror_pda
= (RF_PhysDiskAddr_t
*) node
->params
[4].p
;
1332 RF_PhysDiskAddr_t
*tmp_pda
;
1333 RF_RaidDisk_t
*disks
= raidPtr
->Disks
;
1334 RF_DiskQueue_t
*dqs
= raidPtr
->Queues
, *dataQueue
, *mirrorQueue
;
1336 /* return the [row col] of the disk with the shortest queue */
1337 colData
= data_pda
->col
;
1338 colMirror
= mirror_pda
->col
;
1339 dataQueue
= &(dqs
[colData
]);
1340 mirrorQueue
= &(dqs
[colMirror
]);
1342 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1343 RF_LOCK_QUEUE_MUTEX(dataQueue
, "SelectMirrorDiskIdle");
1344 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1345 dataQueueLength
= dataQueue
->queueLength
+ dataQueue
->numOutstanding
;
1346 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1347 RF_UNLOCK_QUEUE_MUTEX(dataQueue
, "SelectMirrorDiskIdle");
1348 RF_LOCK_QUEUE_MUTEX(mirrorQueue
, "SelectMirrorDiskIdle");
1349 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1350 mirrorQueueLength
= mirrorQueue
->queueLength
+ mirrorQueue
->numOutstanding
;
1351 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1352 RF_UNLOCK_QUEUE_MUTEX(mirrorQueue
, "SelectMirrorDiskIdle");
1353 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1356 if (RF_DEAD_DISK(disks
[colMirror
].status
)) {
1359 if (RF_DEAD_DISK(disks
[colData
].status
)) {
1362 if (raidPtr
->parity_good
== RF_RAID_DIRTY
) {
1363 /* Trust only the main disk */
1366 if (dataQueueLength
< mirrorQueueLength
) {
1369 if (mirrorQueueLength
< dataQueueLength
) {
1372 /* queues are equal length. attempt
1374 if (SNUM_DIFF(dataQueue
->last_deq_sector
, data_pda
->startSector
)
1375 <= SNUM_DIFF(mirrorQueue
->last_deq_sector
, mirror_pda
->startSector
)) {
1383 /* use mirror (parity) disk, swap params 0 & 4 */
1385 node
->params
[0].p
= mirror_pda
;
1386 node
->params
[4].p
= tmp_pda
;
1388 /* use data disk, leave param 0 unchanged */
1390 /* printf("dataQueueLength %d, mirrorQueueLength
1391 * %d\n",dataQueueLength, mirrorQueueLength); */
1393 #if (RF_INCLUDE_CHAINDECLUSTER > 0) || (RF_INCLUDE_INTERDECLUSTER > 0) || (RF_DEBUG_VALIDATE_DAG > 0)
1395 * Do simple partitioning. This assumes that
1396 * the data and parity disks are laid out identically.
1399 rf_SelectMirrorDiskPartition(RF_DagNode_t
* node
)
1401 RF_Raid_t
*raidPtr
= (RF_Raid_t
*) node
->dagHdr
->raidPtr
;
1402 RF_RowCol_t colData
, colMirror
;
1403 RF_PhysDiskAddr_t
*data_pda
= (RF_PhysDiskAddr_t
*) node
->params
[0].p
;
1404 RF_PhysDiskAddr_t
*mirror_pda
= (RF_PhysDiskAddr_t
*) node
->params
[4].p
;
1405 RF_PhysDiskAddr_t
*tmp_pda
;
1406 RF_RaidDisk_t
*disks
= raidPtr
->Disks
;
1409 /* return the [row col] of the disk with the shortest queue */
1410 colData
= data_pda
->col
;
1411 colMirror
= mirror_pda
->col
;
1414 if (RF_DEAD_DISK(disks
[colMirror
].status
)) {
1417 if (RF_DEAD_DISK(disks
[colData
].status
)) {
1420 if (raidPtr
->parity_good
== RF_RAID_DIRTY
) {
1421 /* Trust only the main disk */
1424 if (data_pda
->startSector
<
1425 (disks
[colData
].numBlocks
/ 2)) {
1432 /* use mirror (parity) disk, swap params 0 & 4 */
1434 node
->params
[0].p
= mirror_pda
;
1435 node
->params
[4].p
= tmp_pda
;
1437 /* use data disk, leave param 0 unchanged */