sys/dev/raidframe/rf_dagdegwr.c

   1 /*      $NetBSD: rf_dagdegwr.c,v 1.29 2006/10/12 01:31:50 christos Exp $        */
   2 /*
   3  * Copyright (c) 1995 Carnegie-Mellon University.
   4  * All rights reserved.
   5  *
   6  * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
   7  *
   8  * Permission to use, copy, modify and distribute this software and
   9  * its documentation is hereby granted, provided that both the copyright
  10  * notice and this permission notice appear in all copies of the
  11  * software, derivative works or modified versions, and any portions
  12  * thereof, and that both notices appear in supporting documentation.
  13  *
  14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  17  *
  18  * Carnegie Mellon requests users of this software to return to
  19  *
  20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  21  *  School of Computer Science
  22  *  Carnegie Mellon University
  23  *  Pittsburgh PA 15213-3890
  24  *
  25  * any improvements or extensions that they make and grant Carnegie the
  26  * rights to redistribute these changes.
  27  */
  28
  29 /*
  30  * rf_dagdegwr.c
  31  *
  32  * code for creating degraded write DAGs
  33  *
  34  */
  35
  36 #include <sys/cdefs.h>
  37 __KERNEL_RCSID(0, "$NetBSD: rf_dagdegwr.c,v 1.29 2006/10/12 01:31:50 christos Exp $");
  38
  39 #include <dev/raidframe/raidframevar.h>
  40
  41 #include "rf_raid.h"
  42 #include "rf_dag.h"
  43 #include "rf_dagutils.h"
  44 #include "rf_dagfuncs.h"
  45 #include "rf_debugMem.h"
  46 #include "rf_general.h"
  47 #include "rf_dagdegwr.h"
  48 #include "rf_map.h"
  49
  50
  51 /******************************************************************************
  52  *
  53  * General comments on DAG creation:
  54  *
  55  * All DAGs in this file use roll-away error recovery.  Each DAG has a single
  56  * commit node, usually called "Cmt."  If an error occurs before the Cmt node
  57  * is reached, the execution engine will halt forward execution and work
  58  * backward through the graph, executing the undo functions.  Assuming that
  59  * each node in the graph prior to the Cmt node are undoable and atomic - or -
  60  * does not make changes to permanent state, the graph will fail atomically.
  61  * If an error occurs after the Cmt node executes, the engine will roll-forward
  62  * through the graph, blindly executing nodes until it reaches the end.
  63  * If a graph reaches the end, it is assumed to have completed successfully.
  64  *
  65  * A graph has only 1 Cmt node.
  66  *
  67  */
  68
  69
  70 /******************************************************************************
  71  *
  72  * The following wrappers map the standard DAG creation interface to the
  73  * DAG creation routines.  Additionally, these wrappers enable experimentation
  74  * with new DAG structures by providing an extra level of indirection, allowing
  75  * the DAG creation routines to be replaced at this single point.
  76  */
  77
  78 static
  79 RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG)
  80 {
  81         rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp,
  82             flags, allocList, 1, rf_RecoveryXorFunc, RF_TRUE);
  83 }
  84
  85 void
  86 rf_CreateDegradedWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
  87                           RF_DagHeader_t *dag_h, void *bp,
  88                           RF_RaidAccessFlags_t flags,
  89                           RF_AllocListElem_t *allocList)
  90 {
  91
  92         RF_ASSERT(asmap->numDataFailed == 1);
  93         dag_h->creator = "DegradedWriteDAG";
  94
  95         /*
  96          * if the access writes only a portion of the failed unit, and also
  97          * writes some portion of at least one surviving unit, we create two
  98          * DAGs, one for the failed component and one for the non-failed
  99          * component, and do them sequentially.  Note that the fact that we're
 100          * accessing only a portion of the failed unit indicates that the
 101          * access either starts or ends in the failed unit, and hence we need
 102          * create only two dags.  This is inefficient in that the same data or
 103          * parity can get read and written twice using this structure.  I need
 104          * to fix this to do the access all at once.
 105          */
 106         RF_ASSERT(!(asmap->numStripeUnitsAccessed != 1 &&
 107                     asmap->failedPDAs[0]->numSector !=
 108                         raidPtr->Layout.sectorsPerStripeUnit));
 109         rf_CreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
 110             allocList);
 111 }
 112
 113
 114
 115 /******************************************************************************
 116  *
 117  * DAG creation code begins here
 118  */
 119
 120
 121
 122 /******************************************************************************
 123  *
 124  * CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode
 125  * write, which is as follows
 126  *
 127  *                                        / {Wnq} --\
 128  * hdr -> blockNode ->  Rod -> Xor -> Cmt -> Wnp ----> unblock -> term
 129  *                  \  {Rod} /            \  Wnd ---/
 130  *                                        \ {Wnd} -/
 131  *
 132  * commit nodes: Xor, Wnd
 133  *
 134  * IMPORTANT:
 135  * This DAG generator does not work for double-degraded archs since it does not
 136  * generate Q
 137  *
 138  * This dag is essentially identical to the large-write dag, except that the
 139  * write to the failed data unit is suppressed.
 140  *
 141  * IMPORTANT:  this dag does not work in the case where the access writes only
 142  * a portion of the failed unit, and also writes some portion of at least one
 143  * surviving SU.  this case is handled in CreateDegradedWriteDAG above.
 144  *
 145  * The block & unblock nodes are leftovers from a previous version.  They
 146  * do nothing, but I haven't deleted them because it would be a tremendous
 147  * effort to put them back in.
 148  *
 149  * This dag is used whenever a one of the data units in a write has failed.
 150  * If it is the parity unit that failed, the nonredundant write dag (below)
 151  * is used.
 152  *****************************************************************************/
 153
 154 void
 155 rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t *raidPtr,
 156                                       RF_AccessStripeMap_t *asmap,
 157                                       RF_DagHeader_t *dag_h, void *bp,
 158                                       RF_RaidAccessFlags_t flags,
 159                                       RF_AllocListElem_t *allocList,
 160                                       int nfaults,
 161                                       int (*redFunc) (RF_DagNode_t *),
 162                                       int allowBufferRecycle)
 163 {
 164         int     nNodes, nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum,
 165                 rdnodesFaked;
 166         RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *wnqNode, *termNode;
 167         RF_DagNode_t *wndNodes, *rrdNodes, *xorNode, *commitNode;
 168         RF_DagNode_t *tmpNode, *tmpwndNode, *tmprrdNode;
 169         RF_SectorCount_t sectorsPerSU;
 170         RF_ReconUnitNum_t which_ru;
 171         char   *xorTargetBuf = NULL;    /* the target buffer for the XOR
 172                                          * operation */
 173         char   overlappingPDAs[RF_MAXCOL];/* a temporary array of flags */
 174         RF_AccessStripeMapHeader_t *new_asm_h[2];
 175         RF_PhysDiskAddr_t *pda, *parityPDA;
 176         RF_StripeNum_t parityStripeID;
 177         RF_PhysDiskAddr_t *failedPDA;
 178         RF_RaidLayout_t *layoutPtr;
 179
 180         layoutPtr = &(raidPtr->Layout);
 181         parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
 182             &which_ru);
 183         sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
 184         /* failedPDA points to the pda within the asm that targets the failed
 185          * disk */
 186         failedPDA = asmap->failedPDAs[0];
 187
 188 #if RF_DEBUG_DAG
 189         if (rf_dagDebug)
 190                 printf("[Creating degraded-write DAG]\n");
 191 #endif
 192
 193         RF_ASSERT(asmap->numDataFailed == 1);
 194         dag_h->creator = "SimpleDegradedWriteDAG";
 195
 196         /*
 197          * Generate two ASMs identifying the surviving data
 198          * we need in order to recover the lost data.
 199          */
 200         /* overlappingPDAs array must be zero'd */
 201         memset(overlappingPDAs, 0, RF_MAXCOL);
 202         rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h,
 203             &nXorBufs, NULL, overlappingPDAs, allocList);
 204
 205         /* create all the nodes at once */
 206         nWndNodes = asmap->numStripeUnitsAccessed - 1;  /* no access is
 207                                                          * generated for the
 208                                                          * failed pda */
 209
 210         nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
 211             ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
 212         /*
 213          * XXX
 214          *
 215          * There's a bug with a complete stripe overwrite- that means 0 reads
 216          * of old data, and the rest of the DAG generation code doesn't like
 217          * that. A release is coming, and I don't wanna risk breaking a critical
 218          * DAG generator, so here's what I'm gonna do- if there's no read nodes,
 219          * I'm gonna fake there being a read node, and I'm gonna swap in a
 220          * no-op node in its place (to make all the link-up code happy).
 221          * This should be fixed at some point.  --jimz
 222          */
 223         if (nRrdNodes == 0) {
 224                 nRrdNodes = 1;
 225                 rdnodesFaked = 1;
 226         } else {
 227                 rdnodesFaked = 0;
 228         }
 229         /* lock, unlock, xor, Wnd, Rrd, W(nfaults) */
 230         nNodes = 5 + nfaults + nWndNodes + nRrdNodes;
 231
 232         blockNode = rf_AllocDAGNode();
 233         blockNode->list_next = dag_h->nodes;
 234         dag_h->nodes = blockNode;
 235
 236         commitNode = rf_AllocDAGNode();
 237         commitNode->list_next = dag_h->nodes;
 238         dag_h->nodes = commitNode;
 239
 240         unblockNode = rf_AllocDAGNode();
 241         unblockNode->list_next = dag_h->nodes;
 242         dag_h->nodes = unblockNode;
 243
 244         termNode = rf_AllocDAGNode();
 245         termNode->list_next = dag_h->nodes;
 246         dag_h->nodes = termNode;
 247
 248         xorNode = rf_AllocDAGNode();
 249         xorNode->list_next = dag_h->nodes;
 250         dag_h->nodes = xorNode;
 251
 252         wnpNode = rf_AllocDAGNode();
 253         wnpNode->list_next = dag_h->nodes;
 254         dag_h->nodes = wnpNode;
 255
 256         for (i = 0; i < nWndNodes; i++) {
 257                 tmpNode = rf_AllocDAGNode();
 258                 tmpNode->list_next = dag_h->nodes;
 259                 dag_h->nodes = tmpNode;
 260         }
 261         wndNodes = dag_h->nodes;
 262
 263         for (i = 0; i < nRrdNodes; i++) {
 264                 tmpNode = rf_AllocDAGNode();
 265                 tmpNode->list_next = dag_h->nodes;
 266                 dag_h->nodes = tmpNode;
 267         }
 268         rrdNodes = dag_h->nodes;
 269
 270 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
 271         if (nfaults == 2) {
 272                 wnqNode = rf_AllocDAGNode();
 273                 wnqNode->list_next = dag_h->nodes;
 274                 dag_h->nodes = wnqNode;
 275         } else {
 276 #endif
 277                 wnqNode = NULL;
 278 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
 279         }
 280 #endif
 281
 282         /* this dag can not commit until all rrd and xor Nodes have completed */
 283         dag_h->numCommitNodes = 1;
 284         dag_h->numCommits = 0;
 285         dag_h->numSuccedents = 1;
 286
 287         RF_ASSERT(nRrdNodes > 0);
 288         rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
 289             NULL, nRrdNodes, 0, 0, 0, dag_h, "Nil", allocList);
 290         rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
 291             NULL, nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
 292         rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
 293             NULL, 1, nWndNodes + nfaults, 0, 0, dag_h, "Nil", allocList);
 294         rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
 295             NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
 296         rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
 297             nRrdNodes, 2 * nXorBufs + 2, nfaults, dag_h, "Xrc", allocList);
 298
 299         /*
 300          * Fill in the Rrd nodes. If any of the rrd buffers are the same size as
 301          * the failed buffer, save a pointer to it so we can use it as the target
 302          * of the XOR. The pdas in the rrd nodes have been range-restricted, so if
 303          * a buffer is the same size as the failed buffer, it must also be at the
 304          * same alignment within the SU.
 305          */
 306         i = 0;
 307         tmprrdNode = rrdNodes;
 308         if (new_asm_h[0]) {
 309                 for (i = 0, pda = new_asm_h[0]->stripeMap->physInfo;
 310                     i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
 311                     i++, pda = pda->next) {
 312                         rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
 313                             rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
 314                         RF_ASSERT(pda);
 315                         tmprrdNode->params[0].p = pda;
 316                         tmprrdNode->params[1].p = pda->bufPtr;
 317                         tmprrdNode->params[2].v = parityStripeID;
 318                         tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
 319                         tmprrdNode = tmprrdNode->list_next;
 320                 }
 321         }
 322         /* i now equals the number of stripe units accessed in new_asm_h[0] */
 323         /* Note that for tmprrdNode, this means a continuation from above, so no need to
 324            assign it anything.. */
 325         if (new_asm_h[1]) {
 326                 for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo;
 327                     j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
 328                     j++, pda = pda->next) {
 329                         rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
 330                             rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
 331                         RF_ASSERT(pda);
 332                         tmprrdNode->params[0].p = pda;
 333                         tmprrdNode->params[1].p = pda->bufPtr;
 334                         tmprrdNode->params[2].v = parityStripeID;
 335                         tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
 336                         if (allowBufferRecycle && (pda->numSector == failedPDA->numSector))
 337                                 xorTargetBuf = pda->bufPtr;
 338                         tmprrdNode = tmprrdNode->list_next;
 339                 }
 340         }
 341         if (rdnodesFaked) {
 342                 /*
 343                  * This is where we'll init that fake noop read node
 344                  * (XXX should the wakeup func be different?)
 345                  */
 346                 /* node that rrdNodes will just be a single node... */
 347                 rf_InitNode(rrdNodes, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
 348                     NULL, 1, 1, 0, 0, dag_h, "RrN", allocList);
 349         }
 350         /*
 351          * Make a PDA for the parity unit.  The parity PDA should start at
 352          * the same offset into the SU as the failed PDA.
 353          */
 354         /* Danner comment: I don't think this copy is really necessary. We are
 355          * in one of two cases here. (1) The entire failed unit is written.
 356          * Then asmap->parityInfo will describe the entire parity. (2) We are
 357          * only writing a subset of the failed unit and nothing else. Then the
 358          * asmap->parityInfo describes the failed unit and the copy can also
 359          * be avoided. */
 360
 361         parityPDA = rf_AllocPhysDiskAddr();
 362         parityPDA->next = dag_h->pda_cleanup_list;
 363         dag_h->pda_cleanup_list = parityPDA;
 364         parityPDA->col = asmap->parityInfo->col;
 365         parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU)
 366             * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
 367         parityPDA->numSector = failedPDA->numSector;
 368
 369         if (!xorTargetBuf) {
 370                 xorTargetBuf = rf_AllocBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
 371         }
 372         /* init the Wnp node */
 373         rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
 374             rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
 375         wnpNode->params[0].p = parityPDA;
 376         wnpNode->params[1].p = xorTargetBuf;
 377         wnpNode->params[2].v = parityStripeID;
 378         wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
 379
 380 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
 381         /* fill in the Wnq Node */
 382         if (nfaults == 2) {
 383                 {
 384                         RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
 385                             (RF_PhysDiskAddr_t *), allocList);
 386                         parityPDA->col = asmap->qInfo->col;
 387                         parityPDA->startSector = ((asmap->qInfo->startSector / sectorsPerSU)
 388                             * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
 389                         parityPDA->numSector = failedPDA->numSector;
 390
 391                         rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
 392                             rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
 393                         wnqNode->params[0].p = parityPDA;
 394                         RF_MallocAndAdd(xorNode->results[1],
 395                             rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
 396                         wnqNode->params[1].p = xorNode->results[1];
 397                         wnqNode->params[2].v = parityStripeID;
 398                         wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
 399                 }
 400         }
 401 #endif
 402         /* fill in the Wnd nodes */
 403         tmpwndNode = wndNodes;
 404         for (pda = asmap->physInfo, i = 0; i < nWndNodes; i++, pda = pda->next) {
 405                 if (pda == failedPDA) {
 406                         i--;
 407                         continue;
 408                 }
 409                 rf_InitNode(tmpwndNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
 410                     rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
 411                 RF_ASSERT(pda);
 412                 tmpwndNode->params[0].p = pda;
 413                 tmpwndNode->params[1].p = pda->bufPtr;
 414                 tmpwndNode->params[2].v = parityStripeID;
 415                 tmpwndNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
 416                 tmpwndNode = tmpwndNode->list_next;
 417         }
 418
 419         /* fill in the results of the xor node */
 420         xorNode->results[0] = xorTargetBuf;
 421
 422         /* fill in the params of the xor node */
 423
 424         paramNum = 0;
 425         if (rdnodesFaked == 0) {
 426                 tmprrdNode = rrdNodes;
 427                 for (i = 0; i < nRrdNodes; i++) {
 428                         /* all the Rrd nodes need to be xored together */
 429                         xorNode->params[paramNum++] = tmprrdNode->params[0];
 430                         xorNode->params[paramNum++] = tmprrdNode->params[1];
 431                         tmprrdNode = tmprrdNode->list_next;
 432                 }
 433         }
 434         tmpwndNode = wndNodes;
 435         for (i = 0; i < nWndNodes; i++) {
 436                 /* any Wnd nodes that overlap the failed access need to be
 437                  * xored in */
 438                 if (overlappingPDAs[i]) {
 439                         pda = rf_AllocPhysDiskAddr();
 440                         memcpy((char *) pda, (char *) tmpwndNode->params[0].p, sizeof(RF_PhysDiskAddr_t));
 441                         /* add it into the pda_cleanup_list *after* the copy, TYVM */
 442                         pda->next = dag_h->pda_cleanup_list;
 443                         dag_h->pda_cleanup_list = pda;
 444                         rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0);
 445                         xorNode->params[paramNum++].p = pda;
 446                         xorNode->params[paramNum++].p = pda->bufPtr;
 447                 }
 448                 tmpwndNode = tmpwndNode->list_next;
 449         }
 450
 451         /*
 452          * Install the failed PDA into the xor param list so that the
 453          * new data gets xor'd in.
 454          */
 455         xorNode->params[paramNum++].p = failedPDA;
 456         xorNode->params[paramNum++].p = failedPDA->bufPtr;
 457
 458         /*
 459          * The last 2 params to the recovery xor node are always the failed
 460          * PDA and the raidPtr. install the failedPDA even though we have just
 461          * done so above. This allows us to use the same XOR function for both
 462          * degraded reads and degraded writes.
 463          */
 464         xorNode->params[paramNum++].p = failedPDA;
 465         xorNode->params[paramNum++].p = raidPtr;
 466         RF_ASSERT(paramNum == 2 * nXorBufs + 2);
 467
 468         /*
 469          * Code to link nodes begins here
 470          */
 471
 472         /* link header to block node */
 473         RF_ASSERT(blockNode->numAntecedents == 0);
 474         dag_h->succedents[0] = blockNode;
 475
 476         /* link block node to rd nodes */
 477         RF_ASSERT(blockNode->numSuccedents == nRrdNodes);
 478         tmprrdNode = rrdNodes;
 479         for (i = 0; i < nRrdNodes; i++) {
 480                 RF_ASSERT(tmprrdNode->numAntecedents == 1);
 481                 blockNode->succedents[i] = tmprrdNode;
 482                 tmprrdNode->antecedents[0] = blockNode;
 483                 tmprrdNode->antType[0] = rf_control;
 484                 tmprrdNode = tmprrdNode->list_next;
 485         }
 486
 487         /* link read nodes to xor node */
 488         RF_ASSERT(xorNode->numAntecedents == nRrdNodes);
 489         tmprrdNode = rrdNodes;
 490         for (i = 0; i < nRrdNodes; i++) {
 491                 RF_ASSERT(tmprrdNode->numSuccedents == 1);
 492                 tmprrdNode->succedents[0] = xorNode;
 493                 xorNode->antecedents[i] = tmprrdNode;
 494                 xorNode->antType[i] = rf_trueData;
 495                 tmprrdNode = tmprrdNode->list_next;
 496         }
 497
 498         /* link xor node to commit node */
 499         RF_ASSERT(xorNode->numSuccedents == 1);
 500         RF_ASSERT(commitNode->numAntecedents == 1);
 501         xorNode->succedents[0] = commitNode;
 502         commitNode->antecedents[0] = xorNode;
 503         commitNode->antType[0] = rf_control;
 504
 505         /* link commit node to wnd nodes */
 506         RF_ASSERT(commitNode->numSuccedents == nfaults + nWndNodes);
 507         tmpwndNode = wndNodes;
 508         for (i = 0; i < nWndNodes; i++) {
 509                 RF_ASSERT(tmpwndNode->numAntecedents == 1);
 510                 commitNode->succedents[i] = tmpwndNode;
 511                 tmpwndNode->antecedents[0] = commitNode;
 512                 tmpwndNode->antType[0] = rf_control;
 513                 tmpwndNode = tmpwndNode->list_next;
 514         }
 515
 516         /* link the commit node to wnp, wnq nodes */
 517         RF_ASSERT(wnpNode->numAntecedents == 1);
 518         commitNode->succedents[nWndNodes] = wnpNode;
 519         wnpNode->antecedents[0] = commitNode;
 520         wnpNode->antType[0] = rf_control;
 521 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
 522         if (nfaults == 2) {
 523                 RF_ASSERT(wnqNode->numAntecedents == 1);
 524                 commitNode->succedents[nWndNodes + 1] = wnqNode;
 525                 wnqNode->antecedents[0] = commitNode;
 526                 wnqNode->antType[0] = rf_control;
 527         }
 528 #endif
 529         /* link write new data nodes to unblock node */
 530         RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nfaults));
 531         tmpwndNode = wndNodes;
 532         for (i = 0; i < nWndNodes; i++) {
 533                 RF_ASSERT(tmpwndNode->numSuccedents == 1);
 534                 tmpwndNode->succedents[0] = unblockNode;
 535                 unblockNode->antecedents[i] = tmpwndNode;
 536                 unblockNode->antType[i] = rf_control;
 537                 tmpwndNode = tmpwndNode->list_next;
 538         }
 539
 540         /* link write new parity node to unblock node */
 541         RF_ASSERT(wnpNode->numSuccedents == 1);
 542         wnpNode->succedents[0] = unblockNode;
 543         unblockNode->antecedents[nWndNodes] = wnpNode;
 544         unblockNode->antType[nWndNodes] = rf_control;
 545
 546 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
 547         /* link write new q node to unblock node */
 548         if (nfaults == 2) {
 549                 RF_ASSERT(wnqNode->numSuccedents == 1);
 550                 wnqNode->succedents[0] = unblockNode;
 551                 unblockNode->antecedents[nWndNodes + 1] = wnqNode;
 552                 unblockNode->antType[nWndNodes + 1] = rf_control;
 553         }
 554 #endif
 555         /* link unblock node to term node */
 556         RF_ASSERT(unblockNode->numSuccedents == 1);
 557         RF_ASSERT(termNode->numAntecedents == 1);
 558         RF_ASSERT(termNode->numSuccedents == 0);
 559         unblockNode->succedents[0] = termNode;
 560         termNode->antecedents[0] = unblockNode;
 561         termNode->antType[0] = rf_control;
 562 }
 563 #define CONS_PDA(if,start,num) \
 564   pda_p->col = asmap->if->col; \
 565   pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
 566   pda_p->numSector = num; \
 567   pda_p->next = NULL; \
 568   RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
 569 #if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0)
 570 void
 571 rf_WriteGenerateFailedAccessASMs(
 572     RF_Raid_t * raidPtr,
 573     RF_AccessStripeMap_t * asmap,
 574     RF_PhysDiskAddr_t ** pdap,
 575     int *nNodep,
 576     RF_PhysDiskAddr_t ** pqpdap,
 577     int *nPQNodep,
 578     RF_AllocListElem_t * allocList)
 579 {
 580         RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
 581         int     PDAPerDisk, i;
 582         RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
 583         int     numDataCol = layoutPtr->numDataCol;
 584         int     state;
 585         unsigned napdas;
 586         RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end;
 587         RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1];
 588         RF_PhysDiskAddr_t *pda_p;
 589         RF_RaidAddr_t sosAddr;
 590
 591         /* determine how many pda's we will have to generate per unaccess
 592          * stripe. If there is only one failed data unit, it is one; if two,
 593          * possibly two, depending wether they overlap. */
 594
 595         fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector);
 596         fone_end = fone_start + fone->numSector;
 597
 598         if (asmap->numDataFailed == 1) {
 599                 PDAPerDisk = 1;
 600                 state = 1;
 601                 RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
 602                 pda_p = *pqpdap;
 603                 /* build p */
 604                 CONS_PDA(parityInfo, fone_start, fone->numSector);
 605                 pda_p->type = RF_PDA_TYPE_PARITY;
 606                 pda_p++;
 607                 /* build q */
 608                 CONS_PDA(qInfo, fone_start, fone->numSector);
 609                 pda_p->type = RF_PDA_TYPE_Q;
 610         } else {
 611                 ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector);
 612                 ftwo_end = ftwo_start + ftwo->numSector;
 613                 if (fone->numSector + ftwo->numSector > secPerSU) {
 614                         PDAPerDisk = 1;
 615                         state = 2;
 616                         RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
 617                         pda_p = *pqpdap;
 618                         CONS_PDA(parityInfo, 0, secPerSU);
 619                         pda_p->type = RF_PDA_TYPE_PARITY;
 620                         pda_p++;
 621                         CONS_PDA(qInfo, 0, secPerSU);
 622                         pda_p->type = RF_PDA_TYPE_Q;
 623                 } else {
 624                         PDAPerDisk = 2;
 625                         state = 3;
 626                         /* four of them, fone, then ftwo */
 627                         RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
 628                         pda_p = *pqpdap;
 629                         CONS_PDA(parityInfo, fone_start, fone->numSector);
 630                         pda_p->type = RF_PDA_TYPE_PARITY;
 631                         pda_p++;
 632                         CONS_PDA(qInfo, fone_start, fone->numSector);
 633                         pda_p->type = RF_PDA_TYPE_Q;
 634                         pda_p++;
 635                         CONS_PDA(parityInfo, ftwo_start, ftwo->numSector);
 636                         pda_p->type = RF_PDA_TYPE_PARITY;
 637                         pda_p++;
 638                         CONS_PDA(qInfo, ftwo_start, ftwo->numSector);
 639                         pda_p->type = RF_PDA_TYPE_Q;
 640                 }
 641         }
 642         /* figure out number of nonaccessed pda */
 643         napdas = PDAPerDisk * (numDataCol - 2);
 644         *nPQNodep = PDAPerDisk;
 645
 646         *nNodep = napdas;
 647         if (napdas == 0)
 648                 return;         /* short circuit */
 649
 650         /* allocate up our list of pda's */
 651
 652         RF_MallocAndAdd(pda_p, napdas * sizeof(RF_PhysDiskAddr_t),
 653                         (RF_PhysDiskAddr_t *), allocList);
 654         *pdap = pda_p;
 655
 656         /* linkem together */
 657         for (i = 0; i < (napdas - 1); i++)
 658                 pda_p[i].next = pda_p + (i + 1);
 659
 660         sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
 661         for (i = 0; i < numDataCol; i++) {
 662                 if ((pda_p - (*pdap)) == napdas)
 663                         continue;
 664                 pda_p->type = RF_PDA_TYPE_DATA;
 665                 pda_p->raidAddress = sosAddr + (i * secPerSU);
 666                 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0);
 667                 /* skip over dead disks */
 668                 if (RF_DEAD_DISK(raidPtr->Disks[pda_p->col].status))
 669                         continue;
 670                 switch (state) {
 671                 case 1: /* fone */
 672                         pda_p->numSector = fone->numSector;
 673                         pda_p->raidAddress += fone_start;
 674                         pda_p->startSector += fone_start;
 675                         RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
 676                         break;
 677                 case 2: /* full stripe */
 678                         pda_p->numSector = secPerSU;
 679                         RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, secPerSU), (char *), allocList);
 680                         break;
 681                 case 3: /* two slabs */
 682                         pda_p->numSector = fone->numSector;
 683                         pda_p->raidAddress += fone_start;
 684                         pda_p->startSector += fone_start;
 685                         RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
 686                         pda_p++;
 687                         pda_p->type = RF_PDA_TYPE_DATA;
 688                         pda_p->raidAddress = sosAddr + (i * secPerSU);
 689                         (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0);
 690                         pda_p->numSector = ftwo->numSector;
 691                         pda_p->raidAddress += ftwo_start;
 692                         pda_p->startSector += ftwo_start;
 693                         RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
 694                         break;
 695                 default:
 696                         RF_PANIC();
 697                 }
 698                 pda_p++;
 699         }
 700
 701         RF_ASSERT(pda_p - *pdap == napdas);
 702         return;
 703 }
 704 #define DISK_NODE_PDA(node)  ((node)->params[0].p)
 705
 706 #define DISK_NODE_PARAMS(_node_,_p_) \
 707   (_node_).params[0].p = _p_ ; \
 708   (_node_).params[1].p = (_p_)->bufPtr; \
 709   (_node_).params[2].v = parityStripeID; \
 710   (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru)
 711
 712 void
 713 rf_DoubleDegSmallWrite(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
 714                        RF_DagHeader_t *dag_h, void *bp,
 715                        RF_RaidAccessFlags_t flags,
 716                        RF_AllocListElem_t *allocList,
 717                        const char *redundantReadNodeName,
 718                        const char *redundantWriteNodeName,
 719                        const char *recoveryNodeName,
 720                        int (*recovFunc) (RF_DagNode_t *))
 721 {
 722         RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
 723         RF_DagNode_t *nodes, *wudNodes, *rrdNodes, *recoveryNode, *blockNode,
 724                *unblockNode, *rpNodes, *rqNodes, *wpNodes, *wqNodes, *termNode;
 725         RF_PhysDiskAddr_t *pda, *pqPDAs;
 726         RF_PhysDiskAddr_t *npdas;
 727         int     nWriteNodes, nNodes, nReadNodes, nRrdNodes, nWudNodes, i;
 728         RF_ReconUnitNum_t which_ru;
 729         int     nPQNodes;
 730         RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru);
 731
 732         /* simple small write case - First part looks like a reconstruct-read
 733          * of the failed data units. Then a write of all data units not
 734          * failed. */
 735
 736
 737         /* Hdr | ------Block- /  /         \   Rrd  Rrd ...  Rrd  Rp Rq \  \
 738          * /  -------PQ----- /   \   \ Wud   Wp  WQ          \    |   /
 739          * --Unblock- | T
 740          *
 741          * Rrd = read recovery data  (potentially none) Wud = write user data
 742          * (not incl. failed disks) Wp = Write P (could be two) Wq = Write Q
 743          * (could be two)
 744          *
 745          */
 746
 747         rf_WriteGenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes, allocList);
 748
 749         RF_ASSERT(asmap->numDataFailed == 1);
 750
 751         nWudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
 752         nReadNodes = nRrdNodes + 2 * nPQNodes;
 753         nWriteNodes = nWudNodes + 2 * nPQNodes;
 754         nNodes = 4 + nReadNodes + nWriteNodes;
 755
 756         RF_MallocAndAdd(nodes, nNodes * sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
 757         blockNode = nodes;
 758         unblockNode = blockNode + 1;
 759         termNode = unblockNode + 1;
 760         recoveryNode = termNode + 1;
 761         rrdNodes = recoveryNode + 1;
 762         rpNodes = rrdNodes + nRrdNodes;
 763         rqNodes = rpNodes + nPQNodes;
 764         wudNodes = rqNodes + nPQNodes;
 765         wpNodes = wudNodes + nWudNodes;
 766         wqNodes = wpNodes + nPQNodes;
 767
 768         dag_h->creator = "PQ_DDSimpleSmallWrite";
 769         dag_h->numSuccedents = 1;
 770         dag_h->succedents[0] = blockNode;
 771         rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
 772         termNode->antecedents[0] = unblockNode;
 773         termNode->antType[0] = rf_control;
 774
 775         /* init the block and unblock nodes */
 776         /* The block node has all the read nodes as successors */
 777         rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList);
 778         for (i = 0; i < nReadNodes; i++)
 779                 blockNode->succedents[i] = rrdNodes + i;
 780
 781         /* The unblock node has all the writes as successors */
 782         rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWriteNodes, 0, 0, dag_h, "Nil", allocList);
 783         for (i = 0; i < nWriteNodes; i++) {
 784                 unblockNode->antecedents[i] = wudNodes + i;
 785                 unblockNode->antType[i] = rf_control;
 786         }
 787         unblockNode->succedents[0] = termNode;
 788
 789 #define INIT_READ_NODE(node,name) \
 790   rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
 791   (node)->succedents[0] = recoveryNode; \
 792   (node)->antecedents[0] = blockNode; \
 793   (node)->antType[0] = rf_control;
 794
 795         /* build the read nodes */
 796         pda = npdas;
 797         for (i = 0; i < nRrdNodes; i++, pda = pda->next) {
 798                 INIT_READ_NODE(rrdNodes + i, "rrd");
 799                 DISK_NODE_PARAMS(rrdNodes[i], pda);
 800         }
 801
 802         /* read redundancy pdas */
 803         pda = pqPDAs;
 804         INIT_READ_NODE(rpNodes, "Rp");
 805         RF_ASSERT(pda);
 806         DISK_NODE_PARAMS(rpNodes[0], pda);
 807         pda++;
 808         INIT_READ_NODE(rqNodes, redundantReadNodeName);
 809         RF_ASSERT(pda);
 810         DISK_NODE_PARAMS(rqNodes[0], pda);
 811         if (nPQNodes == 2) {
 812                 pda++;
 813                 INIT_READ_NODE(rpNodes + 1, "Rp");
 814                 RF_ASSERT(pda);
 815                 DISK_NODE_PARAMS(rpNodes[1], pda);
 816                 pda++;
 817                 INIT_READ_NODE(rqNodes + 1, redundantReadNodeName);
 818                 RF_ASSERT(pda);
 819                 DISK_NODE_PARAMS(rqNodes[1], pda);
 820         }
 821         /* the recovery node has all reads as precedessors and all writes as
 822          * successors. It generates a result for every write P or write Q
 823          * node. As parameters, it takes a pda per read and a pda per stripe
 824          * of user data written. It also takes as the last params the raidPtr
 825          * and asm. For results, it takes PDA for P & Q. */
 826
 827
 828         rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL,
 829             nWriteNodes,        /* succesors */
 830             nReadNodes,         /* preds */
 831             nReadNodes + nWudNodes + 3, /* params */
 832             2 * nPQNodes,       /* results */
 833             dag_h, recoveryNodeName, allocList);
 834
 835
 836
 837         for (i = 0; i < nReadNodes; i++) {
 838                 recoveryNode->antecedents[i] = rrdNodes + i;
 839                 recoveryNode->antType[i] = rf_control;
 840                 recoveryNode->params[i].p = DISK_NODE_PDA(rrdNodes + i);
 841         }
 842         for (i = 0; i < nWudNodes; i++) {
 843                 recoveryNode->succedents[i] = wudNodes + i;
 844         }
 845         recoveryNode->params[nReadNodes + nWudNodes].p = asmap->failedPDAs[0];
 846         recoveryNode->params[nReadNodes + nWudNodes + 1].p = raidPtr;
 847         recoveryNode->params[nReadNodes + nWudNodes + 2].p = asmap;
 848
 849         for (; i < nWriteNodes; i++)
 850                 recoveryNode->succedents[i] = wudNodes + i;
 851
 852         pda = pqPDAs;
 853         recoveryNode->results[0] = pda;
 854         pda++;
 855         recoveryNode->results[1] = pda;
 856         if (nPQNodes == 2) {
 857                 pda++;
 858                 recoveryNode->results[2] = pda;
 859                 pda++;
 860                 recoveryNode->results[3] = pda;
 861         }
 862         /* fill writes */
 863 #define INIT_WRITE_NODE(node,name) \
 864   rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
 865     (node)->succedents[0] = unblockNode; \
 866     (node)->antecedents[0] = recoveryNode; \
 867     (node)->antType[0] = rf_control;
 868
 869         pda = asmap->physInfo;
 870         for (i = 0; i < nWudNodes; i++) {
 871                 INIT_WRITE_NODE(wudNodes + i, "Wd");
 872                 DISK_NODE_PARAMS(wudNodes[i], pda);
 873                 recoveryNode->params[nReadNodes + i].p = DISK_NODE_PDA(wudNodes + i);
 874                 pda = pda->next;
 875         }
 876         /* write redundancy pdas */
 877         pda = pqPDAs;
 878         INIT_WRITE_NODE(wpNodes, "Wp");
 879         RF_ASSERT(pda);
 880         DISK_NODE_PARAMS(wpNodes[0], pda);
 881         pda++;
 882         INIT_WRITE_NODE(wqNodes, "Wq");
 883         RF_ASSERT(pda);
 884         DISK_NODE_PARAMS(wqNodes[0], pda);
 885         if (nPQNodes == 2) {
 886                 pda++;
 887                 INIT_WRITE_NODE(wpNodes + 1, "Wp");
 888                 RF_ASSERT(pda);
 889                 DISK_NODE_PARAMS(wpNodes[1], pda);
 890                 pda++;
 891                 INIT_WRITE_NODE(wqNodes + 1, "Wq");
 892                 RF_ASSERT(pda);
 893                 DISK_NODE_PARAMS(wqNodes[1], pda);
 894         }
 895 }
 896 #endif   /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0) */