sys/dev/raidframe/rf_pqdegdags.c

   1 /*      $NetBSD: rf_pqdegdags.c,v 1.11 2005/12/11 12:23:37 christos Exp $       */
   2 /*
   3  * Copyright (c) 1995 Carnegie-Mellon University.
   4  * All rights reserved.
   5  *
   6  * Author: Daniel Stodolsky
   7  *
   8  * Permission to use, copy, modify and distribute this software and
   9  * its documentation is hereby granted, provided that both the copyright
  10  * notice and this permission notice appear in all copies of the
  11  * software, derivative works or modified versions, and any portions
  12  * thereof, and that both notices appear in supporting documentation.
  13  *
  14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  17  *
  18  * Carnegie Mellon requests users of this software to return to
  19  *
  20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  21  *  School of Computer Science
  22  *  Carnegie Mellon University
  23  *  Pittsburgh PA 15213-3890
  24  *
  25  * any improvements or extensions that they make and grant Carnegie the
  26  * rights to redistribute these changes.
  27  */
  28
  29 /*
  30  * rf_pqdegdags.c
  31  * Degraded mode dags for double fault cases.
  32 */
  33
  34
  35 #include <sys/cdefs.h>
  36 __KERNEL_RCSID(0, "$NetBSD: rf_pqdegdags.c,v 1.11 2005/12/11 12:23:37 christos Exp $");
  37
  38 #include "rf_archs.h"
  39
  40 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
  41
  42 #include <dev/raidframe/raidframevar.h>
  43
  44 #include "rf_raid.h"
  45 #include "rf_dag.h"
  46 #include "rf_dagdegrd.h"
  47 #include "rf_dagdegwr.h"
  48 #include "rf_dagfuncs.h"
  49 #include "rf_dagutils.h"
  50 #include "rf_etimer.h"
  51 #include "rf_acctrace.h"
  52 #include "rf_general.h"
  53 #include "rf_pqdegdags.h"
  54 #include "rf_pq.h"
  55
  56 static void
  57 applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
  58     RF_PhysDiskAddr_t * qpda, void *bp);
  59
  60 /*
  61    Two data drives have failed, and we are doing a read that covers one of them.
  62    We may also be reading some of the surviving drives.
  63
  64
  65  *****************************************************************************************
  66  *
  67  * creates a DAG to perform a degraded-mode read of data within one stripe.
  68  * This DAG is as follows:
  69  *
  70  *                                      Hdr
  71  *                                       |
  72  *                                     Block
  73  *                       /         /           \         \     \   \
  74  *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
  75  *                      | \       | \         | \       | \    | \ | \
  76  *
  77  *                                 |                 |
  78  *                              Unblock              X
  79  *                                  \               /
  80  *                                   ------ T ------
  81  *
  82  * Each R node is a successor of the L node
  83  * One successor arc from each R node goes to U, and the other to X
  84  * There is one Rud for each chunk of surviving user data requested by the user,
  85  * and one Rrd for each chunk of surviving user data _not_ being read by the user
  86  * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
  87  * X = pq recovery node, T = terminate
  88  *
  89  * The block & unblock nodes are leftovers from a previous version.  They
  90  * do nothing, but I haven't deleted them because it would be a tremendous
  91  * effort to put them back in.
  92  *
  93  * Note:  The target buffer for the XOR node is set to the actual user buffer where the
  94  * failed data is supposed to end up.  This buffer is zero'd by the code here.  Thus,
  95  * if you create a degraded read dag, use it, and then re-use, you have to be sure to
  96  * zero the target buffer prior to the re-use.
  97  *
  98  * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
  99  * needs and what's not.
 100  ****************************************************************************************/
 101 /*   init a disk node with 2 successors and one predecessor */
 102 #define INIT_DISK_NODE(node,name) \
 103 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
 104 (node)->succedents[0] = unblockNode; \
 105 (node)->succedents[1] = recoveryNode; \
 106 (node)->antecedents[0] = blockNode; \
 107 (node)->antType[0] = rf_control
 108
 109 #define DISK_NODE_PARAMS(_node_,_p_) \
 110   (_node_).params[0].p = _p_ ; \
 111   (_node_).params[1].p = (_p_)->bufPtr; \
 112   (_node_).params[2].v = parityStripeID; \
 113   (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru)
 114
 115 #define DISK_NODE_PDA(node)  ((node)->params[0].p)
 116
 117 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
 118 {
 119         rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
 120             "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
 121 }
 122
 123 static void
 124 applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, RF_PhysDiskAddr_t *ppda, RF_PhysDiskAddr_t *qpda, void *bp)
 125 {
 126         RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
 127         RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
 128         RF_SectorCount_t s0len = ppda->numSector, len;
 129         RF_SectorNum_t suoffset;
 130         unsigned coeff;
 131         char   *pbuf = ppda->bufPtr;
 132         char   *qbuf = qpda->bufPtr;
 133         char   *buf;
 134         int     delta;
 135
 136         suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
 137         len = pda->numSector;
 138         /* see if pda intersects a recovery pda */
 139         if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
 140                 buf = pda->bufPtr;
 141                 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
 142                 coeff = (coeff % raidPtr->Layout.numDataCol);
 143
 144                 if (suoffset < s0off) {
 145                         delta = s0off - suoffset;
 146                         buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
 147                         suoffset = s0off;
 148                         len -= delta;
 149                 }
 150                 if (suoffset > s0off) {
 151                         delta = suoffset - s0off;
 152                         pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
 153                         qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
 154                 }
 155                 if ((suoffset + len) > (s0len + s0off))
 156                         len = s0len + s0off - suoffset;
 157
 158                 /* src, dest, len */
 159                 rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
 160
 161                 /* dest, src, len, coeff */
 162                 rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
 163         }
 164 }
 165 /*
 166    Recover data in the case of a double failure. There can be two
 167    result buffers, one for each chunk of data trying to be recovered.
 168    The params are pda's that have not been range restricted or otherwise
 169    politely massaged - this should be done here. The last params are the
 170    pdas of P and Q, followed by the raidPtr. The list can look like
 171
 172    pda, pda, ... , p pda, q pda, raidptr, asm
 173
 174    or
 175
 176    pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
 177
 178    depending on wether two chunks of recovery data were required.
 179
 180    The second condition only arises if there are two failed buffers
 181    whose lengths do not add up a stripe unit.
 182 */
 183
 184
 185 int
 186 rf_PQDoubleRecoveryFunc(RF_DagNode_t *node)
 187 {
 188         int     np = node->numParams;
 189         RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
 190         RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
 191         RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
 192         int     d, i;
 193         unsigned coeff;
 194         RF_RaidAddr_t sosAddr, suoffset;
 195         RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
 196         int     two = 0;
 197         RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
 198         char   *buf;
 199         int     numDataCol = layoutPtr->numDataCol;
 200         RF_Etimer_t timer;
 201         RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
 202
 203         RF_ETIMER_START(timer);
 204
 205         if (asmap->failedPDAs[1] &&
 206             (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
 207                 RF_ASSERT(0);
 208                 ppda = node->params[np - 6].p;
 209                 ppda2 = node->params[np - 5].p;
 210                 qpda = node->params[np - 4].p;
 211                 qpda2 = node->params[np - 3].p;
 212                 d = (np - 6);
 213                 two = 1;
 214         } else {
 215                 ppda = node->params[np - 4].p;
 216                 qpda = node->params[np - 3].p;
 217                 d = (np - 4);
 218         }
 219
 220         for (i = 0; i < d; i++) {
 221                 pda = node->params[i].p;
 222                 buf = pda->bufPtr;
 223                 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
 224                 len = pda->numSector;
 225                 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
 226                 /* compute the data unit offset within the column */
 227                 coeff = (coeff % raidPtr->Layout.numDataCol);
 228                 /* see if pda intersects a recovery pda */
 229                 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
 230                 if (two)
 231                         applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
 232         }
 233
 234         /* ok, we got the parity back to the point where we can recover. We
 235          * now need to determine the coeff of the columns that need to be
 236          * recovered. We can also only need to recover a single stripe unit. */
 237
 238         if (asmap->failedPDAs[1] == NULL) {     /* only a single stripe unit
 239                                                  * to recover. */
 240                 pda = asmap->failedPDAs[0];
 241                 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
 242                 /* need to determine the column of the other failed disk */
 243                 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
 244                 /* compute the data unit offset within the column */
 245                 coeff = (coeff % raidPtr->Layout.numDataCol);
 246                 for (i = 0; i < numDataCol; i++) {
 247                         npda.raidAddress = sosAddr + (i * secPerSU);
 248                         (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
 249                         /* skip over dead disks */
 250                         if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
 251                                 if (i != coeff)
 252                                         break;
 253                 }
 254                 RF_ASSERT(i < numDataCol);
 255                 RF_ASSERT(two == 0);
 256                 /* recover the data. Since we need only want to recover one
 257                  * column, we overwrite the parity with the other one. */
 258                 if (coeff < i)  /* recovering 'a' */
 259                         rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
 260                 else            /* recovering 'b' */
 261                         rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
 262         } else
 263                 RF_PANIC();
 264
 265         RF_ETIMER_STOP(timer);
 266         RF_ETIMER_EVAL(timer);
 267         if (tracerec)
 268                 tracerec->q_us += RF_ETIMER_VAL_US(timer);
 269         rf_GenericWakeupFunc(node, 0);
 270         return (0);
 271 }
 272
 273 int
 274 rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node)
 275 {
 276         /* The situation:
 277          *
 278          * We are doing a write that hits only one failed data unit. The other
 279          * failed data unit is not being overwritten, so we need to generate
 280          * it.
 281          *
 282          * For the moment, we assume all the nonfailed data being written is in
 283          * the shadow of the failed data unit. (i.e,, either a single data
 284          * unit write or the entire failed stripe unit is being overwritten. )
 285          *
 286          * Recovery strategy: apply the recovery data to the parity and q. Use P
 287          * & Q to recover the second failed data unit in P. Zero fill Q, then
 288          * apply the recovered data to p. Then apply the data being written to
 289          * the failed drive. Then walk through the surviving drives, applying
 290          * new data when it exists, othewise the recovery data. Quite a mess.
 291          *
 292          *
 293          * The params
 294          *
 295          * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
 296          * write pda (numStripeUnitAccess - numDataFailed), failed pda,
 297          * raidPtr, asmap */
 298
 299         int     np = node->numParams;
 300         RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
 301         RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
 302         RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
 303         int     i;
 304         RF_RaidAddr_t sosAddr;
 305         unsigned coeff;
 306         RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
 307         RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
 308         int     numDataCol = layoutPtr->numDataCol;
 309         RF_Etimer_t timer;
 310         RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
 311
 312         RF_ASSERT(node->numResults == 2);
 313         RF_ASSERT(asmap->failedPDAs[1] == NULL);
 314         RF_ETIMER_START(timer);
 315         ppda = node->results[0];
 316         qpda = node->results[1];
 317         /* apply the recovery data */
 318         for (i = 0; i < numDataCol - 2; i++)
 319                 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
 320
 321         /* determine the other failed data unit */
 322         pda = asmap->failedPDAs[0];
 323         sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
 324         /* need to determine the column of the other failed disk */
 325         coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
 326         /* compute the data unit offset within the column */
 327         coeff = (coeff % raidPtr->Layout.numDataCol);
 328         for (i = 0; i < numDataCol; i++) {
 329                 npda.raidAddress = sosAddr + (i * secPerSU);
 330                 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
 331                 /* skip over dead disks */
 332                 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
 333                         if (i != coeff)
 334                                 break;
 335         }
 336         RF_ASSERT(i < numDataCol);
 337         /* recover the data. The column we want to recover we write over the
 338          * parity. The column we don't care about we dump in q. */
 339         if (coeff < i)          /* recovering 'a' */
 340                 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
 341         else                    /* recovering 'b' */
 342                 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
 343
 344         /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
 345         memset(qpda->bufPtr, 0, rf_RaidAddressToByte(raidPtr, qpda->numSector));
 346         rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
 347
 348         /* now apply all the write data to the buffer */
 349         /* single stripe unit write case: the failed data is only thing we are
 350          * writing. */
 351         RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
 352         /* dest, src, len, coeff */
 353         rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
 354         rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
 355
 356         /* now apply all the recovery data */
 357         for (i = 0; i < numDataCol - 2; i++)
 358                 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
 359
 360         RF_ETIMER_STOP(timer);
 361         RF_ETIMER_EVAL(timer);
 362         if (tracerec)
 363                 tracerec->q_us += RF_ETIMER_VAL_US(timer);
 364
 365         rf_GenericWakeupFunc(node, 0);
 366         return (0);
 367 }
 368 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
 369 {
 370         RF_PANIC();
 371 }
 372 /*
 373    Two lost data unit write case.
 374
 375    There are really two cases here:
 376
 377    (1) The write completely covers the two lost data units.
 378        In that case, a reconstruct write that doesn't write the
 379        failed data units will do the correct thing. So in this case,
 380        the dag looks like
 381
 382             full stripe read of surviving data units (not being overwriten)
 383             write new data (ignoring failed units)   compute P&Q
 384                                                      write P&Q
 385
 386
 387    (2) The write does not completely cover both failed data units
 388        (but touches at least one of them). Then we need to do the
 389        equivalent of a reconstruct read to recover the missing data
 390        unit from the other stripe.
 391
 392        For any data we are writing that is not in the "shadow"
 393        of the failed units, we need to do a four cycle update.
 394        PANIC on this case. for now
 395
 396 */
 397
 398 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
 399 {
 400         RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
 401         RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
 402         int     sum;
 403         int     nf = asmap->numDataFailed;
 404
 405         sum = asmap->failedPDAs[0]->numSector;
 406         if (nf == 2)
 407                 sum += asmap->failedPDAs[1]->numSector;
 408
 409         if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
 410                 /* large write case */
 411                 rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
 412                 return;
 413         }
 414         if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
 415                 /* small write case, no user data not in shadow */
 416                 rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
 417                 return;
 418         }
 419         RF_PANIC();
 420 }
 421 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
 422 {
 423         rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
 424 }
 425 #endif                          /* (RF_INCLUDE_DECL_PQ > 0) ||
 426                                  * (RF_INCLUDE_RAID6 > 0) */