1 /* $NetBSD: rf_decluster.c,v 1.21 2006/11/16 01:33:23 christos Exp $ */
3 * Copyright (c) 1995 Carnegie-Mellon University.
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
18 * Carnegie Mellon requests users of this software to return to
20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
29 /*----------------------------------------------------------------------
31 * rf_decluster.c -- code related to the declustered layout
33 * Created 10-21-92 (MCH)
35 * Nov 93: adding support for distributed sparing. This code is a little
36 * complex: the basic layout used is as follows:
37 * let F = (v-1)/GCD(r,v-1). The spare space for each set of
38 * F consecutive fulltables is grouped together and placed after
40 * +------------------------------+
46 * +------------------------------+
48 *--------------------------------------------------------------------*/
50 #include <sys/cdefs.h>
51 __KERNEL_RCSID(0, "$NetBSD: rf_decluster.c,v 1.21 2006/11/16 01:33:23 christos Exp $");
53 #include <dev/raidframe/raidframevar.h>
57 #include "rf_decluster.h"
58 #include "rf_debugMem.h"
60 #include "rf_alloclist.h"
61 #include "rf_general.h"
63 #include "rf_shutdown.h"
64 #include "rf_copyback.h"
66 #if (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0)
68 /* configuration code */
71 rf_ConfigureDeclustered(RF_ShutdownList_t
**listp
, RF_Raid_t
*raidPtr
,
74 RF_RaidLayout_t
*layoutPtr
= &(raidPtr
->Layout
);
75 int b
, v
, k
, r
, lambda
; /* block design params */
77 RF_RowCol_t
*first_avail_slot
;
78 RF_StripeCount_t complete_FT_count
, numCompleteFullTablesPerDisk
;
79 RF_DeclusteredConfigInfo_t
*info
;
80 RF_StripeCount_t PUsPerDisk
, spareRegionDepthInPUs
, numCompleteSpareRegionsPerDisk
,
82 RF_StripeCount_t totSparePUsPerDisk
;
83 RF_SectorNum_t diskOffsetOfLastFullTableInSUs
;
84 RF_SectorCount_t SpareSpaceInSUs
;
85 char *cfgBuf
= (char *) (cfgPtr
->layoutSpecific
);
86 RF_StripeNum_t l
, SUID
;
89 numCompleteSpareRegionsPerDisk
= 0;
91 /* 1. create layout specific structure */
92 RF_MallocAndAdd(info
, sizeof(RF_DeclusteredConfigInfo_t
), (RF_DeclusteredConfigInfo_t
*), raidPtr
->cleanupList
);
95 layoutPtr
->layoutSpecificInfo
= (void *) info
;
96 info
->SpareTable
= NULL
;
98 /* 2. extract parameters from the config structure */
99 if (layoutPtr
->map
->flags
& RF_DISTRIBUTE_SPARE
) {
100 (void)memcpy(info
->sparemap_fname
, cfgBuf
, RF_SPAREMAP_NAME_LEN
);
102 cfgBuf
+= RF_SPAREMAP_NAME_LEN
;
104 b
= *((int *) cfgBuf
);
105 cfgBuf
+= sizeof(int);
106 v
= *((int *) cfgBuf
);
107 cfgBuf
+= sizeof(int);
108 k
= *((int *) cfgBuf
);
109 cfgBuf
+= sizeof(int);
110 r
= *((int *) cfgBuf
);
111 cfgBuf
+= sizeof(int);
112 lambda
= *((int *) cfgBuf
);
113 cfgBuf
+= sizeof(int);
114 raidPtr
->noRotate
= *((int *) cfgBuf
);
115 cfgBuf
+= sizeof(int);
117 /* the sparemaps are generated assuming that parity is rotated, so we
118 * issue a warning if both distributed sparing and no-rotate are on at
120 if ((layoutPtr
->map
->flags
& RF_DISTRIBUTE_SPARE
) && raidPtr
->noRotate
) {
121 RF_ERRORMSG("Warning: distributed sparing specified without parity rotation.\n");
123 if (raidPtr
->numCol
!= v
) {
124 RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v
, raidPtr
->numCol
);
127 /* 3. set up the values used in the mapping code */
128 info
->BlocksPerTable
= b
;
129 info
->Lambda
= lambda
;
130 info
->NumParityReps
= info
->groupSize
= k
;
131 info
->SUsPerTable
= b
* (k
- 1) * layoutPtr
->SUsPerPU
; /* b blks, k-1 SUs each */
132 info
->SUsPerFullTable
= k
* info
->SUsPerTable
; /* rot k times */
133 info
->PUsPerBlock
= k
- 1;
134 info
->SUsPerBlock
= info
->PUsPerBlock
* layoutPtr
->SUsPerPU
;
135 info
->TableDepthInPUs
= (b
* k
) / v
;
136 info
->FullTableDepthInPUs
= info
->TableDepthInPUs
* k
; /* k repetitions */
138 /* used only in distributed sparing case */
139 info
->FullTablesPerSpareRegion
= (v
- 1) / rf_gcd(r
, v
- 1); /* (v-1)/gcd fulltables */
140 info
->TablesPerSpareRegion
= k
* info
->FullTablesPerSpareRegion
;
141 info
->SpareSpaceDepthPerRegionInSUs
= (r
* info
->TablesPerSpareRegion
/ (v
- 1)) * layoutPtr
->SUsPerPU
;
143 /* check to make sure the block design is sufficiently small */
144 if ((raidPtr
->Layout
.map
->flags
& RF_DISTRIBUTE_SPARE
)) {
145 if (info
->FullTableDepthInPUs
* layoutPtr
->SUsPerPU
+ info
->SpareSpaceDepthPerRegionInSUs
> layoutPtr
->stripeUnitsPerDisk
) {
146 RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n",
147 (int) info
->FullTableDepthInPUs
,
148 (int) info
->SpareSpaceDepthPerRegionInSUs
,
149 (int) layoutPtr
->stripeUnitsPerDisk
);
153 if (info
->TableDepthInPUs
* layoutPtr
->SUsPerPU
> layoutPtr
->stripeUnitsPerDisk
) {
154 RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n",
155 (int) (info
->TableDepthInPUs
* layoutPtr
->SUsPerPU
), \
156 (int) layoutPtr
->stripeUnitsPerDisk
);
162 /* compute the size of each disk, and the number of tables in the last
163 * fulltable (which need not be complete) */
164 if (raidPtr
->Layout
.map
->flags
& RF_DISTRIBUTE_SPARE
) {
166 PUsPerDisk
= layoutPtr
->stripeUnitsPerDisk
/ layoutPtr
->SUsPerPU
;
167 spareRegionDepthInPUs
= (info
->TablesPerSpareRegion
* info
->TableDepthInPUs
+
168 (info
->TablesPerSpareRegion
* info
->TableDepthInPUs
) / (v
- 1));
169 info
->SpareRegionDepthInSUs
= spareRegionDepthInPUs
* layoutPtr
->SUsPerPU
;
171 numCompleteSpareRegionsPerDisk
= PUsPerDisk
/ spareRegionDepthInPUs
;
172 info
->NumCompleteSRs
= numCompleteSpareRegionsPerDisk
;
173 extraPUsPerDisk
= PUsPerDisk
% spareRegionDepthInPUs
;
175 /* assume conservatively that we need the full amount of spare
176 * space in one region in order to provide spares for the
177 * partial spare region at the end of the array. We set "i"
178 * to the number of tables in the partial spare region. This
179 * may actually include some fulltables. */
180 extraPUsPerDisk
-= (info
->SpareSpaceDepthPerRegionInSUs
/ layoutPtr
->SUsPerPU
);
181 if (extraPUsPerDisk
<= 0)
184 i
= extraPUsPerDisk
/ info
->TableDepthInPUs
;
186 complete_FT_count
= (numCompleteSpareRegionsPerDisk
* (info
->TablesPerSpareRegion
/ k
) + i
/ k
);
187 info
->FullTableLimitSUID
= complete_FT_count
* info
->SUsPerFullTable
;
188 info
->ExtraTablesPerDisk
= i
% k
;
190 /* note that in the last spare region, the spare space is
191 * complete even though data/parity space is not */
192 totSparePUsPerDisk
= (numCompleteSpareRegionsPerDisk
+ 1) * (info
->SpareSpaceDepthPerRegionInSUs
/ layoutPtr
->SUsPerPU
);
193 info
->TotSparePUsPerDisk
= totSparePUsPerDisk
;
195 layoutPtr
->stripeUnitsPerDisk
=
196 ((complete_FT_count
) * info
->FullTableDepthInPUs
+ /* data & parity space */
197 info
->ExtraTablesPerDisk
* info
->TableDepthInPUs
+
198 totSparePUsPerDisk
/* spare space */
199 ) * layoutPtr
->SUsPerPU
;
200 layoutPtr
->dataStripeUnitsPerDisk
=
201 (complete_FT_count
* info
->FullTableDepthInPUs
+ info
->ExtraTablesPerDisk
* info
->TableDepthInPUs
)
202 * layoutPtr
->SUsPerPU
* (k
- 1) / k
;
205 /* non-dist spare case: force each disk to contain an
206 * integral number of tables */
207 layoutPtr
->stripeUnitsPerDisk
/= (info
->TableDepthInPUs
* layoutPtr
->SUsPerPU
);
208 layoutPtr
->stripeUnitsPerDisk
*= (info
->TableDepthInPUs
* layoutPtr
->SUsPerPU
);
210 /* compute the number of tables in the last fulltable, which
211 * need not be complete */
213 ((layoutPtr
->stripeUnitsPerDisk
/ layoutPtr
->SUsPerPU
) / info
->FullTableDepthInPUs
);
215 info
->FullTableLimitSUID
= complete_FT_count
* info
->SUsPerFullTable
;
216 info
->ExtraTablesPerDisk
=
217 ((layoutPtr
->stripeUnitsPerDisk
/ layoutPtr
->SUsPerPU
) / info
->TableDepthInPUs
) % k
;
220 raidPtr
->sectorsPerDisk
= layoutPtr
->stripeUnitsPerDisk
* layoutPtr
->sectorsPerStripeUnit
;
222 /* find the disk offset of the stripe unit where the last fulltable
224 numCompleteFullTablesPerDisk
= complete_FT_count
;
225 diskOffsetOfLastFullTableInSUs
= numCompleteFullTablesPerDisk
* info
->FullTableDepthInPUs
* layoutPtr
->SUsPerPU
;
226 if (raidPtr
->Layout
.map
->flags
& RF_DISTRIBUTE_SPARE
) {
227 SpareSpaceInSUs
= numCompleteSpareRegionsPerDisk
* info
->SpareSpaceDepthPerRegionInSUs
;
228 diskOffsetOfLastFullTableInSUs
+= SpareSpaceInSUs
;
229 info
->DiskOffsetOfLastSpareSpaceChunkInSUs
=
230 diskOffsetOfLastFullTableInSUs
+ info
->ExtraTablesPerDisk
* info
->TableDepthInPUs
* layoutPtr
->SUsPerPU
;
232 info
->DiskOffsetOfLastFullTableInSUs
= diskOffsetOfLastFullTableInSUs
;
233 info
->numCompleteFullTablesPerDisk
= numCompleteFullTablesPerDisk
;
235 /* 4. create and initialize the lookup tables */
236 info
->LayoutTable
= rf_make_2d_array(b
, k
, raidPtr
->cleanupList
);
237 if (info
->LayoutTable
== NULL
)
239 info
->OffsetTable
= rf_make_2d_array(b
, k
, raidPtr
->cleanupList
);
240 if (info
->OffsetTable
== NULL
)
242 info
->BlockTable
= rf_make_2d_array(info
->TableDepthInPUs
* layoutPtr
->SUsPerPU
, raidPtr
->numCol
, raidPtr
->cleanupList
);
243 if (info
->BlockTable
== NULL
)
246 first_avail_slot
= rf_make_1d_array(v
, NULL
);
247 if (first_avail_slot
== NULL
)
250 for (i
= 0; i
< b
; i
++)
251 for (j
= 0; j
< k
; j
++)
252 info
->LayoutTable
[i
][j
] = *cfgBuf
++;
254 /* initialize offset table */
255 for (i
= 0; i
< b
; i
++)
256 for (j
= 0; j
< k
; j
++) {
257 info
->OffsetTable
[i
][j
] = first_avail_slot
[info
->LayoutTable
[i
][j
]];
258 first_avail_slot
[info
->LayoutTable
[i
][j
]]++;
261 /* initialize block table */
262 for (SUID
= l
= 0; l
< layoutPtr
->SUsPerPU
; l
++) {
263 for (i
= 0; i
< b
; i
++) {
264 for (j
= 0; j
< k
; j
++) {
265 info
->BlockTable
[(info
->OffsetTable
[i
][j
] * layoutPtr
->SUsPerPU
) + l
]
266 [info
->LayoutTable
[i
][j
]] = SUID
;
272 rf_free_1d_array(first_avail_slot
, v
);
274 /* 5. set up the remaining redundant-but-useful parameters */
276 raidPtr
->totalSectors
= (k
* complete_FT_count
+ info
->ExtraTablesPerDisk
) *
277 info
->SUsPerTable
* layoutPtr
->sectorsPerStripeUnit
;
278 layoutPtr
->numStripe
= (raidPtr
->totalSectors
/ layoutPtr
->sectorsPerStripeUnit
) / (k
- 1);
280 /* strange evaluation order below to try and minimize overflow
283 layoutPtr
->dataSectorsPerStripe
= (k
- 1) * layoutPtr
->sectorsPerStripeUnit
;
284 layoutPtr
->numDataCol
= k
- 1;
285 layoutPtr
->numParityCol
= 1;
289 /* declustering with distributed sparing */
290 static void rf_ShutdownDeclusteredDS(RF_ThreadArg_t
);
292 rf_ShutdownDeclusteredDS(RF_ThreadArg_t arg
)
294 RF_DeclusteredConfigInfo_t
*info
;
297 raidPtr
= (RF_Raid_t
*) arg
;
298 info
= (RF_DeclusteredConfigInfo_t
*) raidPtr
->Layout
.layoutSpecificInfo
;
299 if (info
->SpareTable
)
300 rf_FreeSpareTable(raidPtr
);
304 rf_ConfigureDeclusteredDS(RF_ShutdownList_t
**listp
, RF_Raid_t
*raidPtr
,
309 rc
= rf_ConfigureDeclustered(listp
, raidPtr
, cfgPtr
);
312 rf_ShutdownCreate(listp
, rf_ShutdownDeclusteredDS
, raidPtr
);
318 rf_MapSectorDeclustered(RF_Raid_t
*raidPtr
, RF_RaidAddr_t raidSector
,
320 RF_SectorNum_t
*diskSector
, int remap
)
322 RF_RaidLayout_t
*layoutPtr
= &(raidPtr
->Layout
);
323 RF_DeclusteredConfigInfo_t
*info
= (RF_DeclusteredConfigInfo_t
*) layoutPtr
->layoutSpecificInfo
;
324 RF_StripeNum_t SUID
= raidSector
/ layoutPtr
->sectorsPerStripeUnit
;
325 RF_StripeNum_t FullTableID
, FullTableOffset
, TableID
, TableOffset
;
326 RF_StripeNum_t BlockID
, BlockOffset
, RepIndex
;
327 RF_StripeCount_t sus_per_fulltable
= info
->SUsPerFullTable
;
328 RF_StripeCount_t fulltable_depth
= info
->FullTableDepthInPUs
* layoutPtr
->SUsPerPU
;
329 RF_StripeNum_t base_suid
= 0, outSU
, SpareRegion
= 0, SpareSpace
= 0;
331 rf_decluster_adjust_params(layoutPtr
, &SUID
, &sus_per_fulltable
, &fulltable_depth
, &base_suid
);
333 FullTableID
= SUID
/ sus_per_fulltable
; /* fulltable ID within array
336 if (raidPtr
->Layout
.map
->flags
& RF_DISTRIBUTE_SPARE
) {
337 SpareRegion
= FullTableID
/ info
->FullTablesPerSpareRegion
;
338 SpareSpace
= SpareRegion
* info
->SpareSpaceDepthPerRegionInSUs
;
340 FullTableOffset
= SUID
% sus_per_fulltable
;
341 TableID
= FullTableOffset
/ info
->SUsPerTable
;
342 TableOffset
= FullTableOffset
- TableID
* info
->SUsPerTable
;
343 BlockID
= TableOffset
/ info
->PUsPerBlock
;
344 BlockOffset
= TableOffset
- BlockID
* info
->PUsPerBlock
;
345 BlockID
%= info
->BlocksPerTable
;
346 RepIndex
= info
->PUsPerBlock
- TableID
;
347 if (!raidPtr
->noRotate
)
348 BlockOffset
+= ((BlockOffset
>= RepIndex
) ? 1 : 0);
349 *col
= info
->LayoutTable
[BlockID
][BlockOffset
];
351 /* remap to distributed spare space if indicated */
353 RF_ASSERT(raidPtr
->Disks
[*col
].status
== rf_ds_reconstructing
|| raidPtr
->Disks
[*col
].status
== rf_ds_dist_spared
||
354 (rf_copyback_in_progress
&& raidPtr
->Disks
[*col
].status
== rf_ds_optimal
));
355 rf_remap_to_spare_space(layoutPtr
, info
, FullTableID
, TableID
, BlockID
, (base_suid
) ? 1 : 0, SpareRegion
, col
, &outSU
);
359 outSU
+= FullTableID
* fulltable_depth
; /* offs to strt of FT */
360 outSU
+= SpareSpace
; /* skip rsvd spare space */
361 outSU
+= TableID
* info
->TableDepthInPUs
* layoutPtr
->SUsPerPU
; /* offs to strt of tble */
362 outSU
+= info
->OffsetTable
[BlockID
][BlockOffset
] * layoutPtr
->SUsPerPU
; /* offs to the PU */
364 outSU
+= TableOffset
/ (info
->BlocksPerTable
* info
->PUsPerBlock
); /* offs to the SU within
367 /* convert SUs to sectors, and, if not aligned to SU boundary, add in
368 * offset to sector. */
369 *diskSector
= outSU
* layoutPtr
->sectorsPerStripeUnit
+ (raidSector
% layoutPtr
->sectorsPerStripeUnit
);
371 RF_ASSERT(*col
!= -1);
375 /* prototyping this inexplicably causes the compile of the layout table (rf_layout.c) to fail */
377 rf_MapParityDeclustered(RF_Raid_t
*raidPtr
, RF_RaidAddr_t raidSector
,
379 RF_SectorNum_t
*diskSector
, int remap
)
381 RF_RaidLayout_t
*layoutPtr
= &(raidPtr
->Layout
);
382 RF_DeclusteredConfigInfo_t
*info
= (RF_DeclusteredConfigInfo_t
*) layoutPtr
->layoutSpecificInfo
;
383 RF_StripeNum_t SUID
= raidSector
/ layoutPtr
->sectorsPerStripeUnit
;
384 RF_StripeNum_t FullTableID
, FullTableOffset
, TableID
, TableOffset
;
385 RF_StripeNum_t BlockID
, BlockOffset
, RepIndex
;
386 RF_StripeCount_t sus_per_fulltable
= info
->SUsPerFullTable
;
387 RF_StripeCount_t fulltable_depth
= info
->FullTableDepthInPUs
* layoutPtr
->SUsPerPU
;
388 RF_StripeNum_t base_suid
= 0, outSU
, SpareRegion
= 0, SpareSpace
= 0;
390 rf_decluster_adjust_params(layoutPtr
, &SUID
, &sus_per_fulltable
, &fulltable_depth
, &base_suid
);
392 /* compute row & (possibly) spare space exactly as before */
393 FullTableID
= SUID
/ sus_per_fulltable
;
395 if ((raidPtr
->Layout
.map
->flags
& RF_DISTRIBUTE_SPARE
)) {
396 SpareRegion
= FullTableID
/ info
->FullTablesPerSpareRegion
;
397 SpareSpace
= SpareRegion
* info
->SpareSpaceDepthPerRegionInSUs
;
399 /* compute BlockID and RepIndex exactly as before */
400 FullTableOffset
= SUID
% sus_per_fulltable
;
401 TableID
= FullTableOffset
/ info
->SUsPerTable
;
402 TableOffset
= FullTableOffset
- TableID
* info
->SUsPerTable
;
403 /* TableOffset = FullTableOffset % info->SUsPerTable; */
404 /* BlockID = (TableOffset / info->PUsPerBlock) %
405 * info->BlocksPerTable; */
406 BlockID
= TableOffset
/ info
->PUsPerBlock
;
407 /* BlockOffset = TableOffset % info->PUsPerBlock; */
408 BlockOffset
= TableOffset
- BlockID
* info
->PUsPerBlock
;
409 BlockID
%= info
->BlocksPerTable
;
411 /* the parity block is in the position indicated by RepIndex */
412 RepIndex
= (raidPtr
->noRotate
) ? info
->PUsPerBlock
: info
->PUsPerBlock
- TableID
;
413 *col
= info
->LayoutTable
[BlockID
][RepIndex
];
416 RF_ASSERT(raidPtr
->Disks
[*col
].status
== rf_ds_reconstructing
|| raidPtr
->Disks
[*col
].status
== rf_ds_dist_spared
||
417 (rf_copyback_in_progress
&& raidPtr
->Disks
[*col
].status
== rf_ds_optimal
));
418 rf_remap_to_spare_space(layoutPtr
, info
, FullTableID
, TableID
, BlockID
, (base_suid
) ? 1 : 0, SpareRegion
, col
, &outSU
);
421 /* compute sector as before, except use RepIndex instead of
424 outSU
+= FullTableID
* fulltable_depth
;
425 outSU
+= SpareSpace
; /* skip rsvd spare space */
426 outSU
+= TableID
* info
->TableDepthInPUs
* layoutPtr
->SUsPerPU
;
427 outSU
+= info
->OffsetTable
[BlockID
][RepIndex
] * layoutPtr
->SUsPerPU
;
430 outSU
+= TableOffset
/ (info
->BlocksPerTable
* info
->PUsPerBlock
);
431 *diskSector
= outSU
* layoutPtr
->sectorsPerStripeUnit
+ (raidSector
% layoutPtr
->sectorsPerStripeUnit
);
433 RF_ASSERT(*col
!= -1);
435 /* returns an array of ints identifying the disks that comprise the stripe containing the indicated address.
436 * the caller must _never_ attempt to modify this array.
439 rf_IdentifyStripeDeclustered(RF_Raid_t
*raidPtr
, RF_RaidAddr_t addr
,
440 RF_RowCol_t
**diskids
)
442 RF_RaidLayout_t
*layoutPtr
= &(raidPtr
->Layout
);
443 RF_DeclusteredConfigInfo_t
*info
= (RF_DeclusteredConfigInfo_t
*) layoutPtr
->layoutSpecificInfo
;
444 RF_StripeCount_t sus_per_fulltable
= info
->SUsPerFullTable
;
445 RF_StripeCount_t fulltable_depth
= info
->FullTableDepthInPUs
* layoutPtr
->SUsPerPU
;
446 RF_StripeNum_t base_suid
= 0;
447 RF_StripeNum_t SUID
= rf_RaidAddressToStripeUnitID(layoutPtr
, addr
);
448 RF_StripeNum_t stripeID
, FullTableID
;
451 rf_decluster_adjust_params(layoutPtr
, &SUID
, &sus_per_fulltable
, &fulltable_depth
, &base_suid
);
452 FullTableID
= SUID
/ sus_per_fulltable
; /* fulltable ID within array
454 stripeID
= rf_StripeUnitIDToStripeID(layoutPtr
, SUID
); /* find stripe offset
456 tableOffset
= (stripeID
% info
->BlocksPerTable
); /* find offset into
457 * block design table */
458 *diskids
= info
->LayoutTable
[tableOffset
];
460 /* This returns the default head-separation limit, which is measured
461 * in "required units for reconstruction". Each time a disk fetches
462 * a unit, it bumps a counter. The head-sep code prohibits any disk
463 * from getting more than headSepLimit counter values ahead of any
466 * We assume here that the number of floating recon buffers is already
467 * set. There are r stripes to be reconstructed in each table, and so
468 * if we have a total of B buffers, we can have at most B/r tables
469 * under recon at any one time. In each table, lambda units are required
470 * from each disk, so given B buffers, the head sep limit has to be
471 * (lambda*B)/r units. We subtract one to avoid weird boundary cases.
473 * for example, suppose were given 50 buffers, r=19, and lambda=4 as in
474 * the 20.5 design. There are 19 stripes/table to be reconstructed, so
475 * we can have 50/19 tables concurrently under reconstruction, which means
476 * we can allow the fastest disk to get 50/19 tables ahead of the slower
477 * disk. There are lambda "required units" for each disk, so the fastest
478 * disk can get 4*50/19 = 10 counter values ahead of the slowest.
480 * If numBufsToAccumulate is not 1, we need to limit the head sep further
481 * because multiple bufs will be required for each stripe under recon.
484 rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t
*raidPtr
)
486 RF_DeclusteredConfigInfo_t
*info
= (RF_DeclusteredConfigInfo_t
*) raidPtr
->Layout
.layoutSpecificInfo
;
488 return (info
->Lambda
* raidPtr
->numFloatingReconBufs
/ info
->TableDepthInPUs
/ rf_numBufsToAccumulate
);
490 /* returns the default number of recon buffers to use. The value
491 * is somewhat arbitrary...it's intended to be large enough to allow
492 * for a reasonably large head-sep limit, but small enough that you
493 * don't use up all your system memory with buffers.
496 rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t
* raidPtr
)
498 return (100 * rf_numBufsToAccumulate
);
500 /* sectors in the last fulltable of the array need to be handled
501 * specially since this fulltable can be incomplete. this function
502 * changes the values of certain params to handle this.
504 * the idea here is that MapSector et. al. figure out which disk the
505 * addressed unit lives on by computing the modulos of the unit number
506 * with the number of units per fulltable, table, etc. In the last
507 * fulltable, there are fewer units per fulltable, so we need to adjust
508 * the number of user data units per fulltable to reflect this.
510 * so, we (1) convert the fulltable size and depth parameters to
511 * the size of the partial fulltable at the end, (2) compute the
512 * disk sector offset where this fulltable starts, and (3) convert
513 * the users stripe unit number from an offset into the array to
514 * an offset into the last fulltable.
517 rf_decluster_adjust_params(RF_RaidLayout_t
*layoutPtr
,
518 RF_StripeNum_t
*SUID
,
519 RF_StripeCount_t
*sus_per_fulltable
,
520 RF_StripeCount_t
*fulltable_depth
,
521 RF_StripeNum_t
*base_suid
)
523 RF_DeclusteredConfigInfo_t
*info
= (RF_DeclusteredConfigInfo_t
*) layoutPtr
->layoutSpecificInfo
;
525 if (*SUID
>= info
->FullTableLimitSUID
) {
526 /* new full table size is size of last full table on disk */
527 *sus_per_fulltable
= info
->ExtraTablesPerDisk
* info
->SUsPerTable
;
529 /* new full table depth is corresponding depth */
530 *fulltable_depth
= info
->ExtraTablesPerDisk
* info
->TableDepthInPUs
* layoutPtr
->SUsPerPU
;
532 /* set up the new base offset */
533 *base_suid
= info
->DiskOffsetOfLastFullTableInSUs
;
535 /* convert users array address to an offset into the last
537 *SUID
-= info
->FullTableLimitSUID
;
541 * map a stripe ID to a parity stripe ID.
542 * See comment above RaidAddressToParityStripeID in layout.c.
545 rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t
*layoutPtr
,
546 RF_StripeNum_t stripeID
,
547 RF_StripeNum_t
*psID
,
548 RF_ReconUnitNum_t
*which_ru
)
550 RF_DeclusteredConfigInfo_t
*info
;
552 info
= (RF_DeclusteredConfigInfo_t
*) layoutPtr
->layoutSpecificInfo
;
554 *psID
= (stripeID
/ (layoutPtr
->SUsPerPU
* info
->BlocksPerTable
))
555 * info
->BlocksPerTable
+ (stripeID
% info
->BlocksPerTable
);
556 *which_ru
= (stripeID
% (info
->BlocksPerTable
* layoutPtr
->SUsPerPU
))
557 / info
->BlocksPerTable
;
558 RF_ASSERT((*which_ru
) < layoutPtr
->SUsPerPU
/ layoutPtr
->SUsPerRU
);
561 * Called from MapSector and MapParity to retarget an access at the spare unit.
562 * Modifies the "col" and "outSU" parameters only.
565 rf_remap_to_spare_space(RF_RaidLayout_t
*layoutPtr
,
566 RF_DeclusteredConfigInfo_t
*info
,
567 RF_StripeNum_t FullTableID
,
568 RF_StripeNum_t TableID
,
569 RF_SectorNum_t BlockID
,
570 RF_StripeNum_t base_suid
,
571 RF_StripeNum_t SpareRegion
,
573 RF_StripeNum_t
*outSU
)
575 RF_StripeNum_t ftID
, spareTableStartSU
, TableInSpareRegion
, lastSROffset
,
579 * note that FullTableID and hence SpareRegion may have gotten
580 * tweaked by rf_decluster_adjust_params. We detect this by
581 * noticing that base_suid is not 0.
583 if (base_suid
== 0) {
587 * There may be > 1.0 full tables in the last (i.e. partial)
588 * spare region. find out which of these we're in.
590 lastSROffset
= info
->NumCompleteSRs
* info
->SpareRegionDepthInSUs
;
591 which_ft
= (info
->DiskOffsetOfLastFullTableInSUs
- lastSROffset
) / (info
->FullTableDepthInPUs
* layoutPtr
->SUsPerPU
);
593 /* compute the actual full table ID */
594 ftID
= info
->DiskOffsetOfLastFullTableInSUs
/ (info
->FullTableDepthInPUs
* layoutPtr
->SUsPerPU
) + which_ft
;
595 SpareRegion
= info
->NumCompleteSRs
;
597 TableInSpareRegion
= (ftID
* info
->NumParityReps
+ TableID
) % info
->TablesPerSpareRegion
;
599 *outCol
= info
->SpareTable
[TableInSpareRegion
][BlockID
].spareDisk
;
600 RF_ASSERT(*outCol
!= -1);
602 spareTableStartSU
= (SpareRegion
== info
->NumCompleteSRs
) ?
603 info
->DiskOffsetOfLastFullTableInSUs
+ info
->ExtraTablesPerDisk
* info
->TableDepthInPUs
* layoutPtr
->SUsPerPU
:
604 (SpareRegion
+ 1) * info
->SpareRegionDepthInSUs
- info
->SpareSpaceDepthPerRegionInSUs
;
605 *outSU
= spareTableStartSU
+ info
->SpareTable
[TableInSpareRegion
][BlockID
].spareBlockOffsetInSUs
;
606 if (*outSU
>= layoutPtr
->stripeUnitsPerDisk
) {
607 printf("rf_remap_to_spare_space: invalid remapped disk SU offset %ld\n", (long) *outSU
);
611 #endif /* (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) */
613 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
615 rf_InstallSpareTable(RF_Raid_t
*raidPtr
, RF_RowCol_t frow
,
618 RF_DeclusteredConfigInfo_t
*info
= (RF_DeclusteredConfigInfo_t
*) raidPtr
->Layout
.layoutSpecificInfo
;
619 RF_SparetWait_t
*req
;
622 RF_Malloc(req
, sizeof(*req
), (RF_SparetWait_t
*));
623 req
->C
= raidPtr
->numCol
;
624 req
->G
= raidPtr
->Layout
.numDataCol
+ raidPtr
->Layout
.numParityCol
;
626 req
->SUsPerPU
= raidPtr
->Layout
.SUsPerPU
;
627 req
->TablesPerSpareRegion
= info
->TablesPerSpareRegion
;
628 req
->BlocksPerTable
= info
->BlocksPerTable
;
629 req
->TableDepthInPUs
= info
->TableDepthInPUs
;
630 req
->SpareSpaceDepthPerRegionInSUs
= info
->SpareSpaceDepthPerRegionInSUs
;
632 retcode
= rf_GetSpareTableFromDaemon(req
);
633 RF_ASSERT(!retcode
); /* XXX -- fix this to recover gracefully --
638 #if (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0)
640 * Invoked via ioctl to install a spare table in the kernel.
643 rf_SetSpareTable(RF_Raid_t
*raidPtr
, void *data
)
645 RF_DeclusteredConfigInfo_t
*info
= (RF_DeclusteredConfigInfo_t
*) raidPtr
->Layout
.layoutSpecificInfo
;
646 RF_SpareTableEntry_t
**ptrs
;
649 /* what we need to copyin is a 2-d array, so first copyin the user
650 * pointers to the rows in the table */
651 RF_Malloc(ptrs
, info
->TablesPerSpareRegion
* sizeof(RF_SpareTableEntry_t
*), (RF_SpareTableEntry_t
**));
652 retcode
= copyin((void *) data
, (void *) ptrs
, info
->TablesPerSpareRegion
* sizeof(RF_SpareTableEntry_t
*));
657 /* now allocate kernel space for the row pointers */
658 RF_Malloc(info
->SpareTable
, info
->TablesPerSpareRegion
* sizeof(RF_SpareTableEntry_t
*), (RF_SpareTableEntry_t
**));
660 /* now allocate kernel space for each row in the table, and copy it in
662 for (i
= 0; i
< info
->TablesPerSpareRegion
; i
++) {
663 RF_Malloc(info
->SpareTable
[i
], info
->BlocksPerTable
* sizeof(RF_SpareTableEntry_t
), (RF_SpareTableEntry_t
*));
664 retcode
= copyin(ptrs
[i
], info
->SpareTable
[i
], info
->BlocksPerTable
* sizeof(RF_SpareTableEntry_t
));
666 info
->SpareTable
= NULL
; /* blow off the memory
672 /* free up the temporary array we used */
673 RF_Free(ptrs
, info
->TablesPerSpareRegion
* sizeof(RF_SpareTableEntry_t
*));
679 rf_GetNumSpareRUsDeclustered(RF_Raid_t
*raidPtr
)
681 RF_RaidLayout_t
*layoutPtr
= &raidPtr
->Layout
;
683 return (((RF_DeclusteredConfigInfo_t
*) layoutPtr
->layoutSpecificInfo
)->TotSparePUsPerDisk
);
685 #endif /* (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) */
688 rf_FreeSpareTable(RF_Raid_t
*raidPtr
)
691 RF_RaidLayout_t
*layoutPtr
= &raidPtr
->Layout
;
692 RF_DeclusteredConfigInfo_t
*info
= (RF_DeclusteredConfigInfo_t
*) layoutPtr
->layoutSpecificInfo
;
693 RF_SpareTableEntry_t
**table
= info
->SpareTable
;
695 for (i
= 0; i
< info
->TablesPerSpareRegion
; i
++) {
696 RF_Free(table
[i
], info
->BlocksPerTable
* sizeof(RF_SpareTableEntry_t
));
698 RF_Free(table
, info
->TablesPerSpareRegion
* sizeof(RF_SpareTableEntry_t
*));
699 info
->SpareTable
= (RF_SpareTableEntry_t
**) NULL
;