1 /* $NetBSD: rf_paritymap.c,v 1.2 2009/11/26 01:23:56 kenh Exp $ */
4 * Copyright (c) 2009 Jed Davis.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: rf_paritymap.c,v 1.2 2009/11/26 01:23:56 kenh Exp $");
32 #include <sys/param.h>
33 #include <sys/callout.h>
35 #include <sys/mutex.h>
36 #include <sys/rwlock.h>
37 #include <sys/systm.h>
38 #include <sys/types.h>
40 #include <dev/raidframe/rf_paritymap.h>
41 #include <dev/raidframe/rf_stripelocks.h>
42 #include <dev/raidframe/rf_layout.h>
43 #include <dev/raidframe/rf_raid.h>
44 #include <dev/raidframe/rf_parityscan.h>
45 #include <dev/raidframe/rf_kintf.h>
47 /* Important parameters: */
48 #define REGION_MINSIZE (25ULL << 20)
49 #define DFL_TICKMS 40000
50 #define DFL_COOLDOWN 8 /* 7-8 intervals of 40s = 5min +/- 20s */
52 /* Internal-use flag bits. */
57 static void rf_paritymap_write_locked(struct rf_paritymap
*);
58 static void rf_paritymap_tick(void *);
59 static u_int
rf_paritymap_nreg(RF_Raid_t
*);
61 /* Extract the current status of the parity map. */
63 rf_paritymap_status(struct rf_paritymap
*pm
, struct rf_pmstat
*ps
)
65 memset(ps
, 0, sizeof(*ps
));
70 ps
->region_size
= pm
->region_size
;
71 mutex_enter(&pm
->lock
);
72 memcpy(&ps
->params
, &pm
->params
, sizeof(ps
->params
));
73 memcpy(ps
->dirty
, pm
->disk_now
, sizeof(ps
->dirty
));
74 memcpy(&ps
->ctrs
, &pm
->ctrs
, sizeof(ps
->ctrs
));
75 mutex_exit(&pm
->lock
);
80 * Test whether parity in a given sector is suspected of being inconsistent
81 * on disk (assuming that any pending I/O to it is allowed to complete).
82 * This may be of interest to future work on parity scrubbing.
85 rf_paritymap_test(struct rf_paritymap
*pm
, daddr_t sector
)
87 unsigned region
= sector
/ pm
->region_size
;
90 mutex_enter(&pm
->lock
);
91 retval
= isset(pm
->disk_boot
->bits
, region
) ? 1 : 0;
92 mutex_exit(&pm
->lock
);
96 /* To be called before a write to the RAID is submitted. */
98 rf_paritymap_begin(struct rf_paritymap
*pm
, daddr_t offset
, daddr_t size
)
102 b
= offset
/ pm
->region_size
;
103 e
= (offset
+ size
- 1) / pm
->region_size
;
105 for (i
= b
; i
<= e
; i
++)
106 rf_paritymap_begin_region(pm
, i
);
109 /* To be called after a write to the RAID completes. */
111 rf_paritymap_end(struct rf_paritymap
*pm
, daddr_t offset
, daddr_t size
)
115 b
= offset
/ pm
->region_size
;
116 e
= (offset
+ size
- 1) / pm
->region_size
;
118 for (i
= b
; i
<= e
; i
++)
119 rf_paritymap_end_region(pm
, i
);
123 rf_paritymap_begin_region(struct rf_paritymap
*pm
, unsigned region
)
127 KASSERT(region
< RF_PARITYMAP_NREG
);
130 /* If it was being kept warm, deal with that. */
131 mutex_enter(&pm
->lock
);
132 if (pm
->current
->state
[region
] < 0)
133 pm
->current
->state
[region
] = 0;
135 /* This shouldn't happen unless RAIDOUTSTANDING is set too high. */
136 KASSERT(pm
->current
->state
[region
] < 127);
137 pm
->current
->state
[region
]++;
139 needs_write
= isclr(pm
->disk_now
->bits
, region
);
142 KASSERT(pm
->current
->state
[region
] == 1);
143 rf_paritymap_write_locked(pm
);
146 mutex_exit(&pm
->lock
);
150 rf_paritymap_end_region(struct rf_paritymap
*pm
, unsigned region
)
152 KASSERT(region
< RF_PARITYMAP_NREG
);
154 mutex_enter(&pm
->lock
);
155 KASSERT(pm
->current
->state
[region
] > 0);
156 --pm
->current
->state
[region
];
158 if (pm
->current
->state
[region
] <= 0) {
159 pm
->current
->state
[region
] = -pm
->params
.cooldown
;
160 KASSERT(pm
->current
->state
[region
] <= 0);
161 mutex_enter(&pm
->lk_flags
);
162 if (!(pm
->flags
& TICKING
)) {
163 pm
->flags
|= TICKING
;
164 mutex_exit(&pm
->lk_flags
);
165 callout_schedule(&pm
->ticker
,
166 mstohz(pm
->params
.tickms
));
168 mutex_exit(&pm
->lk_flags
);
170 mutex_exit(&pm
->lock
);
174 * Updates the parity map to account for any changes in current activity
175 * and/or an ongoing parity scan, then writes it to disk with appropriate
179 rf_paritymap_write(struct rf_paritymap
*pm
)
181 mutex_enter(&pm
->lock
);
182 rf_paritymap_write_locked(pm
);
183 mutex_exit(&pm
->lock
);
186 /* As above, but to be used when pm->lock is already held. */
188 rf_paritymap_write_locked(struct rf_paritymap
*pm
)
191 int i
, j
, setting
, clearing
;
193 setting
= clearing
= 0;
194 for (i
= 0; i
< RF_PARITYMAP_NBYTE
; i
++) {
195 w0
= pm
->disk_now
->bits
[i
];
196 w
= pm
->disk_boot
->bits
[i
];
198 for (j
= 0; j
< NBBY
; j
++)
199 if (pm
->current
->state
[i
* NBBY
+ j
] != 0)
207 pm
->disk_now
->bits
[i
] = w
;
209 pm
->ctrs
.ncachesync
+= setting
+ clearing
;
210 pm
->ctrs
.nclearing
+= clearing
;
213 * If bits are being set in the parity map, then a sync is
214 * required afterwards, so that the regions are marked dirty
215 * on disk before any writes to them take place. If bits are
216 * being cleared, then a sync is required before the write, so
217 * that any writes to those regions are processed before the
218 * region is marked clean. (Synchronization is somewhat
219 * overkill; a write ordering barrier would suffice, but we
220 * currently have no way to express that directly.)
223 rf_sync_component_caches(pm
->raid
);
224 rf_paritymap_kern_write(pm
->raid
, pm
->disk_now
);
226 rf_sync_component_caches(pm
->raid
);
229 /* Mark all parity as being in need of rewrite. */
231 rf_paritymap_invalidate(struct rf_paritymap
*pm
)
233 mutex_enter(&pm
->lock
);
234 memset(pm
->disk_boot
, ~(unsigned char)0,
235 sizeof(struct rf_paritymap_ondisk
));
236 mutex_exit(&pm
->lock
);
239 /* Mark all parity as being correct. */
241 rf_paritymap_forceclean(struct rf_paritymap
*pm
)
243 mutex_enter(&pm
->lock
);
244 memset(pm
->disk_boot
, (unsigned char)0,
245 sizeof(struct rf_paritymap_ondisk
));
246 mutex_exit(&pm
->lock
);
250 * The cooldown callout routine just defers its work to a thread; it can't do
251 * the parity map write itself as it would block, and although mutex-induced
252 * blocking is permitted it seems wise to avoid tying up the softint.
255 rf_paritymap_tick(void *arg
)
257 struct rf_paritymap
*pm
= arg
;
259 mutex_enter(&pm
->lk_flags
);
261 mutex_exit(&pm
->lk_flags
);
262 wakeup(&(pm
->raid
->iodone
)); /* XXX */
266 * This is where the parity cooling work (and rearming the callout if needed)
267 * is done; the raidio thread calls it when woken up, as by the above.
270 rf_paritymap_checkwork(struct rf_paritymap
*pm
)
272 int i
, zerop
, progressp
;
274 mutex_enter(&pm
->lk_flags
);
275 if (pm
->flags
& TICKED
) {
276 zerop
= progressp
= 0;
278 pm
->flags
&= ~TICKED
;
279 mutex_exit(&pm
->lk_flags
);
281 mutex_enter(&pm
->lock
);
282 for (i
= 0; i
< RF_PARITYMAP_NREG
; i
++) {
283 if (pm
->current
->state
[i
] < 0) {
285 pm
->current
->state
[i
]++;
286 if (pm
->current
->state
[i
] == 0)
292 callout_schedule(&pm
->ticker
,
293 mstohz(pm
->params
.tickms
));
295 mutex_enter(&pm
->lk_flags
);
296 pm
->flags
&= ~TICKING
;
297 mutex_exit(&pm
->lk_flags
);
301 rf_paritymap_write_locked(pm
);
302 mutex_exit(&pm
->lock
);
304 mutex_exit(&pm
->lk_flags
);
308 * Set parity map parameters; used both to alter parameters on the fly and to
309 * establish their initial values. Note that setting a parameter to 0 means
310 * to leave the previous setting unchanged, and that if this is done for the
311 * initial setting of "regions", then a default value will be computed based
312 * on the RAID component size.
315 rf_paritymap_set_params(struct rf_paritymap
*pm
,
316 const struct rf_pmparams
*params
, int todisk
)
318 int cooldown
, tickms
;
321 RF_ComponentLabel_t
*clabel
;
324 cooldown
= params
->cooldown
!= 0
325 ? params
->cooldown
: pm
->params
.cooldown
;
326 tickms
= params
->tickms
!= 0
327 ? params
->tickms
: pm
->params
.tickms
;
328 regions
= params
->regions
!= 0
329 ? params
->regions
: pm
->params
.regions
;
331 if (cooldown
< 1 || cooldown
> 128) {
332 printf("raid%d: cooldown %d out of range\n", pm
->raid
->raidid
,
337 printf("raid%d: tick time %dms out of range\n",
338 pm
->raid
->raidid
, tickms
);
342 regions
= rf_paritymap_nreg(pm
->raid
);
343 } else if (regions
> RF_PARITYMAP_NREG
) {
344 printf("raid%d: region count %u too large (more than %u)\n",
345 pm
->raid
->raidid
, regions
, RF_PARITYMAP_NREG
);
349 /* XXX any currently warm parity will be used with the new tickms! */
350 pm
->params
.cooldown
= cooldown
;
351 pm
->params
.tickms
= tickms
;
352 /* Apply the initial region count, but do not change it after that. */
353 if (pm
->params
.regions
== 0)
354 pm
->params
.regions
= regions
;
356 /* So that the newly set parameters can be tested: */
357 pm
->ctrs
.nwrite
= pm
->ctrs
.ncachesync
= pm
->ctrs
.nclearing
= 0;
361 for (col
= 0; col
< raidPtr
->numCol
; col
++) {
362 clabel
= raidget_component_label(raidPtr
, col
);
363 clabel
->parity_map_ntick
= cooldown
;
364 clabel
->parity_map_tickms
= tickms
;
365 clabel
->parity_map_regions
= regions
;
366 raidflush_component_label(raidPtr
, col
);
373 * The number of regions may not be as many as can fit into the map, because
374 * when regions are too small, the overhead of setting parity map bits
375 * becomes significant in comparison to the actual I/O, while the
376 * corresponding gains in parity verification time become negligible. Thus,
377 * a minimum region size (defined above) is imposed.
379 * Note that, if the number of regions is less than the maximum, then some of
380 * the regions will be "fictional", corresponding to no actual disk; some
381 * parts of the code may process them as normal, but they can not ever be
385 rf_paritymap_nreg(RF_Raid_t
*raid
)
387 daddr_t bytes_per_disk
, nreg
;
389 bytes_per_disk
= raid
->sectorsPerDisk
<< raid
->logBytesPerSector
;
390 nreg
= bytes_per_disk
/ REGION_MINSIZE
;
391 if (nreg
> RF_PARITYMAP_NREG
)
392 nreg
= RF_PARITYMAP_NREG
;
398 * Initialize a parity map given specific parameters. This neither reads nor
399 * writes the parity map config in the component labels; for that, see below.
402 rf_paritymap_init(struct rf_paritymap
*pm
, RF_Raid_t
*raid
,
403 const struct rf_pmparams
*params
)
406 struct rf_pmparams safe
;
409 pm
->params
.regions
= 0;
410 if (0 != rf_paritymap_set_params(pm
, params
, 0)) {
412 * If the parameters are out-of-range, then bring the
413 * parity map up with something reasonable, so that
414 * the admin can at least go and fix it (or ignore it
417 safe
.cooldown
= DFL_COOLDOWN
;
418 safe
.tickms
= DFL_TICKMS
;
421 if (0 != rf_paritymap_set_params(pm
, &safe
, 0))
425 rstripes
= howmany(raid
->Layout
.numStripe
, pm
->params
.regions
);
426 pm
->region_size
= rstripes
* raid
->Layout
.dataSectorsPerStripe
;
428 callout_init(&pm
->ticker
, CALLOUT_MPSAFE
);
429 callout_setfunc(&pm
->ticker
, rf_paritymap_tick
, pm
);
432 pm
->disk_boot
= kmem_alloc(sizeof(struct rf_paritymap_ondisk
),
434 pm
->disk_now
= kmem_alloc(sizeof(struct rf_paritymap_ondisk
),
436 pm
->current
= kmem_zalloc(sizeof(struct rf_paritymap_current
),
439 rf_paritymap_kern_read(pm
->raid
, pm
->disk_boot
);
440 memcpy(pm
->disk_now
, pm
->disk_boot
, sizeof(*pm
->disk_now
));
442 mutex_init(&pm
->lock
, MUTEX_DEFAULT
, IPL_NONE
);
443 mutex_init(&pm
->lk_flags
, MUTEX_DEFAULT
, IPL_SOFTCLOCK
);
449 * Destroys a parity map; unless "force" is set, also cleans parity for any
450 * regions which were still in cooldown (but are not dirty on disk).
453 rf_paritymap_destroy(struct rf_paritymap
*pm
, int force
)
457 callout_halt(&pm
->ticker
, NULL
); /* XXX stop? halt? */
458 callout_destroy(&pm
->ticker
);
461 for (i
= 0; i
< RF_PARITYMAP_NREG
; i
++) {
462 /* XXX check for > 0 ? */
463 if (pm
->current
->state
[i
] < 0)
464 pm
->current
->state
[i
] = 0;
467 rf_paritymap_write_locked(pm
);
470 mutex_destroy(&pm
->lock
);
471 mutex_destroy(&pm
->lk_flags
);
473 kmem_free(pm
->disk_boot
, sizeof(struct rf_paritymap_ondisk
));
474 kmem_free(pm
->disk_now
, sizeof(struct rf_paritymap_ondisk
));
475 kmem_free(pm
->current
, sizeof(struct rf_paritymap_current
));
479 * Rewrite parity, taking parity map into account; this is the equivalent of
480 * the old rf_RewriteParity, and is likewise to be called from a suitable
481 * thread and shouldn't have multiple copies running in parallel and so on.
483 * Note that the fictional regions are "cleaned" in one shot, so that very
484 * small RAIDs (useful for testing) will not experience potentially severe
485 * regressions in rewrite time.
488 rf_paritymap_rewrite(struct rf_paritymap
*pm
)
491 daddr_t reg_b
, reg_e
;
493 /* Process only the actual regions. */
494 for (i
= 0; i
< pm
->params
.regions
; i
++) {
495 mutex_enter(&pm
->lock
);
496 if (isset(pm
->disk_boot
->bits
, i
)) {
497 mutex_exit(&pm
->lock
);
499 reg_b
= i
* pm
->region_size
;
500 reg_e
= reg_b
+ pm
->region_size
;
501 if (reg_e
> pm
->raid
->totalSectors
)
502 reg_e
= pm
->raid
->totalSectors
;
504 if (rf_RewriteParityRange(pm
->raid
, reg_b
,
507 if (pm
->raid
->waitShutdown
)
510 mutex_enter(&pm
->lock
);
511 clrbit(pm
->disk_boot
->bits
, i
);
512 rf_paritymap_write_locked(pm
);
513 mutex_exit(&pm
->lock
);
516 mutex_exit(&pm
->lock
);
520 /* Now, clear the fictional regions, if any. */
521 rf_paritymap_forceclean(pm
);
522 rf_paritymap_write(pm
);
528 * How to merge the on-disk parity maps when reading them in from the
529 * various components; returns whether they differ. In the case that
530 * they do differ, sets *dst to the union of *dst and *src.
532 * In theory, it should be safe to take the intersection (or just pick
533 * a single component arbitrarily), but the paranoid approach costs
536 * Appropriate locking, if any, is the responsibility of the caller.
539 rf_paritymap_merge(struct rf_paritymap_ondisk
*dst
,
540 struct rf_paritymap_ondisk
*src
)
544 for (i
= 0; i
< RF_PARITYMAP_NBYTE
; i
++) {
545 if (dst
->bits
[i
] != src
->bits
[i
])
547 dst
->bits
[i
] |= src
->bits
[i
];
554 * Detach a parity map from its RAID. This is not meant to be applied except
555 * when unconfiguring the RAID after all I/O has been resolved, as otherwise
556 * an out-of-date parity map could be treated as current.
559 rf_paritymap_detach(RF_Raid_t
*raidPtr
)
561 if (raidPtr
->parity_map
== NULL
)
564 simple_lock(&(raidPtr
->iodone_lock
));
565 struct rf_paritymap
*pm
= raidPtr
->parity_map
;
566 raidPtr
->parity_map
= NULL
;
567 simple_unlock(&(raidPtr
->iodone_lock
));
568 /* XXXjld is that enough locking? Or too much? */
569 rf_paritymap_destroy(pm
, 0);
570 kmem_free(pm
, sizeof(*pm
));
574 * Attach a parity map to a RAID set if appropriate. Includes
575 * configure-time processing of parity-map fields of component label.
578 rf_paritymap_attach(RF_Raid_t
*raidPtr
, int force
)
582 int g_tickms
, g_ntick
, g_regions
;
584 RF_ComponentLabel_t
*clabel
;
585 u_int flags
, regions
;
586 struct rf_pmparams params
;
588 if (raidPtr
->Layout
.map
->faultsTolerated
== 0) {
589 /* There isn't any parity. */
595 g_tickms
= DFL_TICKMS
;
596 g_ntick
= DFL_COOLDOWN
;
600 * Collect opinions on the set config. If this is the initial
601 * config (raidctl -C), treat all labels as invalid, since
602 * there may be random data present.
605 for (col
= 0; col
< raidPtr
->numCol
; col
++) {
606 clabel
= raidget_component_label(raidPtr
, col
);
607 flags
= clabel
->parity_map_flags
;
608 /* Check for use by non-parity-map kernel. */
609 if (clabel
->parity_map_modcount
610 != clabel
->mod_counter
) {
611 flags
&= ~RF_PMLABEL_WASUSED
;
614 if (flags
& RF_PMLABEL_VALID
) {
615 g_tickms
= clabel
->parity_map_tickms
;
616 g_ntick
= clabel
->parity_map_ntick
;
617 regions
= clabel
->parity_map_regions
;
620 else if (g_regions
!= regions
) {
621 pm_zap
= 1; /* important! */
624 if (flags
& RF_PMLABEL_DISABLE
) {
627 if (!(flags
& RF_PMLABEL_WASUSED
)) {
638 /* Finally, create and attach the parity map. */
640 params
.cooldown
= g_ntick
;
641 params
.tickms
= g_tickms
;
642 params
.regions
= g_regions
;
644 raidPtr
->parity_map
= kmem_alloc(sizeof(struct rf_paritymap
),
646 if (0 != rf_paritymap_init(raidPtr
->parity_map
, raidPtr
,
648 /* It failed; do without. */
649 kmem_free(raidPtr
->parity_map
,
650 sizeof(struct rf_paritymap
));
651 raidPtr
->parity_map
= NULL
;
656 /* Pick up the autoconfigured region count. */
657 g_regions
= raidPtr
->parity_map
->params
.regions
;
660 good
= raidPtr
->parity_good
&& !force
;
663 rf_paritymap_forceclean(raidPtr
->parity_map
);
665 rf_paritymap_invalidate(raidPtr
->parity_map
);
666 /* This needs to be on disk before WASUSED is set. */
667 rf_paritymap_write(raidPtr
->parity_map
);
671 /* Alter labels in-core to reflect the current view of things. */
672 for (col
= 0; col
< raidPtr
->numCol
; col
++) {
673 clabel
= raidget_component_label(raidPtr
, col
);
676 flags
= RF_PMLABEL_VALID
| RF_PMLABEL_WASUSED
;
678 flags
= RF_PMLABEL_VALID
| RF_PMLABEL_DISABLE
;
680 clabel
->parity_map_flags
= flags
;
681 clabel
->parity_map_tickms
= g_tickms
;
682 clabel
->parity_map_ntick
= g_ntick
;
683 clabel
->parity_map_regions
= g_regions
;
684 raidflush_component_label(raidPtr
, col
);
689 * For initializing the parity-map fields of a component label, both on
690 * initial creation and on reconstruct/copyback/etc.
693 rf_paritymap_init_label(struct rf_paritymap
*pm
, RF_ComponentLabel_t
*clabel
)
696 clabel
->parity_map_flags
=
697 RF_PMLABEL_VALID
| RF_PMLABEL_WASUSED
;
698 clabel
->parity_map_tickms
= pm
->params
.tickms
;
699 clabel
->parity_map_ntick
= pm
->params
.cooldown
;
701 * XXXjld: If the number of regions is changed on disk, and
702 * then a new component is labeled before the next configure,
703 * then it will get the old value and they will conflict on
704 * the next boot (and the default will be used instead).
706 clabel
->parity_map_regions
= pm
->params
.regions
;
709 * XXXjld: if the map is disabled, and all the components are
710 * replaced without an intervening unconfigure/reconfigure,
711 * then it will become enabled on the next unconfig/reconfig.
717 /* Will the parity map be disabled next time? */
719 rf_paritymap_get_disable(RF_Raid_t
*raidPtr
)
721 RF_ComponentLabel_t
*clabel
;
726 for (col
= 0; col
< raidPtr
->numCol
; col
++) {
727 clabel
= raidget_component_label(raidPtr
, col
);
728 if (clabel
->parity_map_flags
& RF_PMLABEL_DISABLE
)
735 /* Set whether the parity map will be disabled next time. */
737 rf_paritymap_set_disable(RF_Raid_t
*raidPtr
, int dis
)
739 RF_ComponentLabel_t
*clabel
;
742 for (col
= 0; col
< raidPtr
->numCol
; col
++) {
743 clabel
= raidget_component_label(raidPtr
, col
);
745 clabel
->parity_map_flags
|= RF_PMLABEL_DISABLE
;
747 clabel
->parity_map_flags
&= ~RF_PMLABEL_DISABLE
;
748 raidflush_component_label(raidPtr
, col
);