4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/vnode.h>
29 #include <sys/errno.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
35 #include <sys/cmn_err.h>
36 #include <sys/fssnap_if.h>
37 #include <sys/fs/ufs_inode.h>
38 #include <sys/fs/ufs_filio.h>
39 #include <sys/fs/ufs_log.h>
40 #include <sys/fs/ufs_bio.h>
41 #include <sys/inttypes.h>
42 #include <sys/callb.h>
43 #include <sys/tnf_probe.h>
46 * Kernel threads for logging
47 * Currently only one for rolling the log (one per log).
50 #define LUFS_DEFAULT_NUM_ROLL_BUFS 16
51 #define LUFS_DEFAULT_MIN_ROLL_BUFS 4
52 #define LUFS_DEFAULT_MAX_ROLL_BUFS 64
57 #define logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
58 #define ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
63 uint32_t lufs_num_roll_bufs
= LUFS_DEFAULT_NUM_ROLL_BUFS
;
64 uint32_t lufs_min_roll_bufs
= LUFS_DEFAULT_MIN_ROLL_BUFS
;
65 uint32_t lufs_max_roll_bufs
= LUFS_DEFAULT_MAX_ROLL_BUFS
;
66 long logmap_maxnme
= 1536;
67 int trans_roll_tics
= 0;
68 uint64_t trans_roll_new_delta
= 0;
69 uint64_t lrr_wait
= 0;
71 * Key for thread specific data for the roll thread to
72 * bypass snapshot throttling
74 uint_t bypass_snapshot_throttle_key
;
79 extern kmutex_t ml_scan
;
80 extern kcondvar_t ml_scan_cv
;
84 trans_roll_wait(mt_map_t
*logmap
, callb_cpr_t
*cprinfop
)
86 mutex_enter(&logmap
->mtm_mutex
);
88 if (logmap
->mtm_flags
& MTM_FORCE_ROLL
) {
89 cv_broadcast(&logmap
->mtm_from_roll_cv
);
91 logmap
->mtm_flags
&= ~(MTM_FORCE_ROLL
| MTM_ROLLING
);
92 CALLB_CPR_SAFE_BEGIN(cprinfop
);
93 (void) cv_reltimedwait(&logmap
->mtm_to_roll_cv
, &logmap
->mtm_mutex
,
94 trans_roll_tics
, TR_CLOCK_TICK
);
95 CALLB_CPR_SAFE_END(cprinfop
, &logmap
->mtm_mutex
);
96 logmap
->mtm_flags
|= MTM_ROLLING
;
97 mutex_exit(&logmap
->mtm_mutex
);
101 * returns the number of 8K buffers to use for rolling the log
107 * sanity validate the tunable lufs_num_roll_bufs
109 if (lufs_num_roll_bufs
< lufs_min_roll_bufs
) {
110 return (lufs_min_roll_bufs
);
112 if (lufs_num_roll_bufs
> lufs_max_roll_bufs
) {
113 return (lufs_max_roll_bufs
);
115 return (lufs_num_roll_bufs
);
119 * Find something to roll, then if we don't have cached roll buffers
120 * covering all the deltas in that MAPBLOCK then read the master
121 * and overlay the deltas.
124 * 1 on finding nothing to roll
128 log_roll_read(ml_unit_t
*ul
, rollbuf_t
*rbs
, int nmblk
, caddr_t roll_bufs
,
134 mt_map_t
*logmap
= ul
->un_logmap
;
141 * Make sure there is really something to roll
144 if (!logmap_next_roll(logmap
, &mof
)) {
149 * build some master blocks + deltas to roll forward
151 rw_enter(&logmap
->mtm_rwlock
, RW_READER
);
154 mof
= mof
& (offset_t
)MAPBLOCKMASK
;
155 mblkno
= lbtodb(mof
);
158 * Check for the case of a new delta to a set up buffer
160 for (i
= 0, rbp
= rbs
; i
< nbuf
; ++i
, ++rbp
) {
161 if (P2ALIGN(rbp
->rb_bh
.b_blkno
,
162 MAPBLOCKSIZE
/ DEV_BSIZE
) == mblkno
) {
163 trans_roll_new_delta
++;
164 /* Flush out the current set of buffers */
170 * Work out what to roll next. If it isn't cached then read
171 * it asynchronously from the master.
174 bp
->b_blkno
= mblkno
;
175 bp
->b_flags
= B_READ
;
176 bp
->b_un
.b_addr
= roll_bufs
+ (nbuf
<< MAPBLOCKSHIFT
);
177 bp
->b_bufsize
= MAPBLOCKSIZE
;
178 if (top_read_roll(rbp
, ul
)) {
179 /* logmap deltas were in use */
182 * On first buffer wait for the logmap user
183 * to finish by grabbing the logmap lock
184 * exclusively rather than spinning
186 rw_exit(&logmap
->mtm_rwlock
);
188 rw_enter(&logmap
->mtm_rwlock
, RW_WRITER
);
189 rw_exit(&logmap
->mtm_rwlock
);
192 /* we have at least one buffer - flush it */
195 if ((bp
->b_flags
& B_INVAL
) == 0) {
199 } while ((nbuf
< nmblk
) && logmap_next_roll(logmap
, &mof
));
202 * If there was nothing to roll cycle back
205 rw_exit(&logmap
->mtm_rwlock
);
211 * For each buffer, if it isn't cached then wait for the read to
212 * finish and overlay the deltas.
214 for (error
= 0, i
= 0, rbp
= rbs
; i
< nbuf
; ++i
, ++rbp
) {
217 if (trans_not_wait(bp
)) {
219 "Error reading master during ufs log roll");
223 * sync read the data from the log
225 if (ldl_read(ul
, bp
->b_un
.b_addr
,
226 ldbtob(bp
->b_blkno
) & (offset_t
)MAPBLOCKMASK
,
227 MAPBLOCKSIZE
, rbp
->rb_age
)) {
233 * reset the age bit in the age list
235 logmap_list_put_roll(logmap
, rbp
->rb_age
);
237 if (ul
->un_flags
& LDL_ERROR
) {
241 rw_exit(&logmap
->mtm_rwlock
);
249 * Write out a cached roll buffer
252 log_roll_write_crb(ufsvfs_t
*ufsvfsp
, rollbuf_t
*rbp
)
254 crb_t
*crb
= rbp
->rb_crb
;
255 buf_t
*bp
= &rbp
->rb_bh
;
257 bp
->b_blkno
= lbtodb(crb
->c_mof
);
258 bp
->b_un
.b_addr
= crb
->c_buf
;
259 bp
->b_bcount
= crb
->c_nb
;
260 bp
->b_bufsize
= crb
->c_nb
;
261 ASSERT((crb
->c_nb
& DEV_BMASK
) == 0);
262 bp
->b_flags
= B_WRITE
;
263 logstats
.ls_rwrites
.value
.ui64
++;
265 /* if snapshots are enabled, call it */
266 if (ufsvfsp
->vfs_snapshot
) {
267 fssnap_strategy(&ufsvfsp
->vfs_snapshot
, bp
);
269 (void) bdev_strategy(bp
);
274 * Write out a set of non cached roll buffers
277 log_roll_write_bufs(ufsvfs_t
*ufsvfsp
, rollbuf_t
*rbp
)
279 buf_t
*bp
= &rbp
->rb_bh
;
281 rbsecmap_t secmap
= rbp
->rb_secmap
;
285 ASSERT((bp
->b_flags
& B_INVAL
) == 0);
287 do { /* for each contiguous block of sectors */
288 /* find start of next sector to write */
289 for (j
= 0; j
< 16; ++j
) {
290 if (secmap
& UINT16_C(1))
294 bp
->b_un
.b_addr
+= (j
<< DEV_BSHIFT
);
297 /* calculate number of sectors */
300 for (k
= 1; j
< 16; ++j
) {
301 if ((secmap
& UINT16_C(1)) == 0)
306 bp
->b_bcount
= k
<< DEV_BSHIFT
;
307 bp
->b_flags
= B_WRITE
;
308 logstats
.ls_rwrites
.value
.ui64
++;
310 /* if snapshots are enabled, call it */
311 if (ufsvfsp
->vfs_snapshot
)
312 fssnap_strategy(&ufsvfsp
->vfs_snapshot
, bp
);
314 (void) bdev_strategy(bp
);
317 * Allocate another buf_t to handle
318 * the next write in this MAPBLOCK
319 * Chain them via b_list.
321 bp2
= kmem_alloc(sizeof (buf_t
), KM_SLEEP
);
324 bp2
->b_iodone
= trans_not_done
;
325 bp2
->b_bufsize
= MAPBLOCKSIZE
;
326 bp2
->b_edev
= bp
->b_edev
;
328 bp
->b_un
.b_addr
+ bp
->b_bcount
;
329 bp2
->b_blkno
= bp
->b_blkno
+ k
;
336 * Asynchronously roll the deltas, using the sector map
340 log_roll_write(ml_unit_t
*ul
, rollbuf_t
*rbs
, int nbuf
)
343 ufsvfs_t
*ufsvfsp
= ul
->un_ufsvfs
;
346 rollbuf_t
*head
, *prev
, *rbp2
;
349 * Order the buffers by blkno
352 for (head
= rbs
, rbp
= rbs
+ 1; rbp
< rbs
+ nbuf
; rbp
++) {
353 for (rbp2
= head
; rbp2
; prev
= rbp2
, rbp2
= rbp2
->rb_next
) {
354 if (rbp
->rb_bh
.b_blkno
< rbp2
->rb_bh
.b_blkno
) {
372 * issue the in-order writes
374 for (rbp
= head
; rbp
; rbp
= rbp2
) {
376 log_roll_write_crb(ufsvfsp
, rbp
);
378 log_roll_write_bufs(ufsvfsp
, rbp
);
380 /* null out the rb_next link for next set of rolling */
386 * wait for all the writes to finish
388 for (rbp
= rbs
; rbp
< rbs
+ nbuf
; rbp
++) {
390 if (trans_not_wait(bp
)) {
392 "Error writing master during ufs log roll");
396 * Now wait for all the "cloned" buffer writes (if any)
397 * and free those headers
402 if (trans_not_wait(bp2
)) {
404 "Error writing master during ufs log roll");
408 kmem_free(bp
, sizeof (buf_t
));
412 if (ul
->un_flags
& LDL_ERROR
)
418 trans_roll(ml_unit_t
*ul
)
421 mt_map_t
*logmap
= ul
->un_logmap
;
431 CALLB_CPR_INIT(&cprinfo
, &logmap
->mtm_mutex
, callb_generic_cpr
,
435 * We do not want the roll thread's writes to be
436 * throttled by the snapshot.
437 * If they are throttled then we can have a deadlock
438 * between the roll thread and the snapshot taskq thread:
439 * roll thread wants the throttling semaphore and
440 * the snapshot taskq thread cannot release the semaphore
441 * because it is writing to the log and the log is full.
444 (void) tsd_set(bypass_snapshot_throttle_key
, (void*)1);
447 * setup some roll parameters
449 if (trans_roll_tics
== 0)
450 trans_roll_tics
= 5 * hz
;
451 nmblk
= log_roll_buffers();
454 * allocate the buffers and buffer headers
456 roll_bufs
= kmem_alloc(nmblk
* MAPBLOCKSIZE
, KM_SLEEP
);
457 rbs
= kmem_alloc(nmblk
* sizeof (rollbuf_t
), KM_SLEEP
);
460 * initialize the buffer headers
462 for (i
= 0, rbp
= rbs
; i
< nmblk
; ++i
, ++rbp
) {
466 bp
->b_edev
= ul
->un_dev
;
467 bp
->b_iodone
= trans_not_done
;
468 bp
->b_bufsize
= MAPBLOCKSIZE
;
481 mutex_enter(&logmap
->mtm_mutex
);
482 if ((ul
->un_flags
& LDL_ERROR
) || (logmap
->mtm_flags
& MTM_ROLL_EXIT
)) {
483 kmem_free(rbs
, nmblk
* sizeof (rollbuf_t
));
484 kmem_free(roll_bufs
, nmblk
* MAPBLOCKSIZE
);
485 logmap
->mtm_flags
&= ~(MTM_FORCE_ROLL
| MTM_ROLL_RUNNING
|
486 MTM_ROLL_EXIT
| MTM_ROLLING
);
487 cv_broadcast(&logmap
->mtm_from_roll_cv
);
488 CALLB_CPR_EXIT(&cprinfo
);
495 * don't roll except in FORCEROLL situations
497 if (logmap
->mtm_debug
& MT_SCAN
)
498 if ((logmap
->mtm_flags
& MTM_FORCE_ROLL
) == 0) {
499 mutex_exit(&logmap
->mtm_mutex
);
500 trans_roll_wait(logmap
, &cprinfo
);
503 ASSERT(logmap
->mtm_trimlof
== 0);
506 * If we've finished a force roll cycle then wakeup any
509 if (doingforceroll
) {
511 logmap
->mtm_flags
&= ~MTM_FORCE_ROLL
;
512 mutex_exit(&logmap
->mtm_mutex
);
513 cv_broadcast(&logmap
->mtm_from_roll_cv
);
515 mutex_exit(&logmap
->mtm_mutex
);
519 * If someone wants us to roll something; then do it
521 if (logmap
->mtm_flags
& MTM_FORCE_ROLL
) {
527 * Log is busy, check if logmap is getting full.
529 if (logmap_need_roll(logmap
)) {
534 * Check if the log is idle and is not empty
536 if (!logmap
->mtm_ref
&& !ldl_empty(ul
)) {
541 * Log is busy, check if its getting full
543 if (ldl_need_roll(ul
)) {
548 * nothing to do; wait a bit and then start over
550 trans_roll_wait(logmap
, &cprinfo
);
559 * Use the cached roll buffers, or read the master
560 * and overlay the deltas
562 switch (log_roll_read(ul
, rbs
, nmblk
, roll_bufs
, &nbuf
)) {
563 case 1: trans_roll_wait(logmap
, &cprinfo
);
566 /* default case is success */
570 * Asynchronously write out the deltas
572 if (log_roll_write(ul
, rbs
, nbuf
))
576 * free up the deltas in the logmap
578 for (i
= 0, rbp
= rbs
; i
< nbuf
; ++i
, ++rbp
) {
580 logmap_remove_roll(logmap
,
581 ldbtob(bp
->b_blkno
) & (offset_t
)MAPBLOCKMASK
, MAPBLOCKSIZE
);
585 * free up log space; if possible
587 logmap_sethead(logmap
, ul
);