4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/vnode.h>
29 #include <sys/errno.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
35 #include <sys/cmn_err.h>
36 #include <sys/fssnap_if.h>
37 #include <sys/fs/ufs_inode.h>
38 #include <sys/fs/ufs_filio.h>
39 #include <sys/fs/ufs_log.h>
40 #include <sys/fs/ufs_bio.h>
41 #include <sys/atomic.h>
44 extern uint_t bypass_snapshot_throttle_key
;
46 extern struct kmem_cache
*lufs_sv
;
47 extern struct kmem_cache
*lufs_bp
;
50 makebusy(ml_unit_t
*ul
, buf_t
*bp
)
53 if ((bp
->b_flags
& B_ERROR
) == 0)
55 if (bp
->b_flags
& B_READ
)
56 ldl_seterror(ul
, "Error reading ufs log");
58 ldl_seterror(ul
, "Error writing ufs log");
64 bp
->b_flags
|= B_DONE
;
66 if (bp
->b_flags
& B_WRITE
)
69 /* wakeup the thread waiting on this buf */
75 ldl_strategy_done(buf_t
*cb
)
81 ASSERT(SEMA_HELD(&cb
->b_sem
));
82 ASSERT((cb
->b_flags
& B_DONE
) == 0);
85 * Compute address of the ``save'' struct
87 lbp
= (lufs_buf_t
*)cb
;
88 sv
= (lufs_save_t
*)lbp
->lb_ptr
;
90 if (cb
->b_flags
& B_ERROR
)
94 * If this is the last request, release the resources and
95 * ``done'' the original buffer header.
97 if (atomic_add_long_nv(&sv
->sv_nb_left
, -cb
->b_bcount
)) {
98 kmem_cache_free(lufs_bp
, lbp
);
101 /* Propagate any errors back to the original buffer header */
104 bp
->b_flags
|= B_ERROR
;
105 kmem_cache_free(lufs_bp
, lbp
);
106 kmem_cache_free(lufs_sv
, sv
);
113 * Map the log logical block number to a physical disk block number
123 ic_extent_t
*ext
= ul
->un_ebp
->ic_extents
;
124 uint32_t e
= ul
->un_ebp
->ic_nextents
;
131 if (ext
[i
].ic_lbno
<= lblkno
) {
132 if ((ext
[i
].ic_lbno
+ ext
[i
].ic_nbno
) > lblkno
) {
134 bno_off
= lblkno
- (uint32_t)ext
[i
].ic_lbno
;
135 *pbcount
= MIN(bcount
, dbtob(ext
[i
].ic_nbno
- bno_off
));
136 *pblkno
= ext
[i
].ic_pbno
+ bno_off
;
142 i
= s
+ ((e
- s
) >> 1);
154 * The log is a set of extents (which typically will be only one, but
155 * may be more if the disk was close to full when the log was created)
156 * and hence the logical offsets into the log
157 * have to be translated into their real device locations before
158 * calling the device's strategy routine. The translation may result
159 * in several IO requests if this request spans extents.
162 ldl_strategy(ml_unit_t
*ul
, buf_t
*pb
)
167 ufsvfs_t
*ufsvfsp
= ul
->un_ufsvfs
;
168 daddr_t lblkno
, pblkno
;
169 size_t nb_left
, pbcount
;
171 dev_t dev
= ul
->un_dev
;
173 int read
= pb
->b_flags
& B_READ
;
176 * Allocate and initialise the save stucture,
178 sv
= kmem_cache_alloc(lufs_sv
, KM_SLEEP
);
181 nb_left
= pb
->b_bcount
;
182 sv
->sv_nb_left
= nb_left
;
184 lblkno
= pb
->b_blkno
;
188 error
= map_frag(ul
, lblkno
, nb_left
, &pblkno
, &pbcount
);
190 lbp
= kmem_cache_alloc(lufs_bp
, KM_SLEEP
);
191 bioinit(&lbp
->lb_buf
);
194 cb
= bioclone(pb
, offset
, pbcount
, dev
,
195 pblkno
, ldl_strategy_done
, &lbp
->lb_buf
, KM_SLEEP
);
198 lblkno
+= btodb(pbcount
);
202 cb
->b_flags
|= B_ERROR
;
203 cb
->b_resid
= cb
->b_bcount
;
207 logstats
.ls_ldlreads
.value
.ui64
++;
208 ufsvfsp
->vfs_iotstamp
= ddi_get_lbolt();
209 lwp_stat_update(LWP_STAT_INBLK
, 1);
211 logstats
.ls_ldlwrites
.value
.ui64
++;
212 lwp_stat_update(LWP_STAT_OUBLK
, 1);
216 * write through the snapshot driver if necessary
217 * We do not want this write to be throttled because
218 * we are holding the un_log mutex here. If we
219 * are throttled in fssnap_translate, the fssnap_taskq
220 * thread which can wake us up can get blocked on
221 * the un_log mutex resulting in a deadlock.
223 if (ufsvfsp
->vfs_snapshot
) {
224 (void) tsd_set(bypass_snapshot_throttle_key
,
226 fssnap_strategy(&ufsvfsp
->vfs_snapshot
, cb
);
228 (void) tsd_set(bypass_snapshot_throttle_key
,
231 (void) bdev_strategy(cb
);
239 writelog(ml_unit_t
*ul
, buf_t
*bp
)
241 ASSERT(SEMA_HELD(&bp
->b_sem
));
244 * This is really an B_ASYNC write but we want Presto to
245 * cache this write. The iodone routine, logdone, processes
248 bp
->b_flags
= B_WRITE
;
249 bp
->b_edev
= ul
->un_dev
;
250 bp
->b_iodone
= logdone
;
253 * return EIO for every IO if in hard error state
255 if (ul
->un_flags
& LDL_ERROR
) {
256 bp
->b_flags
|= B_ERROR
;
262 ldl_strategy(ul
, bp
);
266 readlog(ml_unit_t
*ul
, buf_t
*bp
)
268 ASSERT(SEMA_HELD(&bp
->b_sem
));
269 ASSERT(bp
->b_bcount
);
271 bp
->b_flags
= B_READ
;
272 bp
->b_edev
= ul
->un_dev
;
273 bp
->b_iodone
= logdone
;
275 /* all IO returns errors when in error state */
276 if (ul
->un_flags
& LDL_ERROR
) {
277 bp
->b_flags
|= B_ERROR
;
280 (void) trans_wait(bp
);
284 ldl_strategy(ul
, bp
);
287 ldl_seterror(ul
, "Error reading ufs log");
291 * NOTE: writers are single threaded thru the log layer.
292 * This means we can safely reference and change the cb and bp fields
293 * that ldl_read does not reference w/o holding the cb_rwlock or
294 * the bp makebusy lock.
297 push_dirty_bp(ml_unit_t
*ul
, buf_t
*bp
)
300 cirbuf_t
*cb
= &ul
->un_wrbuf
;
302 ASSERT(bp
== cb
->cb_bp
&& bp
== cb
->cb_dirty
);
303 ASSERT((bp
->b_bcount
& (DEV_BSIZE
-1)) == 0);
306 * async write the buf
311 * no longer filling any buf
316 * no extra buffer space; all done
318 if (bp
->b_bcount
== bp
->b_bufsize
)
322 * give extra buffer space to a new bp
323 * try to take buf off of free list
325 if ((newbp
= cb
->cb_free
) != NULL
) {
326 cb
->cb_free
= newbp
->b_forw
;
328 newbp
= kmem_zalloc(sizeof (buf_t
), KM_SLEEP
);
329 sema_init(&newbp
->b_sem
, 1, NULL
, SEMA_DEFAULT
, NULL
);
330 sema_init(&newbp
->b_io
, 0, NULL
, SEMA_DEFAULT
, NULL
);
334 newbp
->b_file
= NULL
;
335 newbp
->b_offset
= -1;
336 newbp
->b_bufsize
= bp
->b_bufsize
- bp
->b_bcount
;
337 newbp
->b_un
.b_addr
= bp
->b_un
.b_addr
+ bp
->b_bcount
;
338 bp
->b_bufsize
= bp
->b_bcount
;
341 * lock out readers and put new buf at LRU position
343 rw_enter(&cb
->cb_rwlock
, RW_WRITER
);
344 newbp
->b_forw
= bp
->b_forw
;
346 bp
->b_forw
->b_back
= newbp
;
348 rw_exit(&cb
->cb_rwlock
);
352 inval_range(ml_unit_t
*ul
, cirbuf_t
*cb
, off_t lof
, off_t nb
)
355 off_t elof
= lof
+ nb
;
360 * discard all bufs that overlap the range (lof, lof + nb)
362 rw_enter(&cb
->cb_rwlock
, RW_WRITER
);
365 if (bp
== cb
->cb_dirty
|| bp
->b_bcount
== 0) {
369 buflof
= dbtob(bp
->b_blkno
);
370 bufelof
= buflof
+ bp
->b_bcount
;
371 if ((buflof
< lof
&& bufelof
<= lof
) ||
372 (buflof
>= elof
&& bufelof
> elof
)) {
381 } while (bp
!= cb
->cb_bp
);
382 rw_exit(&cb
->cb_rwlock
);
386 * NOTE: writers are single threaded thru the log layer.
387 * This means we can safely reference and change the cb and bp fields
388 * that ldl_read does not reference w/o holding the cb_rwlock or
389 * the bp makebusy lock.
392 get_write_bp(ml_unit_t
*ul
)
394 cirbuf_t
*cb
= &ul
->un_wrbuf
;
398 * cb_dirty is the buffer we are currently filling; if any
400 if ((bp
= cb
->cb_dirty
) != NULL
) {
405 * discard any bp that overlaps the current tail since we are
406 * about to overwrite it.
408 inval_range(ul
, cb
, ul
->un_tail_lof
, 1);
413 rw_enter(&cb
->cb_rwlock
, RW_WRITER
);
414 bp
= cb
->cb_bp
->b_forw
;
422 bp
->b_blkno
= btodb(ul
->un_tail_lof
);
423 ASSERT(dbtob(bp
->b_blkno
) == ul
->un_tail_lof
);
424 rw_exit(&cb
->cb_rwlock
);
428 * 1. un_tail_lof never addresses >= un_eol_lof
429 * 2. b_blkno + btodb(b_bufsize) may > un_eol_lof
430 * this case is handled in storebuf
436 alloc_wrbuf(cirbuf_t
*cb
, size_t bufsize
)
442 * Clear previous allocation
447 bzero(cb
, sizeof (*cb
));
448 rw_init(&cb
->cb_rwlock
, NULL
, RW_DRIVER
, NULL
);
450 rw_enter(&cb
->cb_rwlock
, RW_WRITER
);
453 * preallocate 3 bp's and put them on the free list.
455 for (i
= 0; i
< 3; ++i
) {
456 bp
= kmem_zalloc(sizeof (buf_t
), KM_SLEEP
);
457 sema_init(&bp
->b_sem
, 1, NULL
, SEMA_DEFAULT
, NULL
);
458 sema_init(&bp
->b_io
, 0, NULL
, SEMA_DEFAULT
, NULL
);
460 bp
->b_forw
= cb
->cb_free
;
464 cb
->cb_va
= kmem_alloc(bufsize
, KM_SLEEP
);
468 * first bp claims entire write buffer
471 cb
->cb_free
= bp
->b_forw
;
476 bp
->b_un
.b_addr
= cb
->cb_va
;
477 bp
->b_bufsize
= cb
->cb_nb
;
479 rw_exit(&cb
->cb_rwlock
);
483 alloc_rdbuf(cirbuf_t
*cb
, size_t bufsize
, size_t blksize
)
490 * Clear previous allocation
495 bzero(cb
, sizeof (*cb
));
496 rw_init(&cb
->cb_rwlock
, NULL
, RW_DRIVER
, NULL
);
498 rw_enter(&cb
->cb_rwlock
, RW_WRITER
);
500 cb
->cb_va
= kmem_alloc(bufsize
, KM_SLEEP
);
504 * preallocate N bufs that are hard-sized to blksize
505 * in other words, the read buffer pool is a linked list
506 * of statically sized bufs.
509 while ((nb
= bufsize
) != 0) {
512 bp
= kmem_zalloc(sizeof (buf_t
), KM_SLEEP
);
513 sema_init(&bp
->b_sem
, 1, NULL
, SEMA_DEFAULT
, NULL
);
514 sema_init(&bp
->b_io
, 0, NULL
, SEMA_DEFAULT
, NULL
);
515 bp
->b_un
.b_addr
= va
;
518 bp
->b_forw
= cb
->cb_bp
->b_forw
;
519 bp
->b_back
= cb
->cb_bp
;
520 cb
->cb_bp
->b_forw
->b_back
= bp
;
521 cb
->cb_bp
->b_forw
= bp
;
523 bp
->b_forw
= bp
->b_back
= bp
;
529 rw_exit(&cb
->cb_rwlock
);
533 free_cirbuf(cirbuf_t
*cb
)
540 rw_enter(&cb
->cb_rwlock
, RW_WRITER
);
541 ASSERT(cb
->cb_dirty
== NULL
);
544 * free the active bufs
546 while ((bp
= cb
->cb_bp
) != NULL
) {
547 if (bp
== bp
->b_forw
)
550 cb
->cb_bp
= bp
->b_forw
;
551 bp
->b_back
->b_forw
= bp
->b_forw
;
552 bp
->b_forw
->b_back
= bp
->b_back
;
553 sema_destroy(&bp
->b_sem
);
554 sema_destroy(&bp
->b_io
);
555 kmem_free(bp
, sizeof (buf_t
));
561 while ((bp
= cb
->cb_free
) != NULL
) {
562 cb
->cb_free
= bp
->b_forw
;
563 sema_destroy(&bp
->b_sem
);
564 sema_destroy(&bp
->b_io
);
565 kmem_free(bp
, sizeof (buf_t
));
567 kmem_free(cb
->cb_va
, cb
->cb_nb
);
570 rw_exit(&cb
->cb_rwlock
);
571 rw_destroy(&cb
->cb_rwlock
);
575 within_range(off_t lof
, daddr_t blkno
, ulong_t bcount
)
577 off_t blof
= dbtob(blkno
);
579 return ((lof
>= blof
) && (lof
< (blof
+ bcount
)));
583 find_bp(ml_unit_t
*ul
, cirbuf_t
*cb
, off_t lof
)
588 * find a buf that contains the offset lof
590 rw_enter(&cb
->cb_rwlock
, RW_READER
);
594 within_range(lof
, bp
->b_blkno
, bp
->b_bcount
)) {
596 rw_exit(&cb
->cb_rwlock
);
600 } while (bp
!= cb
->cb_bp
);
601 rw_exit(&cb
->cb_rwlock
);
607 find_read_lof(ml_unit_t
*ul
, cirbuf_t
*cb
, off_t lof
)
615 * o read past the tail
616 * o read data that may be being written.
618 rw_enter(&cb
->cb_rwlock
, RW_READER
);
619 bpend
= bp
= cb
->cb_bp
->b_forw
;
620 rlof
= ul
->un_tail_lof
;
623 rlof
= dbtob(bp
->b_blkno
);
627 } while (bp
!= bpend
);
628 rw_exit(&cb
->cb_rwlock
);
631 /* lof is prior to the range represented by the write buf */
634 /* lof follows the range represented by the write buf */
635 return ((off_t
)ul
->un_eol_lof
);
639 get_read_bp(ml_unit_t
*ul
, off_t lof
)
646 * retrieve as much data as possible from the incore buffers
648 if ((bp
= find_bp(ul
, &ul
->un_wrbuf
, lof
)) != NULL
) {
649 logstats
.ls_lreadsinmem
.value
.ui64
++;
652 if ((bp
= find_bp(ul
, &ul
->un_rdbuf
, lof
)) != NULL
) {
653 logstats
.ls_lreadsinmem
.value
.ui64
++;
661 rw_enter(&cb
->cb_rwlock
, RW_WRITER
);
662 bp
= cb
->cb_bp
->b_forw
;
667 rw_exit(&cb
->cb_rwlock
);
670 * don't read past the tail or the end-of-log
672 bp
->b_blkno
= btodb(lof
);
673 lof
= dbtob(bp
->b_blkno
);
674 rlof
= find_read_lof(ul
, &ul
->un_wrbuf
, lof
);
675 bp
->b_bcount
= MIN(bp
->b_bufsize
, rlof
- lof
);
681 * NOTE: writers are single threaded thru the log layer.
682 * This means we can safely reference and change the cb and bp fields
683 * that ldl_read does not reference w/o holding the cb_rwlock or
684 * the bp makebusy lock.
687 extend_write_bp(ml_unit_t
*ul
, cirbuf_t
*cb
, buf_t
*bp
)
689 buf_t
*bpforw
= bp
->b_forw
;
691 ASSERT(bp
== cb
->cb_bp
&& bp
== cb
->cb_dirty
);
694 * there is no `next' bp; do nothing
700 * buffer space is not adjacent; do nothing
702 if ((bp
->b_un
.b_addr
+ bp
->b_bufsize
) != bpforw
->b_un
.b_addr
)
706 * locking protocol requires giving up any bp locks before
707 * acquiring cb_rwlock. This is okay because we hold
715 rw_enter(&cb
->cb_rwlock
, RW_WRITER
);
718 * wait for current IO to finish w/next bp; if necessary
720 makebusy(ul
, bpforw
);
723 * free the next bp and steal its space
725 bp
->b_forw
= bpforw
->b_forw
;
726 bpforw
->b_forw
->b_back
= bp
;
727 bp
->b_bufsize
+= bpforw
->b_bufsize
;
728 sema_v(&bpforw
->b_sem
);
729 bpforw
->b_forw
= cb
->cb_free
;
730 cb
->cb_free
= bpforw
;
732 rw_exit(&cb
->cb_rwlock
);
738 storebuf(ml_unit_t
*ul
, buf_t
*bp
, caddr_t va
, size_t nb
)
744 cirbuf_t
*cb
= &ul
->un_wrbuf
;
747 nb_in_sec
= NB_LEFT_IN_SECTOR(bp
->b_bcount
);
748 copy_nb
= MIN(nb_left
, nb_in_sec
);
752 bcopy(va
, bp
->b_un
.b_addr
+ bp
->b_bcount
, copy_nb
);
753 bp
->b_bcount
+= copy_nb
;
756 ul
->un_tail_lof
+= copy_nb
;
758 if ((nb_in_sec
-= copy_nb
) == 0) {
759 st
= (sect_trailer_t
*)(bp
->b_un
.b_addr
+ bp
->b_bcount
);
761 st
->st_tid
= ul
->un_logmap
->mtm_tid
;
762 st
->st_ident
= ul
->un_tail_ident
++;
763 bp
->b_bcount
+= sizeof (sect_trailer_t
);
764 ul
->un_tail_lof
+= sizeof (sect_trailer_t
);
766 * log wrapped; async write this bp
768 if (ul
->un_tail_lof
== ul
->un_eol_lof
) {
769 ul
->un_tail_lof
= ul
->un_bol_lof
;
770 push_dirty_bp(ul
, bp
);
771 return (nb
- nb_left
);
774 * out of bp space; get more or async write buf
776 if (bp
->b_bcount
== bp
->b_bufsize
) {
777 if (!extend_write_bp(ul
, cb
, bp
)) {
778 push_dirty_bp(ul
, bp
);
779 return (nb
- nb_left
);
791 fetchzeroes(caddr_t dst_va
, offset_t dst_mof
, ulong_t dst_nb
, mapentry_t
*me
)
793 offset_t src_mof
= me
->me_mof
;
794 size_t src_nb
= me
->me_nb
;
796 if (src_mof
> dst_mof
) {
797 ASSERT(src_mof
< (dst_mof
+ dst_nb
));
798 dst_va
+= (src_mof
- dst_mof
);
799 dst_nb
-= (src_mof
- dst_mof
);
801 ASSERT(dst_mof
< (src_mof
+ src_nb
));
802 src_nb
-= (dst_mof
- src_mof
);
805 src_nb
= MIN(src_nb
, dst_nb
);
807 bzero(dst_va
, src_nb
);
811 * dst_va == NULL means don't copy anything
824 off_t dst_lof
= *dst_lofp
;
825 ulong_t sav_dst_nb
= dst_nb
;
826 ulong_t src_nb
= bp
->b_bcount
;
827 off_t src_lof
= dbtob(bp
->b_blkno
);
828 off_t src_elof
= src_lof
+ src_nb
;
829 caddr_t src_va
= bp
->b_un
.b_addr
;
832 * copy from bp to dst_va
836 * compute address within bp
838 copy_va
= src_va
+ (dst_lof
- src_lof
);
841 * adjust copy size to amount of data in bp
843 copy_nb
= MIN(dst_nb
, src_elof
- dst_lof
);
846 * adjust copy size to amount of data in sector
848 nb_sec
= NB_LEFT_IN_SECTOR(dst_lof
);
849 copy_nb
= MIN(copy_nb
, nb_sec
);
852 * dst_va == NULL means don't do copy (see logseek())
855 bcopy(copy_va
, dst_va
, copy_nb
);
863 * advance over sector trailer
866 dst_lof
+= sizeof (sect_trailer_t
);
870 * return current lof for next read
872 if (dst_lof
== src_elof
) {
874 if (dst_lof
== ul
->un_eol_lof
)
875 dst_lof
= ul
->un_bol_lof
;
877 return (sav_dst_nb
- dst_nb
);
882 * copy complete - return current lof
890 ldl_round_commit(ml_unit_t
*ul
)
896 cirbuf_t
*cb
= &ul
->un_wrbuf
;
899 * if nothing to write; then do nothing
901 if ((bp
= cb
->cb_dirty
) == NULL
)
906 * round up to sector boundary and set new tail
907 * don't readjust st_ident if buf is already rounded
909 bcount
= P2ROUNDUP(bp
->b_bcount
, DEV_BSIZE
);
910 if (bcount
== bp
->b_bcount
) {
914 bp
->b_bcount
= bcount
;
915 ul
->un_tail_lof
= dbtob(bp
->b_blkno
) + bcount
;
917 if (ul
->un_tail_lof
== ul
->un_eol_lof
) {
918 ul
->un_tail_lof
= ul
->un_bol_lof
;
921 ASSERT(ul
->un_tail_lof
!= ul
->un_head_lof
);
924 * fix up the sector trailer
927 st
= (sect_trailer_t
*)
928 ((bp
->b_un
.b_addr
+ bcount
) - sizeof (*st
));
929 st
->st_tid
= ul
->un_logmap
->mtm_tid
;
930 st
->st_ident
= ul
->un_tail_ident
++;
933 * if tail wrapped or we have exhausted this buffer
934 * async write the buffer
936 if (wrapped
|| bcount
== bp
->b_bufsize
)
937 push_dirty_bp(ul
, bp
);
943 ldl_push_commit(ml_unit_t
*ul
)
946 cirbuf_t
*cb
= &ul
->un_wrbuf
;
949 * if nothing to write; then do nothing
951 if ((bp
= cb
->cb_dirty
) == NULL
)
954 push_dirty_bp(ul
, bp
);
958 ldl_need_commit(ml_unit_t
*ul
)
960 return (ul
->un_resv
> (ul
->un_maxresv
- (ul
->un_maxresv
>>2)));
964 ldl_has_space(ml_unit_t
*ul
, mapentry_t
*me
)
969 ASSERT(MUTEX_HELD(&ul
->un_log_mutex
));
972 * Add up the size used by the deltas
973 * round nb up to a sector length plus an extra sector
974 * w/o the extra sector we couldn't distinguish
975 * a full log (head == tail) from an empty log (head == tail)
977 for (nb
= DEV_BSIZE
; me
; me
= me
->me_hash
) {
978 nb
+= sizeof (struct delta
);
979 if (me
->me_dt
!= DT_CANCEL
)
982 nb
= P2ROUNDUP(nb
, DEV_BSIZE
);
984 if (ul
->un_head_lof
<= ul
->un_tail_lof
)
985 nfb
= (ul
->un_head_lof
- ul
->un_bol_lof
) +
986 (ul
->un_eol_lof
- ul
->un_tail_lof
);
988 nfb
= ul
->un_head_lof
- ul
->un_tail_lof
;
994 ldl_write(ml_unit_t
*ul
, caddr_t bufp
, offset_t bufmof
, struct mapentry
*me
)
1001 ASSERT(MUTEX_HELD(&ul
->un_log_mutex
));
1003 /* Write the delta */
1005 nb
= sizeof (struct delta
);
1006 va
= (caddr_t
)&me
->me_delta
;
1007 bp
= get_write_bp(ul
);
1010 if (ul
->un_flags
& LDL_ERROR
) {
1014 actual
= storebuf(ul
, bp
, va
, nb
);
1019 bp
= get_write_bp(ul
);
1022 /* If a commit, cancel, or 0's; we're almost done */
1023 switch (me
->me_dt
) {
1027 /* roll needs to know where the next delta will go */
1028 me
->me_lof
= ul
->un_tail_lof
;
1034 /* Now write the data */
1036 ASSERT(me
->me_nb
!= 0);
1039 va
= (me
->me_mof
- bufmof
) + bufp
;
1040 bp
= get_write_bp(ul
);
1042 /* Save where we will put the data */
1043 me
->me_lof
= ul
->un_tail_lof
;
1046 if (ul
->un_flags
& LDL_ERROR
) {
1050 actual
= storebuf(ul
, bp
, va
, nb
);
1055 bp
= get_write_bp(ul
);
1060 ldl_waito(ml_unit_t
*ul
)
1063 cirbuf_t
*cb
= &ul
->un_wrbuf
;
1065 rw_enter(&cb
->cb_rwlock
, RW_WRITER
);
1071 if ((bp
->b_flags
& B_DONE
) == 0) {
1076 } while (bp
!= cb
->cb_bp
);
1077 rw_exit(&cb
->cb_rwlock
);
1081 * seek nb bytes from location lof
1084 logseek(ml_unit_t
*ul
, off_t lof
, size_t nb
, off_t
*lofp
)
1090 bp
= get_read_bp(ul
, lof
);
1091 if (bp
->b_flags
& B_ERROR
) {
1095 actual
= fetchbuf(ul
, bp
, NULL
, nb
, &lof
);
1106 ml_unit_t
*ul
, /* Log unit */
1107 caddr_t va
, /* address of buffer to read into */
1108 offset_t mof
, /* mof of buffer */
1109 off_t nb
, /* length of buffer */
1110 mapentry_t
*me
) /* Map entry list */
1114 caddr_t rva
; /* address to read into */
1115 size_t rnb
; /* # of bytes to read */
1116 off_t lof
; /* log device offset to read from */
1120 caddr_t eva
= va
+ nb
; /* end of buffer */
1122 for (; me
; me
= me
->me_agenext
) {
1123 ASSERT(me
->me_dt
!= DT_CANCEL
);
1126 * check for an cached roll buffer
1130 if (mof
> crb
->c_mof
) {
1132 * This mapentry overlaps with the beginning of
1133 * the supplied buffer
1135 skip
= mof
- crb
->c_mof
;
1136 bcopy(crb
->c_buf
+ skip
, va
,
1137 MIN(nb
, crb
->c_nb
- skip
));
1140 * This mapentry starts at or after
1141 * the supplied buffer.
1143 skip
= crb
->c_mof
- mof
;
1144 bcopy(crb
->c_buf
, va
+ skip
,
1145 MIN(crb
->c_nb
, nb
- skip
));
1147 logstats
.ls_lreadsinmem
.value
.ui64
++;
1152 * check for a delta full of zeroes - there's no log data
1154 if (me
->me_dt
== DT_ABZERO
) {
1155 fetchzeroes(va
, mof
, nb
, me
);
1159 if (mof
> me
->me_mof
) {
1160 rnb
= (size_t)(mof
- me
->me_mof
);
1161 error
= logseek(ul
, me
->me_lof
, rnb
, &lof
);
1165 rnb
= me
->me_nb
- rnb
;
1166 rnb
= ((rva
+ rnb
) > eva
) ? eva
- rva
: rnb
;
1169 rva
= (me
->me_mof
- mof
) + va
;
1170 rnb
= ((rva
+ me
->me_nb
) > eva
) ? eva
- rva
: me
->me_nb
;
1174 bp
= get_read_bp(ul
, lof
);
1175 if (bp
->b_flags
& B_ERROR
) {
1179 ASSERT(((me
->me_flags
& ME_ROLL
) == 0) ||
1180 (bp
!= ul
->un_wrbuf
.cb_dirty
));
1181 actual
= fetchbuf(ul
, bp
, rva
, rnb
, &lof
);
1191 ldl_savestate(ml_unit_t
*ul
)
1194 buf_t
*bp
= ul
->un_bp
;
1195 ml_odunit_t
*ud
= (void *)bp
->b_un
.b_addr
;
1196 ml_odunit_t
*ud2
= (void *)(bp
->b_un
.b_addr
+ DEV_BSIZE
);
1200 * Scan test is running; don't update intermediate state
1202 if (ul
->un_logmap
&& ul
->un_logmap
->mtm_trimlof
)
1206 mutex_enter(&ul
->un_state_mutex
);
1207 bcopy(&ul
->un_ondisk
, ud
, sizeof (*ud
));
1208 ud
->od_chksum
= ud
->od_head_ident
+ ud
->od_tail_ident
;
1209 bcopy(ud
, ud2
, sizeof (*ud
));
1211 /* If a snapshot is enabled write through the shapshot driver. */
1212 if (ul
->un_ufsvfs
->vfs_snapshot
)
1213 UFS_BWRITE2(ul
->un_ufsvfs
, bp
);
1216 logstats
.ls_ldlwrites
.value
.ui64
++;
1217 error
= bp
->b_flags
& B_ERROR
;
1218 mutex_exit(&ul
->un_state_mutex
);
1220 ldl_seterror(ul
, "Error writing ufs log state");
1224 * The head will be set to (new_lof - header) since ldl_sethead is
1225 * called with the new_lof of the data portion of a delta.
1228 ldl_sethead(ml_unit_t
*ul
, off_t data_lof
, uint32_t tid
)
1236 ASSERT(MUTEX_HELD(&ul
->un_log_mutex
));
1238 if (data_lof
== -1) {
1240 new_ident
= lufs_hd_genid(ul
);
1241 new_lof
= ul
->un_tail_lof
;
1244 /* compute header's lof */
1245 new_ident
= ul
->un_head_ident
;
1246 new_lof
= data_lof
- sizeof (struct delta
);
1248 /* whoops, header spans sectors; subtract out sector trailer */
1249 if (btodb(new_lof
) != btodb(data_lof
))
1250 new_lof
-= sizeof (sect_trailer_t
);
1252 /* whoops, header wrapped the log; go to last sector */
1253 if (new_lof
< ul
->un_bol_lof
) {
1255 new_lof
-= dbtob(btodb(new_lof
));
1256 /* add to last sector's lof */
1257 new_lof
+= (ul
->un_eol_lof
- DEV_BSIZE
);
1259 ul
->un_head_tid
= tid
;
1265 if (new_lof
== ul
->un_head_lof
)
1269 * invalidate the affected bufs and calculate new ident
1271 if (new_lof
> ul
->un_head_lof
) {
1272 nb
= new_lof
- ul
->un_head_lof
;
1273 inval_range(ul
, &ul
->un_wrbuf
, ul
->un_head_lof
, nb
);
1274 inval_range(ul
, &ul
->un_rdbuf
, ul
->un_head_lof
, nb
);
1276 end_blkno
= btodb(new_lof
);
1277 beg_blkno
= btodb(ul
->un_head_lof
);
1278 new_ident
+= (end_blkno
- beg_blkno
);
1280 nb
= ul
->un_eol_lof
- ul
->un_head_lof
;
1281 inval_range(ul
, &ul
->un_wrbuf
, ul
->un_head_lof
, nb
);
1282 inval_range(ul
, &ul
->un_rdbuf
, ul
->un_head_lof
, nb
);
1284 end_blkno
= btodb(ul
->un_eol_lof
);
1285 beg_blkno
= btodb(ul
->un_head_lof
);
1286 new_ident
+= (end_blkno
- beg_blkno
);
1288 nb
= new_lof
- ul
->un_bol_lof
;
1289 inval_range(ul
, &ul
->un_wrbuf
, ul
->un_bol_lof
, nb
);
1290 inval_range(ul
, &ul
->un_rdbuf
, ul
->un_bol_lof
, nb
);
1292 end_blkno
= btodb(new_lof
);
1293 beg_blkno
= btodb(ul
->un_bol_lof
);
1294 new_ident
+= (end_blkno
- beg_blkno
);
1297 * don't update the head if there has been an error
1299 if (ul
->un_flags
& LDL_ERROR
)
1302 /* Fix up the head and ident */
1303 ASSERT(new_lof
>= ul
->un_bol_lof
);
1304 ul
->un_head_lof
= new_lof
;
1305 ul
->un_head_ident
= new_ident
;
1306 if (data_lof
== -1) {
1307 ul
->un_tail_ident
= ul
->un_head_ident
;
1311 /* Commit to the database */
1314 ASSERT(((ul
->un_logmap
->mtm_debug
& MT_SCAN
) == 0) ||
1315 ldl_sethead_debug(ul
));
1319 * The tail will be set to the sector following lof+nb
1320 * lof + nb == size of the last delta + commit record
1321 * this function is called once after the log scan has completed.
1324 ldl_settail(ml_unit_t
*ul
, off_t lof
, size_t nb
)
1331 ASSERT(MUTEX_HELD(&ul
->un_log_mutex
));
1334 ul
->un_tail_lof
= dbtob(btodb(ul
->un_head_lof
));
1335 ul
->un_head_lof
= ul
->un_tail_lof
;
1336 ul
->un_head_ident
= lufs_hd_genid(ul
);
1337 ul
->un_tail_ident
= ul
->un_head_ident
;
1339 /* Commit to the database */
1346 * new_lof is the offset of the sector following the last commit
1348 (void) logseek(ul
, lof
, nb
, &new_lof
);
1349 ASSERT(new_lof
!= dbtob(btodb(ul
->un_head_lof
)));
1352 * calculate new ident
1354 if (new_lof
> ul
->un_head_lof
) {
1355 end_blkno
= btodb(new_lof
);
1356 beg_blkno
= btodb(ul
->un_head_lof
);
1357 new_ident
= ul
->un_head_ident
+ (end_blkno
- beg_blkno
);
1359 end_blkno
= btodb(ul
->un_eol_lof
);
1360 beg_blkno
= btodb(ul
->un_head_lof
);
1361 new_ident
= ul
->un_head_ident
+ (end_blkno
- beg_blkno
);
1363 end_blkno
= btodb(new_lof
);
1364 beg_blkno
= btodb(ul
->un_bol_lof
);
1365 new_ident
+= (end_blkno
- beg_blkno
);
1368 /* Fix up the tail and ident */
1369 ul
->un_tail_lof
= new_lof
;
1370 ul
->un_tail_ident
= new_ident
;
1372 /* Commit to the database */
1380 ldl_logscan_ident(ml_unit_t
*ul
, buf_t
*bp
, off_t lof
)
1387 * compute ident for first sector in the buffer
1389 ident
= ul
->un_head_ident
;
1390 if (bp
->b_blkno
>= btodb(ul
->un_head_lof
)) {
1391 ident
+= (bp
->b_blkno
- btodb(ul
->un_head_lof
));
1393 ident
+= (btodb(ul
->un_eol_lof
) - btodb(ul
->un_head_lof
));
1394 ident
+= (bp
->b_blkno
- btodb(ul
->un_bol_lof
));
1397 * truncate the buffer down to the last valid sector
1399 nblk
= btodb(bp
->b_bcount
);
1402 st
= (sect_trailer_t
*)(bp
->b_un
.b_addr
+ LDL_USABLE_BSIZE
);
1403 for (i
= 0; i
< nblk
; ++i
) {
1404 if (st
->st_ident
!= ident
)
1407 /* remember last valid tid for ldl_logscan_error() */
1408 ul
->un_tid
= st
->st_tid
;
1411 st
= (sect_trailer_t
*)(((caddr_t
)st
) + DEV_BSIZE
);
1413 bp
->b_bcount
+= DEV_BSIZE
;
1416 * make sure that lof is still within range
1418 return (within_range(lof
, bp
->b_blkno
, bp
->b_bcount
));
1422 ldl_logscan_nbcommit(off_t lof
)
1425 * lof is the offset following the commit header. However,
1426 * if the commit header fell on the end-of-sector, then lof
1427 * has already been advanced to the beginning of the next
1428 * sector. So do nothing. Otherwise, return the remaining
1429 * bytes in the sector.
1431 if ((lof
& (DEV_BSIZE
- 1)) == 0)
1433 return (NB_LEFT_IN_SECTOR(lof
));
1437 ldl_logscan_read(ml_unit_t
*ul
, off_t
*lofp
, size_t nb
, caddr_t va
)
1442 ASSERT(ul
->un_head_lof
!= ul
->un_tail_lof
);
1445 * Check the log data doesn't go out of bounds
1447 if (ul
->un_head_lof
< ul
->un_tail_lof
) {
1448 if (!WITHIN(*lofp
, nb
, ul
->un_head_lof
,
1449 (ul
->un_tail_lof
- ul
->un_head_lof
))) {
1453 if (OVERLAP(*lofp
, nb
, ul
->un_tail_lof
,
1454 (ul
->un_head_lof
- ul
->un_tail_lof
))) {
1460 bp
= get_read_bp(ul
, *lofp
);
1461 if (bp
->b_flags
& B_ERROR
) {
1466 * out-of-seq idents means partial transaction
1467 * panic, non-corrupting powerfail, ...
1469 if (!ldl_logscan_ident(ul
, bp
, *lofp
)) {
1474 * copy the header into the caller's buf
1476 actual
= fetchbuf(ul
, bp
, va
, nb
, lofp
);
1485 ldl_logscan_begin(ml_unit_t
*ul
)
1489 ASSERT(ul
->un_wrbuf
.cb_dirty
== NULL
);
1494 ul
->un_flags
|= LDL_SCAN
;
1497 * reset the circular bufs
1499 bufsize
= ldl_bufsize(ul
);
1500 alloc_rdbuf(&ul
->un_rdbuf
, bufsize
, bufsize
);
1501 alloc_wrbuf(&ul
->un_wrbuf
, bufsize
);
1504 * set the tail to reflect a full log
1506 ul
->un_tail_lof
= dbtob(btodb(ul
->un_head_lof
)) - DEV_BSIZE
;
1508 if (ul
->un_tail_lof
< ul
->un_bol_lof
)
1509 ul
->un_tail_lof
= ul
->un_eol_lof
- DEV_BSIZE
;
1510 if (ul
->un_tail_lof
>= ul
->un_eol_lof
)
1511 ul
->un_tail_lof
= ul
->un_bol_lof
;
1514 * un_tid is used during error processing; it is initialized to
1515 * the tid of the delta at un_head_lof;
1517 ul
->un_tid
= ul
->un_head_tid
;
1521 ldl_logscan_end(ml_unit_t
*ul
)
1526 * reset the circular bufs
1528 bufsize
= ldl_bufsize(ul
);
1529 alloc_rdbuf(&ul
->un_rdbuf
, MAPBLOCKSIZE
, MAPBLOCKSIZE
);
1530 alloc_wrbuf(&ul
->un_wrbuf
, bufsize
);
1535 ul
->un_flags
&= ~LDL_SCAN
;
1539 ldl_need_roll(ml_unit_t
*ul
)
1549 * snapshot the log state
1551 head
= ul
->un_head_lof
;
1552 tail
= ul
->un_tail_lof
;
1553 bol
= ul
->un_bol_lof
;
1554 eol
= ul
->un_eol_lof
;
1555 nb
= ul
->un_logsize
;
1558 * compute number of busy (inuse) bytes
1561 busybytes
= tail
- head
;
1563 busybytes
= (eol
- head
) + (tail
- bol
);
1566 * return TRUE if > 75% full
1568 return (busybytes
> (nb
- (nb
>> 2)));
1572 ldl_seterror(ml_unit_t
*ul
, char *why
)
1575 * already in error state; do nothing
1577 if (ul
->un_flags
& LDL_ERROR
)
1580 ul
->un_flags
|= LDL_ERROR
; /* incore */
1581 ul
->un_badlog
= 1; /* ondisk (cleared by fsck) */
1584 * Commit to state sectors
1586 uniqtime(&ul
->un_timestamp
);
1590 cmn_err(CE_WARN
, "%s", why
);
1591 cmn_err(CE_WARN
, "ufs log for %s changed state to Error",
1592 ul
->un_ufsvfs
->vfs_fs
->fs_fsmnt
);
1593 cmn_err(CE_WARN
, "Please umount(1M) %s and run fsck(1M)",
1594 ul
->un_ufsvfs
->vfs_fs
->fs_fsmnt
);
1597 * If we aren't in the middle of scan (aka snarf); tell ufs
1598 * to hard lock itself.
1600 if ((ul
->un_flags
& LDL_SCAN
) == 0)
1601 ufs_trans_onerror();
1605 ldl_bufsize(ml_unit_t
*ul
)
1608 extern uint32_t ldl_minbufsize
;
1611 * initial guess is the maxtransfer value for this log device
1612 * increase if too small
1613 * decrease if too large
1615 bufsize
= dbtob(btod(ul
->un_maxtransfer
));
1616 if (bufsize
< ldl_minbufsize
)
1617 bufsize
= ldl_minbufsize
;
1618 if (bufsize
> maxphys
)
1620 if (bufsize
> ul
->un_maxtransfer
)
1621 bufsize
= ul
->un_maxtransfer
;