dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / kernel / fs / ufs / lufs_log.c
blob8a70d0a646c6e9ed3881651fa696a9fae8b757af
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/vnode.h>
29 #include <sys/errno.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
32 #include <sys/kmem.h>
33 #include <sys/conf.h>
34 #include <sys/proc.h>
35 #include <sys/cmn_err.h>
36 #include <sys/fssnap_if.h>
37 #include <sys/fs/ufs_inode.h>
38 #include <sys/fs/ufs_filio.h>
39 #include <sys/fs/ufs_log.h>
40 #include <sys/fs/ufs_bio.h>
41 #include <sys/atomic.h>
43 extern int maxphys;
44 extern uint_t bypass_snapshot_throttle_key;
46 extern struct kmem_cache *lufs_sv;
47 extern struct kmem_cache *lufs_bp;
49 static void
50 makebusy(ml_unit_t *ul, buf_t *bp)
52 sema_p(&bp->b_sem);
53 if ((bp->b_flags & B_ERROR) == 0)
54 return;
55 if (bp->b_flags & B_READ)
56 ldl_seterror(ul, "Error reading ufs log");
57 else
58 ldl_seterror(ul, "Error writing ufs log");
61 static int
62 logdone(buf_t *bp)
64 bp->b_flags |= B_DONE;
66 if (bp->b_flags & B_WRITE)
67 sema_v(&bp->b_sem);
68 else
69 /* wakeup the thread waiting on this buf */
70 sema_v(&bp->b_io);
71 return (0);
74 static int
75 ldl_strategy_done(buf_t *cb)
77 lufs_save_t *sv;
78 lufs_buf_t *lbp;
79 buf_t *bp;
81 ASSERT(SEMA_HELD(&cb->b_sem));
82 ASSERT((cb->b_flags & B_DONE) == 0);
85 * Compute address of the ``save'' struct
87 lbp = (lufs_buf_t *)cb;
88 sv = (lufs_save_t *)lbp->lb_ptr;
90 if (cb->b_flags & B_ERROR)
91 sv->sv_error = 1;
94 * If this is the last request, release the resources and
95 * ``done'' the original buffer header.
97 if (atomic_add_long_nv(&sv->sv_nb_left, -cb->b_bcount)) {
98 kmem_cache_free(lufs_bp, lbp);
99 return (1);
101 /* Propagate any errors back to the original buffer header */
102 bp = sv->sv_bp;
103 if (sv->sv_error)
104 bp->b_flags |= B_ERROR;
105 kmem_cache_free(lufs_bp, lbp);
106 kmem_cache_free(lufs_sv, sv);
108 biodone(bp);
109 return (0);
113 * Map the log logical block number to a physical disk block number
115 static int
116 map_frag(
117 ml_unit_t *ul,
118 daddr_t lblkno,
119 size_t bcount,
120 daddr_t *pblkno,
121 size_t *pbcount)
123 ic_extent_t *ext = ul->un_ebp->ic_extents;
124 uint32_t e = ul->un_ebp->ic_nextents;
125 uint32_t s = 0;
126 uint32_t i = e >> 1;
127 uint32_t lasti = i;
128 uint32_t bno_off;
130 again:
131 if (ext[i].ic_lbno <= lblkno) {
132 if ((ext[i].ic_lbno + ext[i].ic_nbno) > lblkno) {
133 /* FOUND IT */
134 bno_off = lblkno - (uint32_t)ext[i].ic_lbno;
135 *pbcount = MIN(bcount, dbtob(ext[i].ic_nbno - bno_off));
136 *pblkno = ext[i].ic_pbno + bno_off;
137 return (0);
138 } else
139 s = i;
140 } else
141 e = i;
142 i = s + ((e - s) >> 1);
144 if (i == lasti) {
145 *pbcount = bcount;
146 return (ENOENT);
148 lasti = i;
150 goto again;
154 * The log is a set of extents (which typically will be only one, but
155 * may be more if the disk was close to full when the log was created)
156 * and hence the logical offsets into the log
157 * have to be translated into their real device locations before
158 * calling the device's strategy routine. The translation may result
159 * in several IO requests if this request spans extents.
161 void
162 ldl_strategy(ml_unit_t *ul, buf_t *pb)
164 lufs_save_t *sv;
165 lufs_buf_t *lbp;
166 buf_t *cb;
167 ufsvfs_t *ufsvfsp = ul->un_ufsvfs;
168 daddr_t lblkno, pblkno;
169 size_t nb_left, pbcount;
170 off_t offset;
171 dev_t dev = ul->un_dev;
172 int error;
173 int read = pb->b_flags & B_READ;
176 * Allocate and initialise the save stucture,
178 sv = kmem_cache_alloc(lufs_sv, KM_SLEEP);
179 sv->sv_error = 0;
180 sv->sv_bp = pb;
181 nb_left = pb->b_bcount;
182 sv->sv_nb_left = nb_left;
184 lblkno = pb->b_blkno;
185 offset = 0;
187 do {
188 error = map_frag(ul, lblkno, nb_left, &pblkno, &pbcount);
190 lbp = kmem_cache_alloc(lufs_bp, KM_SLEEP);
191 bioinit(&lbp->lb_buf);
192 lbp->lb_ptr = sv;
194 cb = bioclone(pb, offset, pbcount, dev,
195 pblkno, ldl_strategy_done, &lbp->lb_buf, KM_SLEEP);
197 offset += pbcount;
198 lblkno += btodb(pbcount);
199 nb_left -= pbcount;
201 if (error) {
202 cb->b_flags |= B_ERROR;
203 cb->b_resid = cb->b_bcount;
204 biodone(cb);
205 } else {
206 if (read) {
207 logstats.ls_ldlreads.value.ui64++;
208 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
209 lwp_stat_update(LWP_STAT_INBLK, 1);
210 } else {
211 logstats.ls_ldlwrites.value.ui64++;
212 lwp_stat_update(LWP_STAT_OUBLK, 1);
216 * write through the snapshot driver if necessary
217 * We do not want this write to be throttled because
218 * we are holding the un_log mutex here. If we
219 * are throttled in fssnap_translate, the fssnap_taskq
220 * thread which can wake us up can get blocked on
221 * the un_log mutex resulting in a deadlock.
223 if (ufsvfsp->vfs_snapshot) {
224 (void) tsd_set(bypass_snapshot_throttle_key,
225 (void *)1);
226 fssnap_strategy(&ufsvfsp->vfs_snapshot, cb);
228 (void) tsd_set(bypass_snapshot_throttle_key,
229 NULL);
230 } else {
231 (void) bdev_strategy(cb);
235 } while (nb_left);
238 static void
239 writelog(ml_unit_t *ul, buf_t *bp)
241 ASSERT(SEMA_HELD(&bp->b_sem));
244 * This is really an B_ASYNC write but we want Presto to
245 * cache this write. The iodone routine, logdone, processes
246 * the buf correctly.
248 bp->b_flags = B_WRITE;
249 bp->b_edev = ul->un_dev;
250 bp->b_iodone = logdone;
253 * return EIO for every IO if in hard error state
255 if (ul->un_flags & LDL_ERROR) {
256 bp->b_flags |= B_ERROR;
257 bp->b_error = EIO;
258 biodone(bp);
259 return;
262 ldl_strategy(ul, bp);
265 static void
266 readlog(ml_unit_t *ul, buf_t *bp)
268 ASSERT(SEMA_HELD(&bp->b_sem));
269 ASSERT(bp->b_bcount);
271 bp->b_flags = B_READ;
272 bp->b_edev = ul->un_dev;
273 bp->b_iodone = logdone;
275 /* all IO returns errors when in error state */
276 if (ul->un_flags & LDL_ERROR) {
277 bp->b_flags |= B_ERROR;
278 bp->b_error = EIO;
279 biodone(bp);
280 (void) trans_wait(bp);
281 return;
284 ldl_strategy(ul, bp);
286 if (trans_wait(bp))
287 ldl_seterror(ul, "Error reading ufs log");
291 * NOTE: writers are single threaded thru the log layer.
292 * This means we can safely reference and change the cb and bp fields
293 * that ldl_read does not reference w/o holding the cb_rwlock or
294 * the bp makebusy lock.
296 static void
297 push_dirty_bp(ml_unit_t *ul, buf_t *bp)
299 buf_t *newbp;
300 cirbuf_t *cb = &ul->un_wrbuf;
302 ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
303 ASSERT((bp->b_bcount & (DEV_BSIZE-1)) == 0);
306 * async write the buf
308 writelog(ul, bp);
311 * no longer filling any buf
313 cb->cb_dirty = NULL;
316 * no extra buffer space; all done
318 if (bp->b_bcount == bp->b_bufsize)
319 return;
322 * give extra buffer space to a new bp
323 * try to take buf off of free list
325 if ((newbp = cb->cb_free) != NULL) {
326 cb->cb_free = newbp->b_forw;
327 } else {
328 newbp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
329 sema_init(&newbp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
330 sema_init(&newbp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
332 newbp->b_flags = 0;
333 newbp->b_bcount = 0;
334 newbp->b_file = NULL;
335 newbp->b_offset = -1;
336 newbp->b_bufsize = bp->b_bufsize - bp->b_bcount;
337 newbp->b_un.b_addr = bp->b_un.b_addr + bp->b_bcount;
338 bp->b_bufsize = bp->b_bcount;
341 * lock out readers and put new buf at LRU position
343 rw_enter(&cb->cb_rwlock, RW_WRITER);
344 newbp->b_forw = bp->b_forw;
345 newbp->b_back = bp;
346 bp->b_forw->b_back = newbp;
347 bp->b_forw = newbp;
348 rw_exit(&cb->cb_rwlock);
351 static void
352 inval_range(ml_unit_t *ul, cirbuf_t *cb, off_t lof, off_t nb)
354 buf_t *bp;
355 off_t elof = lof + nb;
356 off_t buflof;
357 off_t bufelof;
360 * discard all bufs that overlap the range (lof, lof + nb)
362 rw_enter(&cb->cb_rwlock, RW_WRITER);
363 bp = cb->cb_bp;
364 do {
365 if (bp == cb->cb_dirty || bp->b_bcount == 0) {
366 bp = bp->b_forw;
367 continue;
369 buflof = dbtob(bp->b_blkno);
370 bufelof = buflof + bp->b_bcount;
371 if ((buflof < lof && bufelof <= lof) ||
372 (buflof >= elof && bufelof > elof)) {
373 bp = bp->b_forw;
374 continue;
376 makebusy(ul, bp);
377 bp->b_flags = 0;
378 bp->b_bcount = 0;
379 sema_v(&bp->b_sem);
380 bp = bp->b_forw;
381 } while (bp != cb->cb_bp);
382 rw_exit(&cb->cb_rwlock);
386 * NOTE: writers are single threaded thru the log layer.
387 * This means we can safely reference and change the cb and bp fields
388 * that ldl_read does not reference w/o holding the cb_rwlock or
389 * the bp makebusy lock.
391 static buf_t *
392 get_write_bp(ml_unit_t *ul)
394 cirbuf_t *cb = &ul->un_wrbuf;
395 buf_t *bp;
398 * cb_dirty is the buffer we are currently filling; if any
400 if ((bp = cb->cb_dirty) != NULL) {
401 makebusy(ul, bp);
402 return (bp);
405 * discard any bp that overlaps the current tail since we are
406 * about to overwrite it.
408 inval_range(ul, cb, ul->un_tail_lof, 1);
411 * steal LRU buf
413 rw_enter(&cb->cb_rwlock, RW_WRITER);
414 bp = cb->cb_bp->b_forw;
415 makebusy(ul, bp);
417 cb->cb_dirty = bp;
418 cb->cb_bp = bp;
420 bp->b_flags = 0;
421 bp->b_bcount = 0;
422 bp->b_blkno = btodb(ul->un_tail_lof);
423 ASSERT(dbtob(bp->b_blkno) == ul->un_tail_lof);
424 rw_exit(&cb->cb_rwlock);
427 * NOTE:
428 * 1. un_tail_lof never addresses >= un_eol_lof
429 * 2. b_blkno + btodb(b_bufsize) may > un_eol_lof
430 * this case is handled in storebuf
432 return (bp);
435 void
436 alloc_wrbuf(cirbuf_t *cb, size_t bufsize)
438 int i;
439 buf_t *bp;
442 * Clear previous allocation
444 if (cb->cb_nb)
445 free_cirbuf(cb);
447 bzero(cb, sizeof (*cb));
448 rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
450 rw_enter(&cb->cb_rwlock, RW_WRITER);
453 * preallocate 3 bp's and put them on the free list.
455 for (i = 0; i < 3; ++i) {
456 bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
457 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
458 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
459 bp->b_offset = -1;
460 bp->b_forw = cb->cb_free;
461 cb->cb_free = bp;
464 cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
465 cb->cb_nb = bufsize;
468 * first bp claims entire write buffer
470 bp = cb->cb_free;
471 cb->cb_free = bp->b_forw;
473 bp->b_forw = bp;
474 bp->b_back = bp;
475 cb->cb_bp = bp;
476 bp->b_un.b_addr = cb->cb_va;
477 bp->b_bufsize = cb->cb_nb;
479 rw_exit(&cb->cb_rwlock);
482 void
483 alloc_rdbuf(cirbuf_t *cb, size_t bufsize, size_t blksize)
485 caddr_t va;
486 size_t nb;
487 buf_t *bp;
490 * Clear previous allocation
492 if (cb->cb_nb)
493 free_cirbuf(cb);
495 bzero(cb, sizeof (*cb));
496 rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
498 rw_enter(&cb->cb_rwlock, RW_WRITER);
500 cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
501 cb->cb_nb = bufsize;
504 * preallocate N bufs that are hard-sized to blksize
505 * in other words, the read buffer pool is a linked list
506 * of statically sized bufs.
508 va = cb->cb_va;
509 while ((nb = bufsize) != 0) {
510 if (nb > blksize)
511 nb = blksize;
512 bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
513 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
514 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
515 bp->b_un.b_addr = va;
516 bp->b_bufsize = nb;
517 if (cb->cb_bp) {
518 bp->b_forw = cb->cb_bp->b_forw;
519 bp->b_back = cb->cb_bp;
520 cb->cb_bp->b_forw->b_back = bp;
521 cb->cb_bp->b_forw = bp;
522 } else
523 bp->b_forw = bp->b_back = bp;
524 cb->cb_bp = bp;
525 bufsize -= nb;
526 va += nb;
529 rw_exit(&cb->cb_rwlock);
532 void
533 free_cirbuf(cirbuf_t *cb)
535 buf_t *bp;
537 if (cb->cb_nb == 0)
538 return;
540 rw_enter(&cb->cb_rwlock, RW_WRITER);
541 ASSERT(cb->cb_dirty == NULL);
544 * free the active bufs
546 while ((bp = cb->cb_bp) != NULL) {
547 if (bp == bp->b_forw)
548 cb->cb_bp = NULL;
549 else
550 cb->cb_bp = bp->b_forw;
551 bp->b_back->b_forw = bp->b_forw;
552 bp->b_forw->b_back = bp->b_back;
553 sema_destroy(&bp->b_sem);
554 sema_destroy(&bp->b_io);
555 kmem_free(bp, sizeof (buf_t));
559 * free the free bufs
561 while ((bp = cb->cb_free) != NULL) {
562 cb->cb_free = bp->b_forw;
563 sema_destroy(&bp->b_sem);
564 sema_destroy(&bp->b_io);
565 kmem_free(bp, sizeof (buf_t));
567 kmem_free(cb->cb_va, cb->cb_nb);
568 cb->cb_va = NULL;
569 cb->cb_nb = 0;
570 rw_exit(&cb->cb_rwlock);
571 rw_destroy(&cb->cb_rwlock);
574 static int
575 within_range(off_t lof, daddr_t blkno, ulong_t bcount)
577 off_t blof = dbtob(blkno);
579 return ((lof >= blof) && (lof < (blof + bcount)));
582 static buf_t *
583 find_bp(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
585 buf_t *bp;
588 * find a buf that contains the offset lof
590 rw_enter(&cb->cb_rwlock, RW_READER);
591 bp = cb->cb_bp;
592 do {
593 if (bp->b_bcount &&
594 within_range(lof, bp->b_blkno, bp->b_bcount)) {
595 makebusy(ul, bp);
596 rw_exit(&cb->cb_rwlock);
597 return (bp);
599 bp = bp->b_forw;
600 } while (bp != cb->cb_bp);
601 rw_exit(&cb->cb_rwlock);
603 return (NULL);
606 static off_t
607 find_read_lof(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
609 buf_t *bp, *bpend;
610 off_t rlof;
613 * we mustn't:
614 * o read past eol
615 * o read past the tail
616 * o read data that may be being written.
618 rw_enter(&cb->cb_rwlock, RW_READER);
619 bpend = bp = cb->cb_bp->b_forw;
620 rlof = ul->un_tail_lof;
621 do {
622 if (bp->b_bcount) {
623 rlof = dbtob(bp->b_blkno);
624 break;
626 bp = bp->b_forw;
627 } while (bp != bpend);
628 rw_exit(&cb->cb_rwlock);
630 if (lof <= rlof)
631 /* lof is prior to the range represented by the write buf */
632 return (rlof);
633 else
634 /* lof follows the range represented by the write buf */
635 return ((off_t)ul->un_eol_lof);
638 static buf_t *
639 get_read_bp(ml_unit_t *ul, off_t lof)
641 cirbuf_t *cb;
642 buf_t *bp;
643 off_t rlof;
646 * retrieve as much data as possible from the incore buffers
648 if ((bp = find_bp(ul, &ul->un_wrbuf, lof)) != NULL) {
649 logstats.ls_lreadsinmem.value.ui64++;
650 return (bp);
652 if ((bp = find_bp(ul, &ul->un_rdbuf, lof)) != NULL) {
653 logstats.ls_lreadsinmem.value.ui64++;
654 return (bp);
658 * steal the LRU buf
660 cb = &ul->un_rdbuf;
661 rw_enter(&cb->cb_rwlock, RW_WRITER);
662 bp = cb->cb_bp->b_forw;
663 makebusy(ul, bp);
664 bp->b_flags = 0;
665 bp->b_bcount = 0;
666 cb->cb_bp = bp;
667 rw_exit(&cb->cb_rwlock);
670 * don't read past the tail or the end-of-log
672 bp->b_blkno = btodb(lof);
673 lof = dbtob(bp->b_blkno);
674 rlof = find_read_lof(ul, &ul->un_wrbuf, lof);
675 bp->b_bcount = MIN(bp->b_bufsize, rlof - lof);
676 readlog(ul, bp);
677 return (bp);
681 * NOTE: writers are single threaded thru the log layer.
682 * This means we can safely reference and change the cb and bp fields
683 * that ldl_read does not reference w/o holding the cb_rwlock or
684 * the bp makebusy lock.
686 static int
687 extend_write_bp(ml_unit_t *ul, cirbuf_t *cb, buf_t *bp)
689 buf_t *bpforw = bp->b_forw;
691 ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
694 * there is no `next' bp; do nothing
696 if (bpforw == bp)
697 return (0);
700 * buffer space is not adjacent; do nothing
702 if ((bp->b_un.b_addr + bp->b_bufsize) != bpforw->b_un.b_addr)
703 return (0);
706 * locking protocol requires giving up any bp locks before
707 * acquiring cb_rwlock. This is okay because we hold
708 * un_log_mutex.
710 sema_v(&bp->b_sem);
713 * lock out ldl_read
715 rw_enter(&cb->cb_rwlock, RW_WRITER);
718 * wait for current IO to finish w/next bp; if necessary
720 makebusy(ul, bpforw);
723 * free the next bp and steal its space
725 bp->b_forw = bpforw->b_forw;
726 bpforw->b_forw->b_back = bp;
727 bp->b_bufsize += bpforw->b_bufsize;
728 sema_v(&bpforw->b_sem);
729 bpforw->b_forw = cb->cb_free;
730 cb->cb_free = bpforw;
731 makebusy(ul, bp);
732 rw_exit(&cb->cb_rwlock);
734 return (1);
737 static size_t
738 storebuf(ml_unit_t *ul, buf_t *bp, caddr_t va, size_t nb)
740 size_t copy_nb;
741 size_t nb_in_sec;
742 sect_trailer_t *st;
743 size_t nb_left = nb;
744 cirbuf_t *cb = &ul->un_wrbuf;
746 again:
747 nb_in_sec = NB_LEFT_IN_SECTOR(bp->b_bcount);
748 copy_nb = MIN(nb_left, nb_in_sec);
750 ASSERT(copy_nb);
752 bcopy(va, bp->b_un.b_addr + bp->b_bcount, copy_nb);
753 bp->b_bcount += copy_nb;
754 va += copy_nb;
755 nb_left -= copy_nb;
756 ul->un_tail_lof += copy_nb;
758 if ((nb_in_sec -= copy_nb) == 0) {
759 st = (sect_trailer_t *)(bp->b_un.b_addr + bp->b_bcount);
761 st->st_tid = ul->un_logmap->mtm_tid;
762 st->st_ident = ul->un_tail_ident++;
763 bp->b_bcount += sizeof (sect_trailer_t);
764 ul->un_tail_lof += sizeof (sect_trailer_t);
766 * log wrapped; async write this bp
768 if (ul->un_tail_lof == ul->un_eol_lof) {
769 ul->un_tail_lof = ul->un_bol_lof;
770 push_dirty_bp(ul, bp);
771 return (nb - nb_left);
774 * out of bp space; get more or async write buf
776 if (bp->b_bcount == bp->b_bufsize) {
777 if (!extend_write_bp(ul, cb, bp)) {
778 push_dirty_bp(ul, bp);
779 return (nb - nb_left);
783 if (nb_left)
784 goto again;
786 sema_v(&bp->b_sem);
787 return (nb);
790 static void
791 fetchzeroes(caddr_t dst_va, offset_t dst_mof, ulong_t dst_nb, mapentry_t *me)
793 offset_t src_mof = me->me_mof;
794 size_t src_nb = me->me_nb;
796 if (src_mof > dst_mof) {
797 ASSERT(src_mof < (dst_mof + dst_nb));
798 dst_va += (src_mof - dst_mof);
799 dst_nb -= (src_mof - dst_mof);
800 } else {
801 ASSERT(dst_mof < (src_mof + src_nb));
802 src_nb -= (dst_mof - src_mof);
805 src_nb = MIN(src_nb, dst_nb);
806 ASSERT(src_nb);
807 bzero(dst_va, src_nb);
811 * dst_va == NULL means don't copy anything
813 static ulong_t
814 fetchbuf(
815 ml_unit_t *ul,
816 buf_t *bp,
817 caddr_t dst_va,
818 size_t dst_nb,
819 off_t *dst_lofp)
821 caddr_t copy_va;
822 size_t copy_nb;
823 size_t nb_sec;
824 off_t dst_lof = *dst_lofp;
825 ulong_t sav_dst_nb = dst_nb;
826 ulong_t src_nb = bp->b_bcount;
827 off_t src_lof = dbtob(bp->b_blkno);
828 off_t src_elof = src_lof + src_nb;
829 caddr_t src_va = bp->b_un.b_addr;
832 * copy from bp to dst_va
834 while (dst_nb) {
836 * compute address within bp
838 copy_va = src_va + (dst_lof - src_lof);
841 * adjust copy size to amount of data in bp
843 copy_nb = MIN(dst_nb, src_elof - dst_lof);
846 * adjust copy size to amount of data in sector
848 nb_sec = NB_LEFT_IN_SECTOR(dst_lof);
849 copy_nb = MIN(copy_nb, nb_sec);
852 * dst_va == NULL means don't do copy (see logseek())
854 if (dst_va) {
855 bcopy(copy_va, dst_va, copy_nb);
856 dst_va += copy_nb;
858 dst_lof += copy_nb;
859 dst_nb -= copy_nb;
860 nb_sec -= copy_nb;
863 * advance over sector trailer
865 if (nb_sec == 0)
866 dst_lof += sizeof (sect_trailer_t);
869 * exhausted buffer
870 * return current lof for next read
872 if (dst_lof == src_elof) {
873 sema_v(&bp->b_sem);
874 if (dst_lof == ul->un_eol_lof)
875 dst_lof = ul->un_bol_lof;
876 *dst_lofp = dst_lof;
877 return (sav_dst_nb - dst_nb);
882 * copy complete - return current lof
884 sema_v(&bp->b_sem);
885 *dst_lofp = dst_lof;
886 return (sav_dst_nb);
889 void
890 ldl_round_commit(ml_unit_t *ul)
892 int wrapped;
893 buf_t *bp;
894 sect_trailer_t *st;
895 size_t bcount;
896 cirbuf_t *cb = &ul->un_wrbuf;
899 * if nothing to write; then do nothing
901 if ((bp = cb->cb_dirty) == NULL)
902 return;
903 makebusy(ul, bp);
906 * round up to sector boundary and set new tail
907 * don't readjust st_ident if buf is already rounded
909 bcount = P2ROUNDUP(bp->b_bcount, DEV_BSIZE);
910 if (bcount == bp->b_bcount) {
911 sema_v(&bp->b_sem);
912 return;
914 bp->b_bcount = bcount;
915 ul->un_tail_lof = dbtob(bp->b_blkno) + bcount;
916 wrapped = 0;
917 if (ul->un_tail_lof == ul->un_eol_lof) {
918 ul->un_tail_lof = ul->un_bol_lof;
919 ++wrapped;
921 ASSERT(ul->un_tail_lof != ul->un_head_lof);
924 * fix up the sector trailer
926 /* LINTED */
927 st = (sect_trailer_t *)
928 ((bp->b_un.b_addr + bcount) - sizeof (*st));
929 st->st_tid = ul->un_logmap->mtm_tid;
930 st->st_ident = ul->un_tail_ident++;
933 * if tail wrapped or we have exhausted this buffer
934 * async write the buffer
936 if (wrapped || bcount == bp->b_bufsize)
937 push_dirty_bp(ul, bp);
938 else
939 sema_v(&bp->b_sem);
942 void
943 ldl_push_commit(ml_unit_t *ul)
945 buf_t *bp;
946 cirbuf_t *cb = &ul->un_wrbuf;
949 * if nothing to write; then do nothing
951 if ((bp = cb->cb_dirty) == NULL)
952 return;
953 makebusy(ul, bp);
954 push_dirty_bp(ul, bp);
958 ldl_need_commit(ml_unit_t *ul)
960 return (ul->un_resv > (ul->un_maxresv - (ul->un_maxresv>>2)));
964 ldl_has_space(ml_unit_t *ul, mapentry_t *me)
966 off_t nfb;
967 off_t nb;
969 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
972 * Add up the size used by the deltas
973 * round nb up to a sector length plus an extra sector
974 * w/o the extra sector we couldn't distinguish
975 * a full log (head == tail) from an empty log (head == tail)
977 for (nb = DEV_BSIZE; me; me = me->me_hash) {
978 nb += sizeof (struct delta);
979 if (me->me_dt != DT_CANCEL)
980 nb += me->me_nb;
982 nb = P2ROUNDUP(nb, DEV_BSIZE);
984 if (ul->un_head_lof <= ul->un_tail_lof)
985 nfb = (ul->un_head_lof - ul->un_bol_lof) +
986 (ul->un_eol_lof - ul->un_tail_lof);
987 else
988 nfb = ul->un_head_lof - ul->un_tail_lof;
990 return (nb < nfb);
993 void
994 ldl_write(ml_unit_t *ul, caddr_t bufp, offset_t bufmof, struct mapentry *me)
996 buf_t *bp;
997 caddr_t va;
998 size_t nb;
999 size_t actual;
1001 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1003 /* Write the delta */
1005 nb = sizeof (struct delta);
1006 va = (caddr_t)&me->me_delta;
1007 bp = get_write_bp(ul);
1009 while (nb) {
1010 if (ul->un_flags & LDL_ERROR) {
1011 sema_v(&bp->b_sem);
1012 return;
1014 actual = storebuf(ul, bp, va, nb);
1015 ASSERT(actual);
1016 va += actual;
1017 nb -= actual;
1018 if (nb)
1019 bp = get_write_bp(ul);
1022 /* If a commit, cancel, or 0's; we're almost done */
1023 switch (me->me_dt) {
1024 case DT_COMMIT:
1025 case DT_CANCEL:
1026 case DT_ABZERO:
1027 /* roll needs to know where the next delta will go */
1028 me->me_lof = ul->un_tail_lof;
1029 return;
1030 default:
1031 break;
1034 /* Now write the data */
1036 ASSERT(me->me_nb != 0);
1038 nb = me->me_nb;
1039 va = (me->me_mof - bufmof) + bufp;
1040 bp = get_write_bp(ul);
1042 /* Save where we will put the data */
1043 me->me_lof = ul->un_tail_lof;
1045 while (nb) {
1046 if (ul->un_flags & LDL_ERROR) {
1047 sema_v(&bp->b_sem);
1048 return;
1050 actual = storebuf(ul, bp, va, nb);
1051 ASSERT(actual);
1052 va += actual;
1053 nb -= actual;
1054 if (nb)
1055 bp = get_write_bp(ul);
1059 void
1060 ldl_waito(ml_unit_t *ul)
1062 buf_t *bp;
1063 cirbuf_t *cb = &ul->un_wrbuf;
1065 rw_enter(&cb->cb_rwlock, RW_WRITER);
1067 * wait on them
1069 bp = cb->cb_bp;
1070 do {
1071 if ((bp->b_flags & B_DONE) == 0) {
1072 makebusy(ul, bp);
1073 sema_v(&bp->b_sem);
1075 bp = bp->b_forw;
1076 } while (bp != cb->cb_bp);
1077 rw_exit(&cb->cb_rwlock);
1081 * seek nb bytes from location lof
1083 static int
1084 logseek(ml_unit_t *ul, off_t lof, size_t nb, off_t *lofp)
1086 buf_t *bp;
1087 ulong_t actual;
1089 while (nb) {
1090 bp = get_read_bp(ul, lof);
1091 if (bp->b_flags & B_ERROR) {
1092 sema_v(&bp->b_sem);
1093 return (EIO);
1095 actual = fetchbuf(ul, bp, NULL, nb, &lof);
1096 ASSERT(actual);
1097 nb -= actual;
1099 *lofp = lof;
1100 ASSERT(nb == 0);
1101 return (0);
1105 ldl_read(
1106 ml_unit_t *ul, /* Log unit */
1107 caddr_t va, /* address of buffer to read into */
1108 offset_t mof, /* mof of buffer */
1109 off_t nb, /* length of buffer */
1110 mapentry_t *me) /* Map entry list */
1112 buf_t *bp;
1113 crb_t *crb;
1114 caddr_t rva; /* address to read into */
1115 size_t rnb; /* # of bytes to read */
1116 off_t lof; /* log device offset to read from */
1117 off_t skip;
1118 ulong_t actual;
1119 int error;
1120 caddr_t eva = va + nb; /* end of buffer */
1122 for (; me; me = me->me_agenext) {
1123 ASSERT(me->me_dt != DT_CANCEL);
1126 * check for an cached roll buffer
1128 crb = me->me_crb;
1129 if (crb) {
1130 if (mof > crb->c_mof) {
1132 * This mapentry overlaps with the beginning of
1133 * the supplied buffer
1135 skip = mof - crb->c_mof;
1136 bcopy(crb->c_buf + skip, va,
1137 MIN(nb, crb->c_nb - skip));
1138 } else {
1140 * This mapentry starts at or after
1141 * the supplied buffer.
1143 skip = crb->c_mof - mof;
1144 bcopy(crb->c_buf, va + skip,
1145 MIN(crb->c_nb, nb - skip));
1147 logstats.ls_lreadsinmem.value.ui64++;
1148 continue;
1152 * check for a delta full of zeroes - there's no log data
1154 if (me->me_dt == DT_ABZERO) {
1155 fetchzeroes(va, mof, nb, me);
1156 continue;
1159 if (mof > me->me_mof) {
1160 rnb = (size_t)(mof - me->me_mof);
1161 error = logseek(ul, me->me_lof, rnb, &lof);
1162 if (error)
1163 return (EIO);
1164 rva = va;
1165 rnb = me->me_nb - rnb;
1166 rnb = ((rva + rnb) > eva) ? eva - rva : rnb;
1167 } else {
1168 lof = me->me_lof;
1169 rva = (me->me_mof - mof) + va;
1170 rnb = ((rva + me->me_nb) > eva) ? eva - rva : me->me_nb;
1173 while (rnb) {
1174 bp = get_read_bp(ul, lof);
1175 if (bp->b_flags & B_ERROR) {
1176 sema_v(&bp->b_sem);
1177 return (EIO);
1179 ASSERT(((me->me_flags & ME_ROLL) == 0) ||
1180 (bp != ul->un_wrbuf.cb_dirty));
1181 actual = fetchbuf(ul, bp, rva, rnb, &lof);
1182 ASSERT(actual);
1183 rva += actual;
1184 rnb -= actual;
1187 return (0);
1190 void
1191 ldl_savestate(ml_unit_t *ul)
1193 int error;
1194 buf_t *bp = ul->un_bp;
1195 ml_odunit_t *ud = (void *)bp->b_un.b_addr;
1196 ml_odunit_t *ud2 = (void *)(bp->b_un.b_addr + DEV_BSIZE);
1198 #if DEBUG
1200 * Scan test is running; don't update intermediate state
1202 if (ul->un_logmap && ul->un_logmap->mtm_trimlof)
1203 return;
1204 #endif /* DEBUG */
1206 mutex_enter(&ul->un_state_mutex);
1207 bcopy(&ul->un_ondisk, ud, sizeof (*ud));
1208 ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
1209 bcopy(ud, ud2, sizeof (*ud));
1211 /* If a snapshot is enabled write through the shapshot driver. */
1212 if (ul->un_ufsvfs->vfs_snapshot)
1213 UFS_BWRITE2(ul->un_ufsvfs, bp);
1214 else
1215 BWRITE2(bp);
1216 logstats.ls_ldlwrites.value.ui64++;
1217 error = bp->b_flags & B_ERROR;
1218 mutex_exit(&ul->un_state_mutex);
1219 if (error)
1220 ldl_seterror(ul, "Error writing ufs log state");
1224 * The head will be set to (new_lof - header) since ldl_sethead is
1225 * called with the new_lof of the data portion of a delta.
1227 void
1228 ldl_sethead(ml_unit_t *ul, off_t data_lof, uint32_t tid)
1230 off_t nb;
1231 off_t new_lof;
1232 uint32_t new_ident;
1233 daddr_t beg_blkno;
1234 daddr_t end_blkno;
1236 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1238 if (data_lof == -1) {
1239 /* log is empty */
1240 new_ident = lufs_hd_genid(ul);
1241 new_lof = ul->un_tail_lof;
1243 } else {
1244 /* compute header's lof */
1245 new_ident = ul->un_head_ident;
1246 new_lof = data_lof - sizeof (struct delta);
1248 /* whoops, header spans sectors; subtract out sector trailer */
1249 if (btodb(new_lof) != btodb(data_lof))
1250 new_lof -= sizeof (sect_trailer_t);
1252 /* whoops, header wrapped the log; go to last sector */
1253 if (new_lof < ul->un_bol_lof) {
1254 /* sector offset */
1255 new_lof -= dbtob(btodb(new_lof));
1256 /* add to last sector's lof */
1257 new_lof += (ul->un_eol_lof - DEV_BSIZE);
1259 ul->un_head_tid = tid;
1263 * check for nop
1265 if (new_lof == ul->un_head_lof)
1266 return;
1269 * invalidate the affected bufs and calculate new ident
1271 if (new_lof > ul->un_head_lof) {
1272 nb = new_lof - ul->un_head_lof;
1273 inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1274 inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1276 end_blkno = btodb(new_lof);
1277 beg_blkno = btodb(ul->un_head_lof);
1278 new_ident += (end_blkno - beg_blkno);
1279 } else {
1280 nb = ul->un_eol_lof - ul->un_head_lof;
1281 inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1282 inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1284 end_blkno = btodb(ul->un_eol_lof);
1285 beg_blkno = btodb(ul->un_head_lof);
1286 new_ident += (end_blkno - beg_blkno);
1288 nb = new_lof - ul->un_bol_lof;
1289 inval_range(ul, &ul->un_wrbuf, ul->un_bol_lof, nb);
1290 inval_range(ul, &ul->un_rdbuf, ul->un_bol_lof, nb);
1292 end_blkno = btodb(new_lof);
1293 beg_blkno = btodb(ul->un_bol_lof);
1294 new_ident += (end_blkno - beg_blkno);
1297 * don't update the head if there has been an error
1299 if (ul->un_flags & LDL_ERROR)
1300 return;
1302 /* Fix up the head and ident */
1303 ASSERT(new_lof >= ul->un_bol_lof);
1304 ul->un_head_lof = new_lof;
1305 ul->un_head_ident = new_ident;
1306 if (data_lof == -1) {
1307 ul->un_tail_ident = ul->un_head_ident;
1311 /* Commit to the database */
1312 ldl_savestate(ul);
1314 ASSERT(((ul->un_logmap->mtm_debug & MT_SCAN) == 0) ||
1315 ldl_sethead_debug(ul));
1319 * The tail will be set to the sector following lof+nb
1320 * lof + nb == size of the last delta + commit record
1321 * this function is called once after the log scan has completed.
1323 void
1324 ldl_settail(ml_unit_t *ul, off_t lof, size_t nb)
1326 off_t new_lof;
1327 uint32_t new_ident;
1328 daddr_t beg_blkno;
1329 daddr_t end_blkno;
1331 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1333 if (lof == -1) {
1334 ul->un_tail_lof = dbtob(btodb(ul->un_head_lof));
1335 ul->un_head_lof = ul->un_tail_lof;
1336 ul->un_head_ident = lufs_hd_genid(ul);
1337 ul->un_tail_ident = ul->un_head_ident;
1339 /* Commit to the database */
1340 ldl_savestate(ul);
1342 return;
1346 * new_lof is the offset of the sector following the last commit
1348 (void) logseek(ul, lof, nb, &new_lof);
1349 ASSERT(new_lof != dbtob(btodb(ul->un_head_lof)));
1352 * calculate new ident
1354 if (new_lof > ul->un_head_lof) {
1355 end_blkno = btodb(new_lof);
1356 beg_blkno = btodb(ul->un_head_lof);
1357 new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1358 } else {
1359 end_blkno = btodb(ul->un_eol_lof);
1360 beg_blkno = btodb(ul->un_head_lof);
1361 new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1363 end_blkno = btodb(new_lof);
1364 beg_blkno = btodb(ul->un_bol_lof);
1365 new_ident += (end_blkno - beg_blkno);
1368 /* Fix up the tail and ident */
1369 ul->un_tail_lof = new_lof;
1370 ul->un_tail_ident = new_ident;
1372 /* Commit to the database */
1373 ldl_savestate(ul);
1377 * LOGSCAN STUFF
1379 static int
1380 ldl_logscan_ident(ml_unit_t *ul, buf_t *bp, off_t lof)
1382 ulong_t ident;
1383 size_t nblk, i;
1384 sect_trailer_t *st;
1387 * compute ident for first sector in the buffer
1389 ident = ul->un_head_ident;
1390 if (bp->b_blkno >= btodb(ul->un_head_lof)) {
1391 ident += (bp->b_blkno - btodb(ul->un_head_lof));
1392 } else {
1393 ident += (btodb(ul->un_eol_lof) - btodb(ul->un_head_lof));
1394 ident += (bp->b_blkno - btodb(ul->un_bol_lof));
1397 * truncate the buffer down to the last valid sector
1399 nblk = btodb(bp->b_bcount);
1400 bp->b_bcount = 0;
1401 /* LINTED */
1402 st = (sect_trailer_t *)(bp->b_un.b_addr + LDL_USABLE_BSIZE);
1403 for (i = 0; i < nblk; ++i) {
1404 if (st->st_ident != ident)
1405 break;
1407 /* remember last valid tid for ldl_logscan_error() */
1408 ul->un_tid = st->st_tid;
1410 /* LINTED */
1411 st = (sect_trailer_t *)(((caddr_t)st) + DEV_BSIZE);
1412 ++ident;
1413 bp->b_bcount += DEV_BSIZE;
1416 * make sure that lof is still within range
1418 return (within_range(lof, bp->b_blkno, bp->b_bcount));
1421 ulong_t
1422 ldl_logscan_nbcommit(off_t lof)
1425 * lof is the offset following the commit header. However,
1426 * if the commit header fell on the end-of-sector, then lof
1427 * has already been advanced to the beginning of the next
1428 * sector. So do nothing. Otherwise, return the remaining
1429 * bytes in the sector.
1431 if ((lof & (DEV_BSIZE - 1)) == 0)
1432 return (0);
1433 return (NB_LEFT_IN_SECTOR(lof));
1437 ldl_logscan_read(ml_unit_t *ul, off_t *lofp, size_t nb, caddr_t va)
1439 buf_t *bp;
1440 ulong_t actual;
1442 ASSERT(ul->un_head_lof != ul->un_tail_lof);
1445 * Check the log data doesn't go out of bounds
1447 if (ul->un_head_lof < ul->un_tail_lof) {
1448 if (!WITHIN(*lofp, nb, ul->un_head_lof,
1449 (ul->un_tail_lof - ul->un_head_lof))) {
1450 return (EIO);
1452 } else {
1453 if (OVERLAP(*lofp, nb, ul->un_tail_lof,
1454 (ul->un_head_lof - ul->un_tail_lof))) {
1455 return (EIO);
1459 while (nb) {
1460 bp = get_read_bp(ul, *lofp);
1461 if (bp->b_flags & B_ERROR) {
1462 sema_v(&bp->b_sem);
1463 return (EIO);
1466 * out-of-seq idents means partial transaction
1467 * panic, non-corrupting powerfail, ...
1469 if (!ldl_logscan_ident(ul, bp, *lofp)) {
1470 sema_v(&bp->b_sem);
1471 return (EIO);
1474 * copy the header into the caller's buf
1476 actual = fetchbuf(ul, bp, va, nb, lofp);
1477 if (va)
1478 va += actual;
1479 nb -= actual;
1481 return (0);
1484 void
1485 ldl_logscan_begin(ml_unit_t *ul)
1487 size_t bufsize;
1489 ASSERT(ul->un_wrbuf.cb_dirty == NULL);
1492 * logscan has begun
1494 ul->un_flags |= LDL_SCAN;
1497 * reset the circular bufs
1499 bufsize = ldl_bufsize(ul);
1500 alloc_rdbuf(&ul->un_rdbuf, bufsize, bufsize);
1501 alloc_wrbuf(&ul->un_wrbuf, bufsize);
1504 * set the tail to reflect a full log
1506 ul->un_tail_lof = dbtob(btodb(ul->un_head_lof)) - DEV_BSIZE;
1508 if (ul->un_tail_lof < ul->un_bol_lof)
1509 ul->un_tail_lof = ul->un_eol_lof - DEV_BSIZE;
1510 if (ul->un_tail_lof >= ul->un_eol_lof)
1511 ul->un_tail_lof = ul->un_bol_lof;
1514 * un_tid is used during error processing; it is initialized to
1515 * the tid of the delta at un_head_lof;
1517 ul->un_tid = ul->un_head_tid;
1520 void
1521 ldl_logscan_end(ml_unit_t *ul)
1523 size_t bufsize;
1526 * reset the circular bufs
1528 bufsize = ldl_bufsize(ul);
1529 alloc_rdbuf(&ul->un_rdbuf, MAPBLOCKSIZE, MAPBLOCKSIZE);
1530 alloc_wrbuf(&ul->un_wrbuf, bufsize);
1533 * Done w/scan
1535 ul->un_flags &= ~LDL_SCAN;
1539 ldl_need_roll(ml_unit_t *ul)
1541 off_t busybytes;
1542 off_t head;
1543 off_t tail;
1544 off_t bol;
1545 off_t eol;
1546 off_t nb;
1549 * snapshot the log state
1551 head = ul->un_head_lof;
1552 tail = ul->un_tail_lof;
1553 bol = ul->un_bol_lof;
1554 eol = ul->un_eol_lof;
1555 nb = ul->un_logsize;
1558 * compute number of busy (inuse) bytes
1560 if (head <= tail)
1561 busybytes = tail - head;
1562 else
1563 busybytes = (eol - head) + (tail - bol);
1566 * return TRUE if > 75% full
1568 return (busybytes > (nb - (nb >> 2)));
1571 void
1572 ldl_seterror(ml_unit_t *ul, char *why)
1575 * already in error state; do nothing
1577 if (ul->un_flags & LDL_ERROR)
1578 return;
1580 ul->un_flags |= LDL_ERROR; /* incore */
1581 ul->un_badlog = 1; /* ondisk (cleared by fsck) */
1584 * Commit to state sectors
1586 uniqtime(&ul->un_timestamp);
1587 ldl_savestate(ul);
1589 /* Pretty print */
1590 cmn_err(CE_WARN, "%s", why);
1591 cmn_err(CE_WARN, "ufs log for %s changed state to Error",
1592 ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1593 cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)",
1594 ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1597 * If we aren't in the middle of scan (aka snarf); tell ufs
1598 * to hard lock itself.
1600 if ((ul->un_flags & LDL_SCAN) == 0)
1601 ufs_trans_onerror();
1604 size_t
1605 ldl_bufsize(ml_unit_t *ul)
1607 size_t bufsize;
1608 extern uint32_t ldl_minbufsize;
1611 * initial guess is the maxtransfer value for this log device
1612 * increase if too small
1613 * decrease if too large
1615 bufsize = dbtob(btod(ul->un_maxtransfer));
1616 if (bufsize < ldl_minbufsize)
1617 bufsize = ldl_minbufsize;
1618 if (bufsize > maxphys)
1619 bufsize = maxphys;
1620 if (bufsize > ul->un_maxtransfer)
1621 bufsize = ul->un_maxtransfer;
1622 return (bufsize);