1 /* $NetBSD: vfs_wapbl.c,v 1.28 2009/10/01 12:28:34 pooka Exp $ */
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * This implements file system independent write ahead filesystem logging.
36 #define WAPBL_INTERNAL
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.28 2009/10/01 12:28:34 pooka Exp $");
41 #include <sys/param.h>
44 #include <sys/param.h>
45 #include <sys/namei.h>
48 #include <sys/vnode.h>
50 #include <sys/malloc.h>
51 #include <sys/resourcevar.h>
53 #include <sys/mount.h>
54 #include <sys/kernel.h>
55 #include <sys/kauth.h>
56 #include <sys/mutex.h>
57 #include <sys/atomic.h>
58 #include <sys/wapbl.h>
59 #include <sys/wapbl_replay.h>
61 #include <miscfs/specfs/specdev.h>
64 #define wapbl_malloc(s) kmem_alloc((s), KM_SLEEP)
65 #define wapbl_free(a, s) kmem_free((a), (s))
66 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
68 MALLOC_JUSTDEFINE(M_WAPBL
, "wapbl", "write-ahead physical block logging");
69 #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK)
70 #define wapbl_free(a, s) free((a), M_WAPBL)
71 #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO)
83 #include <sys/wapbl.h>
84 #include <sys/wapbl_replay.h>
86 #define KDASSERT(x) assert(x)
87 #define KASSERT(x) assert(x)
88 #define wapbl_malloc(s) malloc(s)
89 #define wapbl_free(a, s) free(a)
90 #define wapbl_calloc(n, s) calloc((n), (s))
95 * INTERNAL DATA STRUCTURES
99 * This structure holds per-mount log information.
101 * Legend: a = atomic access only
102 * r = read-only after init
105 * u = unlocked access ok
106 * b = bufcache_lock held
109 struct vnode
*wl_logvp
; /* r: log here */
110 struct vnode
*wl_devvp
; /* r: log on this device */
111 struct mount
*wl_mount
; /* r: mountpoint wl is associated with */
112 daddr_t wl_logpbn
; /* r: Physical block number of start of log */
113 int wl_log_dev_bshift
; /* r: logarithm of device block size of log
115 int wl_fs_dev_bshift
; /* r: logarithm of device block size of
118 unsigned wl_lock_count
; /* m: Count of transactions in progress */
120 size_t wl_circ_size
; /* r: Number of bytes in buffer of log */
121 size_t wl_circ_off
; /* r: Number of bytes reserved at start */
123 size_t wl_bufcount_max
; /* r: Number of buffers reserved for log */
124 size_t wl_bufbytes_max
; /* r: Number of buf bytes reserved for log */
126 off_t wl_head
; /* l: Byte offset of log head */
127 off_t wl_tail
; /* l: Byte offset of log tail */
129 * head == tail == 0 means log is empty
130 * head == tail != 0 means log is full
131 * see assertions in wapbl_advance() for other boundary conditions.
132 * only truncate moves the tail, except when flush sets it to
133 * wl_header_size only flush moves the head, except when truncate
137 struct wapbl_wc_header
*wl_wc_header
; /* l */
138 void *wl_wc_scratch
; /* l: scratch space (XXX: por que?!?) */
140 kmutex_t wl_mtx
; /* u: short-term lock */
141 krwlock_t wl_rwlock
; /* u: File system transaction lock */
144 * Must be held while accessing
145 * wl_count or wl_bufs or head or tail
149 * Callback called from within the flush routine to flush any extra
150 * bits. Note that flush may be skipped without calling this if
151 * there are no outstanding buffers in the transaction.
154 wapbl_flush_fn_t wl_flush
; /* r */
155 wapbl_flush_fn_t wl_flush_abort
;/* r */
158 size_t wl_bufbytes
; /* m: Byte count of pages in wl_bufs */
159 size_t wl_bufcount
; /* m: Count of buffers in wl_bufs */
160 size_t wl_bcount
; /* m: Total bcount of wl_bufs */
162 LIST_HEAD(, buf
) wl_bufs
; /* m: Buffers in current transaction */
164 kcondvar_t wl_reclaimable_cv
; /* m (obviously) */
165 size_t wl_reclaimable_bytes
; /* m: Amount of space available for
166 reclamation by truncate */
167 int wl_error_count
; /* m: # of wl_entries with errors */
168 size_t wl_reserved_bytes
; /* never truncate log smaller than this */
170 #ifdef WAPBL_DEBUG_BUFBYTES
171 size_t wl_unsynced_bufbytes
; /* Byte count of unsynced buffers */
174 daddr_t
*wl_deallocblks
;/* l: address of block */
175 int *wl_dealloclens
; /* l: size of block */
176 int wl_dealloccnt
; /* l: total count */
177 int wl_dealloclim
; /* l: max count */
179 /* hashtable of inode numbers for allocated but unlinked inodes */
181 LIST_HEAD(wapbl_ino_head
, wapbl_ino
) *wl_inohash
;
182 u_long wl_inohashmask
;
185 SIMPLEQ_HEAD(, wapbl_entry
) wl_entries
; /* On disk transaction
189 #ifdef WAPBL_DEBUG_PRINT
190 int wapbl_debug_print
= WAPBL_DEBUG_PRINT
;
193 /****************************************************************/
197 struct wapbl
*wapbl_debug_wl
;
200 static int wapbl_write_commit(struct wapbl
*wl
, off_t head
, off_t tail
);
201 static int wapbl_write_blocks(struct wapbl
*wl
, off_t
*offp
);
202 static int wapbl_write_revocations(struct wapbl
*wl
, off_t
*offp
);
203 static int wapbl_write_inodes(struct wapbl
*wl
, off_t
*offp
);
206 static int wapbl_replay_process(struct wapbl_replay
*wr
, off_t
, off_t
);
208 static __inline
size_t wapbl_space_free(size_t avail
, off_t head
,
210 static __inline
size_t wapbl_space_used(size_t avail
, off_t head
,
215 #define WAPBL_INODETRK_SIZE 83
216 static int wapbl_ino_pool_refcount
;
217 static struct pool wapbl_ino_pool
;
219 LIST_ENTRY(wapbl_ino
) wi_hash
;
224 static void wapbl_inodetrk_init(struct wapbl
*wl
, u_int size
);
225 static void wapbl_inodetrk_free(struct wapbl
*wl
);
226 static struct wapbl_ino
*wapbl_inodetrk_get(struct wapbl
*wl
, ino_t ino
);
228 static size_t wapbl_transaction_len(struct wapbl
*wl
);
229 static __inline
size_t wapbl_transaction_inodes_len(struct wapbl
*wl
);
232 int wapbl_replay_verify(struct wapbl_replay
*, struct vnode
*);
235 static int wapbl_replay_isopen1(struct wapbl_replay
*);
238 * This is useful for debugging. If set, the log will
239 * only be truncated when necessary.
241 int wapbl_lazy_truncate
= 0;
243 struct wapbl_ops wapbl_ops
= {
244 .wo_wapbl_discard
= wapbl_discard
,
245 .wo_wapbl_replay_isopen
= wapbl_replay_isopen1
,
246 .wo_wapbl_replay_can_read
= wapbl_replay_can_read
,
247 .wo_wapbl_replay_read
= wapbl_replay_read
,
248 .wo_wapbl_add_buf
= wapbl_add_buf
,
249 .wo_wapbl_remove_buf
= wapbl_remove_buf
,
250 .wo_wapbl_resize_buf
= wapbl_resize_buf
,
251 .wo_wapbl_begin
= wapbl_begin
,
252 .wo_wapbl_end
= wapbl_end
,
253 .wo_wapbl_junlock_assert
= wapbl_junlock_assert
,
255 /* XXX: the following is only used to say "this is a wapbl buf" */
256 .wo_wapbl_biodone
= wapbl_biodone
,
263 malloc_type_attach(M_WAPBL
);
267 wapbl_start_flush_inodes(struct wapbl
*wl
, struct wapbl_replay
*wr
)
271 WAPBL_PRINTF(WAPBL_PRINT_REPLAY
,
272 ("wapbl_start: reusing log with %d inodes\n", wr
->wr_inodescnt
));
275 * Its only valid to reuse the replay log if its
276 * the same as the new log we just opened.
278 KDASSERT(!wapbl_replay_isopen(wr
));
279 KASSERT(wl
->wl_devvp
->v_rdev
== wr
->wr_devvp
->v_rdev
);
280 KASSERT(wl
->wl_logpbn
== wr
->wr_logpbn
);
281 KASSERT(wl
->wl_circ_size
== wr
->wr_circ_size
);
282 KASSERT(wl
->wl_circ_off
== wr
->wr_circ_off
);
283 KASSERT(wl
->wl_log_dev_bshift
== wr
->wr_log_dev_bshift
);
284 KASSERT(wl
->wl_fs_dev_bshift
== wr
->wr_fs_dev_bshift
);
286 wl
->wl_wc_header
->wc_generation
= wr
->wr_generation
+ 1;
288 for (i
= 0; i
< wr
->wr_inodescnt
; i
++)
289 wapbl_register_inode(wl
, wr
->wr_inodes
[i
].wr_inumber
,
290 wr
->wr_inodes
[i
].wr_imode
);
292 /* Make sure new transaction won't overwrite old inodes list */
293 KDASSERT(wapbl_transaction_len(wl
) <=
294 wapbl_space_free(wl
->wl_circ_size
, wr
->wr_inodeshead
,
297 wl
->wl_head
= wl
->wl_tail
= wr
->wr_inodeshead
;
298 wl
->wl_reclaimable_bytes
= wl
->wl_reserved_bytes
=
299 wapbl_transaction_len(wl
);
301 error
= wapbl_write_inodes(wl
, &wl
->wl_head
);
305 KASSERT(wl
->wl_head
!= wl
->wl_tail
);
306 KASSERT(wl
->wl_head
!= 0);
312 wapbl_start(struct wapbl
** wlp
, struct mount
*mp
, struct vnode
*vp
,
313 daddr_t off
, size_t count
, size_t blksize
, struct wapbl_replay
*wr
,
314 wapbl_flush_fn_t flushfn
, wapbl_flush_fn_t flushabortfn
)
320 int log_dev_bshift
= DEV_BSHIFT
;
321 int fs_dev_bshift
= DEV_BSHIFT
;
324 WAPBL_PRINTF(WAPBL_PRINT_OPEN
, ("wapbl_start: vp=%p off=%" PRId64
325 " count=%zu blksize=%zu\n", vp
, off
, count
, blksize
));
327 if (log_dev_bshift
> fs_dev_bshift
) {
328 WAPBL_PRINTF(WAPBL_PRINT_OPEN
,
329 ("wapbl: log device's block size cannot be larger "
330 "than filesystem's\n"));
332 * Not currently implemented, although it could be if
341 if (blksize
< DEV_BSIZE
)
343 if (blksize
% DEV_BSIZE
)
346 /* XXXTODO: verify that the full load is writable */
349 * XXX check for minimum log size
350 * minimum is governed by minimum amount of space
351 * to complete a transaction. (probably truncate)
353 /* XXX for now pick something minimal */
354 if ((count
* blksize
) < MAXPHYS
) {
358 if ((error
= VOP_BMAP(vp
, off
, &devvp
, &logpbn
, &run
)) != 0) {
362 wl
= wapbl_calloc(1, sizeof(*wl
));
363 rw_init(&wl
->wl_rwlock
);
364 mutex_init(&wl
->wl_mtx
, MUTEX_DEFAULT
, IPL_NONE
);
365 cv_init(&wl
->wl_reclaimable_cv
, "wapblrec");
366 LIST_INIT(&wl
->wl_bufs
);
367 SIMPLEQ_INIT(&wl
->wl_entries
);
370 wl
->wl_devvp
= devvp
;
372 wl
->wl_logpbn
= logpbn
;
373 wl
->wl_log_dev_bshift
= log_dev_bshift
;
374 wl
->wl_fs_dev_bshift
= fs_dev_bshift
;
376 wl
->wl_flush
= flushfn
;
377 wl
->wl_flush_abort
= flushabortfn
;
379 /* Reserve two log device blocks for the commit headers */
380 wl
->wl_circ_off
= 2<<wl
->wl_log_dev_bshift
;
381 wl
->wl_circ_size
= ((count
* blksize
) - wl
->wl_circ_off
);
382 /* truncate the log usage to a multiple of log_dev_bshift */
383 wl
->wl_circ_size
>>= wl
->wl_log_dev_bshift
;
384 wl
->wl_circ_size
<<= wl
->wl_log_dev_bshift
;
387 * wl_bufbytes_max limits the size of the in memory transaction space.
388 * - Since buffers are allocated and accounted for in units of
389 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
390 * (i.e. 1<<PAGE_SHIFT)
391 * - Since the log device has to be written in units of
392 * 1<<wl_log_dev_bshift it is required to be a mulitple of
393 * 1<<wl_log_dev_bshift.
394 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
395 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
396 * Therefore it must be multiple of the least common multiple of those
397 * three quantities. Fortunately, all of those quantities are
398 * guaranteed to be a power of two, and the least common multiple of
399 * a set of numbers which are all powers of two is simply the maximum
400 * of those numbers. Finally, the maximum logarithm of a power of two
401 * is the same as the log of the maximum power of two. So we can do
402 * the following operations to size wl_bufbytes_max:
405 /* XXX fix actual number of pages reserved per filesystem. */
406 wl
->wl_bufbytes_max
= MIN(wl
->wl_circ_size
, buf_memcalc() / 2);
408 /* Round wl_bufbytes_max to the largest power of two constraint */
409 wl
->wl_bufbytes_max
>>= PAGE_SHIFT
;
410 wl
->wl_bufbytes_max
<<= PAGE_SHIFT
;
411 wl
->wl_bufbytes_max
>>= wl
->wl_log_dev_bshift
;
412 wl
->wl_bufbytes_max
<<= wl
->wl_log_dev_bshift
;
413 wl
->wl_bufbytes_max
>>= wl
->wl_fs_dev_bshift
;
414 wl
->wl_bufbytes_max
<<= wl
->wl_fs_dev_bshift
;
416 /* XXX maybe use filesystem fragment size instead of 1024 */
417 /* XXX fix actual number of buffers reserved per filesystem. */
418 wl
->wl_bufcount_max
= (nbuf
/ 2) * 1024;
420 /* XXX tie this into resource estimation */
421 wl
->wl_dealloclim
= 2 * btodb(wl
->wl_bufbytes_max
);
423 wl
->wl_deallocblks
= wapbl_malloc(sizeof(*wl
->wl_deallocblks
) *
425 wl
->wl_dealloclens
= wapbl_malloc(sizeof(*wl
->wl_dealloclens
) *
428 wapbl_inodetrk_init(wl
, WAPBL_INODETRK_SIZE
);
430 /* Initialize the commit header */
432 struct wapbl_wc_header
*wc
;
433 size_t len
= 1 << wl
->wl_log_dev_bshift
;
434 wc
= wapbl_calloc(1, len
);
435 wc
->wc_type
= WAPBL_WC_HEADER
;
437 wc
->wc_circ_off
= wl
->wl_circ_off
;
438 wc
->wc_circ_size
= wl
->wl_circ_size
;
439 /* XXX wc->wc_fsid */
440 wc
->wc_log_dev_bshift
= wl
->wl_log_dev_bshift
;
441 wc
->wc_fs_dev_bshift
= wl
->wl_fs_dev_bshift
;
442 wl
->wl_wc_header
= wc
;
443 wl
->wl_wc_scratch
= wapbl_malloc(len
);
447 * if there was an existing set of unlinked but
448 * allocated inodes, preserve it in the new
451 if (wr
&& wr
->wr_inodescnt
) {
452 error
= wapbl_start_flush_inodes(wl
, wr
);
457 error
= wapbl_write_commit(wl
, wl
->wl_head
, wl
->wl_tail
);
463 #if defined(WAPBL_DEBUG)
470 wapbl_free(wl
->wl_wc_scratch
, wl
->wl_wc_header
->wc_len
);
471 wapbl_free(wl
->wl_wc_header
, wl
->wl_wc_header
->wc_len
);
472 wapbl_free(wl
->wl_deallocblks
,
473 sizeof(*wl
->wl_deallocblks
) * wl
->wl_dealloclim
);
474 wapbl_free(wl
->wl_dealloclens
,
475 sizeof(*wl
->wl_dealloclens
) * wl
->wl_dealloclim
);
476 wapbl_inodetrk_free(wl
);
477 wapbl_free(wl
, sizeof(*wl
));
483 * Like wapbl_flush, only discards the transaction
488 wapbl_discard(struct wapbl
*wl
)
490 struct wapbl_entry
*we
;
495 * XXX we may consider using upgrade here
496 * if we want to call flush from inside a transaction
498 rw_enter(&wl
->wl_rwlock
, RW_WRITER
);
499 wl
->wl_flush(wl
->wl_mount
, wl
->wl_deallocblks
, wl
->wl_dealloclens
,
502 #ifdef WAPBL_DEBUG_PRINT
507 pid
= curproc
->p_pid
;
510 #ifdef WAPBL_DEBUG_BUFBYTES
511 WAPBL_PRINTF(WAPBL_PRINT_DISCARD
,
512 ("wapbl_discard: thread %d.%d discarding "
514 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
515 "deallocs=%d inodes=%d\n"
516 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
518 pid
, lid
, wl
->wl_bufcount
, wl
->wl_bufbytes
,
519 wl
->wl_bcount
, wl
->wl_dealloccnt
,
520 wl
->wl_inohashcnt
, wl
->wl_error_count
,
521 wl
->wl_reclaimable_bytes
, wl
->wl_reserved_bytes
,
522 wl
->wl_unsynced_bufbytes
));
523 SIMPLEQ_FOREACH(we
, &wl
->wl_entries
, we_entries
) {
524 WAPBL_PRINTF(WAPBL_PRINT_DISCARD
,
525 ("\tentry: bufcount = %zu, reclaimable = %zu, "
526 "error = %d, unsynced = %zu\n",
527 we
->we_bufcount
, we
->we_reclaimable_bytes
,
528 we
->we_error
, we
->we_unsynced_bufbytes
));
530 #else /* !WAPBL_DEBUG_BUFBYTES */
531 WAPBL_PRINTF(WAPBL_PRINT_DISCARD
,
532 ("wapbl_discard: thread %d.%d discarding transaction\n"
533 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
534 "deallocs=%d inodes=%d\n"
535 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
536 pid
, lid
, wl
->wl_bufcount
, wl
->wl_bufbytes
,
537 wl
->wl_bcount
, wl
->wl_dealloccnt
,
538 wl
->wl_inohashcnt
, wl
->wl_error_count
,
539 wl
->wl_reclaimable_bytes
, wl
->wl_reserved_bytes
));
540 SIMPLEQ_FOREACH(we
, &wl
->wl_entries
, we_entries
) {
541 WAPBL_PRINTF(WAPBL_PRINT_DISCARD
,
542 ("\tentry: bufcount = %zu, reclaimable = %zu, "
544 we
->we_bufcount
, we
->we_reclaimable_bytes
,
547 #endif /* !WAPBL_DEBUG_BUFBYTES */
549 #endif /* WAPBL_DEBUG_PRINT */
551 for (i
= 0; i
<= wl
->wl_inohashmask
; i
++) {
552 struct wapbl_ino_head
*wih
;
553 struct wapbl_ino
*wi
;
555 wih
= &wl
->wl_inohash
[i
];
556 while ((wi
= LIST_FIRST(wih
)) != NULL
) {
557 LIST_REMOVE(wi
, wi_hash
);
558 pool_put(&wapbl_ino_pool
, wi
);
559 KASSERT(wl
->wl_inohashcnt
> 0);
567 mutex_enter(&bufcache_lock
);
568 mutex_enter(&wl
->wl_mtx
);
569 while ((bp
= LIST_FIRST(&wl
->wl_bufs
)) != NULL
) {
570 if (bbusy(bp
, 0, 0, &wl
->wl_mtx
) == 0) {
572 * The buffer will be unlocked and
573 * removed from the transaction in brelse
575 mutex_exit(&wl
->wl_mtx
);
577 mutex_enter(&wl
->wl_mtx
);
580 mutex_exit(&wl
->wl_mtx
);
581 mutex_exit(&bufcache_lock
);
584 * Remove references to this wl from wl_entries, free any which
585 * no longer have buffers, others will be freed in wapbl_biodone
586 * when they no longer have any buffers.
588 while ((we
= SIMPLEQ_FIRST(&wl
->wl_entries
)) != NULL
) {
589 SIMPLEQ_REMOVE_HEAD(&wl
->wl_entries
, we_entries
);
590 /* XXX should we be accumulating wl_error_count
591 * and increasing reclaimable bytes ? */
593 if (we
->we_bufcount
== 0) {
594 #ifdef WAPBL_DEBUG_BUFBYTES
595 KASSERT(we
->we_unsynced_bufbytes
== 0);
597 wapbl_free(we
, sizeof(*we
));
601 /* Discard list of deallocs */
602 wl
->wl_dealloccnt
= 0;
603 /* XXX should we clear wl_reserved_bytes? */
605 KASSERT(wl
->wl_bufbytes
== 0);
606 KASSERT(wl
->wl_bcount
== 0);
607 KASSERT(wl
->wl_bufcount
== 0);
608 KASSERT(LIST_EMPTY(&wl
->wl_bufs
));
609 KASSERT(SIMPLEQ_EMPTY(&wl
->wl_entries
));
610 KASSERT(wl
->wl_inohashcnt
== 0);
612 rw_exit(&wl
->wl_rwlock
);
616 wapbl_stop(struct wapbl
*wl
, int force
)
621 WAPBL_PRINTF(WAPBL_PRINT_OPEN
, ("wapbl_stop called\n"));
622 error
= wapbl_flush(wl
, 1);
630 /* Unlinked inodes persist after a flush */
631 if (wl
->wl_inohashcnt
) {
639 KASSERT(wl
->wl_bufbytes
== 0);
640 KASSERT(wl
->wl_bcount
== 0);
641 KASSERT(wl
->wl_bufcount
== 0);
642 KASSERT(LIST_EMPTY(&wl
->wl_bufs
));
643 KASSERT(wl
->wl_dealloccnt
== 0);
644 KASSERT(SIMPLEQ_EMPTY(&wl
->wl_entries
));
645 KASSERT(wl
->wl_inohashcnt
== 0);
649 wapbl_free(wl
->wl_wc_scratch
, wl
->wl_wc_header
->wc_len
);
650 wapbl_free(wl
->wl_wc_header
, wl
->wl_wc_header
->wc_len
);
651 wapbl_free(wl
->wl_deallocblks
,
652 sizeof(*wl
->wl_deallocblks
) * wl
->wl_dealloclim
);
653 wapbl_free(wl
->wl_dealloclens
,
654 sizeof(*wl
->wl_dealloclens
) * wl
->wl_dealloclim
);
655 wapbl_inodetrk_free(wl
);
657 cv_destroy(&wl
->wl_reclaimable_cv
);
658 mutex_destroy(&wl
->wl_mtx
);
659 rw_destroy(&wl
->wl_rwlock
);
660 wapbl_free(wl
, sizeof(*wl
));
666 wapbl_doio(void *data
, size_t len
, struct vnode
*devvp
, daddr_t pbn
, int flags
)
668 struct pstats
*pstats
= curlwp
->l_proc
->p_stats
;
672 KASSERT((flags
& ~(B_WRITE
| B_READ
)) == 0);
673 KASSERT(devvp
->v_type
== VBLK
);
675 if ((flags
& (B_WRITE
| B_READ
)) == B_WRITE
) {
676 mutex_enter(&devvp
->v_interlock
);
677 devvp
->v_numoutput
++;
678 mutex_exit(&devvp
->v_interlock
);
679 pstats
->p_ru
.ru_oublock
++;
681 pstats
->p_ru
.ru_inblock
++;
684 bp
= getiobuf(devvp
, true);
686 bp
->b_cflags
= BC_BUSY
; /* silly & dubious */
687 bp
->b_dev
= devvp
->v_rdev
;
689 bp
->b_bufsize
= bp
->b_resid
= bp
->b_bcount
= len
;
692 WAPBL_PRINTF(WAPBL_PRINT_IO
,
693 ("wapbl_doio: %s %d bytes at block %"PRId64
" on dev 0x%"PRIx64
"\n",
694 BUF_ISWRITE(bp
) ? "write" : "read", bp
->b_bcount
,
695 bp
->b_blkno
, bp
->b_dev
));
697 VOP_STRATEGY(devvp
, bp
);
703 WAPBL_PRINTF(WAPBL_PRINT_ERROR
,
704 ("wapbl_doio: %s %zu bytes at block %" PRId64
705 " on dev 0x%"PRIx64
" failed with error %d\n",
706 (((flags
& (B_WRITE
| B_READ
)) == B_WRITE
) ?
708 len
, pbn
, devvp
->v_rdev
, error
));
715 wapbl_write(void *data
, size_t len
, struct vnode
*devvp
, daddr_t pbn
)
718 return wapbl_doio(data
, len
, devvp
, pbn
, B_WRITE
);
722 wapbl_read(void *data
, size_t len
, struct vnode
*devvp
, daddr_t pbn
)
725 return wapbl_doio(data
, len
, devvp
, pbn
, B_READ
);
729 * Off is byte offset returns new offset for next write
730 * handles log wraparound
733 wapbl_circ_write(struct wapbl
*wl
, void *data
, size_t len
, off_t
*offp
)
739 KDASSERT(((len
>> wl
->wl_log_dev_bshift
) <<
740 wl
->wl_log_dev_bshift
) == len
);
742 if (off
< wl
->wl_circ_off
)
743 off
= wl
->wl_circ_off
;
744 slen
= wl
->wl_circ_off
+ wl
->wl_circ_size
- off
;
746 error
= wapbl_write(data
, slen
, wl
->wl_devvp
,
747 wl
->wl_logpbn
+ (off
>> wl
->wl_log_dev_bshift
));
750 data
= (uint8_t *)data
+ slen
;
752 off
= wl
->wl_circ_off
;
754 error
= wapbl_write(data
, len
, wl
->wl_devvp
,
755 wl
->wl_logpbn
+ (off
>> wl
->wl_log_dev_bshift
));
759 if (off
>= wl
->wl_circ_off
+ wl
->wl_circ_size
)
760 off
= wl
->wl_circ_off
;
765 /****************************************************************/
768 wapbl_begin(struct wapbl
*wl
, const char *file
, int line
)
776 * XXX this needs to be made much more sophisticated.
777 * perhaps each wapbl_begin could reserve a specified
778 * number of buffers and bytes.
780 mutex_enter(&wl
->wl_mtx
);
781 lockcount
= wl
->wl_lock_count
;
782 doflush
= ((wl
->wl_bufbytes
+ (lockcount
* MAXPHYS
)) >
783 wl
->wl_bufbytes_max
/ 2) ||
784 ((wl
->wl_bufcount
+ (lockcount
* 10)) >
785 wl
->wl_bufcount_max
/ 2) ||
786 (wapbl_transaction_len(wl
) > wl
->wl_circ_size
/ 2) ||
787 (wl
->wl_dealloccnt
>=
788 (wl
->wl_dealloclim
- (wl
->wl_dealloclim
>> 8)));
789 mutex_exit(&wl
->wl_mtx
);
792 WAPBL_PRINTF(WAPBL_PRINT_FLUSH
,
793 ("force flush lockcnt=%d bufbytes=%zu "
794 "(max=%zu) bufcount=%zu (max=%zu) "
795 "dealloccnt %d (lim=%d)\n",
796 lockcount
, wl
->wl_bufbytes
,
797 wl
->wl_bufbytes_max
, wl
->wl_bufcount
,
799 wl
->wl_dealloccnt
, wl
->wl_dealloclim
));
803 int error
= wapbl_flush(wl
, 0);
808 rw_enter(&wl
->wl_rwlock
, RW_READER
);
809 mutex_enter(&wl
->wl_mtx
);
811 mutex_exit(&wl
->wl_mtx
);
813 #if defined(WAPBL_DEBUG_PRINT)
814 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION
,
815 ("wapbl_begin thread %d.%d with bufcount=%zu "
816 "bufbytes=%zu bcount=%zu at %s:%d\n",
817 curproc
->p_pid
, curlwp
->l_lid
, wl
->wl_bufcount
,
818 wl
->wl_bufbytes
, wl
->wl_bcount
, file
, line
));
825 wapbl_end(struct wapbl
*wl
)
828 #if defined(WAPBL_DEBUG_PRINT)
829 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION
,
830 ("wapbl_end thread %d.%d with bufcount=%zu "
831 "bufbytes=%zu bcount=%zu\n",
832 curproc
->p_pid
, curlwp
->l_lid
, wl
->wl_bufcount
,
833 wl
->wl_bufbytes
, wl
->wl_bcount
));
836 mutex_enter(&wl
->wl_mtx
);
837 KASSERT(wl
->wl_lock_count
> 0);
839 mutex_exit(&wl
->wl_mtx
);
841 rw_exit(&wl
->wl_rwlock
);
845 wapbl_add_buf(struct wapbl
*wl
, struct buf
* bp
)
848 KASSERT(bp
->b_cflags
& BC_BUSY
);
851 wapbl_jlock_assert(wl
);
855 * XXX this might be an issue for swapfiles.
856 * see uvm_swap.c:1702
858 * XXX2 why require it then? leap of semantics?
860 KASSERT((bp
->b_cflags
& BC_NOCACHE
) == 0);
863 mutex_enter(&wl
->wl_mtx
);
864 if (bp
->b_flags
& B_LOCKED
) {
865 LIST_REMOVE(bp
, b_wapbllist
);
866 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2
,
867 ("wapbl_add_buf thread %d.%d re-adding buf %p "
868 "with %d bytes %d bcount\n",
869 curproc
->p_pid
, curlwp
->l_lid
, bp
, bp
->b_bufsize
,
872 /* unlocked by dirty buffers shouldn't exist */
873 KASSERT(!(bp
->b_oflags
& BO_DELWRI
));
874 wl
->wl_bufbytes
+= bp
->b_bufsize
;
875 wl
->wl_bcount
+= bp
->b_bcount
;
877 WAPBL_PRINTF(WAPBL_PRINT_BUFFER
,
878 ("wapbl_add_buf thread %d.%d adding buf %p "
879 "with %d bytes %d bcount\n",
880 curproc
->p_pid
, curlwp
->l_lid
, bp
, bp
->b_bufsize
,
883 LIST_INSERT_HEAD(&wl
->wl_bufs
, bp
, b_wapbllist
);
884 mutex_exit(&wl
->wl_mtx
);
886 bp
->b_flags
|= B_LOCKED
;
890 wapbl_remove_buf_locked(struct wapbl
* wl
, struct buf
*bp
)
893 KASSERT(mutex_owned(&wl
->wl_mtx
));
894 KASSERT(bp
->b_cflags
& BC_BUSY
);
895 wapbl_jlock_assert(wl
);
899 * XXX this might be an issue for swapfiles.
900 * see uvm_swap.c:1725
904 KASSERT((bp
->b_flags
& BC_NOCACHE
) == 0);
906 KASSERT(bp
->b_flags
& B_LOCKED
);
908 WAPBL_PRINTF(WAPBL_PRINT_BUFFER
,
909 ("wapbl_remove_buf thread %d.%d removing buf %p with "
910 "%d bytes %d bcount\n",
911 curproc
->p_pid
, curlwp
->l_lid
, bp
, bp
->b_bufsize
, bp
->b_bcount
));
913 KASSERT(wl
->wl_bufbytes
>= bp
->b_bufsize
);
914 wl
->wl_bufbytes
-= bp
->b_bufsize
;
915 KASSERT(wl
->wl_bcount
>= bp
->b_bcount
);
916 wl
->wl_bcount
-= bp
->b_bcount
;
917 KASSERT(wl
->wl_bufcount
> 0);
919 KASSERT((wl
->wl_bufcount
== 0) == (wl
->wl_bufbytes
== 0));
920 KASSERT((wl
->wl_bufcount
== 0) == (wl
->wl_bcount
== 0));
921 LIST_REMOVE(bp
, b_wapbllist
);
923 bp
->b_flags
&= ~B_LOCKED
;
926 /* called from brelsel() in vfs_bio among other places */
928 wapbl_remove_buf(struct wapbl
* wl
, struct buf
*bp
)
931 mutex_enter(&wl
->wl_mtx
);
932 wapbl_remove_buf_locked(wl
, bp
);
933 mutex_exit(&wl
->wl_mtx
);
937 wapbl_resize_buf(struct wapbl
*wl
, struct buf
*bp
, long oldsz
, long oldcnt
)
940 KASSERT(bp
->b_cflags
& BC_BUSY
);
943 * XXX: why does this depend on B_LOCKED? otherwise the buf
944 * is not for a transaction? if so, why is this called in the
947 if (bp
->b_flags
& B_LOCKED
) {
948 mutex_enter(&wl
->wl_mtx
);
949 wl
->wl_bufbytes
+= bp
->b_bufsize
- oldsz
;
950 wl
->wl_bcount
+= bp
->b_bcount
- oldcnt
;
951 mutex_exit(&wl
->wl_mtx
);
957 /****************************************************************/
958 /* Some utility inlines */
960 /* This is used to advance the pointer at old to new value at old+delta */
961 static __inline off_t
962 wapbl_advance(size_t size
, size_t off
, off_t old
, size_t delta
)
966 /* Define acceptable ranges for inputs. */
967 KASSERT(delta
<= size
);
968 KASSERT((old
== 0) || (old
>= off
));
969 KASSERT(old
< (size
+ off
));
971 if ((old
== 0) && (delta
!= 0))
973 else if ((old
+ delta
) < (size
+ off
))
976 new = (old
+ delta
) - size
;
978 /* Note some interesting axioms */
979 KASSERT((delta
!= 0) || (new == old
));
980 KASSERT((delta
== 0) || (new != 0));
981 KASSERT((delta
!= (size
)) || (new == old
));
983 /* Define acceptable ranges for output. */
984 KASSERT((new == 0) || (new >= off
));
985 KASSERT(new < (size
+ off
));
989 static __inline
size_t
990 wapbl_space_used(size_t avail
, off_t head
, off_t tail
)
997 return ((head
+ (avail
- 1) - tail
) % avail
) + 1;
1000 static __inline
size_t
1001 wapbl_space_free(size_t avail
, off_t head
, off_t tail
)
1004 return avail
- wapbl_space_used(avail
, head
, tail
);
1007 static __inline
void
1008 wapbl_advance_head(size_t size
, size_t off
, size_t delta
, off_t
*headp
,
1011 off_t head
= *headp
;
1012 off_t tail
= *tailp
;
1014 KASSERT(delta
<= wapbl_space_free(size
, head
, tail
));
1015 head
= wapbl_advance(size
, off
, head
, delta
);
1016 if ((tail
== 0) && (head
!= 0))
1022 static __inline
void
1023 wapbl_advance_tail(size_t size
, size_t off
, size_t delta
, off_t
*headp
,
1026 off_t head
= *headp
;
1027 off_t tail
= *tailp
;
1029 KASSERT(delta
<= wapbl_space_used(size
, head
, tail
));
1030 tail
= wapbl_advance(size
, off
, tail
, delta
);
1040 /****************************************************************/
1043 * Remove transactions whose buffers are completely flushed to disk.
1044 * Will block until at least minfree space is available.
1045 * only intended to be called from inside wapbl_flush and therefore
1046 * does not protect against commit races with itself or with flush.
1049 wapbl_truncate(struct wapbl
*wl
, size_t minfree
, int waitonly
)
1057 KASSERT(minfree
<= (wl
->wl_circ_size
- wl
->wl_reserved_bytes
));
1058 KASSERT(rw_write_held(&wl
->wl_rwlock
));
1060 mutex_enter(&wl
->wl_mtx
);
1063 * First check to see if we have to do a commit
1066 avail
= wapbl_space_free(wl
->wl_circ_size
, wl
->wl_head
, wl
->wl_tail
);
1067 if (minfree
< avail
) {
1068 mutex_exit(&wl
->wl_mtx
);
1072 while ((wl
->wl_error_count
== 0) &&
1073 (wl
->wl_reclaimable_bytes
< minfree
)) {
1074 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE
,
1075 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1077 &wl
->wl_reclaimable_bytes
, wl
, wl
->wl_reclaimable_bytes
,
1080 cv_wait(&wl
->wl_reclaimable_cv
, &wl
->wl_mtx
);
1082 if (wl
->wl_reclaimable_bytes
< minfree
) {
1083 KASSERT(wl
->wl_error_count
);
1084 /* XXX maybe get actual error from buffer instead someday? */
1089 delta
= wl
->wl_reclaimable_bytes
;
1091 /* If all of of the entries are flushed, then be sure to keep
1092 * the reserved bytes reserved. Watch out for discarded transactions,
1093 * which could leave more bytes reserved than are reclaimable.
1095 if (SIMPLEQ_EMPTY(&wl
->wl_entries
) &&
1096 (delta
>= wl
->wl_reserved_bytes
)) {
1097 delta
-= wl
->wl_reserved_bytes
;
1099 wapbl_advance_tail(wl
->wl_circ_size
, wl
->wl_circ_off
, delta
, &head
,
1101 KDASSERT(wl
->wl_reserved_bytes
<=
1102 wapbl_space_used(wl
->wl_circ_size
, head
, tail
));
1103 mutex_exit(&wl
->wl_mtx
);
1112 * This is where head, tail and delta are unprotected
1113 * from races against itself or flush. This is ok since
1114 * we only call this routine from inside flush itself.
1116 * XXX: how can it race against itself when accessed only
1117 * from behind the write-locked rwlock?
1119 error
= wapbl_write_commit(wl
, head
, tail
);
1126 mutex_enter(&wl
->wl_mtx
);
1127 KASSERT(wl
->wl_reclaimable_bytes
>= delta
);
1128 wl
->wl_reclaimable_bytes
-= delta
;
1129 mutex_exit(&wl
->wl_mtx
);
1130 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE
,
1131 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1132 curproc
->p_pid
, curlwp
->l_lid
, delta
));
1137 /****************************************************************/
1140 wapbl_biodone(struct buf
*bp
)
1142 struct wapbl_entry
*we
= bp
->b_private
;
1143 struct wapbl
*wl
= we
->we_wapbl
;
1146 * Handle possible flushing of buffers after log has been
1150 KASSERT(we
->we_bufcount
> 0);
1152 #ifdef WAPBL_DEBUG_BUFBYTES
1153 KASSERT(we
->we_unsynced_bufbytes
>= bp
->b_bufsize
);
1154 we
->we_unsynced_bufbytes
-= bp
->b_bufsize
;
1157 if (we
->we_bufcount
== 0) {
1158 #ifdef WAPBL_DEBUG_BUFBYTES
1159 KASSERT(we
->we_unsynced_bufbytes
== 0);
1161 wapbl_free(we
, sizeof(*we
));
1169 KDASSERT(bp
->b_flags
& B_DONE
);
1170 KDASSERT(!(bp
->b_flags
& B_DELWRI
));
1171 KDASSERT(bp
->b_flags
& B_ASYNC
);
1172 KDASSERT(bp
->b_flags
& B_BUSY
);
1173 KDASSERT(!(bp
->b_flags
& B_LOCKED
));
1174 KDASSERT(!(bp
->b_flags
& B_READ
));
1175 KDASSERT(!(bp
->b_flags
& B_INVAL
));
1176 KDASSERT(!(bp
->b_flags
& B_NOCACHE
));
1180 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1182 * XXXpooka: interfaces not fully updated
1183 * Note: this was not enabled in the original patch
1184 * against netbsd4 either. I don't know if comment
1185 * above is true or not.
1189 * If an error occurs, report the error and leave the
1190 * buffer as a delayed write on the LRU queue.
1191 * restarting the write would likely result in
1192 * an error spinloop, so let it be done harmlessly
1195 bp
->b_flags
&= ~(B_DONE
);
1196 simple_unlock(&bp
->b_interlock
);
1198 if (we
->we_error
== 0) {
1199 mutex_enter(&wl
->wl_mtx
);
1200 wl
->wl_error_count
++;
1201 mutex_exit(&wl
->wl_mtx
);
1202 cv_broadcast(&wl
->wl_reclaimable_cv
);
1204 we
->we_error
= bp
->b_error
;
1209 /* For now, just mark the log permanently errored out */
1211 mutex_enter(&wl
->wl_mtx
);
1212 if (wl
->wl_error_count
== 0) {
1213 wl
->wl_error_count
++;
1214 cv_broadcast(&wl
->wl_reclaimable_cv
);
1216 mutex_exit(&wl
->wl_mtx
);
1220 mutex_enter(&wl
->wl_mtx
);
1222 KASSERT(we
->we_bufcount
> 0);
1224 #ifdef WAPBL_DEBUG_BUFBYTES
1225 KASSERT(we
->we_unsynced_bufbytes
>= bp
->b_bufsize
);
1226 we
->we_unsynced_bufbytes
-= bp
->b_bufsize
;
1227 KASSERT(wl
->wl_unsynced_bufbytes
>= bp
->b_bufsize
);
1228 wl
->wl_unsynced_bufbytes
-= bp
->b_bufsize
;
1232 * If the current transaction can be reclaimed, start
1233 * at the beginning and reclaim any consecutive reclaimable
1234 * transactions. If we successfully reclaim anything,
1235 * then wakeup anyone waiting for the reclaim.
1237 if (we
->we_bufcount
== 0) {
1240 #ifdef WAPBL_DEBUG_BUFBYTES
1241 KDASSERT(we
->we_unsynced_bufbytes
== 0);
1244 * clear any posted error, since the buffer it came from
1245 * has successfully flushed by now
1247 while ((we
= SIMPLEQ_FIRST(&wl
->wl_entries
)) &&
1248 (we
->we_bufcount
== 0)) {
1249 delta
+= we
->we_reclaimable_bytes
;
1252 SIMPLEQ_REMOVE_HEAD(&wl
->wl_entries
, we_entries
);
1253 wapbl_free(we
, sizeof(*we
));
1257 wl
->wl_reclaimable_bytes
+= delta
;
1258 KASSERT(wl
->wl_error_count
>= errcnt
);
1259 wl
->wl_error_count
-= errcnt
;
1260 cv_broadcast(&wl
->wl_reclaimable_cv
);
1264 mutex_exit(&wl
->wl_mtx
);
1269 * Write transactions to disk + start I/O for contents
1272 wapbl_flush(struct wapbl
*wl
, int waitfor
)
1275 struct wapbl_entry
*we
;
1285 * Do a quick check to see if a full flush can be skipped
1286 * This assumes that the flush callback does not need to be called
1287 * unless there are other outstanding bufs.
1291 mutex_enter(&wl
->wl_mtx
); /* XXX need mutex here to
1292 protect the KASSERTS */
1293 nbufs
= wl
->wl_bufcount
;
1294 KASSERT((wl
->wl_bufcount
== 0) == (wl
->wl_bufbytes
== 0));
1295 KASSERT((wl
->wl_bufcount
== 0) == (wl
->wl_bcount
== 0));
1296 mutex_exit(&wl
->wl_mtx
);
1302 * XXX we may consider using LK_UPGRADE here
1303 * if we want to call flush from inside a transaction
1305 rw_enter(&wl
->wl_rwlock
, RW_WRITER
);
1306 wl
->wl_flush(wl
->wl_mount
, wl
->wl_deallocblks
, wl
->wl_dealloclens
,
1310 * Now that we are fully locked and flushed,
1311 * do another check for nothing to do.
1313 if (wl
->wl_bufcount
== 0) {
1318 WAPBL_PRINTF(WAPBL_PRINT_FLUSH
,
1319 ("wapbl_flush thread %d.%d flushing entries with "
1320 "bufcount=%zu bufbytes=%zu\n",
1321 curproc
->p_pid
, curlwp
->l_lid
, wl
->wl_bufcount
,
1325 /* Calculate amount of space needed to flush */
1326 flushsize
= wapbl_transaction_len(wl
);
1328 if (flushsize
> (wl
->wl_circ_size
- wl
->wl_reserved_bytes
)) {
1330 * XXX this could be handled more gracefully, perhaps place
1331 * only a partial transaction in the log and allow the
1332 * remaining to flush without the protection of the journal.
1334 panic("wapbl_flush: current transaction too big to flush\n");
1337 error
= wapbl_truncate(wl
, flushsize
, 0);
1342 KASSERT((off
== 0) || ((off
>= wl
->wl_circ_off
) &&
1343 (off
< wl
->wl_circ_off
+ wl
->wl_circ_size
)));
1344 error
= wapbl_write_blocks(wl
, &off
);
1347 error
= wapbl_write_revocations(wl
, &off
);
1350 error
= wapbl_write_inodes(wl
, &off
);
1355 if (wl
->wl_inohashcnt
)
1356 reserved
= wapbl_transaction_inodes_len(wl
);
1361 wapbl_advance_head(wl
->wl_circ_size
, wl
->wl_circ_off
, flushsize
,
1365 panic("lost head! head=%"PRIdMAX
" tail=%" PRIdMAX
1366 " off=%"PRIdMAX
" flush=%zu\n",
1367 (intmax_t)head
, (intmax_t)tail
, (intmax_t)off
,
1371 KASSERT(head
== off
);
1374 /* Opportunistically move the tail forward if we can */
1375 if (!wapbl_lazy_truncate
) {
1376 mutex_enter(&wl
->wl_mtx
);
1377 delta
= wl
->wl_reclaimable_bytes
;
1378 mutex_exit(&wl
->wl_mtx
);
1379 wapbl_advance_tail(wl
->wl_circ_size
, wl
->wl_circ_off
, delta
,
1383 error
= wapbl_write_commit(wl
, head
, tail
);
1387 we
= wapbl_calloc(1, sizeof(*we
));
1389 #ifdef WAPBL_DEBUG_BUFBYTES
1390 WAPBL_PRINTF(WAPBL_PRINT_FLUSH
,
1391 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1393 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1395 curproc
->p_pid
, curlwp
->l_lid
, flushsize
, delta
,
1396 wapbl_space_used(wl
->wl_circ_size
, head
, tail
),
1397 wl
->wl_unsynced_bufbytes
, wl
->wl_bufcount
,
1398 wl
->wl_bufbytes
, wl
->wl_bcount
, wl
->wl_dealloccnt
,
1399 wl
->wl_inohashcnt
));
1401 WAPBL_PRINTF(WAPBL_PRINT_FLUSH
,
1402 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1403 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1405 curproc
->p_pid
, curlwp
->l_lid
, flushsize
, delta
,
1406 wapbl_space_used(wl
->wl_circ_size
, head
, tail
),
1407 wl
->wl_bufcount
, wl
->wl_bufbytes
, wl
->wl_bcount
,
1408 wl
->wl_dealloccnt
, wl
->wl_inohashcnt
));
1412 mutex_enter(&bufcache_lock
);
1413 mutex_enter(&wl
->wl_mtx
);
1415 wl
->wl_reserved_bytes
= reserved
;
1418 KASSERT(wl
->wl_reclaimable_bytes
>= delta
);
1419 wl
->wl_reclaimable_bytes
-= delta
;
1420 wl
->wl_dealloccnt
= 0;
1421 #ifdef WAPBL_DEBUG_BUFBYTES
1422 wl
->wl_unsynced_bufbytes
+= wl
->wl_bufbytes
;
1426 we
->we_bufcount
= wl
->wl_bufcount
;
1427 #ifdef WAPBL_DEBUG_BUFBYTES
1428 we
->we_unsynced_bufbytes
= wl
->wl_bufbytes
;
1430 we
->we_reclaimable_bytes
= flushsize
;
1432 SIMPLEQ_INSERT_TAIL(&wl
->wl_entries
, we
, we_entries
);
1435 * this flushes bufs in reverse order than they were queued
1436 * it shouldn't matter, but if we care we could use TAILQ instead.
1437 * XXX Note they will get put on the lru queue when they flush
1438 * so we might actually want to change this to preserve order.
1440 while ((bp
= LIST_FIRST(&wl
->wl_bufs
)) != NULL
) {
1441 if (bbusy(bp
, 0, 0, &wl
->wl_mtx
)) {
1444 bp
->b_iodone
= wapbl_biodone
;
1447 wapbl_remove_buf_locked(wl
, bp
);
1448 mutex_exit(&wl
->wl_mtx
);
1449 mutex_exit(&bufcache_lock
);
1451 mutex_enter(&bufcache_lock
);
1452 mutex_enter(&wl
->wl_mtx
);
1454 mutex_exit(&wl
->wl_mtx
);
1455 mutex_exit(&bufcache_lock
);
1458 WAPBL_PRINTF(WAPBL_PRINT_FLUSH
,
1459 ("wapbl_flush thread %d.%d done flushing entries...\n",
1460 curproc
->p_pid
, curlwp
->l_lid
));
1466 * If the waitfor flag is set, don't return until everything is
1467 * fully flushed and the on disk log is empty.
1470 error
= wapbl_truncate(wl
, wl
->wl_circ_size
-
1471 wl
->wl_reserved_bytes
, wapbl_lazy_truncate
);
1476 wl
->wl_flush_abort(wl
->wl_mount
, wl
->wl_deallocblks
,
1477 wl
->wl_dealloclens
, wl
->wl_dealloccnt
);
1480 #ifdef WAPBL_DEBUG_PRINT
1485 pid
= curproc
->p_pid
;
1487 lid
= curlwp
->l_lid
;
1488 mutex_enter(&wl
->wl_mtx
);
1489 #ifdef WAPBL_DEBUG_BUFBYTES
1490 WAPBL_PRINTF(WAPBL_PRINT_ERROR
,
1491 ("wapbl_flush: thread %d.%d aborted flush: "
1493 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1494 "deallocs=%d inodes=%d\n"
1495 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1497 pid
, lid
, error
, wl
->wl_bufcount
,
1498 wl
->wl_bufbytes
, wl
->wl_bcount
,
1499 wl
->wl_dealloccnt
, wl
->wl_inohashcnt
,
1500 wl
->wl_error_count
, wl
->wl_reclaimable_bytes
,
1501 wl
->wl_reserved_bytes
, wl
->wl_unsynced_bufbytes
));
1502 SIMPLEQ_FOREACH(we
, &wl
->wl_entries
, we_entries
) {
1503 WAPBL_PRINTF(WAPBL_PRINT_ERROR
,
1504 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1505 "error = %d, unsynced = %zu\n",
1506 we
->we_bufcount
, we
->we_reclaimable_bytes
,
1507 we
->we_error
, we
->we_unsynced_bufbytes
));
1510 WAPBL_PRINTF(WAPBL_PRINT_ERROR
,
1511 ("wapbl_flush: thread %d.%d aborted flush: "
1513 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1514 "deallocs=%d inodes=%d\n"
1515 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1516 pid
, lid
, error
, wl
->wl_bufcount
,
1517 wl
->wl_bufbytes
, wl
->wl_bcount
,
1518 wl
->wl_dealloccnt
, wl
->wl_inohashcnt
,
1519 wl
->wl_error_count
, wl
->wl_reclaimable_bytes
,
1520 wl
->wl_reserved_bytes
));
1521 SIMPLEQ_FOREACH(we
, &wl
->wl_entries
, we_entries
) {
1522 WAPBL_PRINTF(WAPBL_PRINT_ERROR
,
1523 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1524 "error = %d\n", we
->we_bufcount
,
1525 we
->we_reclaimable_bytes
, we
->we_error
));
1528 mutex_exit(&wl
->wl_mtx
);
1532 rw_exit(&wl
->wl_rwlock
);
1536 /****************************************************************/
1539 wapbl_jlock_assert(struct wapbl
*wl
)
1542 KASSERT(rw_lock_held(&wl
->wl_rwlock
));
1546 wapbl_junlock_assert(struct wapbl
*wl
)
1549 KASSERT(!rw_write_held(&wl
->wl_rwlock
));
1552 /****************************************************************/
1556 wapbl_print(struct wapbl
*wl
,
1558 void (*pr
)(const char *, ...))
1561 struct wapbl_entry
*we
;
1562 (*pr
)("wapbl %p", wl
);
1563 (*pr
)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64
"\n",
1564 wl
->wl_logvp
, wl
->wl_devvp
, wl
->wl_logpbn
);
1565 (*pr
)("circ = %zu, header = %zu, head = %"PRIdMAX
" tail = %"PRIdMAX
"\n",
1566 wl
->wl_circ_size
, wl
->wl_circ_off
,
1567 (intmax_t)wl
->wl_head
, (intmax_t)wl
->wl_tail
);
1568 (*pr
)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1569 wl
->wl_log_dev_bshift
, wl
->wl_fs_dev_bshift
);
1570 #ifdef WAPBL_DEBUG_BUFBYTES
1571 (*pr
)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1572 "reserved = %zu errcnt = %d unsynced = %zu\n",
1573 wl
->wl_bufcount
, wl
->wl_bufbytes
, wl
->wl_bcount
,
1574 wl
->wl_reclaimable_bytes
, wl
->wl_reserved_bytes
,
1575 wl
->wl_error_count
, wl
->wl_unsynced_bufbytes
);
1577 (*pr
)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1578 "reserved = %zu errcnt = %d\n", wl
->wl_bufcount
, wl
->wl_bufbytes
,
1579 wl
->wl_bcount
, wl
->wl_reclaimable_bytes
, wl
->wl_reserved_bytes
,
1580 wl
->wl_error_count
);
1582 (*pr
)("\tdealloccnt = %d, dealloclim = %d\n",
1583 wl
->wl_dealloccnt
, wl
->wl_dealloclim
);
1584 (*pr
)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1585 wl
->wl_inohashcnt
, wl
->wl_inohashmask
);
1586 (*pr
)("entries:\n");
1587 SIMPLEQ_FOREACH(we
, &wl
->wl_entries
, we_entries
) {
1588 #ifdef WAPBL_DEBUG_BUFBYTES
1589 (*pr
)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1591 we
->we_bufcount
, we
->we_reclaimable_bytes
,
1592 we
->we_error
, we
->we_unsynced_bufbytes
);
1594 (*pr
)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1595 we
->we_bufcount
, we
->we_reclaimable_bytes
, we
->we_error
);
1601 LIST_FOREACH(bp
, &wl
->wl_bufs
, b_wapbllist
) {
1602 if (!LIST_NEXT(bp
, b_wapbllist
)) {
1604 } else if ((++cnt
% 6) == 0) {
1605 (*pr
)(" %p,\n\t", bp
);
1612 (*pr
)("dealloced blks = ");
1616 for (i
= 0; i
< wl
->wl_dealloccnt
; i
++) {
1617 (*pr
)(" %"PRId64
":%d,",
1618 wl
->wl_deallocblks
[i
],
1619 wl
->wl_dealloclens
[i
]);
1620 if ((++cnt
% 4) == 0) {
1627 (*pr
)("registered inodes = ");
1631 for (i
= 0; i
<= wl
->wl_inohashmask
; i
++) {
1632 struct wapbl_ino_head
*wih
;
1633 struct wapbl_ino
*wi
;
1635 wih
= &wl
->wl_inohash
[i
];
1636 LIST_FOREACH(wi
, wih
, wi_hash
) {
1637 if (wi
->wi_ino
== 0)
1639 (*pr
)(" %"PRId32
"/0%06"PRIo32
",",
1640 wi
->wi_ino
, wi
->wi_mode
);
1641 if ((++cnt
% 4) == 0) {
1651 #if defined(WAPBL_DEBUG) || defined(DDB)
1653 wapbl_dump(struct wapbl
*wl
)
1655 #if defined(WAPBL_DEBUG)
1657 wl
= wapbl_debug_wl
;
1661 wapbl_print(wl
, 1, printf
);
1665 /****************************************************************/
1668 wapbl_register_deallocation(struct wapbl
*wl
, daddr_t blk
, int len
)
1671 wapbl_jlock_assert(wl
);
1673 /* XXX should eventually instead tie this into resource estimation */
1675 * XXX this panic needs locking/mutex analysis and the
1676 * ability to cope with the failure.
1678 /* XXX this XXX doesn't have enough XXX */
1679 if (__predict_false(wl
->wl_dealloccnt
>= wl
->wl_dealloclim
))
1680 panic("wapbl_register_deallocation: out of resources");
1682 wl
->wl_deallocblks
[wl
->wl_dealloccnt
] = blk
;
1683 wl
->wl_dealloclens
[wl
->wl_dealloccnt
] = len
;
1684 wl
->wl_dealloccnt
++;
1685 WAPBL_PRINTF(WAPBL_PRINT_ALLOC
,
1686 ("wapbl_register_deallocation: blk=%"PRId64
" len=%d\n", blk
, len
));
1689 /****************************************************************/
1692 wapbl_inodetrk_init(struct wapbl
*wl
, u_int size
)
1695 wl
->wl_inohash
= hashinit(size
, HASH_LIST
, true, &wl
->wl_inohashmask
);
1696 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount
) == 1) {
1697 pool_init(&wapbl_ino_pool
, sizeof(struct wapbl_ino
), 0, 0, 0,
1698 "wapblinopl", &pool_allocator_nointr
, IPL_NONE
);
1703 wapbl_inodetrk_free(struct wapbl
*wl
)
1706 /* XXX this KASSERT needs locking/mutex analysis */
1707 KASSERT(wl
->wl_inohashcnt
== 0);
1708 hashdone(wl
->wl_inohash
, HASH_LIST
, wl
->wl_inohashmask
);
1709 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount
) == 0) {
1710 pool_destroy(&wapbl_ino_pool
);
1714 static struct wapbl_ino
*
1715 wapbl_inodetrk_get(struct wapbl
*wl
, ino_t ino
)
1717 struct wapbl_ino_head
*wih
;
1718 struct wapbl_ino
*wi
;
1720 KASSERT(mutex_owned(&wl
->wl_mtx
));
1722 wih
= &wl
->wl_inohash
[ino
& wl
->wl_inohashmask
];
1723 LIST_FOREACH(wi
, wih
, wi_hash
) {
1724 if (ino
== wi
->wi_ino
)
1731 wapbl_register_inode(struct wapbl
*wl
, ino_t ino
, mode_t mode
)
1733 struct wapbl_ino_head
*wih
;
1734 struct wapbl_ino
*wi
;
1736 wi
= pool_get(&wapbl_ino_pool
, PR_WAITOK
);
1738 mutex_enter(&wl
->wl_mtx
);
1739 if (wapbl_inodetrk_get(wl
, ino
) == NULL
) {
1742 wih
= &wl
->wl_inohash
[ino
& wl
->wl_inohashmask
];
1743 LIST_INSERT_HEAD(wih
, wi
, wi_hash
);
1744 wl
->wl_inohashcnt
++;
1745 WAPBL_PRINTF(WAPBL_PRINT_INODE
,
1746 ("wapbl_register_inode: ino=%"PRId64
"\n", ino
));
1747 mutex_exit(&wl
->wl_mtx
);
1749 mutex_exit(&wl
->wl_mtx
);
1750 pool_put(&wapbl_ino_pool
, wi
);
1755 wapbl_unregister_inode(struct wapbl
*wl
, ino_t ino
, mode_t mode
)
1757 struct wapbl_ino
*wi
;
1759 mutex_enter(&wl
->wl_mtx
);
1760 wi
= wapbl_inodetrk_get(wl
, ino
);
1762 WAPBL_PRINTF(WAPBL_PRINT_INODE
,
1763 ("wapbl_unregister_inode: ino=%"PRId64
"\n", ino
));
1764 KASSERT(wl
->wl_inohashcnt
> 0);
1765 wl
->wl_inohashcnt
--;
1766 LIST_REMOVE(wi
, wi_hash
);
1767 mutex_exit(&wl
->wl_mtx
);
1769 pool_put(&wapbl_ino_pool
, wi
);
1771 mutex_exit(&wl
->wl_mtx
);
1775 /****************************************************************/
1777 static __inline
size_t
1778 wapbl_transaction_inodes_len(struct wapbl
*wl
)
1780 int blocklen
= 1<<wl
->wl_log_dev_bshift
;
1783 /* Calculate number of inodes described in a inodelist header */
1784 iph
= (blocklen
- offsetof(struct wapbl_wc_inodelist
, wc_inodes
)) /
1785 sizeof(((struct wapbl_wc_inodelist
*)0)->wc_inodes
[0]);
1789 return MAX(1, howmany(wl
->wl_inohashcnt
, iph
))*blocklen
;
1793 /* Calculate amount of space a transaction will take on disk */
1795 wapbl_transaction_len(struct wapbl
*wl
)
1797 int blocklen
= 1<<wl
->wl_log_dev_bshift
;
1801 /* Calculate number of blocks described in a blocklist header */
1802 bph
= (blocklen
- offsetof(struct wapbl_wc_blocklist
, wc_blocks
)) /
1803 sizeof(((struct wapbl_wc_blocklist
*)0)->wc_blocks
[0]);
1807 len
= wl
->wl_bcount
;
1808 len
+= howmany(wl
->wl_bufcount
, bph
)*blocklen
;
1809 len
+= howmany(wl
->wl_dealloccnt
, bph
)*blocklen
;
1810 len
+= wapbl_transaction_inodes_len(wl
);
1816 * Perform commit operation
1818 * Note that generation number incrementation needs to
1819 * be protected against racing with other invocations
1820 * of wapbl_commit. This is ok since this routine
1821 * is only invoked from wapbl_flush
1824 wapbl_write_commit(struct wapbl
*wl
, off_t head
, off_t tail
)
1826 struct wapbl_wc_header
*wc
= wl
->wl_wc_header
;
1831 /* XXX Calc checksum here, instead we do this for now */
1832 error
= VOP_IOCTL(wl
->wl_devvp
, DIOCCACHESYNC
, &force
, FWRITE
, FSCRED
);
1834 WAPBL_PRINTF(WAPBL_PRINT_ERROR
,
1835 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%"PRIx64
1836 " returned %d\n", wl
->wl_devvp
->v_rdev
, error
));
1841 wc
->wc_checksum
= 0;
1844 wc
->wc_time
= ts
.tv_sec
;
1845 wc
->wc_timensec
= ts
.tv_nsec
;
1847 WAPBL_PRINTF(WAPBL_PRINT_WRITE
,
1848 ("wapbl_write_commit: head = %"PRIdMAX
"tail = %"PRIdMAX
"\n",
1849 (intmax_t)head
, (intmax_t)tail
));
1852 * XXX if generation will rollover, then first zero
1853 * over second commit header before trying to write both headers.
1856 error
= wapbl_write(wc
, wc
->wc_len
, wl
->wl_devvp
,
1857 wl
->wl_logpbn
+ wc
->wc_generation
% 2);
1861 error
= VOP_IOCTL(wl
->wl_devvp
, DIOCCACHESYNC
, &force
, FWRITE
, FSCRED
);
1863 WAPBL_PRINTF(WAPBL_PRINT_ERROR
,
1864 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%"PRIx64
1865 " returned %d\n", wl
->wl_devvp
->v_rdev
, error
));
1869 * If the generation number was zero, write it out a second time.
1870 * This handles initialization and generation number rollover
1872 if (wc
->wc_generation
++ == 0) {
1873 error
= wapbl_write_commit(wl
, head
, tail
);
1875 * This panic should be able to be removed if we do the
1876 * zero'ing mentioned above, and we are certain to roll
1877 * back generation number on failure.
1880 panic("wapbl_write_commit: error writing duplicate "
1881 "log header: %d\n", error
);
1886 /* Returns new offset value */
1888 wapbl_write_blocks(struct wapbl
*wl
, off_t
*offp
)
1890 struct wapbl_wc_blocklist
*wc
=
1891 (struct wapbl_wc_blocklist
*)wl
->wl_wc_scratch
;
1892 int blocklen
= 1<<wl
->wl_log_dev_bshift
;
1899 KASSERT(rw_write_held(&wl
->wl_rwlock
));
1901 bph
= (blocklen
- offsetof(struct wapbl_wc_blocklist
, wc_blocks
)) /
1902 sizeof(((struct wapbl_wc_blocklist
*)0)->wc_blocks
[0]);
1904 bp
= LIST_FIRST(&wl
->wl_bufs
);
1908 struct buf
*obp
= bp
;
1910 KASSERT(bp
->b_flags
& B_LOCKED
);
1912 wc
->wc_type
= WAPBL_WC_BLOCKS
;
1913 wc
->wc_len
= blocklen
;
1914 wc
->wc_blkcount
= 0;
1915 while (bp
&& (wc
->wc_blkcount
< bph
)) {
1917 * Make sure all the physical block numbers are up to
1918 * date. If this is not always true on a given
1919 * filesystem, then VOP_BMAP must be called. We
1920 * could call VOP_BMAP here, or else in the filesystem
1921 * specific flush callback, although neither of those
1922 * solutions allow us to take the vnode lock. If a
1923 * filesystem requires that we must take the vnode lock
1924 * to call VOP_BMAP, then we can probably do it in
1925 * bwrite when the vnode lock should already be held
1926 * by the invoking code.
1928 KASSERT((bp
->b_vp
->v_type
== VBLK
) ||
1929 (bp
->b_blkno
!= bp
->b_lblkno
));
1930 KASSERT(bp
->b_blkno
> 0);
1932 wc
->wc_blocks
[wc
->wc_blkcount
].wc_daddr
= bp
->b_blkno
;
1933 wc
->wc_blocks
[wc
->wc_blkcount
].wc_dlen
= bp
->b_bcount
;
1934 wc
->wc_len
+= bp
->b_bcount
;
1936 bp
= LIST_NEXT(bp
, b_wapbllist
);
1938 if (wc
->wc_len
% blocklen
!= 0) {
1939 padding
= blocklen
- wc
->wc_len
% blocklen
;
1940 wc
->wc_len
+= padding
;
1945 WAPBL_PRINTF(WAPBL_PRINT_WRITE
,
1946 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX
"\n",
1947 wc
->wc_len
, padding
, (intmax_t)off
));
1949 error
= wapbl_circ_write(wl
, wc
, blocklen
, &off
);
1954 while (bp
&& (cnt
++ < bph
)) {
1955 error
= wapbl_circ_write(wl
, bp
->b_data
,
1956 bp
->b_bcount
, &off
);
1959 bp
= LIST_NEXT(bp
, b_wapbllist
);
1964 zero
= wapbl_malloc(padding
);
1965 memset(zero
, 0, padding
);
1966 error
= wapbl_circ_write(wl
, zero
, padding
, &off
);
1967 wapbl_free(zero
, padding
);
1977 wapbl_write_revocations(struct wapbl
*wl
, off_t
*offp
)
1979 struct wapbl_wc_blocklist
*wc
=
1980 (struct wapbl_wc_blocklist
*)wl
->wl_wc_scratch
;
1982 int blocklen
= 1<<wl
->wl_log_dev_bshift
;
1987 if (wl
->wl_dealloccnt
== 0)
1990 bph
= (blocklen
- offsetof(struct wapbl_wc_blocklist
, wc_blocks
)) /
1991 sizeof(((struct wapbl_wc_blocklist
*)0)->wc_blocks
[0]);
1994 while (i
< wl
->wl_dealloccnt
) {
1995 wc
->wc_type
= WAPBL_WC_REVOCATIONS
;
1996 wc
->wc_len
= blocklen
;
1997 wc
->wc_blkcount
= 0;
1998 while ((i
< wl
->wl_dealloccnt
) && (wc
->wc_blkcount
< bph
)) {
1999 wc
->wc_blocks
[wc
->wc_blkcount
].wc_daddr
=
2000 wl
->wl_deallocblks
[i
];
2001 wc
->wc_blocks
[wc
->wc_blkcount
].wc_dlen
=
2002 wl
->wl_dealloclens
[i
];
2006 WAPBL_PRINTF(WAPBL_PRINT_WRITE
,
2007 ("wapbl_write_revocations: len = %u off = %"PRIdMAX
"\n",
2008 wc
->wc_len
, (intmax_t)off
));
2009 error
= wapbl_circ_write(wl
, wc
, blocklen
, &off
);
2018 wapbl_write_inodes(struct wapbl
*wl
, off_t
*offp
)
2020 struct wapbl_wc_inodelist
*wc
=
2021 (struct wapbl_wc_inodelist
*)wl
->wl_wc_scratch
;
2023 int blocklen
= 1 << wl
->wl_log_dev_bshift
;
2027 struct wapbl_ino_head
*wih
;
2028 struct wapbl_ino
*wi
;
2031 iph
= (blocklen
- offsetof(struct wapbl_wc_inodelist
, wc_inodes
)) /
2032 sizeof(((struct wapbl_wc_inodelist
*)0)->wc_inodes
[0]);
2035 wih
= &wl
->wl_inohash
[0];
2038 wc
->wc_type
= WAPBL_WC_INODES
;
2039 wc
->wc_len
= blocklen
;
2041 wc
->wc_clear
= (i
== 0);
2042 while ((i
< wl
->wl_inohashcnt
) && (wc
->wc_inocnt
< iph
)) {
2044 KASSERT((wih
- &wl
->wl_inohash
[0])
2045 <= wl
->wl_inohashmask
);
2046 wi
= LIST_FIRST(wih
++);
2048 wc
->wc_inodes
[wc
->wc_inocnt
].wc_inumber
= wi
->wi_ino
;
2049 wc
->wc_inodes
[wc
->wc_inocnt
].wc_imode
= wi
->wi_mode
;
2052 wi
= LIST_NEXT(wi
, wi_hash
);
2054 WAPBL_PRINTF(WAPBL_PRINT_WRITE
,
2055 ("wapbl_write_inodes: len = %u off = %"PRIdMAX
"\n",
2056 wc
->wc_len
, (intmax_t)off
));
2057 error
= wapbl_circ_write(wl
, wc
, blocklen
, &off
);
2060 } while (i
< wl
->wl_inohashcnt
);
2066 #endif /* _KERNEL */
2068 /****************************************************************/
2071 LIST_ENTRY(wapbl_blk
) wb_hash
;
2073 off_t wb_off
; /* Offset of this block in the log */
2075 #define WAPBL_BLKPOOL_MIN 83
2078 wapbl_blkhash_init(struct wapbl_replay
*wr
, u_int size
)
2080 if (size
< WAPBL_BLKPOOL_MIN
)
2081 size
= WAPBL_BLKPOOL_MIN
;
2082 KASSERT(wr
->wr_blkhash
== 0);
2084 wr
->wr_blkhash
= hashinit(size
, HASH_LIST
, true, &wr
->wr_blkhashmask
);
2085 #else /* ! _KERNEL */
2086 /* Manually implement hashinit */
2088 unsigned long i
, hashsize
;
2089 for (hashsize
= 1; hashsize
< size
; hashsize
<<= 1)
2091 wr
->wr_blkhash
= wapbl_malloc(hashsize
* sizeof(*wr
->wr_blkhash
));
2092 for (i
= 0; i
< wr
->wr_blkhashmask
; i
++)
2093 LIST_INIT(&wr
->wr_blkhash
[i
]);
2094 wr
->wr_blkhashmask
= hashsize
- 1;
2096 #endif /* ! _KERNEL */
2100 wapbl_blkhash_free(struct wapbl_replay
*wr
)
2102 KASSERT(wr
->wr_blkhashcnt
== 0);
2104 hashdone(wr
->wr_blkhash
, HASH_LIST
, wr
->wr_blkhashmask
);
2105 #else /* ! _KERNEL */
2106 wapbl_free(wr
->wr_blkhash
,
2107 (wr
->wr_blkhashmask
+ 1) * sizeof(*wr
->wr_blkhash
));
2108 #endif /* ! _KERNEL */
2111 static struct wapbl_blk
*
2112 wapbl_blkhash_get(struct wapbl_replay
*wr
, daddr_t blk
)
2114 struct wapbl_blk_head
*wbh
;
2115 struct wapbl_blk
*wb
;
2116 wbh
= &wr
->wr_blkhash
[blk
& wr
->wr_blkhashmask
];
2117 LIST_FOREACH(wb
, wbh
, wb_hash
) {
2118 if (blk
== wb
->wb_blk
)
2125 wapbl_blkhash_ins(struct wapbl_replay
*wr
, daddr_t blk
, off_t off
)
2127 struct wapbl_blk_head
*wbh
;
2128 struct wapbl_blk
*wb
;
2129 wb
= wapbl_blkhash_get(wr
, blk
);
2131 KASSERT(wb
->wb_blk
== blk
);
2134 wb
= wapbl_malloc(sizeof(*wb
));
2137 wbh
= &wr
->wr_blkhash
[blk
& wr
->wr_blkhashmask
];
2138 LIST_INSERT_HEAD(wbh
, wb
, wb_hash
);
2139 wr
->wr_blkhashcnt
++;
2144 wapbl_blkhash_rem(struct wapbl_replay
*wr
, daddr_t blk
)
2146 struct wapbl_blk
*wb
= wapbl_blkhash_get(wr
, blk
);
2148 KASSERT(wr
->wr_blkhashcnt
> 0);
2149 wr
->wr_blkhashcnt
--;
2150 LIST_REMOVE(wb
, wb_hash
);
2151 wapbl_free(wb
, sizeof(*wb
));
2156 wapbl_blkhash_clear(struct wapbl_replay
*wr
)
2159 for (i
= 0; i
<= wr
->wr_blkhashmask
; i
++) {
2160 struct wapbl_blk
*wb
;
2162 while ((wb
= LIST_FIRST(&wr
->wr_blkhash
[i
]))) {
2163 KASSERT(wr
->wr_blkhashcnt
> 0);
2164 wr
->wr_blkhashcnt
--;
2165 LIST_REMOVE(wb
, wb_hash
);
2166 wapbl_free(wb
, sizeof(*wb
));
2169 KASSERT(wr
->wr_blkhashcnt
== 0);
2172 /****************************************************************/
2175 wapbl_circ_read(struct wapbl_replay
*wr
, void *data
, size_t len
, off_t
*offp
)
2181 KASSERT(((len
>> wr
->wr_log_dev_bshift
) <<
2182 wr
->wr_log_dev_bshift
) == len
);
2183 if (off
< wr
->wr_circ_off
)
2184 off
= wr
->wr_circ_off
;
2185 slen
= wr
->wr_circ_off
+ wr
->wr_circ_size
- off
;
2187 error
= wapbl_read(data
, slen
, wr
->wr_devvp
,
2188 wr
->wr_logpbn
+ (off
>> wr
->wr_log_dev_bshift
));
2191 data
= (uint8_t *)data
+ slen
;
2193 off
= wr
->wr_circ_off
;
2195 error
= wapbl_read(data
, len
, wr
->wr_devvp
,
2196 wr
->wr_logpbn
+ (off
>> wr
->wr_log_dev_bshift
));
2200 if (off
>= wr
->wr_circ_off
+ wr
->wr_circ_size
)
2201 off
= wr
->wr_circ_off
;
2207 wapbl_circ_advance(struct wapbl_replay
*wr
, size_t len
, off_t
*offp
)
2212 KASSERT(((len
>> wr
->wr_log_dev_bshift
) <<
2213 wr
->wr_log_dev_bshift
) == len
);
2215 if (off
< wr
->wr_circ_off
)
2216 off
= wr
->wr_circ_off
;
2217 slen
= wr
->wr_circ_off
+ wr
->wr_circ_size
- off
;
2220 off
= wr
->wr_circ_off
;
2223 if (off
>= wr
->wr_circ_off
+ wr
->wr_circ_size
)
2224 off
= wr
->wr_circ_off
;
2228 /****************************************************************/
2231 wapbl_replay_start(struct wapbl_replay
**wrp
, struct vnode
*vp
,
2232 daddr_t off
, size_t count
, size_t blksize
)
2234 struct wapbl_replay
*wr
;
2236 struct vnode
*devvp
;
2239 struct wapbl_wc_header
*wch
;
2240 struct wapbl_wc_header
*wch2
;
2241 /* Use this until we read the actual log header */
2242 int log_dev_bshift
= DEV_BSHIFT
;
2245 WAPBL_PRINTF(WAPBL_PRINT_REPLAY
,
2246 ("wapbl_replay_start: vp=%p off=%"PRId64
" count=%zu blksize=%zu\n",
2247 vp
, off
, count
, blksize
));
2252 if (blksize
< DEV_BSIZE
)
2254 if (blksize
% DEV_BSIZE
)
2259 /* XXX vp->v_size isn't reliably set for VBLK devices,
2260 * especially root. However, we might still want to verify
2261 * that the full load is readable */
2262 if ((off
+ count
) * blksize
> vp
->v_size
)
2266 if ((error
= VOP_BMAP(vp
, off
, &devvp
, &logpbn
, 0)) != 0) {
2269 #else /* ! _KERNEL */
2272 #endif /* ! _KERNEL */
2274 scratch
= wapbl_malloc(MAXBSIZE
);
2276 error
= wapbl_read(scratch
, 2<<log_dev_bshift
, devvp
, logpbn
);
2280 wch
= (struct wapbl_wc_header
*)scratch
;
2282 (struct wapbl_wc_header
*)(scratch
+ (1<<log_dev_bshift
));
2283 /* XXX verify checksums and magic numbers */
2284 if (wch
->wc_type
!= WAPBL_WC_HEADER
) {
2285 printf("Unrecognized wapbl magic: 0x%08x\n", wch
->wc_type
);
2290 if (wch2
->wc_generation
> wch
->wc_generation
)
2293 wr
= wapbl_calloc(1, sizeof(*wr
));
2296 wr
->wr_devvp
= devvp
;
2297 wr
->wr_logpbn
= logpbn
;
2299 wr
->wr_scratch
= scratch
;
2301 wr
->wr_log_dev_bshift
= wch
->wc_log_dev_bshift
;
2302 wr
->wr_fs_dev_bshift
= wch
->wc_fs_dev_bshift
;
2303 wr
->wr_circ_off
= wch
->wc_circ_off
;
2304 wr
->wr_circ_size
= wch
->wc_circ_size
;
2305 wr
->wr_generation
= wch
->wc_generation
;
2307 used
= wapbl_space_used(wch
->wc_circ_size
, wch
->wc_head
, wch
->wc_tail
);
2309 WAPBL_PRINTF(WAPBL_PRINT_REPLAY
,
2310 ("wapbl_replay: head=%"PRId64
" tail=%"PRId64
" off=%"PRId64
2311 " len=%"PRId64
" used=%zu\n",
2312 wch
->wc_head
, wch
->wc_tail
, wch
->wc_circ_off
,
2313 wch
->wc_circ_size
, used
));
2315 wapbl_blkhash_init(wr
, (used
>> wch
->wc_fs_dev_bshift
));
2317 error
= wapbl_replay_process(wr
, wch
->wc_head
, wch
->wc_tail
);
2319 wapbl_replay_stop(wr
);
2320 wapbl_replay_free(wr
);
2328 wapbl_free(scratch
, MAXBSIZE
);
2333 wapbl_replay_stop(struct wapbl_replay
*wr
)
2336 if (!wapbl_replay_isopen(wr
))
2339 WAPBL_PRINTF(WAPBL_PRINT_REPLAY
, ("wapbl_replay_stop called\n"));
2341 wapbl_free(wr
->wr_scratch
, MAXBSIZE
);
2342 wr
->wr_scratch
= NULL
;
2344 wr
->wr_logvp
= NULL
;
2346 wapbl_blkhash_clear(wr
);
2347 wapbl_blkhash_free(wr
);
2351 wapbl_replay_free(struct wapbl_replay
*wr
)
2354 KDASSERT(!wapbl_replay_isopen(wr
));
2357 wapbl_free(wr
->wr_inodes
,
2358 wr
->wr_inodescnt
* sizeof(wr
->wr_inodes
[0]));
2359 wapbl_free(wr
, sizeof(*wr
));
2364 wapbl_replay_isopen1(struct wapbl_replay
*wr
)
2367 return wapbl_replay_isopen(wr
);
2372 wapbl_replay_process_blocks(struct wapbl_replay
*wr
, off_t
*offp
)
2374 struct wapbl_wc_blocklist
*wc
=
2375 (struct wapbl_wc_blocklist
*)wr
->wr_scratch
;
2376 int fsblklen
= 1 << wr
->wr_fs_dev_bshift
;
2379 for (i
= 0; i
< wc
->wc_blkcount
; i
++) {
2381 * Enter each physical block into the hashtable independently.
2383 n
= wc
->wc_blocks
[i
].wc_dlen
>> wr
->wr_fs_dev_bshift
;
2384 for (j
= 0; j
< n
; j
++) {
2385 wapbl_blkhash_ins(wr
, wc
->wc_blocks
[i
].wc_daddr
+ j
,
2387 wapbl_circ_advance(wr
, fsblklen
, offp
);
2393 wapbl_replay_process_revocations(struct wapbl_replay
*wr
)
2395 struct wapbl_wc_blocklist
*wc
=
2396 (struct wapbl_wc_blocklist
*)wr
->wr_scratch
;
2399 for (i
= 0; i
< wc
->wc_blkcount
; i
++) {
2401 * Remove any blocks found from the hashtable.
2403 n
= wc
->wc_blocks
[i
].wc_dlen
>> wr
->wr_fs_dev_bshift
;
2404 for (j
= 0; j
< n
; j
++)
2405 wapbl_blkhash_rem(wr
, wc
->wc_blocks
[i
].wc_daddr
+ j
);
2410 wapbl_replay_process_inodes(struct wapbl_replay
*wr
, off_t oldoff
, off_t newoff
)
2412 struct wapbl_wc_inodelist
*wc
=
2413 (struct wapbl_wc_inodelist
*)wr
->wr_scratch
;
2415 const size_t oldsize
= wr
->wr_inodescnt
* sizeof(wr
->wr_inodes
[0]);
2417 KASSERT(sizeof(wr
->wr_inodes
[0]) == sizeof(wc
->wc_inodes
[0]));
2420 * Keep track of where we found this so location won't be
2424 wr
->wr_inodestail
= oldoff
;
2425 wr
->wr_inodescnt
= 0;
2426 if (wr
->wr_inodes
!= NULL
) {
2427 wapbl_free(wr
->wr_inodes
, oldsize
);
2428 wr
->wr_inodes
= NULL
;
2431 wr
->wr_inodeshead
= newoff
;
2432 if (wc
->wc_inocnt
== 0)
2435 new_inodes
= wapbl_malloc((wr
->wr_inodescnt
+ wc
->wc_inocnt
) *
2436 sizeof(wr
->wr_inodes
[0]));
2437 if (wr
->wr_inodes
!= NULL
) {
2438 memcpy(new_inodes
, wr
->wr_inodes
, oldsize
);
2439 wapbl_free(wr
->wr_inodes
, oldsize
);
2441 wr
->wr_inodes
= new_inodes
;
2442 memcpy(&wr
->wr_inodes
[wr
->wr_inodescnt
], wc
->wc_inodes
,
2443 wc
->wc_inocnt
* sizeof(wr
->wr_inodes
[0]));
2444 wr
->wr_inodescnt
+= wc
->wc_inocnt
;
2448 wapbl_replay_process(struct wapbl_replay
*wr
, off_t head
, off_t tail
)
2453 int logblklen
= 1 << wr
->wr_log_dev_bshift
;
2455 wapbl_blkhash_clear(wr
);
2458 while (off
!= head
) {
2459 struct wapbl_wc_null
*wcn
;
2460 off_t saveoff
= off
;
2461 error
= wapbl_circ_read(wr
, wr
->wr_scratch
, logblklen
, &off
);
2464 wcn
= (struct wapbl_wc_null
*)wr
->wr_scratch
;
2465 switch (wcn
->wc_type
) {
2466 case WAPBL_WC_BLOCKS
:
2467 wapbl_replay_process_blocks(wr
, &off
);
2470 case WAPBL_WC_REVOCATIONS
:
2471 wapbl_replay_process_revocations(wr
);
2474 case WAPBL_WC_INODES
:
2475 wapbl_replay_process_inodes(wr
, saveoff
, off
);
2479 printf("Unrecognized wapbl type: 0x%08x\n",
2484 wapbl_circ_advance(wr
, wcn
->wc_len
, &saveoff
);
2485 if (off
!= saveoff
) {
2486 printf("wapbl_replay: corrupted records\n");
2494 wapbl_blkhash_clear(wr
);
2500 wapbl_replay_verify(struct wapbl_replay
*wr
, struct vnode
*fsdevvp
)
2503 int mismatchcnt
= 0;
2504 int logblklen
= 1 << wr
->wr_log_dev_bshift
;
2505 int fsblklen
= 1 << wr
->wr_fs_dev_bshift
;
2506 void *scratch1
= wapbl_malloc(MAXBSIZE
);
2507 void *scratch2
= wapbl_malloc(MAXBSIZE
);
2510 KDASSERT(wapbl_replay_isopen(wr
));
2513 while (off
!= wch
->wc_head
) {
2514 struct wapbl_wc_null
*wcn
;
2516 off_t saveoff
= off
;
2518 error
= wapbl_circ_read(wr
, wr
->wr_scratch
, logblklen
, &off
);
2521 wcn
= (struct wapbl_wc_null
*)wr
->wr_scratch
;
2522 switch (wcn
->wc_type
) {
2523 case WAPBL_WC_BLOCKS
:
2525 struct wapbl_wc_blocklist
*wc
=
2526 (struct wapbl_wc_blocklist
*)wr
->wr_scratch
;
2528 for (i
= 0; i
< wc
->wc_blkcount
; i
++) {
2533 * Check each physical block into the
2534 * hashtable independently
2536 n
= wc
->wc_blocks
[i
].wc_dlen
>>
2537 wch
->wc_fs_dev_bshift
;
2538 for (j
= 0; j
< n
; j
++) {
2539 struct wapbl_blk
*wb
=
2540 wapbl_blkhash_get(wr
,
2541 wc
->wc_blocks
[i
].wc_daddr
+ j
);
2542 if (wb
&& (wb
->wb_off
== off
)) {
2551 wapbl_read(scratch2
,
2556 if (memcmp(scratch1
,
2560 "wapbl_verify: mismatch block %"PRId64
" at off %"PRIdMAX
"\n",
2561 wb
->wb_blk
, (intmax_t)off
);
2566 wapbl_circ_advance(wr
,
2572 * If all of the blocks in an entry
2573 * are clean, then remove all of its
2574 * blocks from the hashtable since they
2575 * never will need replay.
2577 if ((foundcnt
!= 0) &&
2580 wapbl_circ_advance(wr
,
2582 for (j
= 0; j
< n
; j
++) {
2583 struct wapbl_blk
*wb
=
2584 wapbl_blkhash_get(wr
,
2585 wc
->wc_blocks
[i
].wc_daddr
+ j
);
2587 (wb
->wb_off
== off
)) {
2588 wapbl_blkhash_rem(wr
, wb
->wb_blk
);
2590 wapbl_circ_advance(wr
,
2598 case WAPBL_WC_REVOCATIONS
:
2599 case WAPBL_WC_INODES
:
2605 wapbl_circ_advance(wr
, wcn
->wc_len
, &saveoff
);
2606 KASSERT(off
== saveoff
);
2610 wapbl_free(scratch1
, MAXBSIZE
);
2611 wapbl_free(scratch2
, MAXBSIZE
);
2612 if (!error
&& mismatchcnt
)
2619 wapbl_replay_write(struct wapbl_replay
*wr
, struct vnode
*fsdevvp
)
2621 struct wapbl_blk
*wb
;
2626 int fsblklen
= 1 << wr
->wr_fs_dev_bshift
;
2628 KDASSERT(wapbl_replay_isopen(wr
));
2630 scratch
= wapbl_malloc(MAXBSIZE
);
2632 for (i
= 0; i
< wr
->wr_blkhashmask
; ++i
) {
2633 LIST_FOREACH(wb
, &wr
->wr_blkhash
[i
], wb_hash
) {
2635 error
= wapbl_circ_read(wr
, scratch
, fsblklen
, &off
);
2638 error
= wapbl_write(scratch
, fsblklen
, fsdevvp
,
2645 wapbl_free(scratch
, MAXBSIZE
);
2650 wapbl_replay_can_read(struct wapbl_replay
*wr
, daddr_t blk
, long len
)
2652 int fsblklen
= 1 << wr
->wr_fs_dev_bshift
;
2654 KDASSERT(wapbl_replay_isopen(wr
));
2655 KASSERT((len
% fsblklen
) == 0);
2658 struct wapbl_blk
*wb
= wapbl_blkhash_get(wr
, blk
);
2667 wapbl_replay_read(struct wapbl_replay
*wr
, void *data
, daddr_t blk
, long len
)
2669 int fsblklen
= 1 << wr
->wr_fs_dev_bshift
;
2671 KDASSERT(wapbl_replay_isopen(wr
));
2673 KASSERT((len
% fsblklen
) == 0);
2676 struct wapbl_blk
*wb
= wapbl_blkhash_get(wr
, blk
);
2678 off_t off
= wb
->wb_off
;
2680 error
= wapbl_circ_read(wr
, data
, fsblklen
, &off
);
2684 data
= (uint8_t *)data
+ fsblklen
;