4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 2015 by Delphix. All rights reserved.
30 #include <sys/systm.h>
31 #include <sys/types.h>
32 #include <sys/vnode.h>
33 #include <sys/errno.h>
34 #include <sys/sysmacros.h>
35 #include <sys/debug.h>
39 #include <sys/taskq.h>
40 #include <sys/cmn_err.h>
41 #include <sys/fs/ufs_inode.h>
42 #include <sys/fs/ufs_filio.h>
43 #include <sys/fs/ufs_log.h>
44 #include <sys/fs/ufs_bio.h>
47 * FILE SYSTEM INTERFACE TO TRANSACTION OPERATIONS (TOP; like VOP)
50 uint_t topkey
; /* tsd transaction key */
64 ml_unit_t
*ul
= ufsvfsp
->vfs_log
;
65 threadtrans_t
*tp
= tsd_get(topkey
);
67 ASSERT(ufsvfsp
->vfs_dev
== ul
->un_dev
);
69 ASSERT(((ul
->un_debug
& (MT_TRANSACT
|MT_MATAMAP
)) == 0) ||
70 top_delta_debug(ul
, mof
, nb
, dtyp
));
72 deltamap_add(ul
->un_deltamap
, mof
, nb
, dtyp
, func
, arg
, tp
);
74 ul
->un_logmap
->mtm_ref
= 1; /* for roll thread's heuristic */
84 top_cancel(ufsvfs_t
*ufsvfsp
, offset_t mof
, off_t nb
, int flags
)
86 ml_unit_t
*ul
= ufsvfsp
->vfs_log
;
87 int metadata
= flags
& (I_DIR
|I_IBLK
|I_SHAD
|I_QUOTA
);
89 ASSERT(ufsvfsp
->vfs_dev
== ul
->un_dev
);
91 ASSERT(((ul
->un_debug
& (MT_TRANSACT
|MT_MATAMAP
)) == 0) ||
92 (!(flags
& metadata
) ||
93 top_delta_debug(ul
, mof
, nb
, DT_CANCEL
)));
96 deltamap_del(ul
->un_deltamap
, mof
, nb
);
98 logmap_cancel(ul
, mof
, nb
, metadata
);
101 * needed for the roll thread's heuristic
103 ul
->un_logmap
->mtm_ref
= 1;
107 * check if this delta has been canceled (metadata -> userdata)
110 top_iscancel(ufsvfs_t
*ufsvfsp
, offset_t mof
, off_t nb
)
112 ml_unit_t
*ul
= ufsvfsp
->vfs_log
;
114 ASSERT(ufsvfsp
->vfs_dev
== ul
->un_dev
);
116 if (logmap_iscancel(ul
->un_logmap
, mof
, nb
))
118 if (ul
->un_flags
& LDL_ERROR
)
124 * put device into error state
127 top_seterror(ufsvfs_t
*ufsvfsp
)
129 ml_unit_t
*ul
= ufsvfsp
->vfs_log
;
131 ASSERT(ufsvfsp
->vfs_dev
== ul
->un_dev
);
132 ldl_seterror(ul
, "ufs is forcing a ufs log error");
136 * issue a empty sync op to help empty the delta/log map or the log
139 top_issue_sync(ufsvfs_t
*ufsvfsp
)
143 if ((curthread
->t_flag
& T_DONTBLOCK
) == 0)
144 curthread
->t_flag
|= T_DONTBLOCK
;
145 top_begin_sync(ufsvfsp
, TOP_COMMIT_ASYNC
, 0, &error
);
147 top_end_sync(ufsvfsp
, &error
, TOP_COMMIT_ASYNC
, 0);
152 top_issue_from_taskq(void *arg
)
154 ufsvfs_t
*ufsvfsp
= arg
;
155 ml_unit_t
*ul
= ufsvfsp
->vfs_log
;
156 mt_map_t
*mtm
= ul
->un_logmap
;
158 top_issue_sync(ufsvfsp
);
161 * We were called from the taskq_dispatch() in top_begin_async(), so
162 * decrement mtm_taskq_sync_count and wake up the thread waiting
163 * on the mtm_cv if the mtm_taskq_sync_count hits zero.
165 ASSERT(taskq_member(system_taskq
, curthread
));
167 mutex_enter(&mtm
->mtm_lock
);
168 mtm
->mtm_taskq_sync_count
--;
169 if (mtm
->mtm_taskq_sync_count
== 0) {
170 cv_signal(&mtm
->mtm_cv
);
172 mutex_exit(&mtm
->mtm_lock
);
176 * MOBY TRANSACTION ROUTINES
177 * begin a moby transaction
178 * sync ops enter until first sync op finishes
179 * async ops enter until last sync op finishes
180 * end a moby transaction
181 * outstanding deltas are pushed thru log
182 * log buffer is committed (incore only)
183 * next trans is open to async ops
184 * log buffer is committed on the log
185 * next trans is open to sync ops
190 top_begin_sync(ufsvfs_t
*ufsvfsp
, top_t topid
, ulong_t size
, int *error
)
192 ml_unit_t
*ul
= ufsvfsp
->vfs_log
;
193 mt_map_t
*mtm
= ul
->un_logmap
;
197 ASSERT(ufsvfsp
->vfs_dev
== ul
->un_dev
);
198 ASSERT(error
!= NULL
);
201 mutex_enter(&mtm
->mtm_lock
);
202 if (topid
== TOP_FSYNC
) {
204 * Error the fsync immediately if this is an nfs thread
205 * and its last transaction has already been committed.
206 * The only transactions outstanding are those
207 * where no commit has even started
208 * (last_async_tid == mtm->mtm_tid)
209 * or those where a commit is in progress
210 * (last_async_tid == mtm->mtm_committid)
212 if (curthread
->t_flag
& T_DONTPEND
) {
213 tp
= tsd_get(topkey
);
214 if (tp
&& (tp
->last_async_tid
!= mtm
->mtm_tid
) &&
215 (tp
->last_async_tid
!= mtm
->mtm_committid
)) {
216 mutex_exit(&mtm
->mtm_lock
);
223 * If there's already other synchronous transactions
224 * and we haven't allowed async ones to start yet
225 * then just wait for the commit to complete.
227 if (((mtm
->mtm_closed
& (TOP_SYNC
| TOP_ASYNC
)) ==
228 (TOP_SYNC
| TOP_ASYNC
)) || mtm
->mtm_activesync
) {
231 cv_wait(&mtm
->mtm_cv_commit
, &mtm
->mtm_lock
);
232 } while (seq
== mtm
->mtm_seq
);
233 mutex_exit(&mtm
->mtm_lock
);
237 if (mtm
->mtm_closed
& TOP_SYNC
) {
239 * We know we're in the window where a thread is
240 * committing a transaction in top_end_sync() and
241 * has allowed async threads to start but hasn't
242 * got the completion on the commit write to
243 * allow sync threads to start.
244 * So wait for that commit completion then retest
245 * for the quick nfs check and if that fails
246 * go on to start a transaction
250 cv_wait(&mtm
->mtm_cv_commit
, &mtm
->mtm_lock
);
251 } while (seq
== mtm
->mtm_seq
);
253 /* tp is set above if T_DONTPEND */
254 if ((curthread
->t_flag
& T_DONTPEND
) && tp
&&
255 (tp
->last_async_tid
!= mtm
->mtm_tid
) &&
256 (tp
->last_async_tid
!= mtm
->mtm_committid
)) {
257 mutex_exit(&mtm
->mtm_lock
);
266 * current transaction closed to sync ops; try for next transaction
268 if ((mtm
->mtm_closed
& TOP_SYNC
) && !panicstr
) {
272 * We know a commit is in progress, if we are trying to
273 * commit and we haven't allowed async ones to start yet,
274 * then just wait for the commit completion
276 if ((size
== TOP_COMMIT_SIZE
) &&
277 (((mtm
->mtm_closed
& (TOP_SYNC
| TOP_ASYNC
)) ==
278 (TOP_SYNC
| TOP_ASYNC
)) || (mtm
->mtm_activesync
))) {
281 cv_wait(&mtm
->mtm_cv_commit
, &mtm
->mtm_lock
);
282 } while (seq
== mtm
->mtm_seq
);
283 mutex_exit(&mtm
->mtm_lock
);
289 * next transaction is full; try for next transaction
291 resv
= size
+ ul
->un_resv_wantin
+ ul
->un_resv
;
292 if (resv
> ul
->un_maxresv
) {
293 cv_wait(&mtm
->mtm_cv_commit
, &mtm
->mtm_lock
);
297 * we are in the next transaction; wait for it to start
300 ul
->un_resv_wantin
+= size
;
302 * The corresponding cv_broadcast wakes up
303 * all threads that have been validated to go into
304 * the next transaction. However, because spurious
305 * cv_wait wakeups are possible we use a sequence
306 * number to check that the commit and cv_broadcast
307 * has really occurred. We couldn't use mtm_tid
308 * because on error that doesn't get incremented.
312 cv_wait(&mtm
->mtm_cv_commit
, &mtm
->mtm_lock
);
313 } while (seq
== mtm
->mtm_seq
);
316 * if the current transaction is full; try the next one
318 if (size
&& (ul
->un_resv
&& ((size
+ ul
->un_resv
) >
319 ul
->un_maxresv
)) && !panicstr
) {
321 * log is over reserved and no one will unresv the space
322 * so generate empty sync op to unresv the space
324 if (mtm
->mtm_activesync
== 0) {
325 mutex_exit(&mtm
->mtm_lock
);
326 top_issue_sync(ufsvfsp
);
327 mutex_enter(&mtm
->mtm_lock
);
330 cv_wait(&mtm
->mtm_cv_commit
, &mtm
->mtm_lock
);
334 * we are in the current transaction
337 mtm
->mtm_activesync
++;
341 ASSERT(mtm
->mtm_active
> 0);
342 ASSERT(mtm
->mtm_activesync
> 0);
343 mutex_exit(&mtm
->mtm_lock
);
345 ASSERT(((ul
->un_debug
& MT_TRANSACT
) == 0) ||
346 top_begin_debug(ul
, topid
, size
));
352 top_begin_async(ufsvfs_t
*ufsvfsp
, top_t topid
, ulong_t size
, int tryasync
)
354 ml_unit_t
*ul
= ufsvfsp
->vfs_log
;
355 mt_map_t
*mtm
= ul
->un_logmap
;
358 ASSERT(ufsvfsp
->vfs_dev
== ul
->un_dev
);
360 tp
= tsd_get(topkey
);
362 tp
= kmem_zalloc(sizeof (threadtrans_t
), KM_SLEEP
);
363 (void) tsd_set(topkey
, tp
);
368 mutex_enter(&mtm
->mtm_lock
);
372 * current transaction closed to async ops; try for next transaction
374 if ((mtm
->mtm_closed
& TOP_ASYNC
) && !panicstr
) {
376 mutex_exit(&mtm
->mtm_lock
);
378 return (EWOULDBLOCK
);
380 cv_wait(&mtm
->mtm_cv_next
, &mtm
->mtm_lock
);
385 * if the current transaction is full; try the next one
387 if (((size
+ ul
->un_resv
+ ul
->un_resv_wantin
) > ul
->un_maxresv
) &&
390 * log is overreserved and no one will unresv the space
391 * so generate empty sync op to unresv the space
392 * We need TOP_SYNC_FORCED because we want to know when
393 * a top_end_sync is completed.
394 * mtm_taskq_sync_count is needed because we want to keep track
395 * of the pending top_issue_sync dispatches so that during
396 * forced umount we can wait for these to complete.
397 * mtm_taskq_sync_count is decremented in top_issue_sync and
398 * can remain set even after top_end_sync completes.
399 * We have a window between the clearing of TOP_SYNC_FORCED
400 * flag and the decrementing of mtm_taskq_sync_count.
401 * If in this window new async transactions start consuming
402 * log space, the log can get overreserved.
403 * Subsequently a new async transaction would fail to generate
404 * an empty sync transaction via the taskq, since it finds
405 * the mtm_taskq_sync_count set. This can cause a hang.
406 * Hence we do not test for mtm_taskq_sync_count being zero.
407 * Instead, the TOP_SYNC_FORCED flag is tested here.
409 if ((mtm
->mtm_activesync
== 0) &&
410 (!(mtm
->mtm_closed
& TOP_SYNC_FORCED
))) {
412 * Set flag to stop multiple forced empty
413 * sync transactions. Increment mtm_taskq_sync_count.
415 mtm
->mtm_closed
|= TOP_SYNC_FORCED
;
416 mtm
->mtm_taskq_sync_count
++;
417 mutex_exit(&mtm
->mtm_lock
);
418 (void) taskq_dispatch(system_taskq
,
419 top_issue_from_taskq
, ufsvfsp
, TQ_SLEEP
);
422 return (EWOULDBLOCK
);
424 mutex_enter(&mtm
->mtm_lock
);
428 mutex_exit(&mtm
->mtm_lock
);
430 return (EWOULDBLOCK
);
432 cv_wait(&mtm
->mtm_cv_next
, &mtm
->mtm_lock
);
436 * we are in the current transaction
441 ASSERT(mtm
->mtm_active
> 0);
442 mutex_exit(&mtm
->mtm_lock
);
444 ASSERT(((ul
->un_debug
& MT_TRANSACT
) == 0) ||
445 top_begin_debug(ul
, topid
, size
));
451 top_end_sync(ufsvfs_t
*ufsvfsp
, int *ep
, top_t topid
, ulong_t size
)
453 ml_unit_t
*ul
= ufsvfsp
->vfs_log
;
454 mt_map_t
*mtm
= ul
->un_logmap
;
455 mapentry_t
*cancellist
;
458 ASSERT(ufsvfsp
->vfs_dev
== ul
->un_dev
);
459 ASSERT(((ul
->un_debug
& MT_TRANSACT
) == 0) ||
460 top_end_debug(ul
, mtm
, topid
, size
));
462 mutex_enter(&mtm
->mtm_lock
);
465 mtm
->mtm_activesync
--;
471 * wait for last syncop to complete
473 if (mtm
->mtm_activesync
|| panicstr
) {
474 ushort_t seq
= mtm
->mtm_seq
;
476 mtm
->mtm_closed
= TOP_SYNC
;
479 cv_wait(&mtm
->mtm_cv_commit
, &mtm
->mtm_lock
);
480 } while (seq
== mtm
->mtm_seq
);
481 mutex_exit(&mtm
->mtm_lock
);
485 * last syncop; close current transaction to all ops
487 mtm
->mtm_closed
= TOP_SYNC
|TOP_ASYNC
;
490 * wait for last asyncop to finish
492 while (mtm
->mtm_active
) {
493 cv_wait(&mtm
->mtm_cv_eot
, &mtm
->mtm_lock
);
497 * push dirty metadata thru the log
501 ASSERT(((ul
->un_debug
& MT_FORCEROLL
) == 0) ||
504 mtm
->mtm_tid
= tid
+ 1; /* can overflow to 0 */
507 * Empty the cancellist, but save it for logmap_free_cancel
509 mutex_enter(&mtm
->mtm_mutex
);
510 cancellist
= mtm
->mtm_cancel
;
511 mtm
->mtm_cancel
= NULL
;
512 mutex_exit(&mtm
->mtm_mutex
);
517 ASSERT(mtm
->mtm_active
== 0);
518 ul
->un_resv
= 0; /* unreserve the log space */
519 mtm
->mtm_closed
= TOP_SYNC
;
521 * Hold the un_log_mutex here until we are done writing
522 * the commit record to prevent any more deltas to be written
523 * to the log after we allow async operations.
525 mutex_enter(&ul
->un_log_mutex
);
526 mutex_exit(&mtm
->mtm_lock
);
527 cv_broadcast(&mtm
->mtm_cv_next
);
530 * asynchronously write the commit record,
532 logmap_commit(ul
, tid
);
535 * wait for outstanding log writes (e.g., commits) to finish
540 * Now that we are sure the commit has been written to the log
541 * we can free any canceled deltas. If we free them before
542 * guaranteeing that the commit was written, we could panic before
543 * the commit, but after an async thread has allocated and written
544 * to canceled freed block.
547 logmap_free_cancel(mtm
, &cancellist
);
548 mutex_exit(&ul
->un_log_mutex
);
553 mutex_enter(&mtm
->mtm_lock
);
554 mtm
->mtm_active
+= mtm
->mtm_wantin
;
555 ul
->un_resv
+= ul
->un_resv_wantin
;
556 mtm
->mtm_activesync
= mtm
->mtm_wantin
;
559 ul
->un_resv_wantin
= 0;
560 mtm
->mtm_committid
= mtm
->mtm_tid
;
562 mutex_exit(&mtm
->mtm_lock
);
565 * Finish any other synchronous transactions and
566 * start any waiting new synchronous transactions
568 cv_broadcast(&mtm
->mtm_cv_commit
);
571 * if the logmap is getting full; roll something
573 if (logmap_need_roll_sync(mtm
)) {
574 logmap_forceroll_nowait(mtm
);
578 if (ul
->un_flags
& LDL_ERROR
)
584 top_end_async(ufsvfs_t
*ufsvfsp
, top_t topid
, ulong_t size
)
586 ml_unit_t
*ul
= ufsvfsp
->vfs_log
;
587 mt_map_t
*mtm
= ul
->un_logmap
;
588 threadtrans_t
*tp
= tsd_get(topkey
);
589 int wakeup_needed
= 0;
592 ASSERT(ufsvfsp
->vfs_dev
== ul
->un_dev
);
593 ASSERT(((ul
->un_debug
& MT_TRANSACT
) == 0) ||
594 top_end_debug(ul
, mtm
, topid
, size
));
596 mutex_enter(&mtm
->mtm_lock
);
598 if (size
> tp
->deltas_size
) {
599 ul
->un_resv
-= (size
- tp
->deltas_size
);
601 if (tp
->any_deltas
) {
602 tp
->last_async_tid
= mtm
->mtm_tid
;
607 if ((mtm
->mtm_active
== 0) &&
608 (mtm
->mtm_closed
== (TOP_SYNC
|TOP_ASYNC
))) {
611 mutex_exit(&mtm
->mtm_lock
);
613 cv_signal(&mtm
->mtm_cv_eot
);
616 * Generate a sync op if the log, logmap, or deltamap are heavily used.
617 * Unless we are possibly holding any VM locks, since if we are holding
618 * any VM locks and we issue a top_end_sync(), we could deadlock.
620 if ((mtm
->mtm_activesync
== 0) &&
621 !(mtm
->mtm_closed
& TOP_SYNC
) &&
622 (deltamap_need_commit(ul
->un_deltamap
) ||
623 logmap_need_commit(mtm
) ||
624 ldl_need_commit(ul
)) &&
625 (topid
!= TOP_GETPAGE
)) {
626 top_issue_sync(ufsvfsp
);
629 * roll something from the log if the logmap is too full
631 if (logmap_need_roll_async(mtm
))
632 logmap_forceroll_nowait(mtm
);
636 * Called from roll thread;
637 * buffer set for reading master
639 * 0 - success, can continue with next buffer
640 * 1 - failure due to logmap deltas being in use
643 top_read_roll(rollbuf_t
*rbp
, ml_unit_t
*ul
)
645 buf_t
*bp
= &rbp
->rb_bh
;
646 offset_t mof
= ldbtob(bp
->b_blkno
);
649 * get a list of deltas
651 if (logmap_list_get_roll(ul
->un_logmap
, mof
, rbp
)) {
652 /* logmap deltas are in use */
657 * no deltas were found, nothing to roll
659 if (rbp
->rb_age
== NULL
) {
660 bp
->b_flags
|= B_INVAL
;
665 * If there is one cached roll buffer that cover all the deltas then
666 * we can use that instead of copying to a separate roll buffer.
669 rbp
->rb_bh
.b_blkno
= lbtodb(rbp
->rb_crb
->c_mof
);
675 * If no read is needed logmap_setup_read() returns 0.
677 if (logmap_setup_read(rbp
->rb_age
, rbp
)) {
679 * async read the data from master
681 logstats
.ls_rreads
.value
.ui64
++;
682 bp
->b_bcount
= MAPBLOCKSIZE
;
683 (void) bdev_strategy(bp
);
684 lwp_stat_update(LWP_STAT_INBLK
, 1);
686 sema_v(&bp
->b_io
); /* mark read as complete */
691 int ufs_crb_enable
= 1;
694 * move deltas from deltamap into the log
697 top_log(ufsvfs_t
*ufsvfsp
, char *va
, offset_t vamof
, off_t nb
,
698 caddr_t buf
, uint32_t bufsz
)
700 ml_unit_t
*ul
= ufsvfsp
->vfs_log
;
706 * needed for the roll thread's heuristic
708 ul
->un_logmap
->mtm_ref
= 1;
710 if (buf
&& ufs_crb_enable
) {
711 ASSERT((bufsz
& DEV_BMASK
) == 0);
713 * Move any deltas to the logmap. Split requests that
714 * straddle MAPBLOCKSIZE hash boundaries (i.e. summary info).
716 for (hmof
= vamof
- (va
- buf
), nb1
= nb
; bufsz
;
717 bufsz
-= hnb
, hmof
+= hnb
, buf
+= hnb
, nb1
-= hnb
) {
718 hnb
= MAPBLOCKSIZE
- (hmof
& MAPBLOCKOFF
);
721 me
= deltamap_remove(ul
->un_deltamap
,
722 MAX(hmof
, vamof
), MIN(hnb
, nb1
));
724 logmap_add_buf(ul
, va
, hmof
, me
, buf
, hnb
);
729 * if there are deltas
731 me
= deltamap_remove(ul
->un_deltamap
, vamof
, nb
);
736 logmap_add(ul
, va
, vamof
, me
);
740 ASSERT((ul
->un_matamap
== NULL
) ||
741 matamap_within(ul
->un_matamap
, vamof
, nb
));
746 top_threadtrans_destroy(void *tp
)
748 kmem_free(tp
, sizeof (threadtrans_t
));
754 ASSERT(top_init_debug());
757 * set up the delta layer
762 * Initialise the thread specific data transaction key
764 tsd_create(&topkey
, top_threadtrans_destroy
);