dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / kernel / fs / ufs / lufs_top.c
blobdde6c6d447c5602b9702948b211beee8bc43464f
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 2015 by Delphix. All rights reserved.
30 #include <sys/systm.h>
31 #include <sys/types.h>
32 #include <sys/vnode.h>
33 #include <sys/errno.h>
34 #include <sys/sysmacros.h>
35 #include <sys/debug.h>
36 #include <sys/kmem.h>
37 #include <sys/conf.h>
38 #include <sys/proc.h>
39 #include <sys/taskq.h>
40 #include <sys/cmn_err.h>
41 #include <sys/fs/ufs_inode.h>
42 #include <sys/fs/ufs_filio.h>
43 #include <sys/fs/ufs_log.h>
44 #include <sys/fs/ufs_bio.h>
47 * FILE SYSTEM INTERFACE TO TRANSACTION OPERATIONS (TOP; like VOP)
50 uint_t topkey; /* tsd transaction key */
53 * declare a delta
55 void
56 top_delta(
57 ufsvfs_t *ufsvfsp,
58 offset_t mof,
59 off_t nb,
60 delta_t dtyp,
61 int (*func)(),
62 ulong_t arg)
64 ml_unit_t *ul = ufsvfsp->vfs_log;
65 threadtrans_t *tp = tsd_get(topkey);
67 ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
68 ASSERT(nb);
69 ASSERT(((ul->un_debug & (MT_TRANSACT|MT_MATAMAP)) == 0) ||
70 top_delta_debug(ul, mof, nb, dtyp));
72 deltamap_add(ul->un_deltamap, mof, nb, dtyp, func, arg, tp);
74 ul->un_logmap->mtm_ref = 1; /* for roll thread's heuristic */
75 if (tp) {
76 tp->any_deltas = 1;
81 * cancel a delta
83 void
84 top_cancel(ufsvfs_t *ufsvfsp, offset_t mof, off_t nb, int flags)
86 ml_unit_t *ul = ufsvfsp->vfs_log;
87 int metadata = flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA);
89 ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
90 ASSERT(nb);
91 ASSERT(((ul->un_debug & (MT_TRANSACT|MT_MATAMAP)) == 0) ||
92 (!(flags & metadata) ||
93 top_delta_debug(ul, mof, nb, DT_CANCEL)));
95 if (metadata)
96 deltamap_del(ul->un_deltamap, mof, nb);
98 logmap_cancel(ul, mof, nb, metadata);
101 * needed for the roll thread's heuristic
103 ul->un_logmap->mtm_ref = 1;
107 * check if this delta has been canceled (metadata -> userdata)
110 top_iscancel(ufsvfs_t *ufsvfsp, offset_t mof, off_t nb)
112 ml_unit_t *ul = ufsvfsp->vfs_log;
114 ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
115 ASSERT(nb);
116 if (logmap_iscancel(ul->un_logmap, mof, nb))
117 return (1);
118 if (ul->un_flags & LDL_ERROR)
119 return (1);
120 return (0);
124 * put device into error state
126 void
127 top_seterror(ufsvfs_t *ufsvfsp)
129 ml_unit_t *ul = ufsvfsp->vfs_log;
131 ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
132 ldl_seterror(ul, "ufs is forcing a ufs log error");
136 * issue a empty sync op to help empty the delta/log map or the log
138 static void
139 top_issue_sync(ufsvfs_t *ufsvfsp)
141 int error = 0;
143 if ((curthread->t_flag & T_DONTBLOCK) == 0)
144 curthread->t_flag |= T_DONTBLOCK;
145 top_begin_sync(ufsvfsp, TOP_COMMIT_ASYNC, 0, &error);
146 if (!error) {
147 top_end_sync(ufsvfsp, &error, TOP_COMMIT_ASYNC, 0);
151 static void
152 top_issue_from_taskq(void *arg)
154 ufsvfs_t *ufsvfsp = arg;
155 ml_unit_t *ul = ufsvfsp->vfs_log;
156 mt_map_t *mtm = ul->un_logmap;
158 top_issue_sync(ufsvfsp);
161 * We were called from the taskq_dispatch() in top_begin_async(), so
162 * decrement mtm_taskq_sync_count and wake up the thread waiting
163 * on the mtm_cv if the mtm_taskq_sync_count hits zero.
165 ASSERT(taskq_member(system_taskq, curthread));
167 mutex_enter(&mtm->mtm_lock);
168 mtm->mtm_taskq_sync_count--;
169 if (mtm->mtm_taskq_sync_count == 0) {
170 cv_signal(&mtm->mtm_cv);
172 mutex_exit(&mtm->mtm_lock);
176 * MOBY TRANSACTION ROUTINES
177 * begin a moby transaction
178 * sync ops enter until first sync op finishes
179 * async ops enter until last sync op finishes
180 * end a moby transaction
181 * outstanding deltas are pushed thru log
182 * log buffer is committed (incore only)
183 * next trans is open to async ops
184 * log buffer is committed on the log
185 * next trans is open to sync ops
188 /*ARGSUSED*/
189 void
190 top_begin_sync(ufsvfs_t *ufsvfsp, top_t topid, ulong_t size, int *error)
192 ml_unit_t *ul = ufsvfsp->vfs_log;
193 mt_map_t *mtm = ul->un_logmap;
194 threadtrans_t *tp;
195 ushort_t seq;
197 ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
198 ASSERT(error != NULL);
199 ASSERT(*error == 0);
201 mutex_enter(&mtm->mtm_lock);
202 if (topid == TOP_FSYNC) {
204 * Error the fsync immediately if this is an nfs thread
205 * and its last transaction has already been committed.
206 * The only transactions outstanding are those
207 * where no commit has even started
208 * (last_async_tid == mtm->mtm_tid)
209 * or those where a commit is in progress
210 * (last_async_tid == mtm->mtm_committid)
212 if (curthread->t_flag & T_DONTPEND) {
213 tp = tsd_get(topkey);
214 if (tp && (tp->last_async_tid != mtm->mtm_tid) &&
215 (tp->last_async_tid != mtm->mtm_committid)) {
216 mutex_exit(&mtm->mtm_lock);
217 *error = 1;
218 return;
223 * If there's already other synchronous transactions
224 * and we haven't allowed async ones to start yet
225 * then just wait for the commit to complete.
227 if (((mtm->mtm_closed & (TOP_SYNC | TOP_ASYNC)) ==
228 (TOP_SYNC | TOP_ASYNC)) || mtm->mtm_activesync) {
229 seq = mtm->mtm_seq;
230 do {
231 cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
232 } while (seq == mtm->mtm_seq);
233 mutex_exit(&mtm->mtm_lock);
234 *error = 1;
235 return;
237 if (mtm->mtm_closed & TOP_SYNC) {
239 * We know we're in the window where a thread is
240 * committing a transaction in top_end_sync() and
241 * has allowed async threads to start but hasn't
242 * got the completion on the commit write to
243 * allow sync threads to start.
244 * So wait for that commit completion then retest
245 * for the quick nfs check and if that fails
246 * go on to start a transaction
248 seq = mtm->mtm_seq;
249 do {
250 cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
251 } while (seq == mtm->mtm_seq);
253 /* tp is set above if T_DONTPEND */
254 if ((curthread->t_flag & T_DONTPEND) && tp &&
255 (tp->last_async_tid != mtm->mtm_tid) &&
256 (tp->last_async_tid != mtm->mtm_committid)) {
257 mutex_exit(&mtm->mtm_lock);
258 *error = 1;
259 return;
263 retry:
264 mtm->mtm_ref = 1;
266 * current transaction closed to sync ops; try for next transaction
268 if ((mtm->mtm_closed & TOP_SYNC) && !panicstr) {
269 ulong_t resv;
272 * We know a commit is in progress, if we are trying to
273 * commit and we haven't allowed async ones to start yet,
274 * then just wait for the commit completion
276 if ((size == TOP_COMMIT_SIZE) &&
277 (((mtm->mtm_closed & (TOP_SYNC | TOP_ASYNC)) ==
278 (TOP_SYNC | TOP_ASYNC)) || (mtm->mtm_activesync))) {
279 seq = mtm->mtm_seq;
280 do {
281 cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
282 } while (seq == mtm->mtm_seq);
283 mutex_exit(&mtm->mtm_lock);
284 *error = 1;
285 return;
289 * next transaction is full; try for next transaction
291 resv = size + ul->un_resv_wantin + ul->un_resv;
292 if (resv > ul->un_maxresv) {
293 cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
294 goto retry;
297 * we are in the next transaction; wait for it to start
299 mtm->mtm_wantin++;
300 ul->un_resv_wantin += size;
302 * The corresponding cv_broadcast wakes up
303 * all threads that have been validated to go into
304 * the next transaction. However, because spurious
305 * cv_wait wakeups are possible we use a sequence
306 * number to check that the commit and cv_broadcast
307 * has really occurred. We couldn't use mtm_tid
308 * because on error that doesn't get incremented.
310 seq = mtm->mtm_seq;
311 do {
312 cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
313 } while (seq == mtm->mtm_seq);
314 } else {
316 * if the current transaction is full; try the next one
318 if (size && (ul->un_resv && ((size + ul->un_resv) >
319 ul->un_maxresv)) && !panicstr) {
321 * log is over reserved and no one will unresv the space
322 * so generate empty sync op to unresv the space
324 if (mtm->mtm_activesync == 0) {
325 mutex_exit(&mtm->mtm_lock);
326 top_issue_sync(ufsvfsp);
327 mutex_enter(&mtm->mtm_lock);
328 goto retry;
330 cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
331 goto retry;
334 * we are in the current transaction
336 mtm->mtm_active++;
337 mtm->mtm_activesync++;
338 ul->un_resv += size;
341 ASSERT(mtm->mtm_active > 0);
342 ASSERT(mtm->mtm_activesync > 0);
343 mutex_exit(&mtm->mtm_lock);
345 ASSERT(((ul->un_debug & MT_TRANSACT) == 0) ||
346 top_begin_debug(ul, topid, size));
349 int tryfail_cnt;
352 top_begin_async(ufsvfs_t *ufsvfsp, top_t topid, ulong_t size, int tryasync)
354 ml_unit_t *ul = ufsvfsp->vfs_log;
355 mt_map_t *mtm = ul->un_logmap;
356 threadtrans_t *tp;
358 ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
360 tp = tsd_get(topkey);
361 if (tp == NULL) {
362 tp = kmem_zalloc(sizeof (threadtrans_t), KM_SLEEP);
363 (void) tsd_set(topkey, tp);
365 tp->deltas_size = 0;
366 tp->any_deltas = 0;
368 mutex_enter(&mtm->mtm_lock);
369 retry:
370 mtm->mtm_ref = 1;
372 * current transaction closed to async ops; try for next transaction
374 if ((mtm->mtm_closed & TOP_ASYNC) && !panicstr) {
375 if (tryasync) {
376 mutex_exit(&mtm->mtm_lock);
377 tryfail_cnt++;
378 return (EWOULDBLOCK);
380 cv_wait(&mtm->mtm_cv_next, &mtm->mtm_lock);
381 goto retry;
385 * if the current transaction is full; try the next one
387 if (((size + ul->un_resv + ul->un_resv_wantin) > ul->un_maxresv) &&
388 !panicstr) {
390 * log is overreserved and no one will unresv the space
391 * so generate empty sync op to unresv the space
392 * We need TOP_SYNC_FORCED because we want to know when
393 * a top_end_sync is completed.
394 * mtm_taskq_sync_count is needed because we want to keep track
395 * of the pending top_issue_sync dispatches so that during
396 * forced umount we can wait for these to complete.
397 * mtm_taskq_sync_count is decremented in top_issue_sync and
398 * can remain set even after top_end_sync completes.
399 * We have a window between the clearing of TOP_SYNC_FORCED
400 * flag and the decrementing of mtm_taskq_sync_count.
401 * If in this window new async transactions start consuming
402 * log space, the log can get overreserved.
403 * Subsequently a new async transaction would fail to generate
404 * an empty sync transaction via the taskq, since it finds
405 * the mtm_taskq_sync_count set. This can cause a hang.
406 * Hence we do not test for mtm_taskq_sync_count being zero.
407 * Instead, the TOP_SYNC_FORCED flag is tested here.
409 if ((mtm->mtm_activesync == 0) &&
410 (!(mtm->mtm_closed & TOP_SYNC_FORCED))) {
412 * Set flag to stop multiple forced empty
413 * sync transactions. Increment mtm_taskq_sync_count.
415 mtm->mtm_closed |= TOP_SYNC_FORCED;
416 mtm->mtm_taskq_sync_count++;
417 mutex_exit(&mtm->mtm_lock);
418 (void) taskq_dispatch(system_taskq,
419 top_issue_from_taskq, ufsvfsp, TQ_SLEEP);
420 if (tryasync) {
421 tryfail_cnt++;
422 return (EWOULDBLOCK);
424 mutex_enter(&mtm->mtm_lock);
425 goto retry;
427 if (tryasync) {
428 mutex_exit(&mtm->mtm_lock);
429 tryfail_cnt++;
430 return (EWOULDBLOCK);
432 cv_wait(&mtm->mtm_cv_next, &mtm->mtm_lock);
433 goto retry;
436 * we are in the current transaction
438 mtm->mtm_active++;
439 ul->un_resv += size;
441 ASSERT(mtm->mtm_active > 0);
442 mutex_exit(&mtm->mtm_lock);
444 ASSERT(((ul->un_debug & MT_TRANSACT) == 0) ||
445 top_begin_debug(ul, topid, size));
446 return (0);
449 /*ARGSUSED*/
450 void
451 top_end_sync(ufsvfs_t *ufsvfsp, int *ep, top_t topid, ulong_t size)
453 ml_unit_t *ul = ufsvfsp->vfs_log;
454 mt_map_t *mtm = ul->un_logmap;
455 mapentry_t *cancellist;
456 uint32_t tid;
458 ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
459 ASSERT(((ul->un_debug & MT_TRANSACT) == 0) ||
460 top_end_debug(ul, mtm, topid, size));
462 mutex_enter(&mtm->mtm_lock);
463 tid = mtm->mtm_tid;
465 mtm->mtm_activesync--;
466 mtm->mtm_active--;
468 mtm->mtm_ref = 1;
471 * wait for last syncop to complete
473 if (mtm->mtm_activesync || panicstr) {
474 ushort_t seq = mtm->mtm_seq;
476 mtm->mtm_closed = TOP_SYNC;
478 do {
479 cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
480 } while (seq == mtm->mtm_seq);
481 mutex_exit(&mtm->mtm_lock);
482 goto out;
485 * last syncop; close current transaction to all ops
487 mtm->mtm_closed = TOP_SYNC|TOP_ASYNC;
490 * wait for last asyncop to finish
492 while (mtm->mtm_active) {
493 cv_wait(&mtm->mtm_cv_eot, &mtm->mtm_lock);
497 * push dirty metadata thru the log
499 deltamap_push(ul);
501 ASSERT(((ul->un_debug & MT_FORCEROLL) == 0) ||
502 top_roll_debug(ul));
504 mtm->mtm_tid = tid + 1; /* can overflow to 0 */
507 * Empty the cancellist, but save it for logmap_free_cancel
509 mutex_enter(&mtm->mtm_mutex);
510 cancellist = mtm->mtm_cancel;
511 mtm->mtm_cancel = NULL;
512 mutex_exit(&mtm->mtm_mutex);
515 * allow async ops
517 ASSERT(mtm->mtm_active == 0);
518 ul->un_resv = 0; /* unreserve the log space */
519 mtm->mtm_closed = TOP_SYNC;
521 * Hold the un_log_mutex here until we are done writing
522 * the commit record to prevent any more deltas to be written
523 * to the log after we allow async operations.
525 mutex_enter(&ul->un_log_mutex);
526 mutex_exit(&mtm->mtm_lock);
527 cv_broadcast(&mtm->mtm_cv_next);
530 * asynchronously write the commit record,
532 logmap_commit(ul, tid);
535 * wait for outstanding log writes (e.g., commits) to finish
537 ldl_waito(ul);
540 * Now that we are sure the commit has been written to the log
541 * we can free any canceled deltas. If we free them before
542 * guaranteeing that the commit was written, we could panic before
543 * the commit, but after an async thread has allocated and written
544 * to canceled freed block.
547 logmap_free_cancel(mtm, &cancellist);
548 mutex_exit(&ul->un_log_mutex);
551 * now, allow all ops
553 mutex_enter(&mtm->mtm_lock);
554 mtm->mtm_active += mtm->mtm_wantin;
555 ul->un_resv += ul->un_resv_wantin;
556 mtm->mtm_activesync = mtm->mtm_wantin;
557 mtm->mtm_wantin = 0;
558 mtm->mtm_closed = 0;
559 ul->un_resv_wantin = 0;
560 mtm->mtm_committid = mtm->mtm_tid;
561 mtm->mtm_seq++;
562 mutex_exit(&mtm->mtm_lock);
565 * Finish any other synchronous transactions and
566 * start any waiting new synchronous transactions
568 cv_broadcast(&mtm->mtm_cv_commit);
571 * if the logmap is getting full; roll something
573 if (logmap_need_roll_sync(mtm)) {
574 logmap_forceroll_nowait(mtm);
577 out:
578 if (ul->un_flags & LDL_ERROR)
579 *ep = EIO;
582 /*ARGSUSED*/
583 void
584 top_end_async(ufsvfs_t *ufsvfsp, top_t topid, ulong_t size)
586 ml_unit_t *ul = ufsvfsp->vfs_log;
587 mt_map_t *mtm = ul->un_logmap;
588 threadtrans_t *tp = tsd_get(topkey);
589 int wakeup_needed = 0;
591 ASSERT(tp);
592 ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
593 ASSERT(((ul->un_debug & MT_TRANSACT) == 0) ||
594 top_end_debug(ul, mtm, topid, size));
596 mutex_enter(&mtm->mtm_lock);
598 if (size > tp->deltas_size) {
599 ul->un_resv -= (size - tp->deltas_size);
601 if (tp->any_deltas) {
602 tp->last_async_tid = mtm->mtm_tid;
604 mtm->mtm_ref = 1;
606 mtm->mtm_active--;
607 if ((mtm->mtm_active == 0) &&
608 (mtm->mtm_closed == (TOP_SYNC|TOP_ASYNC))) {
609 wakeup_needed = 1;
611 mutex_exit(&mtm->mtm_lock);
612 if (wakeup_needed)
613 cv_signal(&mtm->mtm_cv_eot);
616 * Generate a sync op if the log, logmap, or deltamap are heavily used.
617 * Unless we are possibly holding any VM locks, since if we are holding
618 * any VM locks and we issue a top_end_sync(), we could deadlock.
620 if ((mtm->mtm_activesync == 0) &&
621 !(mtm->mtm_closed & TOP_SYNC) &&
622 (deltamap_need_commit(ul->un_deltamap) ||
623 logmap_need_commit(mtm) ||
624 ldl_need_commit(ul)) &&
625 (topid != TOP_GETPAGE)) {
626 top_issue_sync(ufsvfsp);
629 * roll something from the log if the logmap is too full
631 if (logmap_need_roll_async(mtm))
632 logmap_forceroll_nowait(mtm);
636 * Called from roll thread;
637 * buffer set for reading master
638 * Returns
639 * 0 - success, can continue with next buffer
640 * 1 - failure due to logmap deltas being in use
643 top_read_roll(rollbuf_t *rbp, ml_unit_t *ul)
645 buf_t *bp = &rbp->rb_bh;
646 offset_t mof = ldbtob(bp->b_blkno);
649 * get a list of deltas
651 if (logmap_list_get_roll(ul->un_logmap, mof, rbp)) {
652 /* logmap deltas are in use */
653 return (1);
657 * no deltas were found, nothing to roll
659 if (rbp->rb_age == NULL) {
660 bp->b_flags |= B_INVAL;
661 return (0);
665 * If there is one cached roll buffer that cover all the deltas then
666 * we can use that instead of copying to a separate roll buffer.
668 if (rbp->rb_crb) {
669 rbp->rb_bh.b_blkno = lbtodb(rbp->rb_crb->c_mof);
670 return (0);
674 * Set up the read.
675 * If no read is needed logmap_setup_read() returns 0.
677 if (logmap_setup_read(rbp->rb_age, rbp)) {
679 * async read the data from master
681 logstats.ls_rreads.value.ui64++;
682 bp->b_bcount = MAPBLOCKSIZE;
683 (void) bdev_strategy(bp);
684 lwp_stat_update(LWP_STAT_INBLK, 1);
685 } else {
686 sema_v(&bp->b_io); /* mark read as complete */
688 return (0);
691 int ufs_crb_enable = 1;
694 * move deltas from deltamap into the log
696 void
697 top_log(ufsvfs_t *ufsvfsp, char *va, offset_t vamof, off_t nb,
698 caddr_t buf, uint32_t bufsz)
700 ml_unit_t *ul = ufsvfsp->vfs_log;
701 mapentry_t *me;
702 offset_t hmof;
703 uint32_t hnb, nb1;
706 * needed for the roll thread's heuristic
708 ul->un_logmap->mtm_ref = 1;
710 if (buf && ufs_crb_enable) {
711 ASSERT((bufsz & DEV_BMASK) == 0);
713 * Move any deltas to the logmap. Split requests that
714 * straddle MAPBLOCKSIZE hash boundaries (i.e. summary info).
716 for (hmof = vamof - (va - buf), nb1 = nb; bufsz;
717 bufsz -= hnb, hmof += hnb, buf += hnb, nb1 -= hnb) {
718 hnb = MAPBLOCKSIZE - (hmof & MAPBLOCKOFF);
719 if (hnb > bufsz)
720 hnb = bufsz;
721 me = deltamap_remove(ul->un_deltamap,
722 MAX(hmof, vamof), MIN(hnb, nb1));
723 if (me) {
724 logmap_add_buf(ul, va, hmof, me, buf, hnb);
727 } else {
729 * if there are deltas
731 me = deltamap_remove(ul->un_deltamap, vamof, nb);
732 if (me) {
734 * move to logmap
736 logmap_add(ul, va, vamof, me);
740 ASSERT((ul->un_matamap == NULL) ||
741 matamap_within(ul->un_matamap, vamof, nb));
745 static void
746 top_threadtrans_destroy(void *tp)
748 kmem_free(tp, sizeof (threadtrans_t));
751 void
752 _init_top(void)
754 ASSERT(top_init_debug());
757 * set up the delta layer
759 _init_map();
762 * Initialise the thread specific data transaction key
764 tsd_create(&topkey, top_threadtrans_destroy);