4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
30 * This file contains the code to implement file range locking in
31 * ZFS, although there isn't much specific to ZFS (all that comes to mind is
32 * support for growing the blocksize).
36 * Defined in zfs_rlock.h but essentially:
37 * lr = rangelock_enter(zp, off, len, lock_type);
38 * rangelock_reduce(lr, off, len); // optional
43 * An AVL tree is used to maintain the state of the existing ranges
44 * that are locked for exclusive (writer) or shared (reader) use.
45 * The starting range offset is used for searching and sorting the tree.
49 * The (hopefully) usual case is of no overlaps or contention for locks. On
50 * entry to rangelock_enter(), a locked_range_t is allocated; the tree
51 * searched that finds no overlap, and *this* locked_range_t is placed in the
54 * Overlaps/Reference counting/Proxy locks
55 * ---------------------------------------
56 * The avl code only allows one node at a particular offset. Also it's very
57 * inefficient to search through all previous entries looking for overlaps
58 * (because the very 1st in the ordered list might be at offset 0 but
59 * cover the whole file).
60 * So this implementation uses reference counts and proxy range locks.
61 * Firstly, only reader locks use reference counts and proxy locks,
62 * because writer locks are exclusive.
63 * When a reader lock overlaps with another then a proxy lock is created
64 * for that range and replaces the original lock. If the overlap
65 * is exact then the reference count of the proxy is simply incremented.
66 * Otherwise, the proxy lock is split into smaller lock ranges and
67 * new proxy locks created for non overlapping ranges.
68 * The reference counts are adjusted accordingly.
69 * Meanwhile, the original lock is kept around (this is the callers handle)
70 * and its offset and length are used when releasing the lock.
74 * In order to make wakeups efficient and to ensure multiple continuous
75 * readers on a range don't starve a writer for the same range lock,
76 * two condition variables are allocated in each rl_t.
77 * If a writer (or reader) can't get a range it initialises the writer
78 * (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
79 * and waits on that cv. When a thread unlocks that range it wakes up all
80 * writers then all readers before destroying the lock.
84 * Append mode writes need to lock a range at the end of a file.
85 * The offset of the end of the file is determined under the
86 * range locking mutex, and the lock type converted from RL_APPEND to
87 * RL_WRITER and the range locked.
91 * ZFS supports multiple block sizes, up to 16MB. The smallest
92 * block size is used for the file which is grown as needed. During this
93 * growth all other writers and readers must be excluded.
94 * So if the block size needs to be grown then the whole file is
95 * exclusively locked, then later the caller will reduce the lock
96 * range to just the range to be written using rangelock_reduce().
99 #include <sys/zfs_context.h>
100 #include <sys/zfs_rlock.h>
103 * AVL comparison function used to order range locks
104 * Locks are ordered on the start offset of the range.
107 zfs_rangelock_compare(const void *arg1
, const void *arg2
)
109 const zfs_locked_range_t
*rl1
= (const zfs_locked_range_t
*)arg1
;
110 const zfs_locked_range_t
*rl2
= (const zfs_locked_range_t
*)arg2
;
112 return (AVL_CMP(rl1
->lr_offset
, rl2
->lr_offset
));
116 * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock.
117 * It must convert RL_APPEND to RL_WRITER (starting at the end of the file),
118 * and may increase the range that's locked for RL_WRITER.
121 zfs_rangelock_init(zfs_rangelock_t
*rl
, zfs_rangelock_cb_t
*cb
, void *arg
)
123 mutex_init(&rl
->rl_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
124 avl_create(&rl
->rl_tree
, zfs_rangelock_compare
,
125 sizeof (zfs_locked_range_t
), offsetof(zfs_locked_range_t
, lr_node
));
131 zfs_rangelock_fini(zfs_rangelock_t
*rl
)
133 mutex_destroy(&rl
->rl_lock
);
134 avl_destroy(&rl
->rl_tree
);
138 * Check if a write lock can be grabbed, or wait and recheck until available.
141 zfs_rangelock_enter_writer(zfs_rangelock_t
*rl
, zfs_locked_range_t
*new)
143 avl_tree_t
*tree
= &rl
->rl_tree
;
144 zfs_locked_range_t
*lr
;
146 uint64_t orig_off
= new->lr_offset
;
147 uint64_t orig_len
= new->lr_length
;
148 zfs_rangelock_type_t orig_type
= new->lr_type
;
152 * Call callback which can modify new->r_off,len,type.
153 * Note, the callback is used by the ZPL to handle appending
154 * and changing blocksizes. It isn't needed for zvols.
156 if (rl
->rl_cb
!= NULL
) {
157 rl
->rl_cb(new, rl
->rl_arg
);
161 * If the type was APPEND, the callback must convert it to
164 ASSERT3U(new->lr_type
, ==, RL_WRITER
);
167 * First check for the usual case of no locks
169 if (avl_numnodes(tree
) == 0) {
175 * Look for any locks in the range.
177 lr
= avl_find(tree
, new, &where
);
179 goto wait
; /* already locked at same offset */
181 lr
= avl_nearest(tree
, where
, AVL_AFTER
);
183 lr
->lr_offset
< new->lr_offset
+ new->lr_length
)
186 lr
= avl_nearest(tree
, where
, AVL_BEFORE
);
188 lr
->lr_offset
+ lr
->lr_length
> new->lr_offset
)
191 avl_insert(tree
, new, where
);
194 if (!lr
->lr_write_wanted
) {
195 cv_init(&lr
->lr_write_cv
, NULL
, CV_DEFAULT
, NULL
);
196 lr
->lr_write_wanted
= B_TRUE
;
198 cv_wait(&lr
->lr_write_cv
, &rl
->rl_lock
);
200 /* reset to original */
201 new->lr_offset
= orig_off
;
202 new->lr_length
= orig_len
;
203 new->lr_type
= orig_type
;
208 * If this is an original (non-proxy) lock then replace it by
209 * a proxy and return the proxy.
211 static zfs_locked_range_t
*
212 zfs_rangelock_proxify(avl_tree_t
*tree
, zfs_locked_range_t
*lr
)
214 zfs_locked_range_t
*proxy
;
217 return (lr
); /* already a proxy */
219 ASSERT3U(lr
->lr_count
, ==, 1);
220 ASSERT(lr
->lr_write_wanted
== B_FALSE
);
221 ASSERT(lr
->lr_read_wanted
== B_FALSE
);
222 avl_remove(tree
, lr
);
225 /* create a proxy range lock */
226 proxy
= kmem_alloc(sizeof (zfs_locked_range_t
), KM_SLEEP
);
227 proxy
->lr_offset
= lr
->lr_offset
;
228 proxy
->lr_length
= lr
->lr_length
;
230 proxy
->lr_type
= RL_READER
;
231 proxy
->lr_proxy
= B_TRUE
;
232 proxy
->lr_write_wanted
= B_FALSE
;
233 proxy
->lr_read_wanted
= B_FALSE
;
234 avl_add(tree
, proxy
);
240 * Split the range lock at the supplied offset
241 * returning the *front* proxy.
243 static zfs_locked_range_t
*
244 zfs_rangelock_split(avl_tree_t
*tree
, zfs_locked_range_t
*lr
, uint64_t off
)
246 zfs_locked_range_t
*rear
;
248 ASSERT3U(lr
->lr_length
, >, 1);
249 ASSERT3U(off
, >, lr
->lr_offset
);
250 ASSERT3U(off
, <, lr
->lr_offset
+ lr
->lr_length
);
251 ASSERT(lr
->lr_write_wanted
== B_FALSE
);
252 ASSERT(lr
->lr_read_wanted
== B_FALSE
);
254 /* create the rear proxy range lock */
255 rear
= kmem_alloc(sizeof (zfs_locked_range_t
), KM_SLEEP
);
256 rear
->lr_offset
= off
;
257 rear
->lr_length
= lr
->lr_offset
+ lr
->lr_length
- off
;
258 rear
->lr_count
= lr
->lr_count
;
259 rear
->lr_type
= RL_READER
;
260 rear
->lr_proxy
= B_TRUE
;
261 rear
->lr_write_wanted
= B_FALSE
;
262 rear
->lr_read_wanted
= B_FALSE
;
264 zfs_locked_range_t
*front
= zfs_rangelock_proxify(tree
, lr
);
265 front
->lr_length
= off
- lr
->lr_offset
;
267 avl_insert_here(tree
, rear
, front
, AVL_AFTER
);
272 * Create and add a new proxy range lock for the supplied range.
275 zfs_rangelock_new_proxy(avl_tree_t
*tree
, uint64_t off
, uint64_t len
)
277 zfs_locked_range_t
*lr
;
280 lr
= kmem_alloc(sizeof (zfs_locked_range_t
), KM_SLEEP
);
284 lr
->lr_type
= RL_READER
;
285 lr
->lr_proxy
= B_TRUE
;
286 lr
->lr_write_wanted
= B_FALSE
;
287 lr
->lr_read_wanted
= B_FALSE
;
292 zfs_rangelock_add_reader(avl_tree_t
*tree
, zfs_locked_range_t
*new,
293 zfs_locked_range_t
*prev
, avl_index_t where
)
295 zfs_locked_range_t
*next
;
296 uint64_t off
= new->lr_offset
;
297 uint64_t len
= new->lr_length
;
300 * prev arrives either:
301 * - pointing to an entry at the same offset
302 * - pointing to the entry with the closest previous offset whose
303 * range may overlap with the new range
304 * - null, if there were no ranges starting before the new one
307 if (prev
->lr_offset
+ prev
->lr_length
<= off
) {
309 } else if (prev
->lr_offset
!= off
) {
311 * convert to proxy if needed then
312 * split this entry and bump ref count
314 prev
= zfs_rangelock_split(tree
, prev
, off
);
315 prev
= AVL_NEXT(tree
, prev
); /* move to rear range */
318 ASSERT((prev
== NULL
) || (prev
->lr_offset
== off
));
323 next
= avl_nearest(tree
, where
, AVL_AFTER
);
325 if (next
== NULL
|| off
+ len
<= next
->lr_offset
) {
326 /* no overlaps, use the original new rl_t in the tree */
327 avl_insert(tree
, new, where
);
331 if (off
< next
->lr_offset
) {
332 /* Add a proxy for initial range before the overlap */
333 zfs_rangelock_new_proxy(tree
, off
, next
->lr_offset
- off
);
336 new->lr_count
= 0; /* will use proxies in tree */
338 * We now search forward through the ranges, until we go past the end
339 * of the new range. For each entry we make it a proxy if it
340 * isn't already, then bump its reference count. If there's any
341 * gaps between the ranges then we create a new proxy range.
343 for (prev
= NULL
; next
; prev
= next
, next
= AVL_NEXT(tree
, next
)) {
344 if (off
+ len
<= next
->lr_offset
)
346 if (prev
!= NULL
&& prev
->lr_offset
+ prev
->lr_length
<
349 ASSERT3U(next
->lr_offset
, >,
350 prev
->lr_offset
+ prev
->lr_length
);
351 zfs_rangelock_new_proxy(tree
,
352 prev
->lr_offset
+ prev
->lr_length
,
354 (prev
->lr_offset
+ prev
->lr_length
));
356 if (off
+ len
== next
->lr_offset
+ next
->lr_length
) {
357 /* exact overlap with end */
358 next
= zfs_rangelock_proxify(tree
, next
);
362 if (off
+ len
< next
->lr_offset
+ next
->lr_length
) {
363 /* new range ends in the middle of this block */
364 next
= zfs_rangelock_split(tree
, next
, off
+ len
);
368 ASSERT3U(off
+ len
, >, next
->lr_offset
+ next
->lr_length
);
369 next
= zfs_rangelock_proxify(tree
, next
);
373 /* Add the remaining end range. */
374 zfs_rangelock_new_proxy(tree
, prev
->lr_offset
+ prev
->lr_length
,
375 (off
+ len
) - (prev
->lr_offset
+ prev
->lr_length
));
379 * Check if a reader lock can be grabbed, or wait and recheck until available.
382 zfs_rangelock_enter_reader(zfs_rangelock_t
*rl
, zfs_locked_range_t
*new)
384 avl_tree_t
*tree
= &rl
->rl_tree
;
385 zfs_locked_range_t
*prev
, *next
;
387 uint64_t off
= new->lr_offset
;
388 uint64_t len
= new->lr_length
;
391 * Look for any writer locks in the range.
394 prev
= avl_find(tree
, new, &where
);
396 prev
= avl_nearest(tree
, where
, AVL_BEFORE
);
399 * Check the previous range for a writer lock overlap.
401 if (prev
&& (off
< prev
->lr_offset
+ prev
->lr_length
)) {
402 if ((prev
->lr_type
== RL_WRITER
) || (prev
->lr_write_wanted
)) {
403 if (!prev
->lr_read_wanted
) {
404 cv_init(&prev
->lr_read_cv
,
405 NULL
, CV_DEFAULT
, NULL
);
406 prev
->lr_read_wanted
= B_TRUE
;
408 cv_wait(&prev
->lr_read_cv
, &rl
->rl_lock
);
411 if (off
+ len
< prev
->lr_offset
+ prev
->lr_length
)
416 * Search through the following ranges to see if there's
417 * write lock any overlap.
420 next
= AVL_NEXT(tree
, prev
);
422 next
= avl_nearest(tree
, where
, AVL_AFTER
);
423 for (; next
!= NULL
; next
= AVL_NEXT(tree
, next
)) {
424 if (off
+ len
<= next
->lr_offset
)
426 if ((next
->lr_type
== RL_WRITER
) || (next
->lr_write_wanted
)) {
427 if (!next
->lr_read_wanted
) {
428 cv_init(&next
->lr_read_cv
,
429 NULL
, CV_DEFAULT
, NULL
);
430 next
->lr_read_wanted
= B_TRUE
;
432 cv_wait(&next
->lr_read_cv
, &rl
->rl_lock
);
435 if (off
+ len
<= next
->lr_offset
+ next
->lr_length
)
441 * Add the read lock, which may involve splitting existing
442 * locks and bumping ref counts (r_count).
444 zfs_rangelock_add_reader(tree
, new, prev
, where
);
448 * Lock a range (offset, length) as either shared (RL_READER) or exclusive
449 * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert
450 * it to a RL_WRITER lock (with the offset at the end of the file). Returns
451 * the range lock structure for later unlocking (or reduce range if the
452 * entire file is locked as RL_WRITER).
455 zfs_rangelock_enter(zfs_rangelock_t
*rl
, uint64_t off
, uint64_t len
,
456 zfs_rangelock_type_t type
)
458 zfs_locked_range_t
*new;
460 ASSERT(type
== RL_READER
|| type
== RL_WRITER
|| type
== RL_APPEND
);
462 new = kmem_alloc(sizeof (zfs_locked_range_t
), KM_SLEEP
);
463 new->lr_rangelock
= rl
;
464 new->lr_offset
= off
;
465 if (len
+ off
< off
) /* overflow */
466 len
= UINT64_MAX
- off
;
467 new->lr_length
= len
;
468 new->lr_count
= 1; /* assume it's going to be in the tree */
470 new->lr_proxy
= B_FALSE
;
471 new->lr_write_wanted
= B_FALSE
;
472 new->lr_read_wanted
= B_FALSE
;
474 mutex_enter(&rl
->rl_lock
);
475 if (type
== RL_READER
) {
477 * First check for the usual case of no locks
479 if (avl_numnodes(&rl
->rl_tree
) == 0)
480 avl_add(&rl
->rl_tree
, new);
482 zfs_rangelock_enter_reader(rl
, new);
484 /* RL_WRITER or RL_APPEND */
485 zfs_rangelock_enter_writer(rl
, new);
487 mutex_exit(&rl
->rl_lock
);
492 * Safely free the zfs_locked_range_t.
495 zfs_rangelock_free(zfs_locked_range_t
*lr
)
497 if (lr
->lr_write_wanted
)
498 cv_destroy(&lr
->lr_write_cv
);
500 if (lr
->lr_read_wanted
)
501 cv_destroy(&lr
->lr_read_cv
);
503 kmem_free(lr
, sizeof (zfs_locked_range_t
));
507 * Unlock a reader lock
510 zfs_rangelock_exit_reader(zfs_rangelock_t
*rl
, zfs_locked_range_t
*remove
,
513 avl_tree_t
*tree
= &rl
->rl_tree
;
517 * The common case is when the remove entry is in the tree
518 * (cnt == 1) meaning there's been no other reader locks overlapping
519 * with this one. Otherwise the remove entry will have been
520 * removed from the tree and replaced by proxies (one or
521 * more ranges mapping to the entire range).
523 if (remove
->lr_count
== 1) {
524 avl_remove(tree
, remove
);
525 if (remove
->lr_write_wanted
)
526 cv_broadcast(&remove
->lr_write_cv
);
527 if (remove
->lr_read_wanted
)
528 cv_broadcast(&remove
->lr_read_cv
);
529 list_insert_tail(free_list
, remove
);
531 ASSERT0(remove
->lr_count
);
532 ASSERT0(remove
->lr_write_wanted
);
533 ASSERT0(remove
->lr_read_wanted
);
535 * Find start proxy representing this reader lock,
536 * then decrement ref count on all proxies
537 * that make up this range, freeing them as needed.
539 zfs_locked_range_t
*lr
= avl_find(tree
, remove
, NULL
);
540 ASSERT3P(lr
, !=, NULL
);
541 ASSERT3U(lr
->lr_count
, !=, 0);
542 ASSERT3U(lr
->lr_type
, ==, RL_READER
);
543 zfs_locked_range_t
*next
= NULL
;
544 for (len
= remove
->lr_length
; len
!= 0; lr
= next
) {
545 len
-= lr
->lr_length
;
547 next
= AVL_NEXT(tree
, lr
);
548 ASSERT3P(next
, !=, NULL
);
549 ASSERT3U(lr
->lr_offset
+ lr
->lr_length
, ==,
551 ASSERT3U(next
->lr_count
, !=, 0);
552 ASSERT3U(next
->lr_type
, ==, RL_READER
);
555 if (lr
->lr_count
== 0) {
556 avl_remove(tree
, lr
);
557 if (lr
->lr_write_wanted
)
558 cv_broadcast(&lr
->lr_write_cv
);
559 if (lr
->lr_read_wanted
)
560 cv_broadcast(&lr
->lr_read_cv
);
561 list_insert_tail(free_list
, lr
);
564 kmem_free(remove
, sizeof (zfs_locked_range_t
));
569 * Unlock range and destroy range lock structure.
572 zfs_rangelock_exit(zfs_locked_range_t
*lr
)
574 zfs_rangelock_t
*rl
= lr
->lr_rangelock
;
576 zfs_locked_range_t
*free_lr
;
578 ASSERT(lr
->lr_type
== RL_WRITER
|| lr
->lr_type
== RL_READER
);
579 ASSERT(lr
->lr_count
== 1 || lr
->lr_count
== 0);
580 ASSERT(!lr
->lr_proxy
);
583 * The free list is used to defer the cv_destroy() and
584 * subsequent kmem_free until after the mutex is dropped.
586 list_create(&free_list
, sizeof (zfs_locked_range_t
),
587 offsetof(zfs_locked_range_t
, lr_node
));
589 mutex_enter(&rl
->rl_lock
);
590 if (lr
->lr_type
== RL_WRITER
) {
591 /* writer locks can't be shared or split */
592 avl_remove(&rl
->rl_tree
, lr
);
593 if (lr
->lr_write_wanted
)
594 cv_broadcast(&lr
->lr_write_cv
);
595 if (lr
->lr_read_wanted
)
596 cv_broadcast(&lr
->lr_read_cv
);
597 list_insert_tail(&free_list
, lr
);
600 * lock may be shared, let rangelock_exit_reader()
601 * release the lock and free the zfs_locked_range_t.
603 zfs_rangelock_exit_reader(rl
, lr
, &free_list
);
605 mutex_exit(&rl
->rl_lock
);
607 while ((free_lr
= list_remove_head(&free_list
)) != NULL
)
608 zfs_rangelock_free(free_lr
);
610 list_destroy(&free_list
);
614 * Reduce range locked as RL_WRITER from whole file to specified range.
615 * Asserts the whole file is exclusively locked and so there's only one
619 zfs_rangelock_reduce(zfs_locked_range_t
*lr
, uint64_t off
, uint64_t len
)
621 zfs_rangelock_t
*rl
= lr
->lr_rangelock
;
623 /* Ensure there are no other locks */
624 ASSERT3U(avl_numnodes(&rl
->rl_tree
), ==, 1);
625 ASSERT3U(lr
->lr_offset
, ==, 0);
626 ASSERT3U(lr
->lr_type
, ==, RL_WRITER
);
627 ASSERT(!lr
->lr_proxy
);
628 ASSERT3U(lr
->lr_length
, ==, UINT64_MAX
);
629 ASSERT3U(lr
->lr_count
, ==, 1);
631 mutex_enter(&rl
->rl_lock
);
634 mutex_exit(&rl
->rl_lock
);
635 if (lr
->lr_write_wanted
)
636 cv_broadcast(&lr
->lr_write_cv
);
637 if (lr
->lr_read_wanted
)
638 cv_broadcast(&lr
->lr_read_cv
);
642 EXPORT_SYMBOL(zfs_rangelock_init
);
643 EXPORT_SYMBOL(zfs_rangelock_fini
);
644 EXPORT_SYMBOL(zfs_rangelock_enter
);
645 EXPORT_SYMBOL(zfs_rangelock_exit
);
646 EXPORT_SYMBOL(zfs_rangelock_reduce
);