4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2015, Joyent, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 * The Regents of the University of California
36 * University Acknowledgment- Portions of this document are derived from
37 * software developed by the University of California, Berkeley, and its
42 * VM - address spaces.
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/errno.h>
49 #include <sys/systm.h>
51 #include <sys/sysmacros.h>
52 #include <sys/cpuvar.h>
53 #include <sys/sysinfo.h>
55 #include <sys/vnode.h>
56 #include <sys/vmsystm.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/tnf_probe.h>
60 #include <sys/vtrace.h>
65 #include <vm/seg_vn.h>
66 #include <vm/seg_dev.h>
67 #include <vm/seg_kmem.h>
68 #include <vm/seg_map.h>
69 #include <vm/seg_spt.h>
72 clock_t deadlk_wait
= 1; /* number of ticks to wait before retrying */
74 static struct kmem_cache
*as_cache
;
76 static void as_setwatchprot(struct as
*, caddr_t
, size_t, uint_t
);
77 static void as_clearwatchprot(struct as
*, caddr_t
, size_t);
78 int as_map_locked(struct as
*, caddr_t
, size_t, int ((*)()), void *);
82 * Verifying the segment lists is very time-consuming; it may not be
83 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
86 #define VERIFY_SEGLIST
91 * Allocate a new callback data structure entry and fill in the events of
92 * interest, the address range of interest, and the callback argument.
93 * Link the entry on the as->a_callbacks list. A callback entry for the
94 * entire address space may be specified with vaddr = 0 and size = -1.
96 * CALLERS RESPONSIBILITY: If not calling from within the process context for
97 * the specified as, the caller must guarantee persistence of the specified as
98 * for the duration of this function (eg. pages being locked within the as
99 * will guarantee persistence).
102 as_add_callback(struct as
*as
, void (*cb_func
)(), void *arg
, uint_t events
,
103 caddr_t vaddr
, size_t size
, int sleepflag
)
105 struct as_callback
*current_head
, *cb
;
109 /* callback function and an event are mandatory */
110 if ((cb_func
== NULL
) || ((events
& AS_ALL_EVENT
) == 0))
113 /* Adding a callback after as_free has been called is not allowed */
118 * vaddr = 0 and size = -1 is used to indicate that the callback range
119 * is the entire address space so no rounding is done in that case.
122 saddr
= (caddr_t
)((uintptr_t)vaddr
& (uintptr_t)PAGEMASK
);
123 rsize
= (((size_t)(vaddr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
125 /* check for wraparound */
126 if (saddr
+ rsize
< saddr
)
135 /* Allocate and initialize a callback entry */
136 cb
= kmem_zalloc(sizeof (struct as_callback
), sleepflag
);
140 cb
->ascb_func
= cb_func
;
142 cb
->ascb_events
= events
;
143 cb
->ascb_saddr
= saddr
;
144 cb
->ascb_len
= rsize
;
146 /* Add the entry to the list */
147 mutex_enter(&as
->a_contents
);
148 current_head
= as
->a_callbacks
;
149 as
->a_callbacks
= cb
;
150 cb
->ascb_next
= current_head
;
153 * The call to this function may lose in a race with
154 * a pertinent event - eg. a thread does long term memory locking
155 * but before the callback is added another thread executes as_unmap.
156 * A broadcast here resolves that.
158 if ((cb
->ascb_events
& AS_UNMAPWAIT_EVENT
) && AS_ISUNMAPWAIT(as
)) {
160 cv_broadcast(&as
->a_cv
);
163 mutex_exit(&as
->a_contents
);
168 * Search the callback list for an entry which pertains to arg.
170 * This is called from within the client upon completion of the callback.
172 * AS_CALLBACK_DELETED (callback entry found and deleted)
173 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
174 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
175 * entry will be made in as_do_callbacks)
177 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
178 * set, it indicates that as_do_callbacks is processing this entry. The
179 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
180 * to unblock as_do_callbacks, in case it is blocked.
182 * CALLERS RESPONSIBILITY: If not calling from within the process context for
183 * the specified as, the caller must guarantee persistence of the specified as
184 * for the duration of this function (eg. pages being locked within the as
185 * will guarantee persistence).
188 as_delete_callback(struct as
*as
, void *arg
)
190 struct as_callback
**prevcb
= &as
->a_callbacks
;
191 struct as_callback
*cb
;
192 uint_t rc
= AS_CALLBACK_NOTFOUND
;
194 mutex_enter(&as
->a_contents
);
195 for (cb
= as
->a_callbacks
; cb
; prevcb
= &cb
->ascb_next
, cb
= *prevcb
) {
196 if (cb
->ascb_arg
!= arg
)
200 * If the events indicate AS_CALLBACK_CALLED, just clear
201 * AS_ALL_EVENT in the events field and wakeup the thread
202 * that may be waiting in as_do_callbacks. as_do_callbacks
203 * will take care of removing this entry from the list. In
204 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
205 * (AS_CALLBACK_CALLED not set), just remove it from the
206 * list, return the memory and return AS_CALLBACK_DELETED.
208 if ((cb
->ascb_events
& AS_CALLBACK_CALLED
) != 0) {
209 /* leave AS_CALLBACK_CALLED */
210 cb
->ascb_events
&= ~AS_ALL_EVENT
;
211 rc
= AS_CALLBACK_DELETE_DEFERRED
;
212 cv_broadcast(&as
->a_cv
);
214 *prevcb
= cb
->ascb_next
;
215 kmem_free(cb
, sizeof (struct as_callback
));
216 rc
= AS_CALLBACK_DELETED
;
220 mutex_exit(&as
->a_contents
);
225 * Searches the as callback list for a matching entry.
226 * Returns a pointer to the first matching callback, or NULL if
228 * This function never sleeps so it is ok to call it with more
229 * locks held but the (required) a_contents mutex.
231 * See also comment on as_do_callbacks below.
233 static struct as_callback
*
234 as_find_callback(struct as
*as
, uint_t events
, caddr_t event_addr
,
237 struct as_callback
*cb
;
239 ASSERT(MUTEX_HELD(&as
->a_contents
));
240 for (cb
= as
->a_callbacks
; cb
!= NULL
; cb
= cb
->ascb_next
) {
242 * If the callback has not already been called, then
243 * check if events or address range pertains. An event_len
244 * of zero means do an unconditional callback.
246 if (((cb
->ascb_events
& AS_CALLBACK_CALLED
) != 0) ||
247 ((event_len
!= 0) && (((cb
->ascb_events
& events
) == 0) ||
248 (event_addr
+ event_len
< cb
->ascb_saddr
) ||
249 (event_addr
> (cb
->ascb_saddr
+ cb
->ascb_len
))))) {
258 * Executes a given callback and removes it from the callback list for
259 * this address space.
260 * This function may sleep so the caller must drop all locks except
261 * a_contents before calling this func.
263 * See also comments on as_do_callbacks below.
266 as_execute_callback(struct as
*as
, struct as_callback
*cb
,
269 struct as_callback
**prevcb
;
272 ASSERT(MUTEX_HELD(&as
->a_contents
) && (cb
->ascb_events
& events
));
273 cb
->ascb_events
|= AS_CALLBACK_CALLED
;
274 mutex_exit(&as
->a_contents
);
275 (*cb
->ascb_func
)(as
, cb
->ascb_arg
, events
);
276 mutex_enter(&as
->a_contents
);
278 * the callback function is required to delete the callback
279 * when the callback function determines it is OK for
280 * this thread to continue. as_delete_callback will clear
281 * the AS_ALL_EVENT in the events field when it is deleted.
282 * If the callback function called as_delete_callback,
283 * events will already be cleared and there will be no blocking.
285 while ((cb
->ascb_events
& events
) != 0) {
286 cv_wait(&as
->a_cv
, &as
->a_contents
);
289 * This entry needs to be taken off the list. Normally, the
290 * callback func itself does that, but unfortunately the list
291 * may have changed while the callback was running because the
292 * a_contents mutex was dropped and someone else other than the
293 * callback func itself could have called as_delete_callback,
294 * so we have to search to find this entry again. The entry
295 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
297 cb_arg
= cb
->ascb_arg
;
298 prevcb
= &as
->a_callbacks
;
299 for (cb
= as
->a_callbacks
; cb
!= NULL
;
300 prevcb
= &cb
->ascb_next
, cb
= *prevcb
) {
301 if (((cb
->ascb_events
& AS_CALLBACK_CALLED
) == 0) ||
302 (cb_arg
!= cb
->ascb_arg
)) {
305 *prevcb
= cb
->ascb_next
;
306 kmem_free(cb
, sizeof (struct as_callback
));
312 * Check the callback list for a matching event and intersection of
313 * address range. If there is a match invoke the callback. Skip an entry if:
314 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
315 * - not event of interest
316 * - not address range of interest
318 * An event_len of zero indicates a request for an unconditional callback
319 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
320 * a_contents lock must be dropped before a callback, so only one callback
321 * can be done before returning. Return -1 (true) if a callback was
322 * executed and removed from the list, else return 0 (false).
324 * The logically separate parts, i.e. finding a matching callback and
325 * executing a given callback have been separated into two functions
326 * so that they can be called with different sets of locks held beyond
327 * the always-required a_contents. as_find_callback does not sleep so
328 * it is ok to call it if more locks than a_contents (i.e. the a_lock
329 * rwlock) are held. as_execute_callback on the other hand may sleep
330 * so all locks beyond a_contents must be dropped by the caller if one
331 * does not want to end comatose.
334 as_do_callbacks(struct as
*as
, uint_t events
, caddr_t event_addr
,
337 struct as_callback
*cb
;
339 if ((cb
= as_find_callback(as
, events
, event_addr
, event_len
))) {
340 as_execute_callback(as
, cb
, events
);
347 * Search for the segment containing addr. If a segment containing addr
348 * exists, that segment is returned. If no such segment exists, and
349 * the list spans addresses greater than addr, then the first segment
350 * whose base is greater than addr is returned; otherwise, NULL is
351 * returned unless tail is true, in which case the last element of the
354 * a_seglast is used to cache the last found segment for repeated
355 * searches to the same addr (which happens frequently).
358 as_findseg(struct as
*as
, caddr_t addr
, int tail
)
360 struct seg
*seg
= as
->a_seglast
;
363 ASSERT(AS_LOCK_HELD(as
));
366 seg
->s_base
<= addr
&&
367 addr
< seg
->s_base
+ seg
->s_size
)
370 seg
= avl_find(&as
->a_segtree
, &addr
, &where
);
372 return (as
->a_seglast
= seg
);
374 seg
= avl_nearest(&as
->a_segtree
, where
, AVL_AFTER
);
375 if (seg
== NULL
&& tail
)
376 seg
= avl_last(&as
->a_segtree
);
377 return (as
->a_seglast
= seg
);
380 #ifdef VERIFY_SEGLIST
382 * verify that the linked list is coherent
385 as_verify(struct as
*as
)
387 struct seg
*seg
, *seglast
, *p
, *n
;
390 if (do_as_verify
== 0)
393 seglast
= as
->a_seglast
;
395 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
; seg
= AS_SEGNEXT(as
, seg
)) {
396 ASSERT(seg
->s_as
== as
);
397 p
= AS_SEGPREV(as
, seg
);
398 n
= AS_SEGNEXT(as
, seg
);
399 ASSERT(p
== NULL
|| p
->s_as
== as
);
400 ASSERT(p
== NULL
|| p
->s_base
< seg
->s_base
);
401 ASSERT(n
== NULL
|| n
->s_base
> seg
->s_base
);
402 ASSERT(n
!= NULL
|| seg
== avl_last(&as
->a_segtree
));
407 ASSERT(seglast
== NULL
);
408 ASSERT(avl_numnodes(&as
->a_segtree
) == nsegs
);
410 #endif /* VERIFY_SEGLIST */
413 * Add a new segment to the address space. The avl_find()
414 * may be expensive so we attempt to use last segment accessed
415 * in as_gap() as an insertion point.
418 as_addseg(struct as
*as
, struct seg
*newseg
)
425 ASSERT(AS_WRITE_HELD(as
));
427 as
->a_updatedir
= 1; /* inform /proc */
428 gethrestime(&as
->a_updatetime
);
430 if (as
->a_lastgaphl
!= NULL
) {
431 struct seg
*hseg
= NULL
;
432 struct seg
*lseg
= NULL
;
434 if (as
->a_lastgaphl
->s_base
> newseg
->s_base
) {
435 hseg
= as
->a_lastgaphl
;
436 lseg
= AVL_PREV(&as
->a_segtree
, hseg
);
438 lseg
= as
->a_lastgaphl
;
439 hseg
= AVL_NEXT(&as
->a_segtree
, lseg
);
442 if (hseg
&& lseg
&& lseg
->s_base
< newseg
->s_base
&&
443 hseg
->s_base
> newseg
->s_base
) {
444 avl_insert_here(&as
->a_segtree
, newseg
, lseg
,
446 as
->a_lastgaphl
= NULL
;
447 as
->a_seglast
= newseg
;
450 as
->a_lastgaphl
= NULL
;
453 addr
= newseg
->s_base
;
454 eaddr
= addr
+ newseg
->s_size
;
457 seg
= avl_find(&as
->a_segtree
, &addr
, &where
);
460 seg
= avl_nearest(&as
->a_segtree
, where
, AVL_AFTER
);
463 seg
= avl_last(&as
->a_segtree
);
466 caddr_t base
= seg
->s_base
;
469 * If top of seg is below the requested address, then
470 * the insertion point is at the end of the linked list,
471 * and seg points to the tail of the list. Otherwise,
472 * the insertion point is immediately before seg.
474 if (base
+ seg
->s_size
> addr
) {
475 if (addr
>= base
|| eaddr
> base
) {
476 return (-1); /* overlapping segment */
480 as
->a_seglast
= newseg
;
481 avl_insert(&as
->a_segtree
, newseg
, where
);
483 #ifdef VERIFY_SEGLIST
490 as_removeseg(struct as
*as
, struct seg
*seg
)
494 ASSERT(AS_WRITE_HELD(as
));
496 as
->a_updatedir
= 1; /* inform /proc */
497 gethrestime(&as
->a_updatetime
);
503 if (as
->a_seglast
== seg
)
504 as
->a_seglast
= NULL
;
505 as
->a_lastgaphl
= NULL
;
508 * if this segment is at an address higher than
509 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
512 (seg
== as
->a_lastgap
|| seg
->s_base
> as
->a_lastgap
->s_base
))
513 as
->a_lastgap
= AVL_NEXT(t
, seg
);
516 * remove the segment from the seg tree
520 #ifdef VERIFY_SEGLIST
527 * Find a segment containing addr.
530 as_segat(struct as
*as
, caddr_t addr
)
532 struct seg
*seg
= as
->a_seglast
;
534 ASSERT(AS_LOCK_HELD(as
));
536 if (seg
!= NULL
&& seg
->s_base
<= addr
&&
537 addr
< seg
->s_base
+ seg
->s_size
)
540 seg
= avl_find(&as
->a_segtree
, &addr
, NULL
);
545 * Serialize all searches for holes in an address space to
546 * prevent two or more threads from allocating the same virtual
547 * address range. The address space must not be "read/write"
548 * locked by the caller since we may block.
551 as_rangelock(struct as
*as
)
553 mutex_enter(&as
->a_contents
);
554 while (AS_ISCLAIMGAP(as
))
555 cv_wait(&as
->a_cv
, &as
->a_contents
);
557 mutex_exit(&as
->a_contents
);
561 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
564 as_rangeunlock(struct as
*as
)
566 mutex_enter(&as
->a_contents
);
568 cv_signal(&as
->a_cv
);
569 mutex_exit(&as
->a_contents
);
573 * compar segments (or just an address) by segment address range
576 as_segcompar(const void *x
, const void *y
)
578 struct seg
*a
= (struct seg
*)x
;
579 struct seg
*b
= (struct seg
*)y
;
581 if (a
->s_base
< b
->s_base
)
583 if (a
->s_base
>= b
->s_base
+ b
->s_size
)
590 as_avlinit(struct as
*as
)
592 avl_create(&as
->a_segtree
, as_segcompar
, sizeof (struct seg
),
593 offsetof(struct seg
, s_tree
));
594 avl_create(&as
->a_wpage
, wp_compare
, sizeof (struct watched_page
),
595 offsetof(struct watched_page
, wp_link
));
600 as_constructor(void *buf
, void *cdrarg
, int kmflags
)
604 mutex_init(&as
->a_contents
, NULL
, MUTEX_DEFAULT
, NULL
);
605 cv_init(&as
->a_cv
, NULL
, CV_DEFAULT
, NULL
);
606 rw_init(&as
->a_lock
, NULL
, RW_DEFAULT
, NULL
);
613 as_destructor(void *buf
, void *cdrarg
)
617 avl_destroy(&as
->a_segtree
);
618 mutex_destroy(&as
->a_contents
);
619 cv_destroy(&as
->a_cv
);
620 rw_destroy(&as
->a_lock
);
626 as_cache
= kmem_cache_create("as_cache", sizeof (struct as
), 0,
627 as_constructor
, as_destructor
, NULL
, NULL
, NULL
, 0);
631 * Allocate and initialize an address space data structure.
632 * We call hat_alloc to allow any machine dependent
633 * information in the hat structure to be initialized.
640 as
= kmem_cache_alloc(as_cache
, KM_SLEEP
);
645 as
->a_seglast
= NULL
;
649 gethrestime(&as
->a_updatetime
);
650 as
->a_objectdir
= NULL
;
652 as
->a_userlimit
= (caddr_t
)USERLIMIT
;
653 as
->a_lastgap
= NULL
;
654 as
->a_lastgaphl
= NULL
;
655 as
->a_callbacks
= NULL
;
658 AS_LOCK_ENTER(as
, RW_WRITER
);
659 as
->a_hat
= hat_alloc(as
); /* create hat for default system mmu */
666 * Free an address space data structure.
667 * Need to free the hat first and then
668 * all the segments on this as and finally
669 * the space for the as struct itself.
672 as_free(struct as
*as
)
674 struct hat
*hat
= as
->a_hat
;
675 struct seg
*seg
, *next
;
676 boolean_t free_started
= B_FALSE
;
680 * Invoke ALL callbacks. as_do_callbacks will do one callback
681 * per call, and not return (-1) until the callback has completed.
682 * When as_do_callbacks returns zero, all callbacks have completed.
684 mutex_enter(&as
->a_contents
);
685 while (as
->a_callbacks
&& as_do_callbacks(as
, AS_ALL_EVENT
, 0, 0))
688 mutex_exit(&as
->a_contents
);
689 AS_LOCK_ENTER(as
, RW_WRITER
);
692 free_started
= B_TRUE
;
695 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
; seg
= next
) {
698 next
= AS_SEGNEXT(as
, seg
);
700 err
= segop_unmap(seg
, seg
->s_base
, seg
->s_size
);
702 mutex_enter(&as
->a_contents
);
703 if (as
->a_callbacks
) {
705 } else if (!AS_ISNOUNMAPWAIT(as
)) {
707 * Memory is currently locked. Wait for a
708 * cv_signal that it has been unlocked, then
709 * try the operation again.
711 if (AS_ISUNMAPWAIT(as
) == 0)
712 cv_broadcast(&as
->a_cv
);
715 while (AS_ISUNMAPWAIT(as
))
716 cv_wait(&as
->a_cv
, &as
->a_contents
);
719 * We may have raced with
720 * segvn_reclaim()/segspt_reclaim(). In this
721 * case clean nounmapwait flag and retry since
722 * softlockcnt in this segment may be already
723 * 0. We don't drop as writer lock so our
724 * number of retries without sleeping should
725 * be very small. See segvn_reclaim() for
728 AS_CLRNOUNMAPWAIT(as
);
729 mutex_exit(&as
->a_contents
);
732 mutex_exit(&as
->a_contents
);
736 * We do not expect any other error return at this
737 * time. This is similar to an ASSERT in seg_unmap()
746 ASSERT(avl_numnodes(&as
->a_wpage
) == 0);
747 if (as
->a_objectdir
) {
748 kmem_free(as
->a_objectdir
, as
->a_sizedir
* sizeof (vnode_t
*));
749 as
->a_objectdir
= NULL
;
754 * Free the struct as back to kmem. Assert it has no segments.
756 ASSERT(avl_numnodes(&as
->a_segtree
) == 0);
757 kmem_cache_free(as_cache
, as
);
761 as_dup(struct as
*as
, struct proc
*forkedproc
)
764 struct seg
*seg
, *newseg
;
765 size_t purgesize
= 0;
768 AS_LOCK_ENTER(as
, RW_WRITER
);
771 newas
->a_userlimit
= as
->a_userlimit
;
772 newas
->a_proc
= forkedproc
;
774 AS_LOCK_ENTER(newas
, RW_WRITER
);
776 (void) hat_dup(as
->a_hat
, newas
->a_hat
, NULL
, 0, HAT_DUP_SRD
);
778 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
; seg
= AS_SEGNEXT(as
, seg
)) {
780 if (seg
->s_flags
& S_PURGE
) {
781 purgesize
+= seg
->s_size
;
785 newseg
= seg_alloc(newas
, seg
->s_base
, seg
->s_size
);
786 if (newseg
== NULL
) {
793 if ((error
= segop_dup(seg
, newseg
)) != 0) {
795 * We call seg_free() on the new seg
796 * because the segment is not set up
797 * completely; i.e. it has no ops.
806 newas
->a_size
+= seg
->s_size
;
808 newas
->a_resvsize
= as
->a_resvsize
- purgesize
;
810 error
= hat_dup(as
->a_hat
, newas
->a_hat
, NULL
, 0, HAT_DUP_ALL
);
820 forkedproc
->p_as
= newas
;
825 * Handle a ``fault'' at addr for size bytes.
828 as_fault(struct hat
*hat
, struct as
*as
, caddr_t addr
, size_t size
,
829 enum fault_type type
, enum seg_rw rw
)
832 caddr_t raddr
; /* rounded down addr */
833 size_t rsize
; /* rounded up size */
839 klwp_t
*lwp
= ttolwp(curthread
);
845 * Indicate that the lwp is not to be stopped while waiting for a
846 * pagefault. This is to avoid deadlock while debugging a process
847 * via /proc over NFS (in particular).
853 * same length must be used when we softlock and softunlock. We
854 * don't support softunlocking lengths less than the original length
855 * when there is largepage support. See seg_dev.c for more
861 CPU_STATS_ADD_K(vm
, softlock
, 1);
868 CPU_STATS_ADD_K(vm
, prot_fault
, 1);
873 CPU_STATS_ADDQ(CPU
, vm
, as_fault
, 1);
875 CPU_STATS_ADDQ(CPU
, vm
, kernel_asflt
, 1);
880 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
881 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
885 * XXX -- Don't grab the as lock for segkmap. We should grab it for
886 * correctness, but then we could be stuck holding this lock for
887 * a LONG time if the fault needs to be resolved on a slow
888 * filesystem, and then no-one will be able to exec new commands,
889 * as exec'ing requires the write lock on the as.
891 if (as
== &kas
&& segkmap
&& segkmap
->s_base
<= raddr
&&
892 raddr
+ size
< segkmap
->s_base
+ segkmap
->s_size
) {
896 AS_LOCK_ENTER(as
, RW_READER
);
898 seg
= as_segat(as
, raddr
);
912 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
913 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
914 seg
= AS_SEGNEXT(as
, seg
);
915 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
920 if (raddr
+ rsize
> seg
->s_base
+ seg
->s_size
)
921 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
925 res
= segop_fault(hat
, seg
, raddr
, ssize
, type
, rw
);
931 * If we were SOFTLOCKing and encountered a failure,
932 * we must SOFTUNLOCK the range we already did. (Maybe we
933 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
936 if (res
!= 0 && type
== F_SOFTLOCK
) {
937 for (seg
= segsav
; addrsav
< raddr
; addrsav
+= ssize
) {
938 if (addrsav
>= seg
->s_base
+ seg
->s_size
)
939 seg
= AS_SEGNEXT(as
, seg
);
942 * Now call the fault routine again to perform the
943 * unlock using S_OTHER instead of the rw variable
944 * since we never got a chance to touch the pages.
946 if (raddr
> seg
->s_base
+ seg
->s_size
)
947 ssize
= seg
->s_base
+ seg
->s_size
- addrsav
;
949 ssize
= raddr
- addrsav
;
950 (void) segop_fault(hat
, seg
, addrsav
, ssize
,
951 F_SOFTUNLOCK
, S_OTHER
);
960 * If the lower levels returned EDEADLK for a fault,
961 * It means that we should retry the fault. Let's wait
962 * a bit also to let the deadlock causing condition clear.
963 * This is part of a gross hack to work around a design flaw
964 * in the ufs/sds logging code and should go away when the
965 * logging code is re-designed to fix the problem. See bug
966 * 4125102 for details of the problem.
968 if (FC_ERRNO(res
) == EDEADLK
) {
979 * Asynchronous ``fault'' at addr for size bytes.
982 as_faulta(struct as
*as
, caddr_t addr
, size_t size
)
985 caddr_t raddr
; /* rounded down addr */
986 size_t rsize
; /* rounded up size */
988 klwp_t
*lwp
= ttolwp(curthread
);
992 * Indicate that the lwp is not to be stopped while waiting
993 * for a pagefault. This is to avoid deadlock while debugging
994 * a process via /proc over NFS (in particular).
999 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1000 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
1003 AS_LOCK_ENTER(as
, RW_READER
);
1004 seg
= as_segat(as
, raddr
);
1012 for (; rsize
!= 0; rsize
-= PAGESIZE
, raddr
+= PAGESIZE
) {
1013 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
1014 seg
= AS_SEGNEXT(as
, seg
);
1015 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
1020 res
= segop_faulta(seg
, raddr
);
1028 * If the lower levels returned EDEADLK for a fault,
1029 * It means that we should retry the fault. Let's wait
1030 * a bit also to let the deadlock causing condition clear.
1031 * This is part of a gross hack to work around a design flaw
1032 * in the ufs/sds logging code and should go away when the
1033 * logging code is re-designed to fix the problem. See bug
1034 * 4125102 for details of the problem.
1036 if (FC_ERRNO(res
) == EDEADLK
) {
1045 * Set the virtual mapping for the interval from [addr : addr + size)
1046 * in address space `as' to have the specified protection.
1047 * It is ok for the range to cross over several segments,
1048 * as long as they are contiguous.
1051 as_setprot(struct as
*as
, caddr_t addr
, size_t size
, uint_t prot
)
1054 struct as_callback
*cb
;
1056 caddr_t raddr
; /* rounded down addr */
1057 size_t rsize
; /* rounded up size */
1058 int error
= 0, writer
= 0;
1063 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1064 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
1067 if (raddr
+ rsize
< raddr
) /* check for wraparound */
1074 * Normally we only lock the as as a reader. But
1075 * if due to setprot the segment driver needs to split
1076 * a segment it will return IE_RETRY. Therefore we re-acquire
1077 * the as lock as a writer so the segment driver can change
1078 * the seg list. Also the segment driver will return IE_RETRY
1079 * after it has changed the segment list so we therefore keep
1080 * locking as a writer. Since these opeartions should be rare
1081 * want to only lock as a writer when necessary.
1083 if (writer
|| avl_numnodes(&as
->a_wpage
) != 0) {
1084 AS_LOCK_ENTER(as
, RW_WRITER
);
1086 AS_LOCK_ENTER(as
, RW_READER
);
1089 as_clearwatchprot(as
, raddr
, rsize
);
1090 seg
= as_segat(as
, raddr
);
1097 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
1098 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
1099 seg
= AS_SEGNEXT(as
, seg
);
1100 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
1105 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
1106 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
1110 error
= segop_setprot(seg
, raddr
, ssize
, prot
);
1112 if (error
== IE_NOMEM
) {
1117 if (error
== IE_RETRY
) {
1123 if (error
== EAGAIN
) {
1125 * Make sure we have a_lock as writer.
1134 * Memory is currently locked. It must be unlocked
1135 * before this operation can succeed through a retry.
1136 * The possible reasons for locked memory and
1137 * corresponding strategies for unlocking are:
1139 * wait for a signal that the I/O operation
1140 * has completed and the memory is unlocked.
1141 * (2) Asynchronous I/O
1142 * The aio subsystem does not unlock pages when
1143 * the I/O is completed. Those pages are unlocked
1144 * when the application calls aiowait/aioerror.
1145 * So, to prevent blocking forever, cv_broadcast()
1146 * is done to wake up aio_cleanup_thread.
1147 * Subsequently, segvn_reclaim will be called, and
1148 * that will do AS_CLRUNMAPWAIT() and wake us up.
1149 * (3) Long term page locking:
1150 * Drivers intending to have pages locked for a
1151 * period considerably longer than for normal I/O
1152 * (essentially forever) may have registered for a
1153 * callback so they may unlock these pages on
1154 * request. This is needed to allow this operation
1155 * to succeed. Each entry on the callback list is
1156 * examined. If the event or address range pertains
1157 * the callback is invoked (unless it already is in
1158 * progress). The a_contents lock must be dropped
1159 * before the callback, so only one callback can
1160 * be done at a time. Go to the top and do more
1161 * until zero is returned. If zero is returned,
1162 * either there were no callbacks for this event
1163 * or they were already in progress.
1165 mutex_enter(&as
->a_contents
);
1166 if (as
->a_callbacks
&&
1167 (cb
= as_find_callback(as
, AS_SETPROT_EVENT
,
1168 seg
->s_base
, seg
->s_size
))) {
1170 as_execute_callback(as
, cb
, AS_SETPROT_EVENT
);
1171 } else if (!AS_ISNOUNMAPWAIT(as
)) {
1172 if (AS_ISUNMAPWAIT(as
) == 0)
1173 cv_broadcast(&as
->a_cv
);
1174 AS_SETUNMAPWAIT(as
);
1176 while (AS_ISUNMAPWAIT(as
))
1177 cv_wait(&as
->a_cv
, &as
->a_contents
);
1180 * We may have raced with
1181 * segvn_reclaim()/segspt_reclaim(). In this
1182 * case clean nounmapwait flag and retry since
1183 * softlockcnt in this segment may be already
1184 * 0. We don't drop as writer lock so our
1185 * number of retries without sleeping should
1186 * be very small. See segvn_reclaim() for
1189 AS_CLRNOUNMAPWAIT(as
);
1190 mutex_exit(&as
->a_contents
);
1193 mutex_exit(&as
->a_contents
);
1195 } else if (error
!= 0)
1201 as_setwatchprot(as
, saveraddr
, saversize
, prot
);
1208 * Check to make sure that the interval [addr, addr + size)
1209 * in address space `as' has at least the specified protection.
1210 * It is ok for the range to cross over several segments, as long
1211 * as they are contiguous.
1214 as_checkprot(struct as
*as
, caddr_t addr
, size_t size
, uint_t prot
)
1218 caddr_t raddr
; /* rounded down addr */
1219 size_t rsize
; /* rounded up size */
1222 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1223 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
1226 if (raddr
+ rsize
< raddr
) /* check for wraparound */
1230 * This is ugly as sin...
1231 * Normally, we only acquire the address space readers lock.
1232 * However, if the address space has watchpoints present,
1233 * we must acquire the writer lock on the address space for
1234 * the benefit of as_clearwatchprot() and as_setwatchprot().
1236 if (avl_numnodes(&as
->a_wpage
) != 0)
1237 AS_LOCK_ENTER(as
, RW_WRITER
);
1239 AS_LOCK_ENTER(as
, RW_READER
);
1240 as_clearwatchprot(as
, raddr
, rsize
);
1241 seg
= as_segat(as
, raddr
);
1248 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
1249 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
1250 seg
= AS_SEGNEXT(as
, seg
);
1251 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
1256 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
1257 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
1261 error
= segop_checkprot(seg
, raddr
, ssize
, prot
);
1271 as_unmap(struct as
*as
, caddr_t addr
, size_t size
)
1273 struct seg
*seg
, *seg_next
;
1274 struct as_callback
*cb
;
1275 caddr_t raddr
, eaddr
;
1276 size_t ssize
, rsize
= 0;
1280 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1281 eaddr
= (caddr_t
)(((uintptr_t)(addr
+ size
) + PAGEOFFSET
) &
1282 (uintptr_t)PAGEMASK
);
1284 AS_LOCK_ENTER(as
, RW_WRITER
);
1286 as
->a_updatedir
= 1; /* inform /proc */
1287 gethrestime(&as
->a_updatetime
);
1290 * Use as_findseg to find the first segment in the range, then
1291 * step through the segments in order, following s_next.
1293 as_clearwatchprot(as
, raddr
, eaddr
- raddr
);
1295 for (seg
= as_findseg(as
, raddr
, 0); seg
!= NULL
; seg
= seg_next
) {
1296 if (eaddr
<= seg
->s_base
)
1297 break; /* eaddr was in a gap; all done */
1299 /* this is implied by the test above */
1300 ASSERT(raddr
< eaddr
);
1302 if (raddr
< seg
->s_base
)
1303 raddr
= seg
->s_base
; /* raddr was in a gap */
1305 if (eaddr
> (seg
->s_base
+ seg
->s_size
))
1306 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
1308 ssize
= eaddr
- raddr
;
1311 * Save next segment pointer since seg can be
1312 * destroyed during the segment unmap operation.
1314 seg_next
= AS_SEGNEXT(as
, seg
);
1317 * We didn't count /dev/null mappings, so ignore them here.
1318 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1319 * we have to do this check here while we have seg.)
1322 if (!SEG_IS_DEVNULL_MAPPING(seg
) &&
1323 !SEG_IS_PARTIAL_RESV(seg
))
1327 err
= segop_unmap(seg
, raddr
, ssize
);
1328 if (err
== EAGAIN
) {
1330 * Memory is currently locked. It must be unlocked
1331 * before this operation can succeed through a retry.
1332 * The possible reasons for locked memory and
1333 * corresponding strategies for unlocking are:
1335 * wait for a signal that the I/O operation
1336 * has completed and the memory is unlocked.
1337 * (2) Asynchronous I/O
1338 * The aio subsystem does not unlock pages when
1339 * the I/O is completed. Those pages are unlocked
1340 * when the application calls aiowait/aioerror.
1341 * So, to prevent blocking forever, cv_broadcast()
1342 * is done to wake up aio_cleanup_thread.
1343 * Subsequently, segvn_reclaim will be called, and
1344 * that will do AS_CLRUNMAPWAIT() and wake us up.
1345 * (3) Long term page locking:
1346 * Drivers intending to have pages locked for a
1347 * period considerably longer than for normal I/O
1348 * (essentially forever) may have registered for a
1349 * callback so they may unlock these pages on
1350 * request. This is needed to allow this operation
1351 * to succeed. Each entry on the callback list is
1352 * examined. If the event or address range pertains
1353 * the callback is invoked (unless it already is in
1354 * progress). The a_contents lock must be dropped
1355 * before the callback, so only one callback can
1356 * be done at a time. Go to the top and do more
1357 * until zero is returned. If zero is returned,
1358 * either there were no callbacks for this event
1359 * or they were already in progress.
1361 mutex_enter(&as
->a_contents
);
1362 if (as
->a_callbacks
&&
1363 (cb
= as_find_callback(as
, AS_UNMAP_EVENT
,
1364 seg
->s_base
, seg
->s_size
))) {
1366 as_execute_callback(as
, cb
, AS_UNMAP_EVENT
);
1367 } else if (!AS_ISNOUNMAPWAIT(as
)) {
1368 if (AS_ISUNMAPWAIT(as
) == 0)
1369 cv_broadcast(&as
->a_cv
);
1370 AS_SETUNMAPWAIT(as
);
1372 while (AS_ISUNMAPWAIT(as
))
1373 cv_wait(&as
->a_cv
, &as
->a_contents
);
1376 * We may have raced with
1377 * segvn_reclaim()/segspt_reclaim(). In this
1378 * case clean nounmapwait flag and retry since
1379 * softlockcnt in this segment may be already
1380 * 0. We don't drop as writer lock so our
1381 * number of retries without sleeping should
1382 * be very small. See segvn_reclaim() for
1385 AS_CLRNOUNMAPWAIT(as
);
1386 mutex_exit(&as
->a_contents
);
1389 mutex_exit(&as
->a_contents
);
1391 } else if (err
== IE_RETRY
) {
1400 as
->a_size
-= ssize
;
1402 as
->a_resvsize
-= rsize
;
1410 as_map_segvn_segs(struct as
*as
, caddr_t addr
, size_t size
, uint_t szcvec
,
1411 int (*crfp
)(), struct segvn_crargs
*vn_a
, int *segcreated
)
1421 int do_off
= (vn_a
->vp
!= NULL
|| vn_a
->amp
!= NULL
);
1424 ASSERT(AS_WRITE_HELD(as
));
1425 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
1426 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
1427 ASSERT(vn_a
->vp
== NULL
|| vn_a
->amp
== NULL
);
1433 seg
= seg_alloc(as
, addr
, size
);
1438 error
= (*crfp
)(seg
, vn_a
);
1443 as
->a_resvsize
+= size
;
1448 eaddr
= addr
+ size
;
1449 save_szcvec
= szcvec
;
1454 if ((szcvec
& 0x1) == 0) {
1460 pgsz
= page_get_pagesize(nszc
);
1461 a
= (caddr_t
)P2ROUNDUP((uintptr_t)addr
, pgsz
);
1465 seg
= seg_alloc(as
, addr
, segsize
);
1470 error
= (*crfp
)(seg
, vn_a
);
1475 as
->a_size
+= segsize
;
1476 as
->a_resvsize
+= segsize
;
1479 vn_a
->offset
+= segsize
;
1487 ASSERT(addr
< eaddr
);
1488 szcvec
= save_szcvec
| 1; /* add 8K pages */
1490 a
= (caddr_t
)P2ALIGN((uintptr_t)eaddr
, pgsz
);
1494 seg
= seg_alloc(as
, addr
, segsize
);
1499 error
= (*crfp
)(seg
, vn_a
);
1504 as
->a_size
+= segsize
;
1505 as
->a_resvsize
+= segsize
;
1508 vn_a
->offset
+= segsize
;
1512 szcvec
&= ~(1 << szc
);
1514 szc
= highbit(szcvec
) - 1;
1515 pgsz
= page_get_pagesize(szc
);
1518 ASSERT(addr
== eaddr
);
1524 as_map_vnsegs(struct as
*as
, caddr_t addr
, size_t size
,
1525 int (*crfp
)(), struct segvn_crargs
*vn_a
, int *segcreated
)
1527 uint_t mapflags
= vn_a
->flags
& (MAP_TEXT
| MAP_INITDATA
);
1528 int type
= (vn_a
->type
== MAP_SHARED
) ? MAPPGSZC_SHM
: MAPPGSZC_PRIVM
;
1529 uint_t szcvec
= map_pgszcvec(addr
, size
, (uintptr_t)addr
, mapflags
,
1535 size_t save_size
= 0;
1536 extern size_t textrepl_size_thresh
;
1538 ASSERT(AS_WRITE_HELD(as
));
1539 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
1540 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
1541 ASSERT(vn_a
->vp
!= NULL
);
1542 ASSERT(vn_a
->amp
== NULL
);
1546 seg
= seg_alloc(as
, addr
, size
);
1551 error
= (*crfp
)(seg
, vn_a
);
1556 as
->a_resvsize
+= size
;
1561 va
.va_mask
= AT_SIZE
;
1562 if (fop_getattr(vn_a
->vp
, &va
, ATTR_HINT
, vn_a
->cred
, NULL
) != 0) {
1566 eoff
= vn_a
->offset
& PAGEMASK
;
1567 if (eoff
>= va
.va_size
) {
1572 if (btopr(va
.va_size
) < btopr(eoff
)) {
1574 size
= va
.va_size
- (vn_a
->offset
& PAGEMASK
);
1575 size
= P2ROUNDUP_TYPED(size
, PAGESIZE
, size_t);
1576 szcvec
= map_pgszcvec(addr
, size
, (uintptr_t)addr
, mapflags
,
1584 if (size
> textrepl_size_thresh
) {
1585 vn_a
->flags
|= _MAP_TEXTREPL
;
1587 error
= as_map_segvn_segs(as
, addr
, size
, szcvec
, crfp
, vn_a
,
1594 size
= save_size
- size
;
1602 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1603 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1606 as_map_ansegs(struct as
*as
, caddr_t addr
, size_t size
,
1607 int (*crfp
)(), struct segvn_crargs
*vn_a
, int *segcreated
)
1612 ASSERT(vn_a
->type
== MAP_SHARED
|| vn_a
->type
== MAP_PRIVATE
);
1613 if (vn_a
->type
== MAP_SHARED
) {
1614 type
= MAPPGSZC_SHM
;
1615 } else if (vn_a
->type
== MAP_PRIVATE
) {
1616 if (vn_a
->szc
== AS_MAP_HEAP
) {
1617 type
= MAPPGSZC_HEAP
;
1618 } else if (vn_a
->szc
== AS_MAP_STACK
) {
1619 type
= MAPPGSZC_STACK
;
1621 type
= MAPPGSZC_PRIVM
;
1624 szcvec
= map_pgszcvec(addr
, size
, vn_a
->amp
== NULL
?
1625 (uintptr_t)addr
: (uintptr_t)P2ROUNDUP(vn_a
->offset
, PAGESIZE
),
1626 (vn_a
->flags
& MAP_TEXT
), type
, 0);
1627 ASSERT(AS_WRITE_HELD(as
));
1628 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
1629 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
1630 ASSERT(vn_a
->vp
== NULL
);
1632 return (as_map_segvn_segs(as
, addr
, size
, szcvec
,
1633 crfp
, vn_a
, segcreated
));
1637 as_map(struct as
*as
, caddr_t addr
, size_t size
, int (*crfp
)(), void *argsp
)
1639 AS_LOCK_ENTER(as
, RW_WRITER
);
1640 return (as_map_locked(as
, addr
, size
, crfp
, argsp
));
1644 as_map_locked(struct as
*as
, caddr_t addr
, size_t size
, int (*crfp
)(),
1647 struct seg
*seg
= NULL
;
1648 caddr_t raddr
; /* rounded down addr */
1649 size_t rsize
; /* rounded up size */
1653 * The use of a_proc is preferred to handle the case where curproc is
1654 * a door_call server and is allocating memory in the client's (a_proc)
1656 * When creating a shared memory segment a_proc will be NULL so we
1657 * fallback to curproc in that case.
1659 struct proc
*p
= (as
->a_proc
== NULL
) ? curproc
: as
->a_proc
;
1660 struct segvn_crargs crargs
;
1662 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1663 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
1667 * check for wrap around
1669 if ((raddr
+ rsize
< raddr
) || (as
->a_size
> (ULONG_MAX
- size
))) {
1674 as
->a_updatedir
= 1; /* inform /proc */
1675 gethrestime(&as
->a_updatetime
);
1677 if (as
!= &kas
&& as
->a_size
+ rsize
> (size_t)p
->p_vmem_ctl
) {
1680 (void) rctl_action(rctlproc_legacy
[RLIMIT_VMEM
], p
->p_rctls
, p
,
1686 if (AS_MAP_CHECK_VNODE_LPOOB(crfp
, argsp
)) {
1687 crargs
= *(struct segvn_crargs
*)argsp
;
1688 error
= as_map_vnsegs(as
, raddr
, rsize
, crfp
, &crargs
, &unmap
);
1692 (void) as_unmap(as
, addr
, size
);
1696 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp
, argsp
)) {
1697 crargs
= *(struct segvn_crargs
*)argsp
;
1698 error
= as_map_ansegs(as
, raddr
, rsize
, crfp
, &crargs
, &unmap
);
1702 (void) as_unmap(as
, addr
, size
);
1707 seg
= seg_alloc(as
, addr
, size
);
1713 error
= (*crfp
)(seg
, argsp
);
1720 * Add size now so as_unmap will work if as_ctl fails.
1722 as
->a_size
+= rsize
;
1723 as
->a_resvsize
+= rsize
;
1729 * If the address space is locked,
1730 * establish memory locks for the new segment.
1732 mutex_enter(&as
->a_contents
);
1733 if (AS_ISPGLCK(as
)) {
1734 mutex_exit(&as
->a_contents
);
1736 error
= as_ctl(as
, addr
, size
, MC_LOCK
, 0, 0, NULL
, 0);
1738 (void) as_unmap(as
, addr
, size
);
1740 mutex_exit(&as
->a_contents
);
1748 * Delete all segments in the address space marked with S_PURGE.
1749 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1750 * These segments are deleted as a first step before calls to as_gap(), so
1751 * that they don't affect mmap() or shmat().
1754 as_purge(struct as
*as
)
1757 struct seg
*next_seg
;
1760 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1761 * no need to grab a_contents mutex for this check
1763 if ((as
->a_flags
& AS_NEEDSPURGE
) == 0)
1766 AS_LOCK_ENTER(as
, RW_WRITER
);
1768 seg
= AS_SEGFIRST(as
);
1769 while (seg
!= NULL
) {
1770 next_seg
= AS_SEGNEXT(as
, seg
);
1771 if (seg
->s_flags
& S_PURGE
)
1772 (void) segop_unmap(seg
, seg
->s_base
, seg
->s_size
);
1777 mutex_enter(&as
->a_contents
);
1778 as
->a_flags
&= ~AS_NEEDSPURGE
;
1779 mutex_exit(&as
->a_contents
);
1783 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1784 * range of addresses at least "minlen" long, where the base of the range is
1785 * at "off" phase from an "align" boundary and there is space for a
1786 * "redzone"-sized redzone on eithe rside of the range. Thus,
1787 * if align was 4M and off was 16k, the user wants a hole which will start
1788 * 16k into a 4M page.
1790 * If flags specifies AH_HI, the hole will have the highest possible address
1791 * in the range. We use the as->a_lastgap field to figure out where to
1792 * start looking for a gap.
1794 * Otherwise, the gap will have the lowest possible address.
1796 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1798 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1799 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1801 * NOTE: This routine is not correct when base+len overflows caddr_t.
1804 as_gap_aligned(struct as
*as
, size_t minlen
, caddr_t
*basep
, size_t *lenp
,
1805 uint_t flags
, caddr_t addr
, size_t align
, size_t redzone
, size_t off
)
1807 caddr_t lobound
= *basep
;
1808 caddr_t hibound
= lobound
+ *lenp
;
1809 struct seg
*lseg
, *hseg
;
1815 size_t save_redzone
;
1820 save_minlen
= minlen
;
1821 save_redzone
= redzone
;
1824 * For the first pass/fast_path, just add align and redzone into
1825 * minlen since if we get an allocation, we can guarantee that it
1826 * will fit the alignment and redzone requested.
1827 * This increases the chance that hibound will be adjusted to
1828 * a_lastgap->s_base which will likely allow us to find an
1829 * acceptable hole in the address space quicker.
1830 * If we can't find a hole with this fast_path, then we look for
1831 * smaller holes in which the alignment and offset may allow
1832 * the allocation to fit.
1835 minlen
+= 2 * redzone
;
1838 AS_LOCK_ENTER(as
, RW_READER
);
1839 if (AS_SEGFIRST(as
) == NULL
) {
1840 if (valid_va_range_aligned(basep
, lenp
, minlen
, flags
& AH_DIR
,
1841 align
, redzone
, off
)) {
1854 * Set up to iterate over all the inter-segment holes in the given
1855 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1856 * NULL for the highest-addressed hole. If moving backwards, we reset
1857 * sseg to denote the highest-addressed segment.
1859 forward
= (flags
& AH_DIR
) == AH_LO
;
1861 hseg
= as_findseg(as
, lobound
, 1);
1862 lseg
= AS_SEGPREV(as
, hseg
);
1866 * If allocating at least as much as the last allocation,
1867 * use a_lastgap's base as a better estimate of hibound.
1869 if (as
->a_lastgap
&&
1870 minlen
>= as
->a_lastgap
->s_size
&&
1871 hibound
>= as
->a_lastgap
->s_base
)
1872 hibound
= as
->a_lastgap
->s_base
;
1874 hseg
= as_findseg(as
, hibound
, 1);
1875 if (hseg
->s_base
+ hseg
->s_size
< hibound
) {
1879 lseg
= AS_SEGPREV(as
, hseg
);
1885 * Set lo and hi to the hole's boundaries. (We should really
1886 * use MAXADDR in place of hibound in the expression below,
1887 * but can't express it easily; using hibound in its place is
1890 lo
= (lseg
== NULL
) ? 0 : lseg
->s_base
+ lseg
->s_size
;
1891 hi
= (hseg
== NULL
) ? hibound
: hseg
->s_base
;
1893 * If the iteration has moved past the interval from lobound
1894 * to hibound it's pointless to continue.
1896 if ((forward
&& lo
> hibound
) || (!forward
&& hi
< lobound
))
1898 else if (lo
> hibound
|| hi
< lobound
)
1901 * Candidate hole lies at least partially within the allowable
1902 * range. Restrict it to fall completely within that range,
1903 * i.e., to [max(lo, lobound), min(hi, hibound)].
1910 * Verify that the candidate hole is big enough and meets
1911 * hardware constraints. If the hole is too small, no need
1912 * to do the further checks since they will fail.
1916 if (*lenp
>= minlen
&& valid_va_range_aligned(basep
, lenp
,
1917 minlen
, forward
? AH_LO
: AH_HI
, align
, redzone
, off
) &&
1918 ((flags
& AH_CONTAIN
) == 0 ||
1919 (*basep
<= addr
&& *basep
+ *lenp
> addr
))) {
1921 as
->a_lastgap
= hseg
;
1923 as
->a_lastgaphl
= hseg
;
1925 as
->a_lastgaphl
= lseg
;
1931 * Move to the next hole.
1937 hseg
= AS_SEGNEXT(as
, hseg
);
1942 lseg
= AS_SEGPREV(as
, lseg
);
1945 if (fast_path
&& (align
!= 0 || save_redzone
!= 0)) {
1947 minlen
= save_minlen
;
1948 redzone
= save_redzone
;
1958 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1960 * If flags specifies AH_HI, the hole will have the highest possible address
1961 * in the range. We use the as->a_lastgap field to figure out where to
1962 * start looking for a gap.
1964 * Otherwise, the gap will have the lowest possible address.
1966 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1968 * If an adequate hole is found, base and len are set to reflect the part of
1969 * the hole that is within range, and 0 is returned, otherwise,
1972 * NOTE: This routine is not correct when base+len overflows caddr_t.
1975 as_gap(struct as
*as
, size_t minlen
, caddr_t
*basep
, size_t *lenp
, uint_t flags
,
1979 return (as_gap_aligned(as
, minlen
, basep
, lenp
, flags
, addr
, 0, 0, 0));
1983 * Return the next range within [base, base + len) that is backed
1984 * with "real memory". Skip holes and non-seg_vn segments.
1985 * We're lazy and only return one segment at a time.
1988 as_memory(struct as
*as
, caddr_t
*basep
, size_t *lenp
)
1990 extern const struct seg_ops segspt_shmops
; /* needs a header file */
1992 caddr_t addr
, eaddr
;
1995 AS_LOCK_ENTER(as
, RW_READER
);
1998 eaddr
= addr
+ *lenp
;
2000 seg
= as_findseg(as
, addr
, 0);
2002 addr
= MAX(seg
->s_base
, addr
);
2005 if (seg
== NULL
|| addr
>= eaddr
|| eaddr
<= seg
->s_base
) {
2010 if (seg
->s_ops
== &segvn_ops
) {
2011 segend
= seg
->s_base
+ seg
->s_size
;
2016 * We do ISM by looking into the private data
2017 * to determine the real size of the segment.
2019 if (seg
->s_ops
== &segspt_shmops
) {
2020 segend
= seg
->s_base
+ spt_realsize(seg
);
2025 seg
= AS_SEGNEXT(as
, seg
);
2034 *lenp
= eaddr
- addr
;
2036 *lenp
= segend
- addr
;
2043 * Determine whether data from the mappings in interval [addr, addr + size)
2044 * are in the primary memory (core) cache.
2047 as_incore(struct as
*as
, caddr_t addr
,
2048 size_t size
, char *vec
, size_t *sizep
)
2052 caddr_t raddr
; /* rounded down addr */
2053 size_t rsize
; /* rounded up size */
2054 size_t isize
; /* iteration size */
2055 int error
= 0; /* result, assume success */
2058 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
2059 rsize
= ((((size_t)addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
2062 if (raddr
+ rsize
< raddr
) /* check for wraparound */
2065 AS_LOCK_ENTER(as
, RW_READER
);
2066 seg
= as_segat(as
, raddr
);
2072 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
2073 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
2074 seg
= AS_SEGNEXT(as
, seg
);
2075 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
2080 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
2081 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2084 *sizep
+= isize
= segop_incore(seg
, raddr
, ssize
, vec
);
2085 if (isize
!= ssize
) {
2089 vec
+= btopr(ssize
);
2096 as_segunlock(struct seg
*seg
, caddr_t addr
, int attr
,
2097 ulong_t
*bitmap
, size_t position
, size_t npages
)
2099 caddr_t range_start
;
2100 size_t pos1
= position
;
2103 size_t end_pos
= npages
+ position
;
2105 while (bt_range(bitmap
, &pos1
, &pos2
, end_pos
)) {
2106 size
= ptob((pos2
- pos1
));
2107 range_start
= (caddr_t
)((uintptr_t)addr
+
2108 ptob(pos1
- position
));
2110 (void) segop_lockop(seg
, range_start
, size
, attr
, MC_UNLOCK
,
2117 as_unlockerr(struct as
*as
, int attr
, ulong_t
*mlock_map
,
2118 caddr_t raddr
, size_t rsize
)
2120 struct seg
*seg
= as_segat(as
, raddr
);
2123 while (rsize
!= 0) {
2124 if (raddr
>= seg
->s_base
+ seg
->s_size
)
2125 seg
= AS_SEGNEXT(as
, seg
);
2127 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
2128 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2132 as_segunlock(seg
, raddr
, attr
, mlock_map
, 0, btopr(ssize
));
2140 * Cache control operations over the interval [addr, addr + size) in
2141 * address space "as".
2145 as_ctl(struct as
*as
, caddr_t addr
, size_t size
, int func
, int attr
,
2146 uintptr_t arg
, ulong_t
*lock_map
, size_t pos
)
2148 struct seg
*seg
; /* working segment */
2149 caddr_t raddr
; /* rounded down addr */
2150 caddr_t initraddr
; /* saved initial rounded down addr */
2151 size_t rsize
; /* rounded up size */
2152 size_t initrsize
; /* saved initial rounded up size */
2153 size_t ssize
; /* size of seg */
2154 int error
= 0; /* result */
2155 size_t mlock_size
; /* size of bitmap */
2156 ulong_t
*mlock_map
; /* pointer to bitmap used */
2157 /* to represent the locked */
2160 if (error
== IE_RETRY
)
2161 AS_LOCK_ENTER(as
, RW_WRITER
);
2163 AS_LOCK_ENTER(as
, RW_READER
);
2166 * If these are address space lock/unlock operations, loop over
2167 * all segments in the address space, as appropriate.
2169 if (func
== MC_LOCKAS
) {
2171 size_t rlen
= 0; /* rounded as length */
2175 if (arg
& MCL_FUTURE
) {
2176 mutex_enter(&as
->a_contents
);
2178 mutex_exit(&as
->a_contents
);
2180 if ((arg
& MCL_CURRENT
) == 0) {
2185 seg
= AS_SEGFIRST(as
);
2192 raddr
= (caddr_t
)((uintptr_t)seg
->s_base
&
2193 (uintptr_t)PAGEMASK
);
2194 rlen
+= (((uintptr_t)(seg
->s_base
+ seg
->s_size
) +
2195 PAGEOFFSET
) & PAGEMASK
) - (uintptr_t)raddr
;
2196 } while ((seg
= AS_SEGNEXT(as
, seg
)) != NULL
);
2198 mlock_size
= BT_BITOUL(btopr(rlen
));
2199 if ((mlock_map
= (ulong_t
*)kmem_zalloc(mlock_size
*
2200 sizeof (ulong_t
), KM_NOSLEEP
)) == NULL
) {
2205 for (seg
= AS_SEGFIRST(as
); seg
; seg
= AS_SEGNEXT(as
, seg
)) {
2206 error
= segop_lockop(seg
, seg
->s_base
,
2207 seg
->s_size
, attr
, MC_LOCK
, mlock_map
, pos
);
2210 pos
+= seg_pages(seg
);
2214 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
;
2215 seg
= AS_SEGNEXT(as
, seg
)) {
2217 raddr
= (caddr_t
)((uintptr_t)seg
->s_base
&
2218 (uintptr_t)PAGEMASK
);
2219 npages
= seg_pages(seg
);
2220 as_segunlock(seg
, raddr
, attr
, mlock_map
,
2226 kmem_free(mlock_map
, mlock_size
* sizeof (ulong_t
));
2229 } else if (func
== MC_UNLOCKAS
) {
2230 mutex_enter(&as
->a_contents
);
2232 mutex_exit(&as
->a_contents
);
2234 for (seg
= AS_SEGFIRST(as
); seg
; seg
= AS_SEGNEXT(as
, seg
)) {
2235 error
= segop_lockop(seg
, seg
->s_base
,
2236 seg
->s_size
, attr
, MC_UNLOCK
, NULL
, 0);
2246 * Normalize addresses and sizes.
2248 initraddr
= raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
2249 initrsize
= rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
2252 if (raddr
+ rsize
< raddr
) { /* check for wraparound */
2258 * Get initial segment.
2260 if ((seg
= as_segat(as
, raddr
)) == NULL
) {
2265 if (func
== MC_LOCK
) {
2266 mlock_size
= BT_BITOUL(btopr(rsize
));
2267 if ((mlock_map
= (ulong_t
*)kmem_zalloc(mlock_size
*
2268 sizeof (ulong_t
), KM_NOSLEEP
)) == NULL
) {
2275 * Loop over all segments. If a hole in the address range is
2276 * discovered, then fail. For each segment, perform the appropriate
2277 * control operation.
2279 while (rsize
!= 0) {
2282 * Make sure there's no hole, calculate the portion
2283 * of the next segment to be operated over.
2285 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
2286 seg
= AS_SEGNEXT(as
, seg
);
2287 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
2288 if (func
== MC_LOCK
) {
2289 as_unlockerr(as
, attr
, mlock_map
,
2290 initraddr
, initrsize
- rsize
);
2291 kmem_free(mlock_map
,
2292 mlock_size
* sizeof (ulong_t
));
2298 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
2299 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2304 * Dispatch on specific function.
2309 * Synchronize cached data from mappings with backing
2313 if (error
= segop_sync(seg
, raddr
, ssize
,
2314 attr
, (uint_t
)arg
)) {
2321 * Lock pages in memory.
2324 if (error
= segop_lockop(seg
, raddr
, ssize
,
2325 attr
, func
, mlock_map
, pos
)) {
2326 as_unlockerr(as
, attr
, mlock_map
, initraddr
,
2327 initrsize
- rsize
+ ssize
);
2328 kmem_free(mlock_map
, mlock_size
*
2336 * Unlock mapped pages.
2339 (void) segop_lockop(seg
, raddr
, ssize
, attr
, func
,
2344 * Store VM advise for mapped pages in segment layer.
2347 error
= segop_advise(seg
, raddr
, ssize
, (uint_t
)arg
);
2350 * Check for regular errors and special retry error
2353 if (error
== IE_RETRY
) {
2355 * Need to acquire writers lock, so
2356 * have to drop readers lock and start
2361 } else if (error
== IE_REATTACH
) {
2363 * Find segment for current address
2364 * because current segment just got
2365 * split or concatenated
2367 seg
= as_segat(as
, raddr
);
2382 case MC_INHERIT_ZERO
:
2383 error
= segop_inherit(seg
, raddr
, ssize
, SEGP_INH_ZERO
);
2394 panic("as_ctl: bad operation %d", func
);
2402 if (func
== MC_LOCK
)
2403 kmem_free(mlock_map
, mlock_size
* sizeof (ulong_t
));
2409 * If the lower levels returned EDEADLK for a segment lockop,
2410 * it means that we should retry the operation. Let's wait
2411 * a bit also to let the deadlock causing condition clear.
2412 * This is part of a gross hack to work around a design flaw
2413 * in the ufs/sds logging code and should go away when the
2414 * logging code is re-designed to fix the problem. See bug
2415 * 4125102 for details of the problem.
2417 if (error
== EDEADLK
) {
2426 fc_decode(faultcode_t fault_err
)
2430 switch (FC_CODE(fault_err
)) {
2432 error
= FC_ERRNO(fault_err
);
2445 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2446 * lists from each segment and copy them to one contiguous shadow list (plist)
2447 * as expected by the caller. Save pointers to per segment shadow lists at
2448 * the tail of plist so that they can be used during as_pageunlock().
2451 as_pagelock_segs(struct as
*as
, struct seg
*seg
, struct page
***ppp
,
2452 caddr_t addr
, size_t size
, enum seg_rw rw
)
2454 caddr_t sv_addr
= addr
;
2455 size_t sv_size
= size
;
2456 struct seg
*sv_seg
= seg
;
2460 pgcnt_t npages
= btop(size
);
2465 faultcode_t fault_err
= 0;
2467 extern const struct seg_ops segspt_shmops
;
2469 ASSERT(AS_LOCK_HELD(as
));
2470 ASSERT(seg
!= NULL
);
2471 ASSERT(addr
>= seg
->s_base
&& addr
< seg
->s_base
+ seg
->s_size
);
2472 ASSERT(addr
+ size
> seg
->s_base
+ seg
->s_size
);
2473 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
2474 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
2477 * Count the number of segments covered by the range we are about to
2478 * lock. The segment count is used to size the shadow list we return
2479 * back to the caller.
2481 for (; size
!= 0; size
-= ssize
, addr
+= ssize
) {
2482 if (addr
>= seg
->s_base
+ seg
->s_size
) {
2484 seg
= AS_SEGNEXT(as
, seg
);
2485 if (seg
== NULL
|| addr
!= seg
->s_base
) {
2490 * Do a quick check if subsequent segments
2491 * will most likely support pagelock.
2493 if (seg
->s_ops
== &segvn_ops
) {
2496 if (segop_getvp(seg
, addr
, &vp
) != 0 ||
2501 } else if (seg
->s_ops
!= &segspt_shmops
) {
2507 if (addr
+ size
> seg
->s_base
+ seg
->s_size
) {
2508 ssize
= seg
->s_base
+ seg
->s_size
- addr
;
2515 plist
= kmem_zalloc((npages
+ segcnt
) * sizeof (page_t
*), KM_SLEEP
);
2521 for (cnt
= 0, pl_off
= 0; size
!= 0; size
-= ssize
, addr
+= ssize
) {
2522 if (addr
>= seg
->s_base
+ seg
->s_size
) {
2523 seg
= AS_SEGNEXT(as
, seg
);
2524 ASSERT(seg
!= NULL
&& addr
== seg
->s_base
);
2526 ASSERT(cnt
< segcnt
);
2528 if (addr
+ size
> seg
->s_base
+ seg
->s_size
) {
2529 ssize
= seg
->s_base
+ seg
->s_size
- addr
;
2533 pl
= &plist
[npages
+ cnt
];
2534 error
= segop_pagelock(seg
, addr
, ssize
, (page_t
***)pl
,
2539 ASSERT(plist
[npages
+ cnt
] != NULL
);
2540 ASSERT(pl_off
+ btop(ssize
) <= npages
);
2541 bcopy(plist
[npages
+ cnt
], &plist
[pl_off
],
2542 btop(ssize
) * sizeof (page_t
*));
2543 pl_off
+= btop(ssize
);
2548 ASSERT(cnt
== segcnt
- 1);
2554 * one of pagelock calls failed. The error type is in error variable.
2555 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2556 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2557 * back to the caller.
2563 for (cnt
= 0, addr
= sv_addr
; addr
< eaddr
; addr
+= ssize
) {
2564 if (addr
>= seg
->s_base
+ seg
->s_size
) {
2565 seg
= AS_SEGNEXT(as
, seg
);
2566 ASSERT(seg
!= NULL
&& addr
== seg
->s_base
);
2568 ASSERT(cnt
< segcnt
);
2570 if (eaddr
> seg
->s_base
+ seg
->s_size
) {
2571 ssize
= seg
->s_base
+ seg
->s_size
- addr
;
2573 ssize
= eaddr
- addr
;
2575 pl
= &plist
[npages
+ cnt
];
2576 ASSERT(*pl
!= NULL
);
2577 (void) segop_pagelock(seg
, addr
, ssize
, (page_t
***)pl
,
2583 kmem_free(plist
, (npages
+ segcnt
) * sizeof (page_t
*));
2585 if (error
!= ENOTSUP
&& error
!= EFAULT
) {
2591 * If we are here because pagelock failed due to the need to cow fault
2592 * in the pages we want to lock F_SOFTLOCK will do this job and in
2593 * next as_pagelock() call for this address range pagelock will
2594 * hopefully succeed.
2596 fault_err
= as_fault(as
->a_hat
, as
, sv_addr
, sv_size
, F_SOFTLOCK
, rw
);
2597 if (fault_err
!= 0) {
2598 return (fc_decode(fault_err
));
2606 * lock pages in a given address space. Return shadow list. If
2607 * the list is NULL, the MMU mapping is also locked.
2610 as_pagelock(struct as
*as
, struct page
***ppp
, caddr_t addr
,
2611 size_t size
, enum seg_rw rw
)
2615 faultcode_t fault_err
;
2619 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
2620 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
2624 * if the request crosses two segments let
2625 * as_fault handle it.
2627 AS_LOCK_ENTER(as
, RW_READER
);
2629 seg
= as_segat(as
, raddr
);
2634 ASSERT(raddr
>= seg
->s_base
&& raddr
< seg
->s_base
+ seg
->s_size
);
2635 if (raddr
+ rsize
> seg
->s_base
+ seg
->s_size
) {
2636 return (as_pagelock_segs(as
, seg
, ppp
, raddr
, rsize
, rw
));
2638 if (raddr
+ rsize
<= raddr
) {
2644 * try to lock pages and pass back shadow list
2646 err
= segop_pagelock(seg
, raddr
, rsize
, ppp
, L_PAGELOCK
, rw
);
2650 if (err
== 0 || (err
!= ENOTSUP
&& err
!= EFAULT
)) {
2655 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2656 * to no pagelock support for this segment or pages need to be cow
2657 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2658 * this as_pagelock() call and in the next as_pagelock() call for the
2659 * same address range pagelock call will hopefull succeed.
2661 fault_err
= as_fault(as
->a_hat
, as
, addr
, size
, F_SOFTLOCK
, rw
);
2662 if (fault_err
!= 0) {
2663 return (fc_decode(fault_err
));
2671 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2672 * lists from the end of plist and call pageunlock interface for each segment.
2673 * Drop as lock and free plist.
2676 as_pageunlock_segs(struct as
*as
, struct seg
*seg
, caddr_t addr
, size_t size
,
2677 struct page
**plist
, enum seg_rw rw
)
2680 caddr_t eaddr
= addr
+ size
;
2681 pgcnt_t npages
= btop(size
);
2685 ASSERT(AS_LOCK_HELD(as
));
2686 ASSERT(seg
!= NULL
);
2687 ASSERT(addr
>= seg
->s_base
&& addr
< seg
->s_base
+ seg
->s_size
);
2688 ASSERT(addr
+ size
> seg
->s_base
+ seg
->s_size
);
2689 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
2690 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
2691 ASSERT(plist
!= NULL
);
2693 for (cnt
= 0; addr
< eaddr
; addr
+= ssize
) {
2694 if (addr
>= seg
->s_base
+ seg
->s_size
) {
2695 seg
= AS_SEGNEXT(as
, seg
);
2696 ASSERT(seg
!= NULL
&& addr
== seg
->s_base
);
2699 if (eaddr
> seg
->s_base
+ seg
->s_size
) {
2700 ssize
= seg
->s_base
+ seg
->s_size
- addr
;
2702 ssize
= eaddr
- addr
;
2704 pl
= &plist
[npages
+ cnt
];
2705 ASSERT(*pl
!= NULL
);
2706 (void) segop_pagelock(seg
, addr
, ssize
, (page_t
***)pl
,
2713 kmem_free(plist
, (npages
+ cnt
) * sizeof (page_t
*));
2717 * unlock pages in a given address range
2720 as_pageunlock(struct as
*as
, struct page
**pp
, caddr_t addr
, size_t size
,
2728 * if the shadow list is NULL, as_pagelock was
2729 * falling back to as_fault
2732 (void) as_fault(as
->a_hat
, as
, addr
, size
, F_SOFTUNLOCK
, rw
);
2736 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
2737 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
2740 AS_LOCK_ENTER(as
, RW_READER
);
2741 seg
= as_segat(as
, raddr
);
2742 ASSERT(seg
!= NULL
);
2744 ASSERT(raddr
>= seg
->s_base
&& raddr
< seg
->s_base
+ seg
->s_size
);
2745 if (raddr
+ rsize
<= seg
->s_base
+ seg
->s_size
) {
2746 (void) segop_pagelock(seg
, raddr
, rsize
, &pp
, L_PAGEUNLOCK
, rw
);
2748 as_pageunlock_segs(as
, seg
, raddr
, rsize
, pp
, rw
);
2755 as_setpagesize(struct as
*as
, caddr_t addr
, size_t size
, uint_t szc
,
2760 caddr_t raddr
; /* rounded down addr */
2761 size_t rsize
; /* rounded up size */
2763 size_t pgsz
= page_get_pagesize(szc
);
2766 if (!IS_P2ALIGNED(addr
, pgsz
) || !IS_P2ALIGNED(size
, pgsz
)) {
2773 if (raddr
+ rsize
< raddr
) /* check for wraparound */
2776 AS_LOCK_ENTER(as
, RW_WRITER
);
2777 as_clearwatchprot(as
, raddr
, rsize
);
2778 seg
= as_segat(as
, raddr
);
2785 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
2786 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
2787 seg
= AS_SEGNEXT(as
, seg
);
2788 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
2793 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
)) {
2794 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2800 error
= segop_setpagesize(seg
, raddr
, ssize
, szc
);
2802 if (error
== IE_NOMEM
) {
2807 if (error
== IE_RETRY
) {
2812 if (error
== ENOTSUP
) {
2817 if (wait
&& (error
== EAGAIN
)) {
2819 * Memory is currently locked. It must be unlocked
2820 * before this operation can succeed through a retry.
2821 * The possible reasons for locked memory and
2822 * corresponding strategies for unlocking are:
2824 * wait for a signal that the I/O operation
2825 * has completed and the memory is unlocked.
2826 * (2) Asynchronous I/O
2827 * The aio subsystem does not unlock pages when
2828 * the I/O is completed. Those pages are unlocked
2829 * when the application calls aiowait/aioerror.
2830 * So, to prevent blocking forever, cv_broadcast()
2831 * is done to wake up aio_cleanup_thread.
2832 * Subsequently, segvn_reclaim will be called, and
2833 * that will do AS_CLRUNMAPWAIT() and wake us up.
2834 * (3) Long term page locking:
2835 * This is not relevant for as_setpagesize()
2836 * because we cannot change the page size for
2837 * driver memory. The attempt to do so will
2838 * fail with a different error than EAGAIN so
2839 * there's no need to trigger as callbacks like
2840 * as_unmap, as_setprot or as_free would do.
2842 mutex_enter(&as
->a_contents
);
2843 if (!AS_ISNOUNMAPWAIT(as
)) {
2844 if (AS_ISUNMAPWAIT(as
) == 0) {
2845 cv_broadcast(&as
->a_cv
);
2847 AS_SETUNMAPWAIT(as
);
2849 while (AS_ISUNMAPWAIT(as
)) {
2850 cv_wait(&as
->a_cv
, &as
->a_contents
);
2854 * We may have raced with
2855 * segvn_reclaim()/segspt_reclaim(). In this
2856 * case clean nounmapwait flag and retry since
2857 * softlockcnt in this segment may be already
2858 * 0. We don't drop as writer lock so our
2859 * number of retries without sleeping should
2860 * be very small. See segvn_reclaim() for
2863 AS_CLRNOUNMAPWAIT(as
);
2864 mutex_exit(&as
->a_contents
);
2867 mutex_exit(&as
->a_contents
);
2869 } else if (error
!= 0) {
2879 * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments
2880 * in its chunk where s_szc is less than the szc we want to set.
2883 as_iset3_default_lpsize(struct as
*as
, caddr_t raddr
, size_t rsize
, uint_t szc
,
2890 ASSERT(AS_WRITE_HELD(as
));
2892 seg
= as_segat(as
, raddr
);
2894 panic("as_iset3_default_lpsize: no seg");
2897 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
2898 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
2899 seg
= AS_SEGNEXT(as
, seg
);
2900 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
2901 panic("as_iset3_default_lpsize: as changed");
2904 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
)) {
2905 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2910 if (szc
> seg
->s_szc
) {
2911 error
= segop_setpagesize(seg
, raddr
, ssize
, szc
);
2912 /* Only retry on EINVAL segments that have no vnode. */
2913 if (error
== EINVAL
) {
2915 if ((segop_gettype(seg
, raddr
) & MAP_SHARED
) &&
2916 (segop_getvp(seg
, raddr
, &vp
) != 0 ||
2932 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
2933 * pagesize on each segment in its range, but if any fails with EINVAL,
2934 * then it reduces the pagesizes to the next size in the bitmap and
2935 * retries as_iset3_default_lpsize(). The reason why the code retries
2936 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
2937 * match the bigger sizes, and (b) it's hard to get this offset (to begin
2938 * with) to pass to map_pgszcvec().
2941 as_iset2_default_lpsize(struct as
*as
, caddr_t addr
, size_t size
, uint_t szc
,
2947 ASSERT(AS_WRITE_HELD(as
));
2950 error
= as_iset3_default_lpsize(as
, addr
, size
, szc
, &retry
);
2951 if (error
== EINVAL
&& retry
) {
2952 szcvec
&= ~(1 << szc
);
2956 szc
= highbit(szcvec
) - 1;
2964 * as_iset1_default_lpsize() breaks its chunk into areas where existing
2965 * segments have a smaller szc than we want to set. For each such area,
2966 * it calls as_iset2_default_lpsize()
2969 as_iset1_default_lpsize(struct as
*as
, caddr_t raddr
, size_t rsize
, uint_t szc
,
2974 caddr_t setaddr
= raddr
;
2979 ASSERT(AS_WRITE_HELD(as
));
2981 seg
= as_segat(as
, raddr
);
2983 panic("as_iset1_default_lpsize: no seg");
2985 if (seg
->s_szc
< szc
) {
2991 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
, setsize
+= ssize
) {
2992 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
2993 seg
= AS_SEGNEXT(as
, seg
);
2994 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
2995 panic("as_iset1_default_lpsize: as changed");
2997 if (seg
->s_szc
>= szc
&& set
) {
2998 ASSERT(setsize
!= 0);
2999 error
= as_iset2_default_lpsize(as
,
3000 setaddr
, setsize
, szc
, szcvec
);
3005 } else if (seg
->s_szc
< szc
&& !set
) {
3011 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
)) {
3012 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
3019 ASSERT(setsize
!= 0);
3020 error
= as_iset2_default_lpsize(as
, setaddr
, setsize
,
3027 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3028 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3029 * chunk to as_iset1_default_lpsize().
3032 as_iset_default_lpsize(struct as
*as
, caddr_t addr
, size_t size
, int flags
,
3035 int rtype
= (type
& MAP_SHARED
) ? MAPPGSZC_SHM
: MAPPGSZC_PRIVM
;
3036 uint_t szcvec
= map_pgszcvec(addr
, size
, (uintptr_t)addr
,
3047 ASSERT(AS_WRITE_HELD(as
));
3048 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
3049 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
3052 if (szcvec
<= 1) { /* skip if base page size */
3056 /* Get the pagesize of the first larger page size. */
3057 szc
= lowbit(szcvec
) - 1;
3058 pgsz
= page_get_pagesize(szc
);
3059 eaddr
= addr
+ size
;
3060 addr
= (caddr_t
)P2ROUNDUP((uintptr_t)addr
, pgsz
);
3061 eaddr
= (caddr_t
)P2ALIGN((uintptr_t)eaddr
, pgsz
);
3063 save_szcvec
= szcvec
;
3064 szcvec
>>= (szc
+ 1);
3067 if ((szcvec
& 0x1) == 0) {
3073 pgsz
= page_get_pagesize(nszc
);
3074 a
= (caddr_t
)P2ROUNDUP((uintptr_t)addr
, pgsz
);
3079 error
= as_iset1_default_lpsize(as
, addr
, segsize
, szc
,
3090 ASSERT(addr
< eaddr
);
3091 szcvec
= save_szcvec
;
3093 a
= (caddr_t
)P2ALIGN((uintptr_t)eaddr
, pgsz
);
3098 error
= as_iset1_default_lpsize(as
, addr
, segsize
, szc
,
3105 szcvec
&= ~(1 << szc
);
3107 szc
= highbit(szcvec
) - 1;
3108 pgsz
= page_get_pagesize(szc
);
3111 ASSERT(addr
== eaddr
);
3117 * Set the default large page size for the range. Called via memcntl with
3118 * page size set to 0. as_set_default_lpsize breaks the range down into
3119 * chunks with the same type/flags, ignores-non segvn segments, and passes
3120 * each chunk to as_iset_default_lpsize().
3123 as_set_default_lpsize(struct as
*as
, caddr_t addr
, size_t size
)
3139 AS_LOCK_ENTER(as
, RW_WRITER
);
3143 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
3144 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
3147 if (raddr
+ rsize
< raddr
) { /* check for wraparound */
3151 as_clearwatchprot(as
, raddr
, rsize
);
3152 seg
= as_segat(as
, raddr
);
3158 if (seg
->s_ops
== &segvn_ops
) {
3159 rtype
= segop_gettype(seg
, addr
);
3160 rflags
= rtype
& (MAP_TEXT
| MAP_INITDATA
);
3161 rtype
= rtype
& (MAP_SHARED
| MAP_PRIVATE
);
3169 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
, setsize
+= ssize
) {
3170 if (raddr
>= (seg
->s_base
+ seg
->s_size
)) {
3171 seg
= AS_SEGNEXT(as
, seg
);
3172 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
3176 if (seg
->s_ops
== &segvn_ops
) {
3177 stype
= segop_gettype(seg
, raddr
);
3178 sflags
= stype
& (MAP_TEXT
| MAP_INITDATA
);
3179 stype
&= (MAP_SHARED
| MAP_PRIVATE
);
3180 if (segvn
&& (rflags
!= sflags
||
3183 * The next segment is also segvn but
3184 * has different flags and/or type.
3186 ASSERT(setsize
!= 0);
3187 error
= as_iset_default_lpsize(as
,
3188 setaddr
, setsize
, rflags
, rtype
);
3196 } else if (!segvn
) {
3204 /* The next segment is not segvn. */
3205 ASSERT(setsize
!= 0);
3206 error
= as_iset_default_lpsize(as
,
3207 setaddr
, setsize
, rflags
, rtype
);
3214 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
)) {
3215 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
3220 if (error
== 0 && segvn
) {
3221 /* The last chunk when rsize == 0. */
3222 ASSERT(setsize
!= 0);
3223 error
= as_iset_default_lpsize(as
, setaddr
, setsize
,
3227 if (error
== IE_RETRY
) {
3229 } else if (error
== IE_NOMEM
) {
3231 } else if (error
== ENOTSUP
) {
3233 } else if (error
== EAGAIN
) {
3234 mutex_enter(&as
->a_contents
);
3235 if (!AS_ISNOUNMAPWAIT(as
)) {
3236 if (AS_ISUNMAPWAIT(as
) == 0) {
3237 cv_broadcast(&as
->a_cv
);
3239 AS_SETUNMAPWAIT(as
);
3241 while (AS_ISUNMAPWAIT(as
)) {
3242 cv_wait(&as
->a_cv
, &as
->a_contents
);
3244 mutex_exit(&as
->a_contents
);
3245 AS_LOCK_ENTER(as
, RW_WRITER
);
3248 * We may have raced with
3249 * segvn_reclaim()/segspt_reclaim(). In this case
3250 * clean nounmapwait flag and retry since softlockcnt
3251 * in this segment may be already 0. We don't drop as
3252 * writer lock so our number of retries without
3253 * sleeping should be very small. See segvn_reclaim()
3254 * for more comments.
3256 AS_CLRNOUNMAPWAIT(as
);
3257 mutex_exit(&as
->a_contents
);
3268 * Setup all of the uninitialized watched pages that we can.
3271 as_setwatch(struct as
*as
)
3273 struct watched_page
*pwp
;
3279 if (avl_numnodes(&as
->a_wpage
) == 0)
3282 ASSERT(AS_WRITE_HELD(as
));
3284 for (pwp
= avl_first(&as
->a_wpage
); pwp
!= NULL
;
3285 pwp
= AVL_NEXT(&as
->a_wpage
, pwp
)) {
3288 vaddr
= pwp
->wp_vaddr
;
3289 if (pwp
->wp_oprot
!= 0 || /* already set up */
3290 (seg
= as_segat(as
, vaddr
)) == NULL
||
3291 segop_getprot(seg
, vaddr
, 0, &prot
) != 0)
3294 pwp
->wp_oprot
= prot
;
3296 prot
&= ~(PROT_READ
|PROT_WRITE
|PROT_EXEC
);
3298 prot
&= ~PROT_WRITE
;
3300 prot
&= ~(PROT_READ
|PROT_WRITE
|PROT_EXEC
);
3301 if (!(pwp
->wp_flags
& WP_NOWATCH
) && prot
!= pwp
->wp_oprot
) {
3302 err
= segop_setprot(seg
, vaddr
, PAGESIZE
, prot
);
3303 if (err
== IE_RETRY
) {
3305 ASSERT(retrycnt
== 0);
3310 pwp
->wp_prot
= prot
;
3315 * Clear all of the watched pages in the address space.
3318 as_clearwatch(struct as
*as
)
3320 struct watched_page
*pwp
;
3326 if (avl_numnodes(&as
->a_wpage
) == 0)
3329 ASSERT(AS_WRITE_HELD(as
));
3331 for (pwp
= avl_first(&as
->a_wpage
); pwp
!= NULL
;
3332 pwp
= AVL_NEXT(&as
->a_wpage
, pwp
)) {
3335 vaddr
= pwp
->wp_vaddr
;
3336 if (pwp
->wp_oprot
== 0 || /* not set up */
3337 (seg
= as_segat(as
, vaddr
)) == NULL
)
3340 if ((prot
= pwp
->wp_oprot
) != pwp
->wp_prot
) {
3341 err
= segop_setprot(seg
, vaddr
, PAGESIZE
, prot
);
3342 if (err
== IE_RETRY
) {
3343 ASSERT(retrycnt
== 0);
3354 * Force a new setup for all the watched pages in the range.
3357 as_setwatchprot(struct as
*as
, caddr_t addr
, size_t size
, uint_t prot
)
3359 struct watched_page
*pwp
;
3360 struct watched_page tpw
;
3361 caddr_t eaddr
= addr
+ size
;
3368 if (avl_numnodes(&as
->a_wpage
) == 0)
3371 ASSERT(AS_WRITE_HELD(as
));
3373 tpw
.wp_vaddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
3374 if ((pwp
= avl_find(&as
->a_wpage
, &tpw
, &where
)) == NULL
)
3375 pwp
= avl_nearest(&as
->a_wpage
, where
, AVL_AFTER
);
3377 while (pwp
!= NULL
&& pwp
->wp_vaddr
< eaddr
) {
3379 vaddr
= pwp
->wp_vaddr
;
3383 wprot
&= ~(PROT_READ
|PROT_WRITE
|PROT_EXEC
);
3385 wprot
&= ~PROT_WRITE
;
3387 wprot
&= ~(PROT_READ
|PROT_WRITE
|PROT_EXEC
);
3388 if (!(pwp
->wp_flags
& WP_NOWATCH
) && wprot
!= pwp
->wp_oprot
) {
3390 seg
= as_segat(as
, vaddr
);
3392 panic("as_setwatchprot: no seg");
3395 err
= segop_setprot(seg
, vaddr
, PAGESIZE
, wprot
);
3396 if (err
== IE_RETRY
) {
3397 ASSERT(retrycnt
== 0);
3402 pwp
->wp_oprot
= prot
;
3403 pwp
->wp_prot
= wprot
;
3405 pwp
= AVL_NEXT(&as
->a_wpage
, pwp
);
3410 * Clear all of the watched pages in the range.
3413 as_clearwatchprot(struct as
*as
, caddr_t addr
, size_t size
)
3415 caddr_t eaddr
= addr
+ size
;
3416 struct watched_page
*pwp
;
3417 struct watched_page tpw
;
3423 if (avl_numnodes(&as
->a_wpage
) == 0)
3426 tpw
.wp_vaddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
3427 if ((pwp
= avl_find(&as
->a_wpage
, &tpw
, &where
)) == NULL
)
3428 pwp
= avl_nearest(&as
->a_wpage
, where
, AVL_AFTER
);
3430 ASSERT(AS_WRITE_HELD(as
));
3432 while (pwp
!= NULL
&& pwp
->wp_vaddr
< eaddr
) {
3434 if ((prot
= pwp
->wp_oprot
) != 0) {
3437 if (prot
!= pwp
->wp_prot
) {
3439 seg
= as_segat(as
, pwp
->wp_vaddr
);
3442 err
= segop_setprot(seg
, pwp
->wp_vaddr
,
3444 if (err
== IE_RETRY
) {
3445 ASSERT(retrycnt
== 0);
3455 pwp
= AVL_NEXT(&as
->a_wpage
, pwp
);
3460 as_signal_proc(struct as
*as
, k_siginfo_t
*siginfo
)
3464 mutex_enter(&pidlock
);
3465 for (p
= practive
; p
; p
= p
->p_next
) {
3466 if (p
->p_as
== as
) {
3467 mutex_enter(&p
->p_lock
);
3469 sigaddq(p
, NULL
, siginfo
, KM_NOSLEEP
);
3470 mutex_exit(&p
->p_lock
);
3473 mutex_exit(&pidlock
);
3477 * return memory object ID
3480 as_getmemid(struct as
*as
, caddr_t addr
, memid_t
*memidp
)
3485 AS_LOCK_ENTER(as
, RW_READER
);
3486 seg
= as_segat(as
, addr
);
3492 sts
= segop_getmemid(seg
, addr
, memidp
);