Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / kernel / vm / vm_as.c
blob84a804164759472a2c1284f2c28b22f7938480f5
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2018 Joyent, Inc.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 * The Regents of the University of California
34 * All Rights Reserved
36 * University Acknowledgment- Portions of this document are derived from
37 * software developed by the University of California, Berkeley, and its
38 * contributors.
42 * VM - address spaces.
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/errno.h>
49 #include <sys/systm.h>
50 #include <sys/mman.h>
51 #include <sys/sysmacros.h>
52 #include <sys/cpuvar.h>
53 #include <sys/sysinfo.h>
54 #include <sys/kmem.h>
55 #include <sys/vnode.h>
56 #include <sys/vmsystm.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/tnf_probe.h>
60 #include <sys/vtrace.h>
62 #include <vm/hat.h>
63 #include <vm/as.h>
64 #include <vm/seg.h>
65 #include <vm/seg_vn.h>
66 #include <vm/seg_dev.h>
67 #include <vm/seg_kmem.h>
68 #include <vm/seg_map.h>
69 #include <vm/seg_spt.h>
70 #include <vm/seg_hole.h>
71 #include <vm/page.h>
73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
75 static struct kmem_cache *as_cache;
77 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
78 static void as_clearwatchprot(struct as *, caddr_t, size_t);
82 * Verifying the segment lists is very time-consuming; it may not be
83 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
85 #ifdef DEBUG
86 #define VERIFY_SEGLIST
87 int do_as_verify = 0;
88 #endif
91 * Allocate a new callback data structure entry and fill in the events of
92 * interest, the address range of interest, and the callback argument.
93 * Link the entry on the as->a_callbacks list. A callback entry for the
94 * entire address space may be specified with vaddr = 0 and size = -1.
96 * CALLERS RESPONSIBILITY: If not calling from within the process context for
97 * the specified as, the caller must guarantee persistence of the specified as
98 * for the duration of this function (eg. pages being locked within the as
99 * will guarantee persistence).
102 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
103 caddr_t vaddr, size_t size, int sleepflag)
105 struct as_callback *current_head, *cb;
106 caddr_t saddr;
107 size_t rsize;
109 /* callback function and an event are mandatory */
110 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
111 return (EINVAL);
113 /* Adding a callback after as_free has been called is not allowed */
114 if (as == &kas)
115 return (ENOMEM);
118 * vaddr = 0 and size = -1 is used to indicate that the callback range
119 * is the entire address space so no rounding is done in that case.
121 if (size != -1) {
122 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
123 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
124 (size_t)saddr;
125 /* check for wraparound */
126 if (saddr + rsize < saddr)
127 return (ENOMEM);
128 } else {
129 if (vaddr != 0)
130 return (EINVAL);
131 saddr = vaddr;
132 rsize = size;
135 /* Allocate and initialize a callback entry */
136 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
137 if (cb == NULL)
138 return (EAGAIN);
140 cb->ascb_func = cb_func;
141 cb->ascb_arg = arg;
142 cb->ascb_events = events;
143 cb->ascb_saddr = saddr;
144 cb->ascb_len = rsize;
146 /* Add the entry to the list */
147 mutex_enter(&as->a_contents);
148 current_head = as->a_callbacks;
149 as->a_callbacks = cb;
150 cb->ascb_next = current_head;
153 * The call to this function may lose in a race with
154 * a pertinent event - eg. a thread does long term memory locking
155 * but before the callback is added another thread executes as_unmap.
156 * A broadcast here resolves that.
158 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
159 AS_CLRUNMAPWAIT(as);
160 cv_broadcast(&as->a_cv);
163 mutex_exit(&as->a_contents);
164 return (0);
168 * Search the callback list for an entry which pertains to arg.
170 * This is called from within the client upon completion of the callback.
171 * RETURN VALUES:
172 * AS_CALLBACK_DELETED (callback entry found and deleted)
173 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
174 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
175 * entry will be made in as_do_callbacks)
177 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
178 * set, it indicates that as_do_callbacks is processing this entry. The
179 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
180 * to unblock as_do_callbacks, in case it is blocked.
182 * CALLERS RESPONSIBILITY: If not calling from within the process context for
183 * the specified as, the caller must guarantee persistence of the specified as
184 * for the duration of this function (eg. pages being locked within the as
185 * will guarantee persistence).
187 uint_t
188 as_delete_callback(struct as *as, void *arg)
190 struct as_callback **prevcb = &as->a_callbacks;
191 struct as_callback *cb;
192 uint_t rc = AS_CALLBACK_NOTFOUND;
194 mutex_enter(&as->a_contents);
195 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
196 if (cb->ascb_arg != arg)
197 continue;
200 * If the events indicate AS_CALLBACK_CALLED, just clear
201 * AS_ALL_EVENT in the events field and wakeup the thread
202 * that may be waiting in as_do_callbacks. as_do_callbacks
203 * will take care of removing this entry from the list. In
204 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
205 * (AS_CALLBACK_CALLED not set), just remove it from the
206 * list, return the memory and return AS_CALLBACK_DELETED.
208 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
209 /* leave AS_CALLBACK_CALLED */
210 cb->ascb_events &= ~AS_ALL_EVENT;
211 rc = AS_CALLBACK_DELETE_DEFERRED;
212 cv_broadcast(&as->a_cv);
213 } else {
214 *prevcb = cb->ascb_next;
215 kmem_free(cb, sizeof (struct as_callback));
216 rc = AS_CALLBACK_DELETED;
218 break;
220 mutex_exit(&as->a_contents);
221 return (rc);
225 * Searches the as callback list for a matching entry.
226 * Returns a pointer to the first matching callback, or NULL if
227 * nothing is found.
228 * This function never sleeps so it is ok to call it with more
229 * locks held but the (required) a_contents mutex.
231 * See also comment on as_do_callbacks below.
233 static struct as_callback *
234 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
235 size_t event_len)
237 struct as_callback *cb;
239 ASSERT(MUTEX_HELD(&as->a_contents));
240 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
242 * If the callback has not already been called, then
243 * check if events or address range pertains. An event_len
244 * of zero means do an unconditional callback.
246 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
247 ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
248 (event_addr + event_len < cb->ascb_saddr) ||
249 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
250 continue;
252 break;
254 return (cb);
258 * Executes a given callback and removes it from the callback list for
259 * this address space.
260 * This function may sleep so the caller must drop all locks except
261 * a_contents before calling this func.
263 * See also comments on as_do_callbacks below.
265 static void
266 as_execute_callback(struct as *as, struct as_callback *cb,
267 uint_t events)
269 struct as_callback **prevcb;
270 void *cb_arg;
272 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
273 cb->ascb_events |= AS_CALLBACK_CALLED;
274 mutex_exit(&as->a_contents);
275 (*cb->ascb_func)(as, cb->ascb_arg, events);
276 mutex_enter(&as->a_contents);
278 * the callback function is required to delete the callback
279 * when the callback function determines it is OK for
280 * this thread to continue. as_delete_callback will clear
281 * the AS_ALL_EVENT in the events field when it is deleted.
282 * If the callback function called as_delete_callback,
283 * events will already be cleared and there will be no blocking.
285 while ((cb->ascb_events & events) != 0) {
286 cv_wait(&as->a_cv, &as->a_contents);
289 * This entry needs to be taken off the list. Normally, the
290 * callback func itself does that, but unfortunately the list
291 * may have changed while the callback was running because the
292 * a_contents mutex was dropped and someone else other than the
293 * callback func itself could have called as_delete_callback,
294 * so we have to search to find this entry again. The entry
295 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
297 cb_arg = cb->ascb_arg;
298 prevcb = &as->a_callbacks;
299 for (cb = as->a_callbacks; cb != NULL;
300 prevcb = &cb->ascb_next, cb = *prevcb) {
301 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
302 (cb_arg != cb->ascb_arg)) {
303 continue;
305 *prevcb = cb->ascb_next;
306 kmem_free(cb, sizeof (struct as_callback));
307 break;
312 * Check the callback list for a matching event and intersection of
313 * address range. If there is a match invoke the callback. Skip an entry if:
314 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
315 * - not event of interest
316 * - not address range of interest
318 * An event_len of zero indicates a request for an unconditional callback
319 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
320 * a_contents lock must be dropped before a callback, so only one callback
321 * can be done before returning. Return -1 (true) if a callback was
322 * executed and removed from the list, else return 0 (false).
324 * The logically separate parts, i.e. finding a matching callback and
325 * executing a given callback have been separated into two functions
326 * so that they can be called with different sets of locks held beyond
327 * the always-required a_contents. as_find_callback does not sleep so
328 * it is ok to call it if more locks than a_contents (i.e. the a_lock
329 * rwlock) are held. as_execute_callback on the other hand may sleep
330 * so all locks beyond a_contents must be dropped by the caller if one
331 * does not want to end comatose.
333 static int
334 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
335 size_t event_len)
337 struct as_callback *cb;
339 if ((cb = as_find_callback(as, events, event_addr, event_len))) {
340 as_execute_callback(as, cb, events);
341 return (-1);
343 return (0);
347 * Search for the segment containing addr. If a segment containing addr
348 * exists, that segment is returned. If no such segment exists, and
349 * the list spans addresses greater than addr, then the first segment
350 * whose base is greater than addr is returned; otherwise, NULL is
351 * returned unless tail is true, in which case the last element of the
352 * list is returned.
354 * a_seglast is used to cache the last found segment for repeated
355 * searches to the same addr (which happens frequently).
357 struct seg *
358 as_findseg(struct as *as, caddr_t addr, int tail)
360 struct seg *seg = as->a_seglast;
361 avl_index_t where;
363 ASSERT(AS_LOCK_HELD(as));
365 if (seg != NULL &&
366 seg->s_base <= addr &&
367 addr < seg->s_base + seg->s_size)
368 return (seg);
370 seg = avl_find(&as->a_segtree, &addr, &where);
371 if (seg != NULL)
372 return (as->a_seglast = seg);
374 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
375 if (seg == NULL && tail)
376 seg = avl_last(&as->a_segtree);
377 return (as->a_seglast = seg);
380 #ifdef VERIFY_SEGLIST
382 * verify that the linked list is coherent
384 static void
385 as_verify(struct as *as)
387 struct seg *seg, *seglast, *p, *n;
388 uint_t nsegs = 0;
390 if (do_as_verify == 0)
391 return;
393 seglast = as->a_seglast;
395 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
396 ASSERT(seg->s_as == as);
397 p = AS_SEGPREV(as, seg);
398 n = AS_SEGNEXT(as, seg);
399 ASSERT(p == NULL || p->s_as == as);
400 ASSERT(p == NULL || p->s_base < seg->s_base);
401 ASSERT(n == NULL || n->s_base > seg->s_base);
402 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
403 if (seg == seglast)
404 seglast = NULL;
405 nsegs++;
407 ASSERT(seglast == NULL);
408 ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
410 #endif /* VERIFY_SEGLIST */
413 * Add a new segment to the address space. The avl_find()
414 * may be expensive so we attempt to use last segment accessed
415 * in as_gap() as an insertion point.
418 as_addseg(struct as *as, struct seg *newseg)
420 struct seg *seg;
421 caddr_t addr;
422 caddr_t eaddr;
423 avl_index_t where;
425 ASSERT(AS_WRITE_HELD(as));
427 as->a_updatedir = 1; /* inform /proc */
428 gethrestime(&as->a_updatetime);
430 if (as->a_lastgaphl != NULL) {
431 struct seg *hseg = NULL;
432 struct seg *lseg = NULL;
434 if (as->a_lastgaphl->s_base > newseg->s_base) {
435 hseg = as->a_lastgaphl;
436 lseg = AVL_PREV(&as->a_segtree, hseg);
437 } else {
438 lseg = as->a_lastgaphl;
439 hseg = AVL_NEXT(&as->a_segtree, lseg);
442 if (hseg && lseg && lseg->s_base < newseg->s_base &&
443 hseg->s_base > newseg->s_base) {
444 avl_insert_here(&as->a_segtree, newseg, lseg,
445 AVL_AFTER);
446 as->a_lastgaphl = NULL;
447 as->a_seglast = newseg;
448 return (0);
450 as->a_lastgaphl = NULL;
453 addr = newseg->s_base;
454 eaddr = addr + newseg->s_size;
455 again:
457 seg = avl_find(&as->a_segtree, &addr, &where);
459 if (seg == NULL)
460 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
462 if (seg == NULL)
463 seg = avl_last(&as->a_segtree);
465 if (seg != NULL) {
466 caddr_t base = seg->s_base;
469 * If top of seg is below the requested address, then
470 * the insertion point is at the end of the linked list,
471 * and seg points to the tail of the list. Otherwise,
472 * the insertion point is immediately before seg.
474 if (base + seg->s_size > addr) {
475 if (addr >= base || eaddr > base) {
476 return (-1); /* overlapping segment */
480 as->a_seglast = newseg;
481 avl_insert(&as->a_segtree, newseg, where);
483 #ifdef VERIFY_SEGLIST
484 as_verify(as);
485 #endif
486 return (0);
489 struct seg *
490 as_removeseg(struct as *as, struct seg *seg)
492 avl_tree_t *t;
494 ASSERT(AS_WRITE_HELD(as));
496 as->a_updatedir = 1; /* inform /proc */
497 gethrestime(&as->a_updatetime);
499 if (seg == NULL)
500 return (NULL);
502 t = &as->a_segtree;
503 if (as->a_seglast == seg)
504 as->a_seglast = NULL;
505 as->a_lastgaphl = NULL;
508 * if this segment is at an address higher than
509 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
511 if (as->a_lastgap &&
512 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
513 as->a_lastgap = AVL_NEXT(t, seg);
516 * remove the segment from the seg tree
518 avl_remove(t, seg);
520 #ifdef VERIFY_SEGLIST
521 as_verify(as);
522 #endif
523 return (seg);
527 * Find a segment containing addr.
529 struct seg *
530 as_segat(struct as *as, caddr_t addr)
532 struct seg *seg = as->a_seglast;
534 ASSERT(AS_LOCK_HELD(as));
536 if (seg != NULL && seg->s_base <= addr &&
537 addr < seg->s_base + seg->s_size)
538 return (seg);
540 seg = avl_find(&as->a_segtree, &addr, NULL);
541 return (seg);
545 * Serialize all searches for holes in an address space to
546 * prevent two or more threads from allocating the same virtual
547 * address range. The address space must not be "read/write"
548 * locked by the caller since we may block.
550 void
551 as_rangelock(struct as *as)
553 mutex_enter(&as->a_contents);
554 while (AS_ISCLAIMGAP(as))
555 cv_wait(&as->a_cv, &as->a_contents);
556 AS_SETCLAIMGAP(as);
557 mutex_exit(&as->a_contents);
561 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
563 void
564 as_rangeunlock(struct as *as)
566 mutex_enter(&as->a_contents);
567 AS_CLRCLAIMGAP(as);
568 cv_signal(&as->a_cv);
569 mutex_exit(&as->a_contents);
573 * compar segments (or just an address) by segment address range
575 static int
576 as_segcompar(const void *x, const void *y)
578 struct seg *a = (struct seg *)x;
579 struct seg *b = (struct seg *)y;
581 if (a->s_base < b->s_base)
582 return (-1);
583 if (a->s_base >= b->s_base + b->s_size)
584 return (1);
585 return (0);
589 void
590 as_avlinit(struct as *as)
592 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
593 offsetof(struct seg, s_tree));
594 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
595 offsetof(struct watched_page, wp_link));
598 /*ARGSUSED*/
599 static int
600 as_constructor(void *buf, void *cdrarg, int kmflags)
602 struct as *as = buf;
604 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
605 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
606 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
607 as_avlinit(as);
608 return (0);
611 /*ARGSUSED1*/
612 static void
613 as_destructor(void *buf, void *cdrarg)
615 struct as *as = buf;
617 avl_destroy(&as->a_segtree);
618 mutex_destroy(&as->a_contents);
619 cv_destroy(&as->a_cv);
620 rw_destroy(&as->a_lock);
623 void
624 as_init(void)
626 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
627 as_constructor, as_destructor, NULL, NULL, NULL, 0);
631 * Allocate and initialize an address space data structure.
632 * We call hat_alloc to allow any machine dependent
633 * information in the hat structure to be initialized.
635 struct as *
636 as_alloc(void)
638 struct as *as;
640 as = kmem_cache_alloc(as_cache, KM_SLEEP);
642 as->a_flags = 0;
643 as->a_vbits = 0;
644 as->a_hrm = NULL;
645 as->a_seglast = NULL;
646 as->a_size = 0;
647 as->a_resvsize = 0;
648 as->a_updatedir = 0;
649 gethrestime(&as->a_updatetime);
650 as->a_objectdir = NULL;
651 as->a_sizedir = 0;
652 as->a_userlimit = (caddr_t)USERLIMIT;
653 as->a_lastgap = NULL;
654 as->a_lastgaphl = NULL;
655 as->a_callbacks = NULL;
656 as->a_proc = NULL;
658 AS_LOCK_ENTER(as, RW_WRITER);
659 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
660 AS_LOCK_EXIT(as);
662 return (as);
666 * Free an address space data structure.
667 * Need to free the hat first and then
668 * all the segments on this as and finally
669 * the space for the as struct itself.
671 void
672 as_free(struct as *as)
674 struct hat *hat = as->a_hat;
675 struct seg *seg, *next;
676 boolean_t free_started = B_FALSE;
678 top:
680 * Invoke ALL callbacks. as_do_callbacks will do one callback
681 * per call, and not return (-1) until the callback has completed.
682 * When as_do_callbacks returns zero, all callbacks have completed.
684 mutex_enter(&as->a_contents);
685 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
688 mutex_exit(&as->a_contents);
689 AS_LOCK_ENTER(as, RW_WRITER);
691 if (!free_started) {
692 free_started = B_TRUE;
693 hat_free_start(hat);
695 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
696 int err;
698 next = AS_SEGNEXT(as, seg);
699 retry:
700 err = segop_unmap(seg, seg->s_base, seg->s_size);
701 if (err == EAGAIN) {
702 mutex_enter(&as->a_contents);
703 if (as->a_callbacks) {
704 AS_LOCK_EXIT(as);
705 } else if (!AS_ISNOUNMAPWAIT(as)) {
707 * Memory is currently locked. Wait for a
708 * cv_signal that it has been unlocked, then
709 * try the operation again.
711 if (AS_ISUNMAPWAIT(as) == 0)
712 cv_broadcast(&as->a_cv);
713 AS_SETUNMAPWAIT(as);
714 AS_LOCK_EXIT(as);
715 while (AS_ISUNMAPWAIT(as))
716 cv_wait(&as->a_cv, &as->a_contents);
717 } else {
719 * We may have raced with
720 * segvn_reclaim()/segspt_reclaim(). In this
721 * case clean nounmapwait flag and retry since
722 * softlockcnt in this segment may be already
723 * 0. We don't drop as writer lock so our
724 * number of retries without sleeping should
725 * be very small. See segvn_reclaim() for
726 * more comments.
728 AS_CLRNOUNMAPWAIT(as);
729 mutex_exit(&as->a_contents);
730 goto retry;
732 mutex_exit(&as->a_contents);
733 goto top;
734 } else {
736 * We do not expect any other error return at this
737 * time. This is similar to an ASSERT in seg_unmap()
739 ASSERT(err == 0);
742 hat_free_end(hat);
743 AS_LOCK_EXIT(as);
745 /* /proc stuff */
746 ASSERT(avl_numnodes(&as->a_wpage) == 0);
747 if (as->a_objectdir) {
748 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
749 as->a_objectdir = NULL;
750 as->a_sizedir = 0;
754 * Free the struct as back to kmem. Assert it has no segments.
756 ASSERT(avl_numnodes(&as->a_segtree) == 0);
757 kmem_cache_free(as_cache, as);
761 as_dup(struct as *as, struct proc *forkedproc)
763 struct as *newas;
764 struct seg *seg, *newseg;
765 size_t purgesize = 0;
766 int error;
768 AS_LOCK_ENTER(as, RW_WRITER);
769 as_clearwatch(as);
770 newas = as_alloc();
771 newas->a_userlimit = as->a_userlimit;
772 newas->a_proc = forkedproc;
774 AS_LOCK_ENTER(newas, RW_WRITER);
776 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
778 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
780 if (seg->s_flags & S_PURGE) {
781 purgesize += seg->s_size;
782 continue;
785 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
786 if (newseg == NULL) {
787 AS_LOCK_EXIT(newas);
788 as_setwatch(as);
789 AS_LOCK_EXIT(as);
790 as_free(newas);
791 return (-1);
793 if ((error = segop_dup(seg, newseg)) != 0) {
795 * We call seg_free() on the new seg
796 * because the segment is not set up
797 * completely; i.e. it has no ops.
799 as_setwatch(as);
800 AS_LOCK_EXIT(as);
801 seg_free(newseg);
802 AS_LOCK_EXIT(newas);
803 as_free(newas);
804 return (error);
806 if ((newseg->s_flags & S_HOLE) == 0) {
807 newas->a_size += seg->s_size;
810 newas->a_resvsize = as->a_resvsize - purgesize;
812 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
814 AS_LOCK_EXIT(newas);
816 as_setwatch(as);
817 AS_LOCK_EXIT(as);
818 if (error != 0) {
819 as_free(newas);
820 return (error);
822 forkedproc->p_as = newas;
823 return (0);
827 * Handle a ``fault'' at addr for size bytes.
829 faultcode_t
830 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
831 enum fault_type type, enum seg_rw rw)
833 struct seg *seg;
834 caddr_t raddr; /* rounded down addr */
835 size_t rsize; /* rounded up size */
836 size_t ssize;
837 faultcode_t res = 0;
838 caddr_t addrsav;
839 struct seg *segsav;
840 int as_lock_held;
841 klwp_t *lwp = ttolwp(curthread);
845 retry:
847 * Indicate that the lwp is not to be stopped while waiting for a
848 * pagefault. This is to avoid deadlock while debugging a process
849 * via /proc over NFS (in particular).
851 if (lwp != NULL)
852 lwp->lwp_nostop++;
855 * same length must be used when we softlock and softunlock. We
856 * don't support softunlocking lengths less than the original length
857 * when there is largepage support. See seg_dev.c for more
858 * comments.
860 switch (type) {
862 case F_SOFTLOCK:
863 CPU_STATS_ADD_K(vm, softlock, 1);
864 break;
866 case F_SOFTUNLOCK:
867 break;
869 case F_PROT:
870 CPU_STATS_ADD_K(vm, prot_fault, 1);
871 break;
873 case F_INVAL:
874 CPU_STATS_ENTER_K();
875 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
876 if (as == &kas)
877 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
878 CPU_STATS_EXIT_K();
879 break;
882 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
883 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
884 (size_t)raddr;
887 * XXX -- Don't grab the as lock for segkmap. We should grab it for
888 * correctness, but then we could be stuck holding this lock for
889 * a LONG time if the fault needs to be resolved on a slow
890 * filesystem, and then no-one will be able to exec new commands,
891 * as exec'ing requires the write lock on the as.
893 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
894 raddr + size < segkmap->s_base + segkmap->s_size) {
895 seg = segkmap;
896 as_lock_held = 0;
897 } else {
898 AS_LOCK_ENTER(as, RW_READER);
900 seg = as_segat(as, raddr);
901 if (seg == NULL) {
902 AS_LOCK_EXIT(as);
903 if (lwp != NULL)
904 lwp->lwp_nostop--;
905 return (FC_NOMAP);
908 as_lock_held = 1;
911 addrsav = raddr;
912 segsav = seg;
914 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
915 if (raddr >= seg->s_base + seg->s_size) {
916 seg = AS_SEGNEXT(as, seg);
917 if (seg == NULL || raddr != seg->s_base) {
918 res = FC_NOMAP;
919 break;
922 if (raddr + rsize > seg->s_base + seg->s_size)
923 ssize = seg->s_base + seg->s_size - raddr;
924 else
925 ssize = rsize;
927 res = segop_fault(hat, seg, raddr, ssize, type, rw);
928 if (res != 0)
929 break;
933 * If we were SOFTLOCKing and encountered a failure,
934 * we must SOFTUNLOCK the range we already did. (Maybe we
935 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
936 * right here...)
938 if (res != 0 && type == F_SOFTLOCK) {
939 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
940 if (addrsav >= seg->s_base + seg->s_size)
941 seg = AS_SEGNEXT(as, seg);
942 ASSERT(seg != NULL);
944 * Now call the fault routine again to perform the
945 * unlock using S_OTHER instead of the rw variable
946 * since we never got a chance to touch the pages.
948 if (raddr > seg->s_base + seg->s_size)
949 ssize = seg->s_base + seg->s_size - addrsav;
950 else
951 ssize = raddr - addrsav;
952 (void) segop_fault(hat, seg, addrsav, ssize,
953 F_SOFTUNLOCK, S_OTHER);
956 if (as_lock_held)
957 AS_LOCK_EXIT(as);
958 if (lwp != NULL)
959 lwp->lwp_nostop--;
962 * If the lower levels returned EDEADLK for a fault,
963 * It means that we should retry the fault. Let's wait
964 * a bit also to let the deadlock causing condition clear.
965 * This is part of a gross hack to work around a design flaw
966 * in the ufs/sds logging code and should go away when the
967 * logging code is re-designed to fix the problem. See bug
968 * 4125102 for details of the problem.
970 if (FC_ERRNO(res) == EDEADLK) {
971 delay(deadlk_wait);
972 res = 0;
973 goto retry;
975 return (res);
981 * Asynchronous ``fault'' at addr for size bytes.
983 faultcode_t
984 as_faulta(struct as *as, caddr_t addr, size_t size)
986 struct seg *seg;
987 caddr_t raddr; /* rounded down addr */
988 size_t rsize; /* rounded up size */
989 faultcode_t res = 0;
990 klwp_t *lwp = ttolwp(curthread);
992 retry:
994 * Indicate that the lwp is not to be stopped while waiting
995 * for a pagefault. This is to avoid deadlock while debugging
996 * a process via /proc over NFS (in particular).
998 if (lwp != NULL)
999 lwp->lwp_nostop++;
1001 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1002 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1003 (size_t)raddr;
1005 AS_LOCK_ENTER(as, RW_READER);
1006 seg = as_segat(as, raddr);
1007 if (seg == NULL) {
1008 AS_LOCK_EXIT(as);
1009 if (lwp != NULL)
1010 lwp->lwp_nostop--;
1011 return (FC_NOMAP);
1014 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1015 if (raddr >= seg->s_base + seg->s_size) {
1016 seg = AS_SEGNEXT(as, seg);
1017 if (seg == NULL || raddr != seg->s_base) {
1018 res = FC_NOMAP;
1019 break;
1022 res = segop_faulta(seg, raddr);
1023 if (res != 0)
1024 break;
1026 AS_LOCK_EXIT(as);
1027 if (lwp != NULL)
1028 lwp->lwp_nostop--;
1030 * If the lower levels returned EDEADLK for a fault,
1031 * It means that we should retry the fault. Let's wait
1032 * a bit also to let the deadlock causing condition clear.
1033 * This is part of a gross hack to work around a design flaw
1034 * in the ufs/sds logging code and should go away when the
1035 * logging code is re-designed to fix the problem. See bug
1036 * 4125102 for details of the problem.
1038 if (FC_ERRNO(res) == EDEADLK) {
1039 delay(deadlk_wait);
1040 res = 0;
1041 goto retry;
1043 return (res);
1047 * Set the virtual mapping for the interval from [addr : addr + size)
1048 * in address space `as' to have the specified protection.
1049 * It is ok for the range to cross over several segments,
1050 * as long as they are contiguous.
1053 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1055 struct seg *seg;
1056 struct as_callback *cb;
1057 size_t ssize;
1058 caddr_t raddr; /* rounded down addr */
1059 size_t rsize; /* rounded up size */
1060 int error = 0, writer = 0;
1061 caddr_t saveraddr;
1062 size_t saversize;
1064 setprot_top:
1065 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1066 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1067 (size_t)raddr;
1069 if (raddr + rsize < raddr) /* check for wraparound */
1070 return (ENOMEM);
1072 saveraddr = raddr;
1073 saversize = rsize;
1076 * Normally we only lock the as as a reader. But
1077 * if due to setprot the segment driver needs to split
1078 * a segment it will return IE_RETRY. Therefore we re-acquire
1079 * the as lock as a writer so the segment driver can change
1080 * the seg list. Also the segment driver will return IE_RETRY
1081 * after it has changed the segment list so we therefore keep
1082 * locking as a writer. Since these opeartions should be rare
1083 * want to only lock as a writer when necessary.
1085 if (writer || avl_numnodes(&as->a_wpage) != 0) {
1086 AS_LOCK_ENTER(as, RW_WRITER);
1087 } else {
1088 AS_LOCK_ENTER(as, RW_READER);
1091 as_clearwatchprot(as, raddr, rsize);
1092 seg = as_segat(as, raddr);
1093 if (seg == NULL) {
1094 as_setwatch(as);
1095 AS_LOCK_EXIT(as);
1096 return (ENOMEM);
1099 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1100 if (raddr >= seg->s_base + seg->s_size) {
1101 seg = AS_SEGNEXT(as, seg);
1102 if (seg == NULL || raddr != seg->s_base) {
1103 error = ENOMEM;
1104 break;
1107 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1108 ssize = seg->s_base + seg->s_size - raddr;
1109 else
1110 ssize = rsize;
1111 retry:
1112 error = segop_setprot(seg, raddr, ssize, prot);
1114 if (error == IE_NOMEM) {
1115 error = EAGAIN;
1116 break;
1119 if (error == IE_RETRY) {
1120 AS_LOCK_EXIT(as);
1121 writer = 1;
1122 goto setprot_top;
1125 if (error == EAGAIN) {
1127 * Make sure we have a_lock as writer.
1129 if (writer == 0) {
1130 AS_LOCK_EXIT(as);
1131 writer = 1;
1132 goto setprot_top;
1136 * Memory is currently locked. It must be unlocked
1137 * before this operation can succeed through a retry.
1138 * The possible reasons for locked memory and
1139 * corresponding strategies for unlocking are:
1140 * (1) Normal I/O
1141 * wait for a signal that the I/O operation
1142 * has completed and the memory is unlocked.
1143 * (2) Asynchronous I/O
1144 * The aio subsystem does not unlock pages when
1145 * the I/O is completed. Those pages are unlocked
1146 * when the application calls aiowait/aioerror.
1147 * So, to prevent blocking forever, cv_broadcast()
1148 * is done to wake up aio_cleanup_thread.
1149 * Subsequently, segvn_reclaim will be called, and
1150 * that will do AS_CLRUNMAPWAIT() and wake us up.
1151 * (3) Long term page locking:
1152 * Drivers intending to have pages locked for a
1153 * period considerably longer than for normal I/O
1154 * (essentially forever) may have registered for a
1155 * callback so they may unlock these pages on
1156 * request. This is needed to allow this operation
1157 * to succeed. Each entry on the callback list is
1158 * examined. If the event or address range pertains
1159 * the callback is invoked (unless it already is in
1160 * progress). The a_contents lock must be dropped
1161 * before the callback, so only one callback can
1162 * be done at a time. Go to the top and do more
1163 * until zero is returned. If zero is returned,
1164 * either there were no callbacks for this event
1165 * or they were already in progress.
1167 mutex_enter(&as->a_contents);
1168 if (as->a_callbacks &&
1169 (cb = as_find_callback(as, AS_SETPROT_EVENT,
1170 seg->s_base, seg->s_size))) {
1171 AS_LOCK_EXIT(as);
1172 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1173 } else if (!AS_ISNOUNMAPWAIT(as)) {
1174 if (AS_ISUNMAPWAIT(as) == 0)
1175 cv_broadcast(&as->a_cv);
1176 AS_SETUNMAPWAIT(as);
1177 AS_LOCK_EXIT(as);
1178 while (AS_ISUNMAPWAIT(as))
1179 cv_wait(&as->a_cv, &as->a_contents);
1180 } else {
1182 * We may have raced with
1183 * segvn_reclaim()/segspt_reclaim(). In this
1184 * case clean nounmapwait flag and retry since
1185 * softlockcnt in this segment may be already
1186 * 0. We don't drop as writer lock so our
1187 * number of retries without sleeping should
1188 * be very small. See segvn_reclaim() for
1189 * more comments.
1191 AS_CLRNOUNMAPWAIT(as);
1192 mutex_exit(&as->a_contents);
1193 goto retry;
1195 mutex_exit(&as->a_contents);
1196 goto setprot_top;
1197 } else if (error != 0)
1198 break;
1200 if (error != 0) {
1201 as_setwatch(as);
1202 } else {
1203 as_setwatchprot(as, saveraddr, saversize, prot);
1205 AS_LOCK_EXIT(as);
1206 return (error);
1210 * Check to make sure that the interval [addr, addr + size)
1211 * in address space `as' has at least the specified protection.
1212 * It is ok for the range to cross over several segments, as long
1213 * as they are contiguous.
1216 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1218 struct seg *seg;
1219 size_t ssize;
1220 caddr_t raddr; /* rounded down addr */
1221 size_t rsize; /* rounded up size */
1222 int error = 0;
1224 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1225 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1226 (size_t)raddr;
1228 if (raddr + rsize < raddr) /* check for wraparound */
1229 return (ENOMEM);
1232 * This is ugly as sin...
1233 * Normally, we only acquire the address space readers lock.
1234 * However, if the address space has watchpoints present,
1235 * we must acquire the writer lock on the address space for
1236 * the benefit of as_clearwatchprot() and as_setwatchprot().
1238 if (avl_numnodes(&as->a_wpage) != 0)
1239 AS_LOCK_ENTER(as, RW_WRITER);
1240 else
1241 AS_LOCK_ENTER(as, RW_READER);
1242 as_clearwatchprot(as, raddr, rsize);
1243 seg = as_segat(as, raddr);
1244 if (seg == NULL) {
1245 as_setwatch(as);
1246 AS_LOCK_EXIT(as);
1247 return (ENOMEM);
1250 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1251 if (raddr >= seg->s_base + seg->s_size) {
1252 seg = AS_SEGNEXT(as, seg);
1253 if (seg == NULL || raddr != seg->s_base) {
1254 error = ENOMEM;
1255 break;
1258 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1259 ssize = seg->s_base + seg->s_size - raddr;
1260 else
1261 ssize = rsize;
1263 error = segop_checkprot(seg, raddr, ssize, prot);
1264 if (error != 0)
1265 break;
1267 as_setwatch(as);
1268 AS_LOCK_EXIT(as);
1269 return (error);
1273 as_unmap(struct as *as, caddr_t addr, size_t size)
1275 struct seg *seg, *seg_next;
1276 struct as_callback *cb;
1277 caddr_t raddr, eaddr;
1278 size_t ssize, rsize = 0;
1279 int err;
1281 top:
1282 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1283 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1284 (uintptr_t)PAGEMASK);
1286 AS_LOCK_ENTER(as, RW_WRITER);
1288 as->a_updatedir = 1; /* inform /proc */
1289 gethrestime(&as->a_updatetime);
1292 * Use as_findseg to find the first segment in the range, then
1293 * step through the segments in order, following s_next.
1295 as_clearwatchprot(as, raddr, eaddr - raddr);
1297 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1298 const boolean_t is_hole = ((seg->s_flags & S_HOLE) != 0);
1300 if (eaddr <= seg->s_base)
1301 break; /* eaddr was in a gap; all done */
1303 /* this is implied by the test above */
1304 ASSERT(raddr < eaddr);
1306 if (raddr < seg->s_base)
1307 raddr = seg->s_base; /* raddr was in a gap */
1309 if (eaddr > (seg->s_base + seg->s_size))
1310 ssize = seg->s_base + seg->s_size - raddr;
1311 else
1312 ssize = eaddr - raddr;
1315 * Save next segment pointer since seg can be
1316 * destroyed during the segment unmap operation.
1318 seg_next = AS_SEGNEXT(as, seg);
1321 * We didn't count /dev/null mappings, so ignore them here.
1322 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1323 * we have to do this check here while we have seg.)
1325 rsize = 0;
1326 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1327 !SEG_IS_PARTIAL_RESV(seg))
1328 rsize = ssize;
1330 retry:
1331 err = segop_unmap(seg, raddr, ssize);
1332 if (err == EAGAIN) {
1334 * Memory is currently locked. It must be unlocked
1335 * before this operation can succeed through a retry.
1336 * The possible reasons for locked memory and
1337 * corresponding strategies for unlocking are:
1338 * (1) Normal I/O
1339 * wait for a signal that the I/O operation
1340 * has completed and the memory is unlocked.
1341 * (2) Asynchronous I/O
1342 * The aio subsystem does not unlock pages when
1343 * the I/O is completed. Those pages are unlocked
1344 * when the application calls aiowait/aioerror.
1345 * So, to prevent blocking forever, cv_broadcast()
1346 * is done to wake up aio_cleanup_thread.
1347 * Subsequently, segvn_reclaim will be called, and
1348 * that will do AS_CLRUNMAPWAIT() and wake us up.
1349 * (3) Long term page locking:
1350 * Drivers intending to have pages locked for a
1351 * period considerably longer than for normal I/O
1352 * (essentially forever) may have registered for a
1353 * callback so they may unlock these pages on
1354 * request. This is needed to allow this operation
1355 * to succeed. Each entry on the callback list is
1356 * examined. If the event or address range pertains
1357 * the callback is invoked (unless it already is in
1358 * progress). The a_contents lock must be dropped
1359 * before the callback, so only one callback can
1360 * be done at a time. Go to the top and do more
1361 * until zero is returned. If zero is returned,
1362 * either there were no callbacks for this event
1363 * or they were already in progress.
1365 mutex_enter(&as->a_contents);
1366 if (as->a_callbacks &&
1367 (cb = as_find_callback(as, AS_UNMAP_EVENT,
1368 seg->s_base, seg->s_size))) {
1369 AS_LOCK_EXIT(as);
1370 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1371 } else if (!AS_ISNOUNMAPWAIT(as)) {
1372 if (AS_ISUNMAPWAIT(as) == 0)
1373 cv_broadcast(&as->a_cv);
1374 AS_SETUNMAPWAIT(as);
1375 AS_LOCK_EXIT(as);
1376 while (AS_ISUNMAPWAIT(as))
1377 cv_wait(&as->a_cv, &as->a_contents);
1378 } else {
1380 * We may have raced with
1381 * segvn_reclaim()/segspt_reclaim(). In this
1382 * case clean nounmapwait flag and retry since
1383 * softlockcnt in this segment may be already
1384 * 0. We don't drop as writer lock so our
1385 * number of retries without sleeping should
1386 * be very small. See segvn_reclaim() for
1387 * more comments.
1389 AS_CLRNOUNMAPWAIT(as);
1390 mutex_exit(&as->a_contents);
1391 goto retry;
1393 mutex_exit(&as->a_contents);
1394 goto top;
1395 } else if (err == IE_RETRY) {
1396 AS_LOCK_EXIT(as);
1397 goto top;
1398 } else if (err) {
1399 as_setwatch(as);
1400 AS_LOCK_EXIT(as);
1401 return (-1);
1404 if (!is_hole) {
1405 as->a_size -= ssize;
1406 if (rsize)
1407 as->a_resvsize -= rsize;
1409 raddr += ssize;
1411 AS_LOCK_EXIT(as);
1412 return (0);
1415 static int
1416 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1417 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated)
1419 uint_t szc, nszc, save_szcvec;
1420 int error;
1421 caddr_t a, eaddr;
1422 size_t pgsz;
1423 const boolean_t do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1425 ASSERT(AS_WRITE_HELD(as));
1426 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1427 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1428 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1430 if (!do_off) {
1431 vn_a->offset = 0;
1434 if (szcvec <= 1) {
1435 struct seg *seg, *segref;
1437 seg = segref = seg_alloc(as, addr, size);
1438 if (seg == NULL) {
1439 return (ENOMEM);
1441 vn_a->szc = 0;
1442 error = (*crfp)(&seg, vn_a);
1443 if (error != 0) {
1444 VERIFY3P(seg, ==, segref);
1445 seg_free(seg);
1446 } else {
1447 as->a_size += size;
1448 as->a_resvsize += size;
1450 return (error);
1453 eaddr = addr + size;
1454 save_szcvec = szcvec;
1455 szcvec >>= 1;
1456 szc = 0;
1457 nszc = 0;
1458 while (szcvec) {
1459 if ((szcvec & 0x1) == 0) {
1460 nszc++;
1461 szcvec >>= 1;
1462 continue;
1464 nszc++;
1465 pgsz = page_get_pagesize(nszc);
1466 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1467 if (a != addr) {
1468 struct seg *seg, *segref;
1469 size_t segsize;
1471 ASSERT(a < eaddr);
1473 segsize = a - addr;
1474 seg = segref = seg_alloc(as, addr, segsize);
1475 if (seg == NULL) {
1476 return (ENOMEM);
1478 vn_a->szc = szc;
1479 error = (*crfp)(&seg, vn_a);
1480 if (error != 0) {
1481 VERIFY3P(seg, ==, segref);
1482 seg_free(seg);
1483 return (error);
1485 as->a_size += segsize;
1486 as->a_resvsize += segsize;
1487 *segcreated = B_TRUE;
1488 if (do_off) {
1489 vn_a->offset += segsize;
1491 addr = a;
1493 szc = nszc;
1494 szcvec >>= 1;
1497 ASSERT(addr < eaddr);
1498 szcvec = save_szcvec | 1; /* add 8K pages */
1499 while (szcvec) {
1500 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1501 ASSERT(a >= addr);
1502 if (a != addr) {
1503 struct seg *seg, *segref;
1504 size_t segsize;
1506 segsize = a - addr;
1507 seg = segref = seg_alloc(as, addr, segsize);
1508 if (seg == NULL) {
1509 return (ENOMEM);
1511 vn_a->szc = szc;
1512 error = (*crfp)(&seg, vn_a);
1513 if (error != 0) {
1514 VERIFY3P(seg, ==, segref);
1515 seg_free(seg);
1516 return (error);
1518 as->a_size += segsize;
1519 as->a_resvsize += segsize;
1520 *segcreated = B_TRUE;
1521 if (do_off) {
1522 vn_a->offset += segsize;
1524 addr = a;
1526 szcvec &= ~(1 << szc);
1527 if (szcvec) {
1528 szc = highbit(szcvec) - 1;
1529 pgsz = page_get_pagesize(szc);
1532 ASSERT(addr == eaddr);
1534 return (0);
1537 static int
1538 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1539 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated)
1541 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1542 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1543 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1544 type, 0);
1545 int error;
1546 struct vattr va;
1547 uoff_t eoff;
1548 size_t save_size = 0;
1549 extern size_t textrepl_size_thresh;
1551 ASSERT(AS_WRITE_HELD(as));
1552 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1553 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1554 ASSERT(vn_a->vp != NULL);
1555 ASSERT(vn_a->amp == NULL);
1557 again:
1558 if (szcvec <= 1) {
1559 struct seg *seg, *segref;
1561 seg = segref = seg_alloc(as, addr, size);
1562 if (seg == NULL) {
1563 return (ENOMEM);
1565 vn_a->szc = 0;
1566 error = (*crfp)(&seg, vn_a);
1567 if (error != 0) {
1568 VERIFY3P(seg, ==, segref);
1569 seg_free(seg);
1570 } else {
1571 as->a_size += size;
1572 as->a_resvsize += size;
1574 return (error);
1577 va.va_mask = VATTR_SIZE;
1578 if (fop_getattr(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1579 szcvec = 0;
1580 goto again;
1582 eoff = vn_a->offset & PAGEMASK;
1583 if (eoff >= va.va_size) {
1584 szcvec = 0;
1585 goto again;
1587 eoff += size;
1588 if (btopr(va.va_size) < btopr(eoff)) {
1589 save_size = size;
1590 size = va.va_size - (vn_a->offset & PAGEMASK);
1591 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1592 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1593 type, 0);
1594 if (szcvec <= 1) {
1595 size = save_size;
1596 goto again;
1600 if (size > textrepl_size_thresh) {
1601 vn_a->flags |= _MAP_TEXTREPL;
1603 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1604 segcreated);
1605 if (error != 0) {
1606 return (error);
1608 if (save_size) {
1609 addr += size;
1610 size = save_size - size;
1611 szcvec = 0;
1612 goto again;
1614 return (0);
1618 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1619 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1621 static int
1622 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1623 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated)
1625 uint_t szcvec;
1626 uchar_t type;
1628 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1629 if (vn_a->type == MAP_SHARED) {
1630 type = MAPPGSZC_SHM;
1631 } else if (vn_a->type == MAP_PRIVATE) {
1632 if (vn_a->szc == AS_MAP_HEAP) {
1633 type = MAPPGSZC_HEAP;
1634 } else if (vn_a->szc == AS_MAP_STACK) {
1635 type = MAPPGSZC_STACK;
1636 } else {
1637 type = MAPPGSZC_PRIVM;
1640 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1641 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1642 (vn_a->flags & MAP_TEXT), type, 0);
1643 ASSERT(AS_WRITE_HELD(as));
1644 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1645 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1646 ASSERT(vn_a->vp == NULL);
1648 return (as_map_segvn_segs(as, addr, size, szcvec,
1649 crfp, vn_a, segcreated));
1653 as_map(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp,
1654 void *argsp)
1656 AS_LOCK_ENTER(as, RW_WRITER);
1657 return (as_map_locked(as, addr, size, crfp, argsp));
1661 as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp,
1662 void *argsp)
1664 caddr_t raddr; /* rounded down addr */
1665 size_t rsize; /* rounded up size */
1666 int error;
1667 boolean_t is_hole = B_FALSE;
1669 * The use of a_proc is preferred to handle the case where curproc is
1670 * a door_call server and is allocating memory in the client's (a_proc)
1671 * address space.
1672 * When creating a shared memory segment a_proc will be NULL so we
1673 * fallback to curproc in that case.
1675 struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1676 struct segvn_crargs crargs;
1678 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1679 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1680 (size_t)raddr;
1683 * check for wrap around
1685 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1686 AS_LOCK_EXIT(as);
1687 return (ENOMEM);
1690 as->a_updatedir = 1; /* inform /proc */
1691 gethrestime(&as->a_updatetime);
1693 if (as != &kas) {
1695 * Ensure that the virtual size of the process will not exceed
1696 * the configured limit. Since seg_hole segments will later
1697 * set the S_HOLE flag indicating their status as a hole in the
1698 * AS, they are excluded from this check.
1700 if (as->a_size + rsize > (size_t)p->p_vmem_ctl &&
1701 !AS_MAP_CHECK_SEGHOLE(crfp)) {
1702 AS_LOCK_EXIT(as);
1704 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM],
1705 p->p_rctls, p, RCA_UNSAFE_ALL);
1706 return (ENOMEM);
1710 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1711 boolean_t do_unmap = B_FALSE;
1713 crargs = *(struct segvn_crargs *)argsp;
1714 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs,
1715 &do_unmap);
1716 if (error != 0) {
1717 AS_LOCK_EXIT(as);
1718 if (do_unmap) {
1719 (void) as_unmap(as, addr, size);
1721 return (error);
1723 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1724 boolean_t do_unmap = B_FALSE;
1726 crargs = *(struct segvn_crargs *)argsp;
1727 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs,
1728 &do_unmap);
1729 if (error != 0) {
1730 AS_LOCK_EXIT(as);
1731 if (do_unmap) {
1732 (void) as_unmap(as, addr, size);
1734 return (error);
1736 } else {
1737 struct seg *seg, *segref;
1739 seg = segref = seg_alloc(as, addr, size);
1740 if (seg == NULL) {
1741 AS_LOCK_EXIT(as);
1742 return (ENOMEM);
1746 * It is possible that the segment creation routine will free
1747 * 'seg' as part of a more advanced operation, such as when
1748 * segvn concatenates adjacent segments together. When this
1749 * occurs, the seg*_create routine must communicate the
1750 * resulting segment out via the 'struct seg **' parameter.
1752 * If segment creation fails, it must not free the passed-in
1753 * segment, nor alter the argument pointer.
1755 error = (*crfp)(&seg, argsp);
1756 if (error != 0) {
1757 VERIFY3P(seg, ==, segref);
1758 seg_free(seg);
1759 AS_LOCK_EXIT(as);
1760 return (error);
1764 * Check if the resulting segment represents a hole in the
1765 * address space, rather than contributing to the AS size.
1767 is_hole = ((seg->s_flags & S_HOLE) != 0);
1769 /* Add size now so as_unmap will work if as_ctl fails. */
1770 if (!is_hole) {
1771 as->a_size += rsize;
1772 as->a_resvsize += rsize;
1776 as_setwatch(as);
1779 * Establish memory locks for the segment if the address space is
1780 * locked, provided it's not an explicit hole in the AS.
1782 mutex_enter(&as->a_contents);
1783 if (AS_ISPGLCK(as) && !is_hole) {
1784 mutex_exit(&as->a_contents);
1785 AS_LOCK_EXIT(as);
1786 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1787 if (error != 0)
1788 (void) as_unmap(as, addr, size);
1789 } else {
1790 mutex_exit(&as->a_contents);
1791 AS_LOCK_EXIT(as);
1793 return (error);
1798 * Delete all segments in the address space marked with S_PURGE.
1799 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1800 * These segments are deleted as a first step before calls to as_gap(), so
1801 * that they don't affect mmap() or shmat().
1803 void
1804 as_purge(struct as *as)
1806 struct seg *seg;
1807 struct seg *next_seg;
1810 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1811 * no need to grab a_contents mutex for this check
1813 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1814 return;
1816 AS_LOCK_ENTER(as, RW_WRITER);
1817 next_seg = NULL;
1818 seg = AS_SEGFIRST(as);
1819 while (seg != NULL) {
1820 next_seg = AS_SEGNEXT(as, seg);
1821 if (seg->s_flags & S_PURGE)
1822 (void) segop_unmap(seg, seg->s_base, seg->s_size);
1823 seg = next_seg;
1825 AS_LOCK_EXIT(as);
1827 mutex_enter(&as->a_contents);
1828 as->a_flags &= ~AS_NEEDSPURGE;
1829 mutex_exit(&as->a_contents);
1833 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1834 * range of addresses at least "minlen" long, where the base of the range is
1835 * at "off" phase from an "align" boundary and there is space for a
1836 * "redzone"-sized redzone on eithe rside of the range. Thus,
1837 * if align was 4M and off was 16k, the user wants a hole which will start
1838 * 16k into a 4M page.
1840 * If flags specifies AH_HI, the hole will have the highest possible address
1841 * in the range. We use the as->a_lastgap field to figure out where to
1842 * start looking for a gap.
1844 * Otherwise, the gap will have the lowest possible address.
1846 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1848 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1849 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1851 * NOTE: This routine is not correct when base+len overflows caddr_t.
1854 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1855 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1857 caddr_t lobound = *basep;
1858 caddr_t hibound = lobound + *lenp;
1859 struct seg *lseg, *hseg;
1860 caddr_t lo, hi;
1861 int forward;
1862 caddr_t save_base;
1863 size_t save_len;
1864 size_t save_minlen;
1865 size_t save_redzone;
1866 int fast_path = 1;
1868 save_base = *basep;
1869 save_len = *lenp;
1870 save_minlen = minlen;
1871 save_redzone = redzone;
1874 * For the first pass/fast_path, just add align and redzone into
1875 * minlen since if we get an allocation, we can guarantee that it
1876 * will fit the alignment and redzone requested.
1877 * This increases the chance that hibound will be adjusted to
1878 * a_lastgap->s_base which will likely allow us to find an
1879 * acceptable hole in the address space quicker.
1880 * If we can't find a hole with this fast_path, then we look for
1881 * smaller holes in which the alignment and offset may allow
1882 * the allocation to fit.
1884 minlen += align;
1885 minlen += 2 * redzone;
1886 redzone = 0;
1888 AS_LOCK_ENTER(as, RW_READER);
1889 if (AS_SEGFIRST(as) == NULL) {
1890 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1891 align, redzone, off)) {
1892 AS_LOCK_EXIT(as);
1893 return (0);
1894 } else {
1895 AS_LOCK_EXIT(as);
1896 *basep = save_base;
1897 *lenp = save_len;
1898 return (-1);
1902 retry:
1904 * Set up to iterate over all the inter-segment holes in the given
1905 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1906 * NULL for the highest-addressed hole. If moving backwards, we reset
1907 * sseg to denote the highest-addressed segment.
1909 forward = (flags & AH_DIR) == AH_LO;
1910 if (forward) {
1911 hseg = as_findseg(as, lobound, 1);
1912 lseg = AS_SEGPREV(as, hseg);
1913 } else {
1916 * If allocating at least as much as the last allocation,
1917 * use a_lastgap's base as a better estimate of hibound.
1919 if (as->a_lastgap &&
1920 minlen >= as->a_lastgap->s_size &&
1921 hibound >= as->a_lastgap->s_base)
1922 hibound = as->a_lastgap->s_base;
1924 hseg = as_findseg(as, hibound, 1);
1925 if (hseg->s_base + hseg->s_size < hibound) {
1926 lseg = hseg;
1927 hseg = NULL;
1928 } else {
1929 lseg = AS_SEGPREV(as, hseg);
1933 for (;;) {
1935 * Set lo and hi to the hole's boundaries. (We should really
1936 * use MAXADDR in place of hibound in the expression below,
1937 * but can't express it easily; using hibound in its place is
1938 * harmless.)
1940 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1941 hi = (hseg == NULL) ? hibound : hseg->s_base;
1943 * If the iteration has moved past the interval from lobound
1944 * to hibound it's pointless to continue.
1946 if ((forward && lo > hibound) || (!forward && hi < lobound))
1947 break;
1948 else if (lo > hibound || hi < lobound)
1949 goto cont;
1951 * Candidate hole lies at least partially within the allowable
1952 * range. Restrict it to fall completely within that range,
1953 * i.e., to [max(lo, lobound), min(hi, hibound)].
1955 if (lo < lobound)
1956 lo = lobound;
1957 if (hi > hibound)
1958 hi = hibound;
1960 * Verify that the candidate hole is big enough and meets
1961 * hardware constraints. If the hole is too small, no need
1962 * to do the further checks since they will fail.
1964 *basep = lo;
1965 *lenp = hi - lo;
1966 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1967 minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1968 ((flags & AH_CONTAIN) == 0 ||
1969 (*basep <= addr && *basep + *lenp > addr))) {
1970 if (!forward)
1971 as->a_lastgap = hseg;
1972 if (hseg != NULL)
1973 as->a_lastgaphl = hseg;
1974 else
1975 as->a_lastgaphl = lseg;
1976 AS_LOCK_EXIT(as);
1977 return (0);
1979 cont:
1981 * Move to the next hole.
1983 if (forward) {
1984 lseg = hseg;
1985 if (lseg == NULL)
1986 break;
1987 hseg = AS_SEGNEXT(as, hseg);
1988 } else {
1989 hseg = lseg;
1990 if (hseg == NULL)
1991 break;
1992 lseg = AS_SEGPREV(as, lseg);
1995 if (fast_path && (align != 0 || save_redzone != 0)) {
1996 fast_path = 0;
1997 minlen = save_minlen;
1998 redzone = save_redzone;
1999 goto retry;
2001 *basep = save_base;
2002 *lenp = save_len;
2003 AS_LOCK_EXIT(as);
2004 return (-1);
2008 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2010 * If flags specifies AH_HI, the hole will have the highest possible address
2011 * in the range. We use the as->a_lastgap field to figure out where to
2012 * start looking for a gap.
2014 * Otherwise, the gap will have the lowest possible address.
2016 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2018 * If an adequate hole is found, base and len are set to reflect the part of
2019 * the hole that is within range, and 0 is returned, otherwise,
2020 * -1 is returned.
2022 * NOTE: This routine is not correct when base+len overflows caddr_t.
2025 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2026 caddr_t addr)
2029 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2033 * Return the next range within [base, base + len) that is backed
2034 * with "real memory". Skip holes and non-seg_vn segments.
2035 * We're lazy and only return one segment at a time.
2038 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2040 extern const struct seg_ops segspt_shmops; /* needs a header file */
2041 struct seg *seg;
2042 caddr_t addr, eaddr;
2043 caddr_t segend;
2045 AS_LOCK_ENTER(as, RW_READER);
2047 addr = *basep;
2048 eaddr = addr + *lenp;
2050 seg = as_findseg(as, addr, 0);
2051 if (seg != NULL)
2052 addr = MAX(seg->s_base, addr);
2054 for (;;) {
2055 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2056 AS_LOCK_EXIT(as);
2057 return (EINVAL);
2060 if (seg->s_ops == &segvn_ops) {
2061 segend = seg->s_base + seg->s_size;
2062 break;
2066 * We do ISM by looking into the private data
2067 * to determine the real size of the segment.
2069 if (seg->s_ops == &segspt_shmops) {
2070 segend = seg->s_base + spt_realsize(seg);
2071 if (addr < segend)
2072 break;
2075 seg = AS_SEGNEXT(as, seg);
2077 if (seg != NULL)
2078 addr = seg->s_base;
2081 *basep = addr;
2083 if (segend > eaddr)
2084 *lenp = eaddr - addr;
2085 else
2086 *lenp = segend - addr;
2088 AS_LOCK_EXIT(as);
2089 return (0);
2093 * Determine whether data from the mappings in interval [addr, addr + size)
2094 * are in the primary memory (core) cache.
2097 as_incore(struct as *as, caddr_t addr,
2098 size_t size, char *vec, size_t *sizep)
2100 struct seg *seg;
2101 size_t ssize;
2102 caddr_t raddr; /* rounded down addr */
2103 size_t rsize; /* rounded up size */
2104 size_t isize; /* iteration size */
2105 int error = 0; /* result, assume success */
2107 *sizep = 0;
2108 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2109 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2110 (size_t)raddr;
2112 if (raddr + rsize < raddr) /* check for wraparound */
2113 return (ENOMEM);
2115 AS_LOCK_ENTER(as, RW_READER);
2116 seg = as_segat(as, raddr);
2117 if (seg == NULL) {
2118 AS_LOCK_EXIT(as);
2119 return (-1);
2122 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2123 if (raddr >= seg->s_base + seg->s_size) {
2124 seg = AS_SEGNEXT(as, seg);
2125 if (seg == NULL || raddr != seg->s_base) {
2126 error = -1;
2127 break;
2130 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2131 ssize = seg->s_base + seg->s_size - raddr;
2132 else
2133 ssize = rsize;
2134 *sizep += isize = segop_incore(seg, raddr, ssize, vec);
2135 if (isize != ssize) {
2136 error = -1;
2137 break;
2139 vec += btopr(ssize);
2141 AS_LOCK_EXIT(as);
2142 return (error);
2145 static void
2146 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2147 ulong_t *bitmap, size_t position, size_t npages)
2149 caddr_t range_start;
2150 size_t pos1 = position;
2151 size_t pos2;
2152 size_t size;
2153 size_t end_pos = npages + position;
2155 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2156 size = ptob((pos2 - pos1));
2157 range_start = (caddr_t)((uintptr_t)addr +
2158 ptob(pos1 - position));
2160 (void) segop_lockop(seg, range_start, size, attr, MC_UNLOCK,
2161 NULL, 0);
2162 pos1 = pos2;
2166 static void
2167 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2168 caddr_t raddr, size_t rsize)
2170 struct seg *seg = as_segat(as, raddr);
2171 size_t ssize;
2173 while (rsize != 0) {
2174 if (raddr >= seg->s_base + seg->s_size)
2175 seg = AS_SEGNEXT(as, seg);
2177 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2178 ssize = seg->s_base + seg->s_size - raddr;
2179 else
2180 ssize = rsize;
2182 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2184 rsize -= ssize;
2185 raddr += ssize;
2190 * Cache control operations over the interval [addr, addr + size) in
2191 * address space "as".
2193 /*ARGSUSED*/
2195 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2196 uintptr_t arg, ulong_t *lock_map, size_t pos)
2198 struct seg *seg; /* working segment */
2199 caddr_t raddr; /* rounded down addr */
2200 caddr_t initraddr; /* saved initial rounded down addr */
2201 size_t rsize; /* rounded up size */
2202 size_t initrsize; /* saved initial rounded up size */
2203 size_t ssize; /* size of seg */
2204 int error = 0; /* result */
2205 size_t mlock_size; /* size of bitmap */
2206 ulong_t *mlock_map; /* pointer to bitmap used */
2207 /* to represent the locked */
2208 /* pages. */
2209 retry:
2210 if (error == IE_RETRY)
2211 AS_LOCK_ENTER(as, RW_WRITER);
2212 else
2213 AS_LOCK_ENTER(as, RW_READER);
2216 * If these are address space lock/unlock operations, loop over
2217 * all segments in the address space, as appropriate.
2219 if (func == MC_LOCKAS) {
2220 size_t npages, idx;
2221 size_t rlen = 0; /* rounded as length */
2223 idx = pos;
2225 if (arg & MCL_FUTURE) {
2226 mutex_enter(&as->a_contents);
2227 AS_SETPGLCK(as);
2228 mutex_exit(&as->a_contents);
2230 if ((arg & MCL_CURRENT) == 0) {
2231 AS_LOCK_EXIT(as);
2232 return (0);
2235 seg = AS_SEGFIRST(as);
2236 if (seg == NULL) {
2237 AS_LOCK_EXIT(as);
2238 return (0);
2241 do {
2242 raddr = (caddr_t)((uintptr_t)seg->s_base &
2243 (uintptr_t)PAGEMASK);
2244 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2245 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2246 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2248 mlock_size = BT_BITOUL(btopr(rlen));
2249 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2250 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2251 AS_LOCK_EXIT(as);
2252 return (EAGAIN);
2255 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2256 if ((seg->s_flags & S_HOLE) != 0)
2257 continue;
2259 error = segop_lockop(seg, seg->s_base,
2260 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2261 if (error != 0)
2262 break;
2263 pos += seg_pages(seg);
2266 if (error) {
2267 for (seg = AS_SEGFIRST(as); seg != NULL;
2268 seg = AS_SEGNEXT(as, seg)) {
2270 raddr = (caddr_t)((uintptr_t)seg->s_base &
2271 (uintptr_t)PAGEMASK);
2272 npages = seg_pages(seg);
2273 as_segunlock(seg, raddr, attr, mlock_map,
2274 idx, npages);
2275 idx += npages;
2279 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2280 AS_LOCK_EXIT(as);
2281 goto lockerr;
2282 } else if (func == MC_UNLOCKAS) {
2283 mutex_enter(&as->a_contents);
2284 AS_CLRPGLCK(as);
2285 mutex_exit(&as->a_contents);
2287 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2288 if ((seg->s_flags & S_HOLE) != 0)
2289 continue;
2291 error = segop_lockop(seg, seg->s_base,
2292 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2293 if (error != 0)
2294 break;
2297 AS_LOCK_EXIT(as);
2298 goto lockerr;
2302 * Normalize addresses and sizes.
2304 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2305 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2306 (size_t)raddr;
2308 if (raddr + rsize < raddr) { /* check for wraparound */
2309 AS_LOCK_EXIT(as);
2310 return (ENOMEM);
2314 * Get initial segment.
2316 if ((seg = as_segat(as, raddr)) == NULL) {
2317 AS_LOCK_EXIT(as);
2318 return (ENOMEM);
2321 if (func == MC_LOCK) {
2322 mlock_size = BT_BITOUL(btopr(rsize));
2323 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2324 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2325 AS_LOCK_EXIT(as);
2326 return (EAGAIN);
2331 * Loop over all segments. If a hole in the address range is
2332 * discovered, then fail. For each segment, perform the appropriate
2333 * control operation.
2335 while (rsize != 0) {
2338 * Make sure there's no hole, calculate the portion
2339 * of the next segment to be operated over.
2341 if (raddr >= seg->s_base + seg->s_size) {
2342 seg = AS_SEGNEXT(as, seg);
2343 if (seg == NULL || raddr != seg->s_base) {
2344 if (func == MC_LOCK) {
2345 as_unlockerr(as, attr, mlock_map,
2346 initraddr, initrsize - rsize);
2347 kmem_free(mlock_map,
2348 mlock_size * sizeof (ulong_t));
2350 AS_LOCK_EXIT(as);
2351 return (ENOMEM);
2354 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2355 ssize = seg->s_base + seg->s_size - raddr;
2356 else
2357 ssize = rsize;
2360 * Dispatch on specific function.
2362 switch (func) {
2365 * Synchronize cached data from mappings with backing
2366 * objects.
2368 case MC_SYNC:
2369 if (error = segop_sync(seg, raddr, ssize,
2370 attr, (uint_t)arg)) {
2371 AS_LOCK_EXIT(as);
2372 return (error);
2374 break;
2377 * Lock pages in memory.
2379 case MC_LOCK:
2380 if (error = segop_lockop(seg, raddr, ssize,
2381 attr, func, mlock_map, pos)) {
2382 as_unlockerr(as, attr, mlock_map, initraddr,
2383 initrsize - rsize + ssize);
2384 kmem_free(mlock_map, mlock_size *
2385 sizeof (ulong_t));
2386 AS_LOCK_EXIT(as);
2387 goto lockerr;
2389 break;
2392 * Unlock mapped pages.
2394 case MC_UNLOCK:
2395 (void) segop_lockop(seg, raddr, ssize, attr, func,
2396 NULL, 0);
2397 break;
2400 * Store VM advise for mapped pages in segment layer.
2402 case MC_ADVISE:
2403 error = segop_advise(seg, raddr, ssize, (uint_t)arg);
2406 * Check for regular errors and special retry error
2408 if (error) {
2409 if (error == IE_RETRY) {
2411 * Need to acquire writers lock, so
2412 * have to drop readers lock and start
2413 * all over again
2415 AS_LOCK_EXIT(as);
2416 goto retry;
2417 } else if (error == IE_REATTACH) {
2419 * Find segment for current address
2420 * because current segment just got
2421 * split or concatenated
2423 seg = as_segat(as, raddr);
2424 if (seg == NULL) {
2425 AS_LOCK_EXIT(as);
2426 return (ENOMEM);
2428 } else {
2430 * Regular error
2432 AS_LOCK_EXIT(as);
2433 return (error);
2436 break;
2438 case MC_INHERIT_ZERO:
2439 error = segop_inherit(seg, raddr, ssize, SEGP_INH_ZERO);
2440 if (error != 0) {
2441 AS_LOCK_EXIT(as);
2442 return (error);
2444 break;
2447 * Can't happen.
2449 default:
2450 panic("as_ctl: bad operation %d", func);
2451 /*NOTREACHED*/
2454 rsize -= ssize;
2455 raddr += ssize;
2458 if (func == MC_LOCK)
2459 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2460 AS_LOCK_EXIT(as);
2461 return (0);
2462 lockerr:
2465 * If the lower levels returned EDEADLK for a segment lockop,
2466 * it means that we should retry the operation. Let's wait
2467 * a bit also to let the deadlock causing condition clear.
2468 * This is part of a gross hack to work around a design flaw
2469 * in the ufs/sds logging code and should go away when the
2470 * logging code is re-designed to fix the problem. See bug
2471 * 4125102 for details of the problem.
2473 if (error == EDEADLK) {
2474 delay(deadlk_wait);
2475 error = 0;
2476 goto retry;
2478 return (error);
2482 fc_decode(faultcode_t fault_err)
2484 int error = 0;
2486 switch (FC_CODE(fault_err)) {
2487 case FC_OBJERR:
2488 error = FC_ERRNO(fault_err);
2489 break;
2490 case FC_PROT:
2491 error = EACCES;
2492 break;
2493 default:
2494 error = EFAULT;
2495 break;
2497 return (error);
2501 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2502 * lists from each segment and copy them to one contiguous shadow list (plist)
2503 * as expected by the caller. Save pointers to per segment shadow lists at
2504 * the tail of plist so that they can be used during as_pageunlock().
2506 static int
2507 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2508 caddr_t addr, size_t size, enum seg_rw rw)
2510 caddr_t sv_addr = addr;
2511 size_t sv_size = size;
2512 struct seg *sv_seg = seg;
2513 ulong_t segcnt = 1;
2514 ulong_t cnt;
2515 size_t ssize;
2516 pgcnt_t npages = btop(size);
2517 page_t **plist;
2518 page_t **pl;
2519 int error;
2520 caddr_t eaddr;
2521 faultcode_t fault_err = 0;
2522 pgcnt_t pl_off;
2523 extern const struct seg_ops segspt_shmops;
2525 ASSERT(AS_LOCK_HELD(as));
2526 ASSERT(seg != NULL);
2527 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2528 ASSERT(addr + size > seg->s_base + seg->s_size);
2529 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2530 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2533 * Count the number of segments covered by the range we are about to
2534 * lock. The segment count is used to size the shadow list we return
2535 * back to the caller.
2537 for (; size != 0; size -= ssize, addr += ssize) {
2538 if (addr >= seg->s_base + seg->s_size) {
2540 seg = AS_SEGNEXT(as, seg);
2541 if (seg == NULL || addr != seg->s_base) {
2542 AS_LOCK_EXIT(as);
2543 return (EFAULT);
2546 * Do a quick check if subsequent segments
2547 * will most likely support pagelock.
2549 if (seg->s_ops == &segvn_ops) {
2550 vnode_t *vp;
2552 if (segop_getvp(seg, addr, &vp) != 0 ||
2553 vp != NULL) {
2554 AS_LOCK_EXIT(as);
2555 goto slow;
2557 } else if (seg->s_ops != &segspt_shmops) {
2558 AS_LOCK_EXIT(as);
2559 goto slow;
2561 segcnt++;
2563 if (addr + size > seg->s_base + seg->s_size) {
2564 ssize = seg->s_base + seg->s_size - addr;
2565 } else {
2566 ssize = size;
2569 ASSERT(segcnt > 1);
2571 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2573 addr = sv_addr;
2574 size = sv_size;
2575 seg = sv_seg;
2577 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2578 if (addr >= seg->s_base + seg->s_size) {
2579 seg = AS_SEGNEXT(as, seg);
2580 ASSERT(seg != NULL && addr == seg->s_base);
2581 cnt++;
2582 ASSERT(cnt < segcnt);
2584 if (addr + size > seg->s_base + seg->s_size) {
2585 ssize = seg->s_base + seg->s_size - addr;
2586 } else {
2587 ssize = size;
2589 pl = &plist[npages + cnt];
2590 error = segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2591 L_PAGELOCK, rw);
2592 if (error) {
2593 break;
2595 ASSERT(plist[npages + cnt] != NULL);
2596 ASSERT(pl_off + btop(ssize) <= npages);
2597 bcopy(plist[npages + cnt], &plist[pl_off],
2598 btop(ssize) * sizeof (page_t *));
2599 pl_off += btop(ssize);
2602 if (size == 0) {
2603 AS_LOCK_EXIT(as);
2604 ASSERT(cnt == segcnt - 1);
2605 *ppp = plist;
2606 return (0);
2610 * one of pagelock calls failed. The error type is in error variable.
2611 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2612 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2613 * back to the caller.
2616 eaddr = addr;
2617 seg = sv_seg;
2619 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2620 if (addr >= seg->s_base + seg->s_size) {
2621 seg = AS_SEGNEXT(as, seg);
2622 ASSERT(seg != NULL && addr == seg->s_base);
2623 cnt++;
2624 ASSERT(cnt < segcnt);
2626 if (eaddr > seg->s_base + seg->s_size) {
2627 ssize = seg->s_base + seg->s_size - addr;
2628 } else {
2629 ssize = eaddr - addr;
2631 pl = &plist[npages + cnt];
2632 ASSERT(*pl != NULL);
2633 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2634 L_PAGEUNLOCK, rw);
2637 AS_LOCK_EXIT(as);
2639 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2641 if (error != ENOTSUP && error != EFAULT) {
2642 return (error);
2645 slow:
2647 * If we are here because pagelock failed due to the need to cow fault
2648 * in the pages we want to lock F_SOFTLOCK will do this job and in
2649 * next as_pagelock() call for this address range pagelock will
2650 * hopefully succeed.
2652 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2653 if (fault_err != 0) {
2654 return (fc_decode(fault_err));
2656 *ppp = NULL;
2658 return (0);
2662 * lock pages in a given address space. Return shadow list. If
2663 * the list is NULL, the MMU mapping is also locked.
2666 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2667 size_t size, enum seg_rw rw)
2669 size_t rsize;
2670 caddr_t raddr;
2671 faultcode_t fault_err;
2672 struct seg *seg;
2673 int err;
2675 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2676 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2677 (size_t)raddr;
2680 * if the request crosses two segments let
2681 * as_fault handle it.
2683 AS_LOCK_ENTER(as, RW_READER);
2685 seg = as_segat(as, raddr);
2686 if (seg == NULL) {
2687 AS_LOCK_EXIT(as);
2688 return (EFAULT);
2690 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2691 if (raddr + rsize > seg->s_base + seg->s_size) {
2692 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2694 if (raddr + rsize <= raddr) {
2695 AS_LOCK_EXIT(as);
2696 return (EFAULT);
2700 * try to lock pages and pass back shadow list
2702 err = segop_pagelock(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2704 AS_LOCK_EXIT(as);
2706 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2707 return (err);
2711 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2712 * to no pagelock support for this segment or pages need to be cow
2713 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2714 * this as_pagelock() call and in the next as_pagelock() call for the
2715 * same address range pagelock call will hopefull succeed.
2717 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2718 if (fault_err != 0) {
2719 return (fc_decode(fault_err));
2721 *ppp = NULL;
2723 return (0);
2727 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2728 * lists from the end of plist and call pageunlock interface for each segment.
2729 * Drop as lock and free plist.
2731 static void
2732 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2733 struct page **plist, enum seg_rw rw)
2735 ulong_t cnt;
2736 caddr_t eaddr = addr + size;
2737 pgcnt_t npages = btop(size);
2738 size_t ssize;
2739 page_t **pl;
2741 ASSERT(AS_LOCK_HELD(as));
2742 ASSERT(seg != NULL);
2743 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2744 ASSERT(addr + size > seg->s_base + seg->s_size);
2745 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2746 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2747 ASSERT(plist != NULL);
2749 for (cnt = 0; addr < eaddr; addr += ssize) {
2750 if (addr >= seg->s_base + seg->s_size) {
2751 seg = AS_SEGNEXT(as, seg);
2752 ASSERT(seg != NULL && addr == seg->s_base);
2753 cnt++;
2755 if (eaddr > seg->s_base + seg->s_size) {
2756 ssize = seg->s_base + seg->s_size - addr;
2757 } else {
2758 ssize = eaddr - addr;
2760 pl = &plist[npages + cnt];
2761 ASSERT(*pl != NULL);
2762 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2763 L_PAGEUNLOCK, rw);
2765 ASSERT(cnt > 0);
2766 AS_LOCK_EXIT(as);
2768 cnt++;
2769 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2773 * unlock pages in a given address range
2775 void
2776 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2777 enum seg_rw rw)
2779 struct seg *seg;
2780 size_t rsize;
2781 caddr_t raddr;
2784 * if the shadow list is NULL, as_pagelock was
2785 * falling back to as_fault
2787 if (pp == NULL) {
2788 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2789 return;
2792 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2793 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2794 (size_t)raddr;
2796 AS_LOCK_ENTER(as, RW_READER);
2797 seg = as_segat(as, raddr);
2798 ASSERT(seg != NULL);
2800 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2801 if (raddr + rsize <= seg->s_base + seg->s_size) {
2802 (void) segop_pagelock(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2803 } else {
2804 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2805 return;
2807 AS_LOCK_EXIT(as);
2811 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2812 boolean_t wait)
2814 struct seg *seg;
2815 size_t ssize;
2816 caddr_t raddr; /* rounded down addr */
2817 size_t rsize; /* rounded up size */
2818 int error = 0;
2819 size_t pgsz = page_get_pagesize(szc);
2821 setpgsz_top:
2822 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2823 return (EINVAL);
2826 raddr = addr;
2827 rsize = size;
2829 if (raddr + rsize < raddr) /* check for wraparound */
2830 return (ENOMEM);
2832 AS_LOCK_ENTER(as, RW_WRITER);
2833 as_clearwatchprot(as, raddr, rsize);
2834 seg = as_segat(as, raddr);
2835 if (seg == NULL) {
2836 as_setwatch(as);
2837 AS_LOCK_EXIT(as);
2838 return (ENOMEM);
2841 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2842 if (raddr >= seg->s_base + seg->s_size) {
2843 seg = AS_SEGNEXT(as, seg);
2844 if (seg == NULL || raddr != seg->s_base) {
2845 error = ENOMEM;
2846 break;
2849 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2850 ssize = seg->s_base + seg->s_size - raddr;
2851 } else {
2852 ssize = rsize;
2855 retry:
2856 error = segop_setpagesize(seg, raddr, ssize, szc);
2858 if (error == IE_NOMEM) {
2859 error = EAGAIN;
2860 break;
2863 if (error == IE_RETRY) {
2864 AS_LOCK_EXIT(as);
2865 goto setpgsz_top;
2868 if (error == ENOTSUP) {
2869 error = EINVAL;
2870 break;
2873 if (wait && (error == EAGAIN)) {
2875 * Memory is currently locked. It must be unlocked
2876 * before this operation can succeed through a retry.
2877 * The possible reasons for locked memory and
2878 * corresponding strategies for unlocking are:
2879 * (1) Normal I/O
2880 * wait for a signal that the I/O operation
2881 * has completed and the memory is unlocked.
2882 * (2) Asynchronous I/O
2883 * The aio subsystem does not unlock pages when
2884 * the I/O is completed. Those pages are unlocked
2885 * when the application calls aiowait/aioerror.
2886 * So, to prevent blocking forever, cv_broadcast()
2887 * is done to wake up aio_cleanup_thread.
2888 * Subsequently, segvn_reclaim will be called, and
2889 * that will do AS_CLRUNMAPWAIT() and wake us up.
2890 * (3) Long term page locking:
2891 * This is not relevant for as_setpagesize()
2892 * because we cannot change the page size for
2893 * driver memory. The attempt to do so will
2894 * fail with a different error than EAGAIN so
2895 * there's no need to trigger as callbacks like
2896 * as_unmap, as_setprot or as_free would do.
2898 mutex_enter(&as->a_contents);
2899 if (!AS_ISNOUNMAPWAIT(as)) {
2900 if (AS_ISUNMAPWAIT(as) == 0) {
2901 cv_broadcast(&as->a_cv);
2903 AS_SETUNMAPWAIT(as);
2904 AS_LOCK_EXIT(as);
2905 while (AS_ISUNMAPWAIT(as)) {
2906 cv_wait(&as->a_cv, &as->a_contents);
2908 } else {
2910 * We may have raced with
2911 * segvn_reclaim()/segspt_reclaim(). In this
2912 * case clean nounmapwait flag and retry since
2913 * softlockcnt in this segment may be already
2914 * 0. We don't drop as writer lock so our
2915 * number of retries without sleeping should
2916 * be very small. See segvn_reclaim() for
2917 * more comments.
2919 AS_CLRNOUNMAPWAIT(as);
2920 mutex_exit(&as->a_contents);
2921 goto retry;
2923 mutex_exit(&as->a_contents);
2924 goto setpgsz_top;
2925 } else if (error != 0) {
2926 break;
2929 as_setwatch(as);
2930 AS_LOCK_EXIT(as);
2931 return (error);
2935 * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments
2936 * in its chunk where s_szc is less than the szc we want to set.
2938 static int
2939 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2940 int *retry)
2942 struct seg *seg;
2943 size_t ssize;
2944 int error;
2946 ASSERT(AS_WRITE_HELD(as));
2948 seg = as_segat(as, raddr);
2949 if (seg == NULL) {
2950 panic("as_iset3_default_lpsize: no seg");
2953 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2954 if (raddr >= seg->s_base + seg->s_size) {
2955 seg = AS_SEGNEXT(as, seg);
2956 if (seg == NULL || raddr != seg->s_base) {
2957 panic("as_iset3_default_lpsize: as changed");
2960 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2961 ssize = seg->s_base + seg->s_size - raddr;
2962 } else {
2963 ssize = rsize;
2966 if (szc > seg->s_szc) {
2967 error = segop_setpagesize(seg, raddr, ssize, szc);
2968 /* Only retry on EINVAL segments that have no vnode. */
2969 if (error == EINVAL) {
2970 vnode_t *vp = NULL;
2971 if ((segop_gettype(seg, raddr) & MAP_SHARED) &&
2972 (segop_getvp(seg, raddr, &vp) != 0 ||
2973 vp == NULL)) {
2974 *retry = 1;
2975 } else {
2976 *retry = 0;
2979 if (error) {
2980 return (error);
2984 return (0);
2988 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
2989 * pagesize on each segment in its range, but if any fails with EINVAL,
2990 * then it reduces the pagesizes to the next size in the bitmap and
2991 * retries as_iset3_default_lpsize(). The reason why the code retries
2992 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
2993 * match the bigger sizes, and (b) it's hard to get this offset (to begin
2994 * with) to pass to map_pgszcvec().
2996 static int
2997 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2998 uint_t szcvec)
3000 int error;
3001 int retry;
3003 ASSERT(AS_WRITE_HELD(as));
3005 for (;;) {
3006 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3007 if (error == EINVAL && retry) {
3008 szcvec &= ~(1 << szc);
3009 if (szcvec <= 1) {
3010 return (EINVAL);
3012 szc = highbit(szcvec) - 1;
3013 } else {
3014 return (error);
3020 * as_iset1_default_lpsize() breaks its chunk into areas where existing
3021 * segments have a smaller szc than we want to set. For each such area,
3022 * it calls as_iset2_default_lpsize()
3024 static int
3025 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3026 uint_t szcvec)
3028 struct seg *seg;
3029 size_t ssize;
3030 caddr_t setaddr = raddr;
3031 size_t setsize = 0;
3032 int set;
3033 int error;
3035 ASSERT(AS_WRITE_HELD(as));
3037 seg = as_segat(as, raddr);
3038 if (seg == NULL) {
3039 panic("as_iset1_default_lpsize: no seg");
3041 if (seg->s_szc < szc) {
3042 set = 1;
3043 } else {
3044 set = 0;
3047 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3048 if (raddr >= seg->s_base + seg->s_size) {
3049 seg = AS_SEGNEXT(as, seg);
3050 if (seg == NULL || raddr != seg->s_base) {
3051 panic("as_iset1_default_lpsize: as changed");
3053 if (seg->s_szc >= szc && set) {
3054 ASSERT(setsize != 0);
3055 error = as_iset2_default_lpsize(as,
3056 setaddr, setsize, szc, szcvec);
3057 if (error) {
3058 return (error);
3060 set = 0;
3061 } else if (seg->s_szc < szc && !set) {
3062 setaddr = raddr;
3063 setsize = 0;
3064 set = 1;
3067 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3068 ssize = seg->s_base + seg->s_size - raddr;
3069 } else {
3070 ssize = rsize;
3073 error = 0;
3074 if (set) {
3075 ASSERT(setsize != 0);
3076 error = as_iset2_default_lpsize(as, setaddr, setsize,
3077 szc, szcvec);
3079 return (error);
3083 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3084 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3085 * chunk to as_iset1_default_lpsize().
3087 static int
3088 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3089 int type)
3091 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3092 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3093 flags, rtype, 1);
3094 uint_t szc;
3095 uint_t nszc;
3096 int error;
3097 caddr_t a;
3098 caddr_t eaddr;
3099 size_t segsize;
3100 size_t pgsz;
3101 uint_t save_szcvec;
3103 ASSERT(AS_WRITE_HELD(as));
3104 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3105 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3107 szcvec &= ~1;
3108 if (szcvec <= 1) { /* skip if base page size */
3109 return (0);
3112 /* Get the pagesize of the first larger page size. */
3113 szc = lowbit(szcvec) - 1;
3114 pgsz = page_get_pagesize(szc);
3115 eaddr = addr + size;
3116 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3117 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3119 save_szcvec = szcvec;
3120 szcvec >>= (szc + 1);
3121 nszc = szc;
3122 while (szcvec) {
3123 if ((szcvec & 0x1) == 0) {
3124 nszc++;
3125 szcvec >>= 1;
3126 continue;
3128 nszc++;
3129 pgsz = page_get_pagesize(nszc);
3130 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3131 if (a != addr) {
3132 ASSERT(szc > 0);
3133 ASSERT(a < eaddr);
3134 segsize = a - addr;
3135 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3136 save_szcvec);
3137 if (error) {
3138 return (error);
3140 addr = a;
3142 szc = nszc;
3143 szcvec >>= 1;
3146 ASSERT(addr < eaddr);
3147 szcvec = save_szcvec;
3148 while (szcvec) {
3149 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3150 ASSERT(a >= addr);
3151 if (a != addr) {
3152 ASSERT(szc > 0);
3153 segsize = a - addr;
3154 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3155 save_szcvec);
3156 if (error) {
3157 return (error);
3159 addr = a;
3161 szcvec &= ~(1 << szc);
3162 if (szcvec) {
3163 szc = highbit(szcvec) - 1;
3164 pgsz = page_get_pagesize(szc);
3167 ASSERT(addr == eaddr);
3169 return (0);
3173 * Set the default large page size for the range. Called via memcntl with
3174 * page size set to 0. as_set_default_lpsize breaks the range down into
3175 * chunks with the same type/flags, ignores-non segvn segments, and passes
3176 * each chunk to as_iset_default_lpsize().
3179 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3181 struct seg *seg;
3182 caddr_t raddr;
3183 size_t rsize;
3184 size_t ssize;
3185 int rtype, rflags;
3186 int stype, sflags;
3187 int error;
3188 caddr_t setaddr;
3189 size_t setsize;
3190 int segvn;
3192 if (size == 0)
3193 return (0);
3195 AS_LOCK_ENTER(as, RW_WRITER);
3196 again:
3197 error = 0;
3199 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3200 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3201 (size_t)raddr;
3203 if (raddr + rsize < raddr) { /* check for wraparound */
3204 AS_LOCK_EXIT(as);
3205 return (ENOMEM);
3207 as_clearwatchprot(as, raddr, rsize);
3208 seg = as_segat(as, raddr);
3209 if (seg == NULL) {
3210 as_setwatch(as);
3211 AS_LOCK_EXIT(as);
3212 return (ENOMEM);
3214 if (seg->s_ops == &segvn_ops) {
3215 rtype = segop_gettype(seg, addr);
3216 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3217 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3218 segvn = 1;
3219 } else {
3220 segvn = 0;
3222 setaddr = raddr;
3223 setsize = 0;
3225 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3226 if (raddr >= (seg->s_base + seg->s_size)) {
3227 seg = AS_SEGNEXT(as, seg);
3228 if (seg == NULL || raddr != seg->s_base) {
3229 error = ENOMEM;
3230 break;
3232 if (seg->s_ops == &segvn_ops) {
3233 stype = segop_gettype(seg, raddr);
3234 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3235 stype &= (MAP_SHARED | MAP_PRIVATE);
3236 if (segvn && (rflags != sflags ||
3237 rtype != stype)) {
3239 * The next segment is also segvn but
3240 * has different flags and/or type.
3242 ASSERT(setsize != 0);
3243 error = as_iset_default_lpsize(as,
3244 setaddr, setsize, rflags, rtype);
3245 if (error) {
3246 break;
3248 rflags = sflags;
3249 rtype = stype;
3250 setaddr = raddr;
3251 setsize = 0;
3252 } else if (!segvn) {
3253 rflags = sflags;
3254 rtype = stype;
3255 setaddr = raddr;
3256 setsize = 0;
3257 segvn = 1;
3259 } else if (segvn) {
3260 /* The next segment is not segvn. */
3261 ASSERT(setsize != 0);
3262 error = as_iset_default_lpsize(as,
3263 setaddr, setsize, rflags, rtype);
3264 if (error) {
3265 break;
3267 segvn = 0;
3270 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3271 ssize = seg->s_base + seg->s_size - raddr;
3272 } else {
3273 ssize = rsize;
3276 if (error == 0 && segvn) {
3277 /* The last chunk when rsize == 0. */
3278 ASSERT(setsize != 0);
3279 error = as_iset_default_lpsize(as, setaddr, setsize,
3280 rflags, rtype);
3283 if (error == IE_RETRY) {
3284 goto again;
3285 } else if (error == IE_NOMEM) {
3286 error = EAGAIN;
3287 } else if (error == ENOTSUP) {
3288 error = EINVAL;
3289 } else if (error == EAGAIN) {
3290 mutex_enter(&as->a_contents);
3291 if (!AS_ISNOUNMAPWAIT(as)) {
3292 if (AS_ISUNMAPWAIT(as) == 0) {
3293 cv_broadcast(&as->a_cv);
3295 AS_SETUNMAPWAIT(as);
3296 AS_LOCK_EXIT(as);
3297 while (AS_ISUNMAPWAIT(as)) {
3298 cv_wait(&as->a_cv, &as->a_contents);
3300 mutex_exit(&as->a_contents);
3301 AS_LOCK_ENTER(as, RW_WRITER);
3302 } else {
3304 * We may have raced with
3305 * segvn_reclaim()/segspt_reclaim(). In this case
3306 * clean nounmapwait flag and retry since softlockcnt
3307 * in this segment may be already 0. We don't drop as
3308 * writer lock so our number of retries without
3309 * sleeping should be very small. See segvn_reclaim()
3310 * for more comments.
3312 AS_CLRNOUNMAPWAIT(as);
3313 mutex_exit(&as->a_contents);
3315 goto again;
3318 as_setwatch(as);
3319 AS_LOCK_EXIT(as);
3320 return (error);
3324 * Setup all of the uninitialized watched pages that we can.
3326 void
3327 as_setwatch(struct as *as)
3329 struct watched_page *pwp;
3330 struct seg *seg;
3331 caddr_t vaddr;
3332 uint_t prot;
3333 int err, retrycnt;
3335 if (avl_numnodes(&as->a_wpage) == 0)
3336 return;
3338 ASSERT(AS_WRITE_HELD(as));
3340 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3341 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3342 retrycnt = 0;
3343 retry:
3344 vaddr = pwp->wp_vaddr;
3345 if (pwp->wp_oprot != 0 || /* already set up */
3346 (seg = as_segat(as, vaddr)) == NULL ||
3347 segop_getprot(seg, vaddr, 0, &prot) != 0)
3348 continue;
3350 pwp->wp_oprot = prot;
3351 if (pwp->wp_read)
3352 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3353 if (pwp->wp_write)
3354 prot &= ~PROT_WRITE;
3355 if (pwp->wp_exec)
3356 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3357 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3358 err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3359 if (err == IE_RETRY) {
3360 pwp->wp_oprot = 0;
3361 ASSERT(retrycnt == 0);
3362 retrycnt++;
3363 goto retry;
3366 pwp->wp_prot = prot;
3371 * Clear all of the watched pages in the address space.
3373 void
3374 as_clearwatch(struct as *as)
3376 struct watched_page *pwp;
3377 struct seg *seg;
3378 caddr_t vaddr;
3379 uint_t prot;
3380 int err, retrycnt;
3382 if (avl_numnodes(&as->a_wpage) == 0)
3383 return;
3385 ASSERT(AS_WRITE_HELD(as));
3387 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3388 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3389 retrycnt = 0;
3390 retry:
3391 vaddr = pwp->wp_vaddr;
3392 if (pwp->wp_oprot == 0 || /* not set up */
3393 (seg = as_segat(as, vaddr)) == NULL)
3394 continue;
3396 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3397 err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3398 if (err == IE_RETRY) {
3399 ASSERT(retrycnt == 0);
3400 retrycnt++;
3401 goto retry;
3404 pwp->wp_oprot = 0;
3405 pwp->wp_prot = 0;
3410 * Force a new setup for all the watched pages in the range.
3412 static void
3413 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3415 struct watched_page *pwp;
3416 struct watched_page tpw;
3417 caddr_t eaddr = addr + size;
3418 caddr_t vaddr;
3419 struct seg *seg;
3420 int err, retrycnt;
3421 uint_t wprot;
3422 avl_index_t where;
3424 if (avl_numnodes(&as->a_wpage) == 0)
3425 return;
3427 ASSERT(AS_WRITE_HELD(as));
3429 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3430 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3431 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3433 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3434 retrycnt = 0;
3435 vaddr = pwp->wp_vaddr;
3437 wprot = prot;
3438 if (pwp->wp_read)
3439 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3440 if (pwp->wp_write)
3441 wprot &= ~PROT_WRITE;
3442 if (pwp->wp_exec)
3443 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3444 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3445 retry:
3446 seg = as_segat(as, vaddr);
3447 if (seg == NULL) {
3448 panic("as_setwatchprot: no seg");
3449 /*NOTREACHED*/
3451 err = segop_setprot(seg, vaddr, PAGESIZE, wprot);
3452 if (err == IE_RETRY) {
3453 ASSERT(retrycnt == 0);
3454 retrycnt++;
3455 goto retry;
3458 pwp->wp_oprot = prot;
3459 pwp->wp_prot = wprot;
3461 pwp = AVL_NEXT(&as->a_wpage, pwp);
3466 * Clear all of the watched pages in the range.
3468 static void
3469 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3471 caddr_t eaddr = addr + size;
3472 struct watched_page *pwp;
3473 struct watched_page tpw;
3474 uint_t prot;
3475 struct seg *seg;
3476 int err, retrycnt;
3477 avl_index_t where;
3479 if (avl_numnodes(&as->a_wpage) == 0)
3480 return;
3482 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3483 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3484 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3486 ASSERT(AS_WRITE_HELD(as));
3488 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3490 if ((prot = pwp->wp_oprot) != 0) {
3491 retrycnt = 0;
3493 if (prot != pwp->wp_prot) {
3494 retry:
3495 seg = as_segat(as, pwp->wp_vaddr);
3496 if (seg == NULL)
3497 continue;
3498 err = segop_setprot(seg, pwp->wp_vaddr,
3499 PAGESIZE, prot);
3500 if (err == IE_RETRY) {
3501 ASSERT(retrycnt == 0);
3502 retrycnt++;
3503 goto retry;
3507 pwp->wp_oprot = 0;
3508 pwp->wp_prot = 0;
3511 pwp = AVL_NEXT(&as->a_wpage, pwp);
3515 void
3516 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3518 struct proc *p;
3520 mutex_enter(&pidlock);
3521 for (p = practive; p; p = p->p_next) {
3522 if (p->p_as == as) {
3523 mutex_enter(&p->p_lock);
3524 if (p->p_as == as)
3525 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3526 mutex_exit(&p->p_lock);
3529 mutex_exit(&pidlock);
3533 * return memory object ID
3536 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3538 struct seg *seg;
3539 int sts;
3541 AS_LOCK_ENTER(as, RW_READER);
3542 seg = as_segat(as, addr);
3543 if (seg == NULL) {
3544 AS_LOCK_EXIT(as);
3545 return (EFAULT);
3548 sts = segop_getmemid(seg, addr, memidp);
3550 AS_LOCK_EXIT(as);
3551 return (sts);